xref: /llvm-project/llvm/test/CodeGen/AMDGPU/flat-scratch.ll (revision 68694259b298614f16f87d83a56be1207f36fa53)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX10 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX11 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX12 %s
6; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9-PAL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -mattr=-promote-alloca < %s | FileCheck -check-prefixes=GFX940 %s
8; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
9; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
10; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX11-PAL %s
11; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX12-PAL %s
12
13define amdgpu_kernel void @zero_init_kernel() {
14; GFX9-LABEL: zero_init_kernel:
15; GFX9:       ; %bb.0:
16; GFX9-NEXT:    s_mov_b32 s0, 0
17; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
18; GFX9-NEXT:    s_mov_b32 s1, s0
19; GFX9-NEXT:    s_mov_b32 s2, s0
20; GFX9-NEXT:    s_mov_b32 s3, s0
21; GFX9-NEXT:    v_mov_b32_e32 v0, s0
22; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
23; GFX9-NEXT:    v_mov_b32_e32 v1, s1
24; GFX9-NEXT:    v_mov_b32_e32 v2, s2
25; GFX9-NEXT:    v_mov_b32_e32 v3, s3
26; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
27; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
28; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
29; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
30; GFX9-NEXT:    s_endpgm
31;
32; GFX10-LABEL: zero_init_kernel:
33; GFX10:       ; %bb.0:
34; GFX10-NEXT:    s_add_u32 s8, s8, s13
35; GFX10-NEXT:    s_addc_u32 s9, s9, 0
36; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
37; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
38; GFX10-NEXT:    s_mov_b32 s0, 0
39; GFX10-NEXT:    s_mov_b32 s1, s0
40; GFX10-NEXT:    s_mov_b32 s2, s0
41; GFX10-NEXT:    s_mov_b32 s3, s0
42; GFX10-NEXT:    v_mov_b32_e32 v0, s0
43; GFX10-NEXT:    v_mov_b32_e32 v1, s1
44; GFX10-NEXT:    v_mov_b32_e32 v2, s2
45; GFX10-NEXT:    v_mov_b32_e32 v3, s3
46; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
47; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
48; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
49; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off
50; GFX10-NEXT:    s_endpgm
51;
52; GFX11-LABEL: zero_init_kernel:
53; GFX11:       ; %bb.0:
54; GFX11-NEXT:    s_mov_b32 s0, 0
55; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
56; GFX11-NEXT:    s_mov_b32 s1, s0
57; GFX11-NEXT:    s_mov_b32 s2, s0
58; GFX11-NEXT:    s_mov_b32 s3, s0
59; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
60; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
61; GFX11-NEXT:    s_clause 0x3
62; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
63; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
64; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
65; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off
66; GFX11-NEXT:    s_endpgm
67;
68; GFX12-LABEL: zero_init_kernel:
69; GFX12:       ; %bb.0:
70; GFX12-NEXT:    s_mov_b32 s0, 0
71; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
72; GFX12-NEXT:    s_mov_b32 s1, s0
73; GFX12-NEXT:    s_mov_b32 s2, s0
74; GFX12-NEXT:    s_mov_b32 s3, s0
75; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
76; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
77; GFX12-NEXT:    s_clause 0x3
78; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
79; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
80; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
81; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off
82; GFX12-NEXT:    s_endpgm
83;
84; GFX9-PAL-LABEL: zero_init_kernel:
85; GFX9-PAL:       ; %bb.0:
86; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
87; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
88; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
89; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
90; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
91; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
92; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
93; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
95; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
96; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
97; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
98; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
99; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
100; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
101; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
102; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
103; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
104; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
105; GFX9-PAL-NEXT:    s_endpgm
106;
107; GFX940-LABEL: zero_init_kernel:
108; GFX940:       ; %bb.0:
109; GFX940-NEXT:    s_mov_b32 s0, 0
110; GFX940-NEXT:    s_mov_b32 s1, s0
111; GFX940-NEXT:    s_mov_b32 s2, s0
112; GFX940-NEXT:    s_mov_b32 s3, s0
113; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
114; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
115; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48 sc0 sc1
116; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32 sc0 sc1
117; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16 sc0 sc1
118; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off sc0 sc1
119; GFX940-NEXT:    s_endpgm
120;
121; GFX1010-PAL-LABEL: zero_init_kernel:
122; GFX1010-PAL:       ; %bb.0:
123; GFX1010-PAL-NEXT:    s_getpc_b64 s[12:13]
124; GFX1010-PAL-NEXT:    s_mov_b32 s12, s0
125; GFX1010-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
126; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
127; GFX1010-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
128; GFX1010-PAL-NEXT:    s_add_u32 s12, s12, s11
129; GFX1010-PAL-NEXT:    s_addc_u32 s13, s13, 0
130; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
131; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
132; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
133; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
134; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
135; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
136; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
137; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
138; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
139; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
140; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
141; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
142; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
143; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
144; GFX1010-PAL-NEXT:    s_endpgm
145;
146; GFX1030-PAL-LABEL: zero_init_kernel:
147; GFX1030-PAL:       ; %bb.0:
148; GFX1030-PAL-NEXT:    s_getpc_b64 s[12:13]
149; GFX1030-PAL-NEXT:    s_mov_b32 s12, s0
150; GFX1030-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
151; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
152; GFX1030-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
153; GFX1030-PAL-NEXT:    s_add_u32 s12, s12, s11
154; GFX1030-PAL-NEXT:    s_addc_u32 s13, s13, 0
155; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
156; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
157; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
158; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
159; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
160; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
161; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
162; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
163; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
164; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
165; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:48
166; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:32
167; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:16
168; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off
169; GFX1030-PAL-NEXT:    s_endpgm
170;
171; GFX11-PAL-LABEL: zero_init_kernel:
172; GFX11-PAL:       ; %bb.0:
173; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
174; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
175; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
176; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
177; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
178; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
179; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
180; GFX11-PAL-NEXT:    s_clause 0x3
181; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
182; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
183; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
184; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off
185; GFX11-PAL-NEXT:    s_endpgm
186;
187; GFX12-PAL-LABEL: zero_init_kernel:
188; GFX12-PAL:       ; %bb.0:
189; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
190; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
191; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
192; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
193; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
194; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
195; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
196; GFX12-PAL-NEXT:    s_clause 0x3
197; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:48
198; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:32
199; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16
200; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off
201; GFX12-PAL-NEXT:    s_endpgm
202  %alloca = alloca [32 x i16], align 2, addrspace(5)
203  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
204  ret void
205}
206
207define void @zero_init_foo() {
208; GFX9-LABEL: zero_init_foo:
209; GFX9:       ; %bb.0:
210; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
211; GFX9-NEXT:    s_mov_b32 s0, 0
212; GFX9-NEXT:    s_mov_b32 s1, s0
213; GFX9-NEXT:    s_mov_b32 s2, s0
214; GFX9-NEXT:    s_mov_b32 s3, s0
215; GFX9-NEXT:    v_mov_b32_e32 v0, s0
216; GFX9-NEXT:    v_mov_b32_e32 v1, s1
217; GFX9-NEXT:    v_mov_b32_e32 v2, s2
218; GFX9-NEXT:    v_mov_b32_e32 v3, s3
219; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
220; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
221; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
222; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
223; GFX9-NEXT:    s_waitcnt vmcnt(0)
224; GFX9-NEXT:    s_setpc_b64 s[30:31]
225;
226; GFX10-LABEL: zero_init_foo:
227; GFX10:       ; %bb.0:
228; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229; GFX10-NEXT:    s_mov_b32 s0, 0
230; GFX10-NEXT:    s_mov_b32 s1, s0
231; GFX10-NEXT:    s_mov_b32 s2, s0
232; GFX10-NEXT:    s_mov_b32 s3, s0
233; GFX10-NEXT:    v_mov_b32_e32 v0, s0
234; GFX10-NEXT:    v_mov_b32_e32 v1, s1
235; GFX10-NEXT:    v_mov_b32_e32 v2, s2
236; GFX10-NEXT:    v_mov_b32_e32 v3, s3
237; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
238; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
239; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
240; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
241; GFX10-NEXT:    s_setpc_b64 s[30:31]
242;
243; GFX11-LABEL: zero_init_foo:
244; GFX11:       ; %bb.0:
245; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
246; GFX11-NEXT:    s_mov_b32 s0, 0
247; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
248; GFX11-NEXT:    s_mov_b32 s1, s0
249; GFX11-NEXT:    s_mov_b32 s2, s0
250; GFX11-NEXT:    s_mov_b32 s3, s0
251; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
252; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
253; GFX11-NEXT:    s_clause 0x3
254; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
255; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
256; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
257; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
258; GFX11-NEXT:    s_setpc_b64 s[30:31]
259;
260; GFX12-LABEL: zero_init_foo:
261; GFX12:       ; %bb.0:
262; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
263; GFX12-NEXT:    s_wait_expcnt 0x0
264; GFX12-NEXT:    s_wait_samplecnt 0x0
265; GFX12-NEXT:    s_wait_bvhcnt 0x0
266; GFX12-NEXT:    s_wait_kmcnt 0x0
267; GFX12-NEXT:    s_mov_b32 s0, 0
268; GFX12-NEXT:    s_wait_alu 0xfffe
269; GFX12-NEXT:    s_mov_b32 s1, s0
270; GFX12-NEXT:    s_mov_b32 s2, s0
271; GFX12-NEXT:    s_mov_b32 s3, s0
272; GFX12-NEXT:    s_wait_alu 0xfffe
273; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
274; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
275; GFX12-NEXT:    s_clause 0x3
276; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
277; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
278; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
279; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32
280; GFX12-NEXT:    s_setpc_b64 s[30:31]
281;
282; GFX9-PAL-LABEL: zero_init_foo:
283; GFX9-PAL:       ; %bb.0:
284; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
285; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
286; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
287; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
288; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
289; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
290; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
291; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
292; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
293; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
294; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
295; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
296; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
297; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
298; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
299;
300; GFX940-LABEL: zero_init_foo:
301; GFX940:       ; %bb.0:
302; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
303; GFX940-NEXT:    s_mov_b32 s0, 0
304; GFX940-NEXT:    s_mov_b32 s1, s0
305; GFX940-NEXT:    s_mov_b32 s2, s0
306; GFX940-NEXT:    s_mov_b32 s3, s0
307; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
308; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
309; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48 sc0 sc1
310; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32 sc0 sc1
311; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16 sc0 sc1
312; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 sc0 sc1
313; GFX940-NEXT:    s_waitcnt vmcnt(0)
314; GFX940-NEXT:    s_setpc_b64 s[30:31]
315;
316; GFX10-PAL-LABEL: zero_init_foo:
317; GFX10-PAL:       ; %bb.0:
318; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
319; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
320; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
321; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
322; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
323; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
324; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
325; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
326; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
327; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
328; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
329; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
330; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
331; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
332;
333; GFX11-PAL-LABEL: zero_init_foo:
334; GFX11-PAL:       ; %bb.0:
335; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
336; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
337; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
338; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
339; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
340; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
341; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
342; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
343; GFX11-PAL-NEXT:    s_clause 0x3
344; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
345; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
346; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
347; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32
348; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
349;
350; GFX12-PAL-LABEL: zero_init_foo:
351; GFX12-PAL:       ; %bb.0:
352; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
353; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
354; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
355; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
356; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
357; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
358; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
359; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
360; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
361; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
362; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
363; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
364; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
365; GFX12-PAL-NEXT:    s_clause 0x3
366; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:48
367; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
368; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
369; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32
370; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
371  %alloca = alloca [32 x i16], align 2, addrspace(5)
372  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
373  ret void
374}
375
376define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
377; GFX9-LABEL: store_load_sindex_kernel:
378; GFX9:       ; %bb.0: ; %bb
379; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x24
380; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
381; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
382; GFX9-NEXT:    v_mov_b32_e32 v0, 15
383; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
384; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
385; GFX9-NEXT:    s_and_b32 s0, s0, 15
386; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
387; GFX9-NEXT:    scratch_store_dword off, v0, s1
388; GFX9-NEXT:    s_waitcnt vmcnt(0)
389; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
390; GFX9-NEXT:    s_waitcnt vmcnt(0)
391; GFX9-NEXT:    s_endpgm
392;
393; GFX10-LABEL: store_load_sindex_kernel:
394; GFX10:       ; %bb.0: ; %bb
395; GFX10-NEXT:    s_add_u32 s8, s8, s13
396; GFX10-NEXT:    s_addc_u32 s9, s9, 0
397; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
398; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
399; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x24
400; GFX10-NEXT:    v_mov_b32_e32 v0, 15
401; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
402; GFX10-NEXT:    s_and_b32 s1, s0, 15
403; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
404; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
405; GFX10-NEXT:    scratch_store_dword off, v0, s0
406; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
407; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
408; GFX10-NEXT:    s_waitcnt vmcnt(0)
409; GFX10-NEXT:    s_endpgm
410;
411; GFX11-LABEL: store_load_sindex_kernel:
412; GFX11:       ; %bb.0: ; %bb
413; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
414; GFX11-NEXT:    v_mov_b32_e32 v0, 15
415; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
416; GFX11-NEXT:    s_and_b32 s1, s0, 15
417; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
418; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
419; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
420; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
421; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
422; GFX11-NEXT:    s_waitcnt vmcnt(0)
423; GFX11-NEXT:    s_endpgm
424;
425; GFX12-LABEL: store_load_sindex_kernel:
426; GFX12:       ; %bb.0: ; %bb
427; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x24
428; GFX12-NEXT:    v_mov_b32_e32 v0, 15
429; GFX12-NEXT:    s_wait_kmcnt 0x0
430; GFX12-NEXT:    s_and_b32 s1, s0, 15
431; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
432; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
433; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
434; GFX12-NEXT:    s_wait_storecnt 0x0
435; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
436; GFX12-NEXT:    s_wait_loadcnt 0x0
437; GFX12-NEXT:    s_endpgm
438;
439; GFX9-PAL-LABEL: store_load_sindex_kernel:
440; GFX9-PAL:       ; %bb.0: ; %bb
441; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
442; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
443; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
444; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
445; GFX9-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
446; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
447; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
448; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
449; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
450; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
451; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
452; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
453; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
454; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
455; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
456; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
457; GFX9-PAL-NEXT:    s_endpgm
458;
459; GFX940-LABEL: store_load_sindex_kernel:
460; GFX940:       ; %bb.0: ; %bb
461; GFX940-NEXT:    s_load_dword s0, s[4:5], 0x24
462; GFX940-NEXT:    v_mov_b32_e32 v0, 15
463; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
464; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
465; GFX940-NEXT:    s_and_b32 s0, s0, 15
466; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
467; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
468; GFX940-NEXT:    s_waitcnt vmcnt(0)
469; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
470; GFX940-NEXT:    s_waitcnt vmcnt(0)
471; GFX940-NEXT:    s_endpgm
472;
473; GFX10-PAL-LABEL: store_load_sindex_kernel:
474; GFX10-PAL:       ; %bb.0: ; %bb
475; GFX10-PAL-NEXT:    s_getpc_b64 s[12:13]
476; GFX10-PAL-NEXT:    s_mov_b32 s12, s0
477; GFX10-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
478; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
479; GFX10-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
480; GFX10-PAL-NEXT:    s_add_u32 s12, s12, s11
481; GFX10-PAL-NEXT:    s_addc_u32 s13, s13, 0
482; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
483; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
484; GFX10-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
485; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
486; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
487; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
488; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
489; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
490; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
491; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
492; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
493; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
494; GFX10-PAL-NEXT:    s_endpgm
495;
496; GFX11-PAL-LABEL: store_load_sindex_kernel:
497; GFX11-PAL:       ; %bb.0: ; %bb
498; GFX11-PAL-NEXT:    s_load_b32 s0, s[4:5], 0x0
499; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
500; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
501; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
502; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
503; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
504; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
505; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
506; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
507; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
508; GFX11-PAL-NEXT:    s_endpgm
509;
510; GFX12-PAL-LABEL: store_load_sindex_kernel:
511; GFX12-PAL:       ; %bb.0: ; %bb
512; GFX12-PAL-NEXT:    s_load_b32 s0, s[4:5], 0x0
513; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
514; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
515; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
516; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
517; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
518; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
519; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
520; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
521; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
522; GFX12-PAL-NEXT:    s_endpgm
523bb:
524  %i = alloca [32 x float], align 4, addrspace(5)
525  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
526  store volatile i32 15, ptr addrspace(5) %i7, align 4
527  %i9 = and i32 %idx, 15
528  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
529  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
530  ret void
531}
532
533define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
534; GFX9-LABEL: store_load_sindex_foo:
535; GFX9:       ; %bb.0: ; %bb
536; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
537; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
538; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
539; GFX9-NEXT:    v_mov_b32_e32 v0, 15
540; GFX9-NEXT:    scratch_store_dword off, v0, s0
541; GFX9-NEXT:    s_waitcnt vmcnt(0)
542; GFX9-NEXT:    s_and_b32 s0, s2, 15
543; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
544; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
545; GFX9-NEXT:    s_waitcnt vmcnt(0)
546; GFX9-NEXT:    s_endpgm
547;
548; GFX10-LABEL: store_load_sindex_foo:
549; GFX10:       ; %bb.0: ; %bb
550; GFX10-NEXT:    s_add_u32 s0, s0, s3
551; GFX10-NEXT:    s_addc_u32 s1, s1, 0
552; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
553; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
554; GFX10-NEXT:    v_mov_b32_e32 v0, 15
555; GFX10-NEXT:    s_and_b32 s0, s2, 15
556; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
557; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
558; GFX10-NEXT:    scratch_store_dword off, v0, s1
559; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
560; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
561; GFX10-NEXT:    s_waitcnt vmcnt(0)
562; GFX10-NEXT:    s_endpgm
563;
564; GFX11-LABEL: store_load_sindex_foo:
565; GFX11:       ; %bb.0: ; %bb
566; GFX11-NEXT:    v_mov_b32_e32 v0, 15
567; GFX11-NEXT:    s_and_b32 s1, s0, 15
568; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
569; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
570; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
571; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
572; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
573; GFX11-NEXT:    s_waitcnt vmcnt(0)
574; GFX11-NEXT:    s_endpgm
575;
576; GFX12-LABEL: store_load_sindex_foo:
577; GFX12:       ; %bb.0: ; %bb
578; GFX12-NEXT:    v_mov_b32_e32 v0, 15
579; GFX12-NEXT:    s_and_b32 s1, s0, 15
580; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
581; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
582; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
583; GFX12-NEXT:    s_wait_storecnt 0x0
584; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
585; GFX12-NEXT:    s_wait_loadcnt 0x0
586; GFX12-NEXT:    s_endpgm
587;
588; GFX9-PAL-LABEL: store_load_sindex_foo:
589; GFX9-PAL:       ; %bb.0: ; %bb
590; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
591; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
592; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
593; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
594; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
595; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
596; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
597; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
598; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
599; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
600; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
601; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
602; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
603; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
604; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
605; GFX9-PAL-NEXT:    s_endpgm
606;
607; GFX940-LABEL: store_load_sindex_foo:
608; GFX940:       ; %bb.0: ; %bb
609; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
610; GFX940-NEXT:    v_mov_b32_e32 v0, 15
611; GFX940-NEXT:    s_and_b32 s0, s0, 15
612; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
613; GFX940-NEXT:    s_waitcnt vmcnt(0)
614; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
615; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
616; GFX940-NEXT:    s_waitcnt vmcnt(0)
617; GFX940-NEXT:    s_endpgm
618;
619; GFX10-PAL-LABEL: store_load_sindex_foo:
620; GFX10-PAL:       ; %bb.0: ; %bb
621; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
622; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
623; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
624; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
625; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
626; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s1
627; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
628; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
629; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
630; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 15
631; GFX10-PAL-NEXT:    s_and_b32 s1, s0, 15
632; GFX10-PAL-NEXT:    s_lshl_b32 s0, s0, 2
633; GFX10-PAL-NEXT:    s_lshl_b32 s1, s1, 2
634; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s0
635; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
636; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
637; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
638; GFX10-PAL-NEXT:    s_endpgm
639;
640; GFX11-PAL-LABEL: store_load_sindex_foo:
641; GFX11-PAL:       ; %bb.0: ; %bb
642; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
643; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
644; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
645; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
646; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
647; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
648; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
649; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
650; GFX11-PAL-NEXT:    s_endpgm
651;
652; GFX12-PAL-LABEL: store_load_sindex_foo:
653; GFX12-PAL:       ; %bb.0: ; %bb
654; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
655; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
656; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
657; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
658; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
659; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
660; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
661; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
662; GFX12-PAL-NEXT:    s_endpgm
663bb:
664  %i = alloca [32 x float], align 4, addrspace(5)
665  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
666  store volatile i32 15, ptr addrspace(5) %i7, align 4
667  %i9 = and i32 %idx, 15
668  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
669  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
670  ret void
671}
672
673define amdgpu_kernel void @store_load_vindex_kernel() {
674; GFX9-LABEL: store_load_vindex_kernel:
675; GFX9:       ; %bb.0: ; %bb
676; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
677; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
678; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
679; GFX9-NEXT:    v_mov_b32_e32 v1, v0
680; GFX9-NEXT:    v_mov_b32_e32 v2, 15
681; GFX9-NEXT:    scratch_store_dword v1, v2, off
682; GFX9-NEXT:    s_waitcnt vmcnt(0)
683; GFX9-NEXT:    v_sub_u32_e32 v0, 0, v0
684; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
685; GFX9-NEXT:    s_waitcnt vmcnt(0)
686; GFX9-NEXT:    s_endpgm
687;
688; GFX10-LABEL: store_load_vindex_kernel:
689; GFX10:       ; %bb.0: ; %bb
690; GFX10-NEXT:    s_add_u32 s8, s8, s13
691; GFX10-NEXT:    s_addc_u32 s9, s9, 0
692; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
693; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
694; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
695; GFX10-NEXT:    v_mov_b32_e32 v2, 15
696; GFX10-NEXT:    v_mov_b32_e32 v1, v0
697; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
698; GFX10-NEXT:    scratch_store_dword v1, v2, off
699; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
700; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
701; GFX10-NEXT:    s_waitcnt vmcnt(0)
702; GFX10-NEXT:    s_endpgm
703;
704; GFX11-LABEL: store_load_vindex_kernel:
705; GFX11:       ; %bb.0: ; %bb
706; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
707; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
708; GFX11-NEXT:    v_and_b32_e32 v0, 0xffc, v0
709; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
710; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
711; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
712; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
713; GFX11-NEXT:    s_waitcnt vmcnt(0)
714; GFX11-NEXT:    s_endpgm
715;
716; GFX12-LABEL: store_load_vindex_kernel:
717; GFX12:       ; %bb.0: ; %bb
718; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
719; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
720; GFX12-NEXT:    v_and_b32_e32 v0, 0xffc, v0
721; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
722; GFX12-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
723; GFX12-NEXT:    s_wait_storecnt 0x0
724; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
725; GFX12-NEXT:    s_wait_loadcnt 0x0
726; GFX12-NEXT:    s_endpgm
727;
728; GFX9-PAL-LABEL: store_load_vindex_kernel:
729; GFX9-PAL:       ; %bb.0: ; %bb
730; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
731; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
732; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
733; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
734; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, v0
735; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
736; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0, v0
737; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
738; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
739; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
740; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
741; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
742; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
743; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
744; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
745; GFX9-PAL-NEXT:    s_endpgm
746;
747; GFX940-LABEL: store_load_vindex_kernel:
748; GFX940:       ; %bb.0: ; %bb
749; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
750; GFX940-NEXT:    v_and_b32_e32 v0, 0xffc, v0
751; GFX940-NEXT:    v_mov_b32_e32 v1, 15
752; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
753; GFX940-NEXT:    s_waitcnt vmcnt(0)
754; GFX940-NEXT:    v_sub_u32_e32 v0, 0, v0
755; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
756; GFX940-NEXT:    s_waitcnt vmcnt(0)
757; GFX940-NEXT:    s_endpgm
758;
759; GFX10-PAL-LABEL: store_load_vindex_kernel:
760; GFX10-PAL:       ; %bb.0: ; %bb
761; GFX10-PAL-NEXT:    s_getpc_b64 s[12:13]
762; GFX10-PAL-NEXT:    s_mov_b32 s12, s0
763; GFX10-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
764; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX10-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
766; GFX10-PAL-NEXT:    s_add_u32 s12, s12, s11
767; GFX10-PAL-NEXT:    s_addc_u32 s13, s13, 0
768; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
769; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
770; GFX10-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
771; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
772; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, v0
773; GFX10-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0, v0
774; GFX10-PAL-NEXT:    scratch_store_dword v1, v2, off
775; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
776; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
777; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
778; GFX10-PAL-NEXT:    s_endpgm
779;
780; GFX11-PAL-LABEL: store_load_vindex_kernel:
781; GFX11-PAL:       ; %bb.0: ; %bb
782; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
783; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
784; GFX11-PAL-NEXT:    v_and_b32_e32 v0, 0xffc, v0
785; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
786; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off dlc
787; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
788; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
789; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
790; GFX11-PAL-NEXT:    s_endpgm
791;
792; GFX12-PAL-LABEL: store_load_vindex_kernel:
793; GFX12-PAL:       ; %bb.0: ; %bb
794; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
795; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
796; GFX12-PAL-NEXT:    v_and_b32_e32 v0, 0xffc, v0
797; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0, v0
798; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off scope:SCOPE_SYS
799; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
800; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
801; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
802; GFX12-PAL-NEXT:    s_endpgm
803bb:
804  %i = alloca [32 x float], align 4, addrspace(5)
805  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
806  %i3 = zext i32 %i2 to i64
807  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
808  store volatile i32 15, ptr addrspace(5) %i7, align 4
809  %i9 = sub nsw i32 31, %i2
810  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
811  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
812  ret void
813}
814
815define void @store_load_vindex_foo(i32 %idx) {
816; GFX9-LABEL: store_load_vindex_foo:
817; GFX9:       ; %bb.0: ; %bb
818; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
819; GFX9-NEXT:    v_mov_b32_e32 v1, s32
820; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
821; GFX9-NEXT:    v_mov_b32_e32 v3, 15
822; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
823; GFX9-NEXT:    scratch_store_dword v2, v3, off
824; GFX9-NEXT:    s_waitcnt vmcnt(0)
825; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
826; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
827; GFX9-NEXT:    s_waitcnt vmcnt(0)
828; GFX9-NEXT:    s_setpc_b64 s[30:31]
829;
830; GFX10-LABEL: store_load_vindex_foo:
831; GFX10:       ; %bb.0: ; %bb
832; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
833; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
834; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
835; GFX10-NEXT:    v_mov_b32_e32 v2, 15
836; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
837; GFX10-NEXT:    scratch_store_dword v0, v2, off
838; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
839; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
840; GFX10-NEXT:    s_waitcnt vmcnt(0)
841; GFX10-NEXT:    s_setpc_b64 s[30:31]
842;
843; GFX11-LABEL: store_load_vindex_foo:
844; GFX11:       ; %bb.0: ; %bb
845; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
846; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
847; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
848; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
849; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
850; GFX11-NEXT:    scratch_store_b32 v0, v2, off dlc
851; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
852; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
853; GFX11-NEXT:    s_waitcnt vmcnt(0)
854; GFX11-NEXT:    s_setpc_b64 s[30:31]
855;
856; GFX12-LABEL: store_load_vindex_foo:
857; GFX12:       ; %bb.0: ; %bb
858; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
859; GFX12-NEXT:    s_wait_expcnt 0x0
860; GFX12-NEXT:    s_wait_samplecnt 0x0
861; GFX12-NEXT:    s_wait_bvhcnt 0x0
862; GFX12-NEXT:    s_wait_kmcnt 0x0
863; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
864; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
865; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
866; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
867; GFX12-NEXT:    s_wait_storecnt 0x0
868; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
869; GFX12-NEXT:    s_wait_storecnt 0x0
870; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
871; GFX12-NEXT:    s_wait_loadcnt 0x0
872; GFX12-NEXT:    s_setpc_b64 s[30:31]
873;
874; GFX9-PAL-LABEL: store_load_vindex_foo:
875; GFX9-PAL:       ; %bb.0: ; %bb
876; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
877; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s32
878; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
879; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
880; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
881; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
882; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
883; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
884; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
885; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
886; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
887;
888; GFX940-LABEL: store_load_vindex_foo:
889; GFX940:       ; %bb.0: ; %bb
890; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
891; GFX940-NEXT:    v_mov_b32_e32 v1, s32
892; GFX940-NEXT:    v_lshl_add_u32 v1, v0, 2, v1
893; GFX940-NEXT:    v_mov_b32_e32 v2, 15
894; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
895; GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
896; GFX940-NEXT:    s_waitcnt vmcnt(0)
897; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
898; GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
899; GFX940-NEXT:    s_waitcnt vmcnt(0)
900; GFX940-NEXT:    s_setpc_b64 s[30:31]
901;
902; GFX10-PAL-LABEL: store_load_vindex_foo:
903; GFX10-PAL:       ; %bb.0: ; %bb
904; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
905; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
906; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
907; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
908; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s32
909; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
910; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
911; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
912; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
913; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
914;
915; GFX11-PAL-LABEL: store_load_vindex_foo:
916; GFX11-PAL:       ; %bb.0: ; %bb
917; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
918; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
919; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s32
920; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
921; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
922; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off dlc
923; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
924; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
925; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
926; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
927;
928; GFX12-PAL-LABEL: store_load_vindex_foo:
929; GFX12-PAL:       ; %bb.0: ; %bb
930; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
931; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
932; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
933; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
934; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
935; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
936; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
937; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
938; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
939; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
940; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
941; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
942; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
943; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
944; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
945bb:
946  %i = alloca [32 x float], align 4, addrspace(5)
947  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
948  store volatile i32 15, ptr addrspace(5) %i7, align 4
949  %i9 = and i32 %idx, 15
950  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
951  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
952  ret void
953}
954
955define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
956; GFX9-LABEL: private_ptr_foo:
957; GFX9:       ; %bb.0:
958; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
959; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
960; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
961; GFX9-NEXT:    s_waitcnt vmcnt(0)
962; GFX9-NEXT:    s_setpc_b64 s[30:31]
963;
964; GFX10-LABEL: private_ptr_foo:
965; GFX10:       ; %bb.0:
966; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
967; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
968; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
969; GFX10-NEXT:    s_setpc_b64 s[30:31]
970;
971; GFX11-LABEL: private_ptr_foo:
972; GFX11:       ; %bb.0:
973; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
974; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41200000
975; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:4
976; GFX11-NEXT:    s_setpc_b64 s[30:31]
977;
978; GFX12-LABEL: private_ptr_foo:
979; GFX12:       ; %bb.0:
980; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
981; GFX12-NEXT:    s_wait_expcnt 0x0
982; GFX12-NEXT:    s_wait_samplecnt 0x0
983; GFX12-NEXT:    s_wait_bvhcnt 0x0
984; GFX12-NEXT:    s_wait_kmcnt 0x0
985; GFX12-NEXT:    v_mov_b32_e32 v1, 0x41200000
986; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:4
987; GFX12-NEXT:    s_setpc_b64 s[30:31]
988;
989; GFX9-PAL-LABEL: private_ptr_foo:
990; GFX9-PAL:       ; %bb.0:
991; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
992; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
993; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
994; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
995; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
996;
997; GFX940-LABEL: private_ptr_foo:
998; GFX940:       ; %bb.0:
999; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1000; GFX940-NEXT:    v_mov_b32_e32 v1, 0x41200000
1001; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
1002; GFX940-NEXT:    s_waitcnt vmcnt(0)
1003; GFX940-NEXT:    s_setpc_b64 s[30:31]
1004;
1005; GFX10-PAL-LABEL: private_ptr_foo:
1006; GFX10-PAL:       ; %bb.0:
1007; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1008; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
1009; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:4
1010; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1011;
1012; GFX11-PAL-LABEL: private_ptr_foo:
1013; GFX11-PAL:       ; %bb.0:
1014; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1015; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
1016; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
1017; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
1018;
1019; GFX12-PAL-LABEL: private_ptr_foo:
1020; GFX12-PAL:       ; %bb.0:
1021; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
1022; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
1023; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
1024; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
1025; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
1026; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 0x41200000
1027; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
1028; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
1029  %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
1030  store float 1.000000e+01, ptr addrspace(5) %gep, align 4
1031  ret void
1032}
1033
1034define amdgpu_kernel void @zero_init_small_offset_kernel() {
1035; GFX9-LABEL: zero_init_small_offset_kernel:
1036; GFX9:       ; %bb.0:
1037; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
1038; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
1039; GFX9-NEXT:    s_mov_b32 s0, 0
1040; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1041; GFX9-NEXT:    s_waitcnt vmcnt(0)
1042; GFX9-NEXT:    s_mov_b32 s1, s0
1043; GFX9-NEXT:    s_mov_b32 s2, s0
1044; GFX9-NEXT:    s_mov_b32 s3, s0
1045; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1046; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1047; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1048; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1049; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
1050; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
1051; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
1052; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:304
1053; GFX9-NEXT:    s_endpgm
1054;
1055; GFX10-LABEL: zero_init_small_offset_kernel:
1056; GFX10:       ; %bb.0:
1057; GFX10-NEXT:    s_add_u32 s8, s8, s13
1058; GFX10-NEXT:    s_addc_u32 s9, s9, 0
1059; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
1060; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
1061; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
1062; GFX10-NEXT:    s_waitcnt vmcnt(0)
1063; GFX10-NEXT:    s_mov_b32 s0, 0
1064; GFX10-NEXT:    s_mov_b32 s1, s0
1065; GFX10-NEXT:    s_mov_b32 s2, s0
1066; GFX10-NEXT:    s_mov_b32 s3, s0
1067; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1068; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1069; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1070; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1071; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256
1072; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
1073; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
1074; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
1075; GFX10-NEXT:    s_endpgm
1076;
1077; GFX11-LABEL: zero_init_small_offset_kernel:
1078; GFX11:       ; %bb.0:
1079; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
1080; GFX11-NEXT:    s_waitcnt vmcnt(0)
1081; GFX11-NEXT:    s_mov_b32 s0, 0
1082; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1083; GFX11-NEXT:    s_mov_b32 s1, s0
1084; GFX11-NEXT:    s_mov_b32 s2, s0
1085; GFX11-NEXT:    s_mov_b32 s3, s0
1086; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1087; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1088; GFX11-NEXT:    s_clause 0x3
1089; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
1090; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
1091; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
1092; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
1093; GFX11-NEXT:    s_endpgm
1094;
1095; GFX12-LABEL: zero_init_small_offset_kernel:
1096; GFX12:       ; %bb.0:
1097; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
1098; GFX12-NEXT:    s_wait_loadcnt 0x0
1099; GFX12-NEXT:    s_mov_b32 s0, 0
1100; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1101; GFX12-NEXT:    s_mov_b32 s1, s0
1102; GFX12-NEXT:    s_mov_b32 s2, s0
1103; GFX12-NEXT:    s_mov_b32 s3, s0
1104; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1105; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1106; GFX12-NEXT:    s_clause 0x3
1107; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
1108; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
1109; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
1110; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
1111; GFX12-NEXT:    s_endpgm
1112;
1113; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
1114; GFX9-PAL:       ; %bb.0:
1115; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
1116; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
1117; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
1118; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1119; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1120; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1121; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1122; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1123; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
1124; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
1125; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
1126; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1127; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1128; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1129; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1130; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1131; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1132; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
1133; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
1134; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
1135; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:304
1136; GFX9-PAL-NEXT:    s_endpgm
1137;
1138; GFX940-LABEL: zero_init_small_offset_kernel:
1139; GFX940:       ; %bb.0:
1140; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
1141; GFX940-NEXT:    s_waitcnt vmcnt(0)
1142; GFX940-NEXT:    s_mov_b32 s0, 0
1143; GFX940-NEXT:    s_mov_b32 s1, s0
1144; GFX940-NEXT:    s_mov_b32 s2, s0
1145; GFX940-NEXT:    s_mov_b32 s3, s0
1146; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1147; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1148; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256 sc0 sc1
1149; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272 sc0 sc1
1150; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288 sc0 sc1
1151; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304 sc0 sc1
1152; GFX940-NEXT:    s_endpgm
1153;
1154; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
1155; GFX1010-PAL:       ; %bb.0:
1156; GFX1010-PAL-NEXT:    s_getpc_b64 s[12:13]
1157; GFX1010-PAL-NEXT:    s_mov_b32 s12, s0
1158; GFX1010-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
1159; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX1010-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
1161; GFX1010-PAL-NEXT:    s_add_u32 s12, s12, s11
1162; GFX1010-PAL-NEXT:    s_addc_u32 s13, s13, 0
1163; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
1164; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
1165; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
1166; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 glc dlc
1167; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1168; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
1169; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1170; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
1171; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
1172; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
1173; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
1174; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
1175; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:256
1176; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:272
1177; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:288
1178; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:304
1179; GFX1010-PAL-NEXT:    s_endpgm
1180;
1181; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
1182; GFX1030-PAL:       ; %bb.0:
1183; GFX1030-PAL-NEXT:    s_getpc_b64 s[12:13]
1184; GFX1030-PAL-NEXT:    s_mov_b32 s12, s0
1185; GFX1030-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
1186; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX1030-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
1188; GFX1030-PAL-NEXT:    s_add_u32 s12, s12, s11
1189; GFX1030-PAL-NEXT:    s_addc_u32 s13, s13, 0
1190; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
1191; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
1192; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off glc dlc
1193; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1194; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
1195; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
1196; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1197; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
1198; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
1199; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
1200; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
1201; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
1202; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:256
1203; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:272
1204; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:288
1205; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:304
1206; GFX1030-PAL-NEXT:    s_endpgm
1207;
1208; GFX11-PAL-LABEL: zero_init_small_offset_kernel:
1209; GFX11-PAL:       ; %bb.0:
1210; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off glc dlc
1211; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1212; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
1213; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1214; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
1215; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
1216; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
1217; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1218; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1219; GFX11-PAL-NEXT:    s_clause 0x3
1220; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
1221; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
1222; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
1223; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
1224; GFX11-PAL-NEXT:    s_endpgm
1225;
1226; GFX12-PAL-LABEL: zero_init_small_offset_kernel:
1227; GFX12-PAL:       ; %bb.0:
1228; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
1229; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
1230; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
1231; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1232; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
1233; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
1234; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
1235; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1236; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1237; GFX12-PAL-NEXT:    s_clause 0x3
1238; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:256
1239; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:272
1240; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:288
1241; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:304
1242; GFX12-PAL-NEXT:    s_endpgm
1243  %padding = alloca [64 x i32], align 4, addrspace(5)
1244  %alloca = alloca [32 x i16], align 2, addrspace(5)
1245  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
1246  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
1247  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
1248  ret void
1249}
1250
1251define void @zero_init_small_offset_foo() {
1252; GFX9-LABEL: zero_init_small_offset_foo:
1253; GFX9:       ; %bb.0:
1254; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1255; GFX9-NEXT:    scratch_load_dword v0, off, s32 glc
1256; GFX9-NEXT:    s_waitcnt vmcnt(0)
1257; GFX9-NEXT:    s_mov_b32 s0, 0
1258; GFX9-NEXT:    s_mov_b32 s1, s0
1259; GFX9-NEXT:    s_mov_b32 s2, s0
1260; GFX9-NEXT:    s_mov_b32 s3, s0
1261; GFX9-NEXT:    v_mov_b32_e32 v0, s0
1262; GFX9-NEXT:    v_mov_b32_e32 v1, s1
1263; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1264; GFX9-NEXT:    v_mov_b32_e32 v3, s3
1265; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1266; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1267; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1268; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1269; GFX9-NEXT:    s_waitcnt vmcnt(0)
1270; GFX9-NEXT:    s_setpc_b64 s[30:31]
1271;
1272; GFX10-LABEL: zero_init_small_offset_foo:
1273; GFX10:       ; %bb.0:
1274; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1275; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1276; GFX10-NEXT:    s_waitcnt vmcnt(0)
1277; GFX10-NEXT:    s_mov_b32 s0, 0
1278; GFX10-NEXT:    s_mov_b32 s1, s0
1279; GFX10-NEXT:    s_mov_b32 s2, s0
1280; GFX10-NEXT:    s_mov_b32 s3, s0
1281; GFX10-NEXT:    v_mov_b32_e32 v0, s0
1282; GFX10-NEXT:    v_mov_b32_e32 v1, s1
1283; GFX10-NEXT:    v_mov_b32_e32 v2, s2
1284; GFX10-NEXT:    v_mov_b32_e32 v3, s3
1285; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1286; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1287; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1288; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1289; GFX10-NEXT:    s_setpc_b64 s[30:31]
1290;
1291; GFX11-LABEL: zero_init_small_offset_foo:
1292; GFX11:       ; %bb.0:
1293; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1294; GFX11-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
1295; GFX11-NEXT:    s_waitcnt vmcnt(0)
1296; GFX11-NEXT:    s_mov_b32 s0, 0
1297; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1298; GFX11-NEXT:    s_mov_b32 s1, s0
1299; GFX11-NEXT:    s_mov_b32 s2, s0
1300; GFX11-NEXT:    s_mov_b32 s3, s0
1301; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1302; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1303; GFX11-NEXT:    s_clause 0x3
1304; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
1305; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
1306; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
1307; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
1308; GFX11-NEXT:    s_setpc_b64 s[30:31]
1309;
1310; GFX12-LABEL: zero_init_small_offset_foo:
1311; GFX12:       ; %bb.0:
1312; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
1313; GFX12-NEXT:    s_wait_expcnt 0x0
1314; GFX12-NEXT:    s_wait_samplecnt 0x0
1315; GFX12-NEXT:    s_wait_bvhcnt 0x0
1316; GFX12-NEXT:    s_wait_kmcnt 0x0
1317; GFX12-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
1318; GFX12-NEXT:    s_wait_loadcnt 0x0
1319; GFX12-NEXT:    s_mov_b32 s0, 0
1320; GFX12-NEXT:    s_wait_alu 0xfffe
1321; GFX12-NEXT:    s_mov_b32 s1, s0
1322; GFX12-NEXT:    s_mov_b32 s2, s0
1323; GFX12-NEXT:    s_mov_b32 s3, s0
1324; GFX12-NEXT:    s_wait_alu 0xfffe
1325; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1326; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1327; GFX12-NEXT:    s_clause 0x3
1328; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
1329; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
1330; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
1331; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
1332; GFX12-NEXT:    s_setpc_b64 s[30:31]
1333;
1334; GFX9-PAL-LABEL: zero_init_small_offset_foo:
1335; GFX9-PAL:       ; %bb.0:
1336; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1337; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 glc
1338; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1339; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1340; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
1341; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1342; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
1343; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
1344; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
1345; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
1346; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
1347; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1348; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1349; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1350; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1351; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1352; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
1353;
1354; GFX940-LABEL: zero_init_small_offset_foo:
1355; GFX940:       ; %bb.0:
1356; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1357; GFX940-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
1358; GFX940-NEXT:    s_waitcnt vmcnt(0)
1359; GFX940-NEXT:    s_mov_b32 s0, 0
1360; GFX940-NEXT:    s_mov_b32 s1, s0
1361; GFX940-NEXT:    s_mov_b32 s2, s0
1362; GFX940-NEXT:    s_mov_b32 s3, s0
1363; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
1364; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
1365; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256 sc0 sc1
1366; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272 sc0 sc1
1367; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288 sc0 sc1
1368; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304 sc0 sc1
1369; GFX940-NEXT:    s_waitcnt vmcnt(0)
1370; GFX940-NEXT:    s_setpc_b64 s[30:31]
1371;
1372; GFX10-PAL-LABEL: zero_init_small_offset_foo:
1373; GFX10-PAL:       ; %bb.0:
1374; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1375; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
1376; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
1377; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
1378; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
1379; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
1380; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
1381; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
1382; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
1383; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
1384; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
1385; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
1386; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
1387; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
1388; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
1389; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
1390;
1391; GFX11-PAL-LABEL: zero_init_small_offset_foo:
1392; GFX11-PAL:       ; %bb.0:
1393; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1394; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
1395; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1396; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
1397; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1398; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
1399; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
1400; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
1401; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1402; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1403; GFX11-PAL-NEXT:    s_clause 0x3
1404; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
1405; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
1406; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
1407; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
1408; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
1409;
1410; GFX12-PAL-LABEL: zero_init_small_offset_foo:
1411; GFX12-PAL:       ; %bb.0:
1412; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
1413; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
1414; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
1415; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
1416; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
1417; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
1418; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
1419; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
1420; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
1421; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
1422; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
1423; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
1424; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
1425; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1426; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1427; GFX12-PAL-NEXT:    s_clause 0x3
1428; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
1429; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
1430; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
1431; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
1432; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
1433  %padding = alloca [64 x i32], align 4, addrspace(5)
1434  %alloca = alloca [32 x i16], align 2, addrspace(5)
1435  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
1436  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
1437  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
1438  ret void
1439}
1440
1441define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
1442; GFX9-LABEL: store_load_sindex_small_offset_kernel:
1443; GFX9:       ; %bb.0: ; %bb
1444; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x24
1445; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
1446; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
1447; GFX9-NEXT:    s_mov_b32 s1, 0
1448; GFX9-NEXT:    scratch_load_dword v0, off, s1 glc
1449; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
1450; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
1451; GFX9-NEXT:    s_and_b32 s0, s0, 15
1452; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1453; GFX9-NEXT:    s_addk_i32 s1, 0x100
1454; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1455; GFX9-NEXT:    scratch_store_dword off, v0, s1
1456; GFX9-NEXT:    s_waitcnt vmcnt(0)
1457; GFX9-NEXT:    s_addk_i32 s0, 0x100
1458; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1459; GFX9-NEXT:    s_waitcnt vmcnt(0)
1460; GFX9-NEXT:    s_endpgm
1461;
1462; GFX10-LABEL: store_load_sindex_small_offset_kernel:
1463; GFX10:       ; %bb.0: ; %bb
1464; GFX10-NEXT:    s_add_u32 s8, s8, s13
1465; GFX10-NEXT:    s_addc_u32 s9, s9, 0
1466; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
1467; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
1468; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x24
1469; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
1470; GFX10-NEXT:    s_waitcnt vmcnt(0)
1471; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1472; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1473; GFX10-NEXT:    s_and_b32 s1, s0, 15
1474; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1475; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
1476; GFX10-NEXT:    s_addk_i32 s0, 0x100
1477; GFX10-NEXT:    s_addk_i32 s1, 0x100
1478; GFX10-NEXT:    scratch_store_dword off, v0, s0
1479; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1480; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1481; GFX10-NEXT:    s_waitcnt vmcnt(0)
1482; GFX10-NEXT:    s_endpgm
1483;
1484; GFX11-LABEL: store_load_sindex_small_offset_kernel:
1485; GFX11:       ; %bb.0: ; %bb
1486; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
1487; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
1488; GFX11-NEXT:    s_waitcnt vmcnt(0)
1489; GFX11-NEXT:    v_mov_b32_e32 v0, 15
1490; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1491; GFX11-NEXT:    s_and_b32 s1, s0, 15
1492; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
1493; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
1494; GFX11-NEXT:    s_addk_i32 s0, 0x100
1495; GFX11-NEXT:    s_addk_i32 s1, 0x100
1496; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
1497; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1498; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1499; GFX11-NEXT:    s_waitcnt vmcnt(0)
1500; GFX11-NEXT:    s_endpgm
1501;
1502; GFX12-LABEL: store_load_sindex_small_offset_kernel:
1503; GFX12:       ; %bb.0: ; %bb
1504; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x24
1505; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
1506; GFX12-NEXT:    s_wait_loadcnt 0x0
1507; GFX12-NEXT:    v_mov_b32_e32 v0, 15
1508; GFX12-NEXT:    s_wait_kmcnt 0x0
1509; GFX12-NEXT:    s_and_b32 s1, s0, 15
1510; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
1511; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
1512; GFX12-NEXT:    s_addk_co_i32 s0, 0x100
1513; GFX12-NEXT:    s_addk_co_i32 s1, 0x100
1514; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
1515; GFX12-NEXT:    s_wait_storecnt 0x0
1516; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
1517; GFX12-NEXT:    s_wait_loadcnt 0x0
1518; GFX12-NEXT:    s_endpgm
1519;
1520; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
1521; GFX9-PAL:       ; %bb.0: ; %bb
1522; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
1523; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
1524; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
1525; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
1526; GFX9-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
1527; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1528; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
1529; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
1530; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
1531; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 glc
1532; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1533; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1534; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1535; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1536; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x100
1537; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1538; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1539; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1540; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x100
1541; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1542; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1543; GFX9-PAL-NEXT:    s_endpgm
1544;
1545; GFX940-LABEL: store_load_sindex_small_offset_kernel:
1546; GFX940:       ; %bb.0: ; %bb
1547; GFX940-NEXT:    s_load_dword s0, s[4:5], 0x24
1548; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
1549; GFX940-NEXT:    s_waitcnt vmcnt(0)
1550; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1551; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
1552; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1553; GFX940-NEXT:    s_and_b32 s0, s0, 15
1554; GFX940-NEXT:    s_addk_i32 s1, 0x100
1555; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1556; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1557; GFX940-NEXT:    s_waitcnt vmcnt(0)
1558; GFX940-NEXT:    s_addk_i32 s0, 0x100
1559; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1560; GFX940-NEXT:    s_waitcnt vmcnt(0)
1561; GFX940-NEXT:    s_endpgm
1562;
1563; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
1564; GFX1010-PAL:       ; %bb.0: ; %bb
1565; GFX1010-PAL-NEXT:    s_getpc_b64 s[12:13]
1566; GFX1010-PAL-NEXT:    s_mov_b32 s12, s0
1567; GFX1010-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
1568; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1569; GFX1010-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
1570; GFX1010-PAL-NEXT:    s_add_u32 s12, s12, s11
1571; GFX1010-PAL-NEXT:    s_addc_u32 s13, s13, 0
1572; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
1573; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
1574; GFX1010-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
1575; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
1576; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1577; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1578; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1579; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1580; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1581; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1582; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1583; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x100
1584; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x100
1585; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1586; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1587; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1588; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1589; GFX1010-PAL-NEXT:    s_endpgm
1590;
1591; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
1592; GFX1030-PAL:       ; %bb.0: ; %bb
1593; GFX1030-PAL-NEXT:    s_getpc_b64 s[12:13]
1594; GFX1030-PAL-NEXT:    s_mov_b32 s12, s0
1595; GFX1030-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
1596; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1597; GFX1030-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
1598; GFX1030-PAL-NEXT:    s_add_u32 s12, s12, s11
1599; GFX1030-PAL-NEXT:    s_addc_u32 s13, s13, 0
1600; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
1601; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
1602; GFX1030-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
1603; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off glc dlc
1604; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1605; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1606; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1607; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1608; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1609; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1610; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x100
1611; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x100
1612; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1613; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1614; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1615; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1616; GFX1030-PAL-NEXT:    s_endpgm
1617;
1618; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel:
1619; GFX11-PAL:       ; %bb.0: ; %bb
1620; GFX11-PAL-NEXT:    s_load_b32 s0, s[4:5], 0x0
1621; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off glc dlc
1622; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1623; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
1624; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1625; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
1626; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1627; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1628; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x100
1629; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x100
1630; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
1631; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1632; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1633; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1634; GFX11-PAL-NEXT:    s_endpgm
1635;
1636; GFX12-PAL-LABEL: store_load_sindex_small_offset_kernel:
1637; GFX12-PAL:       ; %bb.0: ; %bb
1638; GFX12-PAL-NEXT:    s_load_b32 s0, s[4:5], 0x0
1639; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
1640; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
1641; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
1642; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
1643; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
1644; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1645; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1646; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x100
1647; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x100
1648; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
1649; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
1650; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
1651; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
1652; GFX12-PAL-NEXT:    s_endpgm
1653bb:
1654  %padding = alloca [64 x i32], align 4, addrspace(5)
1655  %i = alloca [32 x float], align 4, addrspace(5)
1656  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
1657  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
1658  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
1659  store volatile i32 15, ptr addrspace(5) %i7, align 4
1660  %i9 = and i32 %idx, 15
1661  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
1662  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
1663  ret void
1664}
1665
1666define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
1667; GFX9-LABEL: store_load_sindex_small_offset_foo:
1668; GFX9:       ; %bb.0: ; %bb
1669; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
1670; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
1671; GFX9-NEXT:    s_mov_b32 s0, 0
1672; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1673; GFX9-NEXT:    s_waitcnt vmcnt(0)
1674; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
1675; GFX9-NEXT:    s_addk_i32 s0, 0x100
1676; GFX9-NEXT:    v_mov_b32_e32 v0, 15
1677; GFX9-NEXT:    scratch_store_dword off, v0, s0
1678; GFX9-NEXT:    s_waitcnt vmcnt(0)
1679; GFX9-NEXT:    s_and_b32 s0, s2, 15
1680; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
1681; GFX9-NEXT:    s_addk_i32 s0, 0x100
1682; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
1683; GFX9-NEXT:    s_waitcnt vmcnt(0)
1684; GFX9-NEXT:    s_endpgm
1685;
1686; GFX10-LABEL: store_load_sindex_small_offset_foo:
1687; GFX10:       ; %bb.0: ; %bb
1688; GFX10-NEXT:    s_add_u32 s0, s0, s3
1689; GFX10-NEXT:    s_addc_u32 s1, s1, 0
1690; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1691; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1692; GFX10-NEXT:    scratch_load_dword v0, off, off glc dlc
1693; GFX10-NEXT:    s_waitcnt vmcnt(0)
1694; GFX10-NEXT:    v_mov_b32_e32 v0, 15
1695; GFX10-NEXT:    s_and_b32 s0, s2, 15
1696; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
1697; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
1698; GFX10-NEXT:    s_addk_i32 s1, 0x100
1699; GFX10-NEXT:    s_addk_i32 s0, 0x100
1700; GFX10-NEXT:    scratch_store_dword off, v0, s1
1701; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1702; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
1703; GFX10-NEXT:    s_waitcnt vmcnt(0)
1704; GFX10-NEXT:    s_endpgm
1705;
1706; GFX11-LABEL: store_load_sindex_small_offset_foo:
1707; GFX11:       ; %bb.0: ; %bb
1708; GFX11-NEXT:    scratch_load_b32 v0, off, off glc dlc
1709; GFX11-NEXT:    s_waitcnt vmcnt(0)
1710; GFX11-NEXT:    v_mov_b32_e32 v0, 15
1711; GFX11-NEXT:    s_and_b32 s1, s0, 15
1712; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
1713; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
1714; GFX11-NEXT:    s_addk_i32 s0, 0x100
1715; GFX11-NEXT:    s_addk_i32 s1, 0x100
1716; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
1717; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1718; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1719; GFX11-NEXT:    s_waitcnt vmcnt(0)
1720; GFX11-NEXT:    s_endpgm
1721;
1722; GFX12-LABEL: store_load_sindex_small_offset_foo:
1723; GFX12:       ; %bb.0: ; %bb
1724; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
1725; GFX12-NEXT:    s_wait_loadcnt 0x0
1726; GFX12-NEXT:    v_mov_b32_e32 v0, 15
1727; GFX12-NEXT:    s_and_b32 s1, s0, 15
1728; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
1729; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
1730; GFX12-NEXT:    s_addk_co_i32 s0, 0x100
1731; GFX12-NEXT:    s_addk_co_i32 s1, 0x100
1732; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
1733; GFX12-NEXT:    s_wait_storecnt 0x0
1734; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
1735; GFX12-NEXT:    s_wait_loadcnt 0x0
1736; GFX12-NEXT:    s_endpgm
1737;
1738; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
1739; GFX9-PAL:       ; %bb.0: ; %bb
1740; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
1741; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
1742; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1743; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1744; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1745; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
1746; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
1747; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
1748; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 glc
1749; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1750; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
1751; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
1752; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x100
1753; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
1754; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1755; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
1756; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1757; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x100
1758; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
1759; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1760; GFX9-PAL-NEXT:    s_endpgm
1761;
1762; GFX940-LABEL: store_load_sindex_small_offset_foo:
1763; GFX940:       ; %bb.0: ; %bb
1764; GFX940-NEXT:    scratch_load_dword v0, off, off sc0 sc1
1765; GFX940-NEXT:    s_waitcnt vmcnt(0)
1766; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
1767; GFX940-NEXT:    s_and_b32 s0, s0, 15
1768; GFX940-NEXT:    s_addk_i32 s1, 0x100
1769; GFX940-NEXT:    v_mov_b32_e32 v0, 15
1770; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
1771; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
1772; GFX940-NEXT:    s_waitcnt vmcnt(0)
1773; GFX940-NEXT:    s_addk_i32 s0, 0x100
1774; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
1775; GFX940-NEXT:    s_waitcnt vmcnt(0)
1776; GFX940-NEXT:    s_endpgm
1777;
1778; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
1779; GFX1010-PAL:       ; %bb.0: ; %bb
1780; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
1781; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
1782; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1783; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1784; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1785; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
1786; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
1787; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1788; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1789; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
1790; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1791; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1792; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
1793; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
1794; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1795; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1796; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x100
1797; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x100
1798; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
1799; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1800; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1801; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1802; GFX1010-PAL-NEXT:    s_endpgm
1803;
1804; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
1805; GFX1030-PAL:       ; %bb.0: ; %bb
1806; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
1807; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
1808; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
1809; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1810; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
1811; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
1812; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
1813; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1814; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1815; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off glc dlc
1816; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1817; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
1818; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
1819; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1820; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1821; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x100
1822; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x100
1823; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
1824; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1825; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
1826; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
1827; GFX1030-PAL-NEXT:    s_endpgm
1828;
1829; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo:
1830; GFX11-PAL:       ; %bb.0: ; %bb
1831; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off glc dlc
1832; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1833; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
1834; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
1835; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1836; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1837; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x100
1838; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x100
1839; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
1840; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1841; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
1842; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
1843; GFX11-PAL-NEXT:    s_endpgm
1844;
1845; GFX12-PAL-LABEL: store_load_sindex_small_offset_foo:
1846; GFX12-PAL:       ; %bb.0: ; %bb
1847; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
1848; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
1849; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
1850; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
1851; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
1852; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
1853; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x100
1854; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x100
1855; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
1856; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
1857; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
1858; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
1859; GFX12-PAL-NEXT:    s_endpgm
1860bb:
1861  %padding = alloca [64 x i32], align 4, addrspace(5)
1862  %i = alloca [32 x float], align 4, addrspace(5)
1863  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
1864  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
1865  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
1866  store volatile i32 15, ptr addrspace(5) %i7, align 4
1867  %i9 = and i32 %idx, 15
1868  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
1869  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
1870  ret void
1871}
1872
1873define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
1874; GFX9-LABEL: store_load_vindex_small_offset_kernel:
1875; GFX9:       ; %bb.0: ; %bb
1876; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
1877; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
1878; GFX9-NEXT:    s_mov_b32 s0, 0
1879; GFX9-NEXT:    scratch_load_dword v1, off, s0 glc
1880; GFX9-NEXT:    s_waitcnt vmcnt(0)
1881; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1882; GFX9-NEXT:    v_add_u32_e32 v1, 0x100, v0
1883; GFX9-NEXT:    v_mov_b32_e32 v2, 15
1884; GFX9-NEXT:    scratch_store_dword v1, v2, off
1885; GFX9-NEXT:    s_waitcnt vmcnt(0)
1886; GFX9-NEXT:    v_sub_u32_e32 v0, 0x100, v0
1887; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1888; GFX9-NEXT:    s_waitcnt vmcnt(0)
1889; GFX9-NEXT:    s_endpgm
1890;
1891; GFX10-LABEL: store_load_vindex_small_offset_kernel:
1892; GFX10:       ; %bb.0: ; %bb
1893; GFX10-NEXT:    s_add_u32 s8, s8, s13
1894; GFX10-NEXT:    s_addc_u32 s9, s9, 0
1895; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
1896; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
1897; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1898; GFX10-NEXT:    v_mov_b32_e32 v2, 15
1899; GFX10-NEXT:    scratch_load_dword v3, off, off glc dlc
1900; GFX10-NEXT:    s_waitcnt vmcnt(0)
1901; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x100, v0
1902; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x100, v0
1903; GFX10-NEXT:    scratch_store_dword v1, v2, off
1904; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
1905; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1906; GFX10-NEXT:    s_waitcnt vmcnt(0)
1907; GFX10-NEXT:    s_endpgm
1908;
1909; GFX11-LABEL: store_load_vindex_small_offset_kernel:
1910; GFX11:       ; %bb.0: ; %bb
1911; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
1912; GFX11-NEXT:    scratch_load_b32 v3, off, off glc dlc
1913; GFX11-NEXT:    s_waitcnt vmcnt(0)
1914; GFX11-NEXT:    v_and_b32_e32 v0, 0xffc, v0
1915; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1916; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
1917; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:256 dlc
1918; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1919; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
1920; GFX11-NEXT:    s_waitcnt vmcnt(0)
1921; GFX11-NEXT:    s_endpgm
1922;
1923; GFX12-LABEL: store_load_vindex_small_offset_kernel:
1924; GFX12:       ; %bb.0: ; %bb
1925; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
1926; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
1927; GFX12-NEXT:    s_wait_loadcnt 0x0
1928; GFX12-NEXT:    v_and_b32_e32 v0, 0xffc, v0
1929; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1930; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
1931; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
1932; GFX12-NEXT:    s_wait_storecnt 0x0
1933; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
1934; GFX12-NEXT:    s_wait_loadcnt 0x0
1935; GFX12-NEXT:    s_endpgm
1936;
1937; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
1938; GFX9-PAL:       ; %bb.0: ; %bb
1939; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
1940; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
1941; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
1942; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
1943; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1944; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
1945; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1946; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
1947; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
1948; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
1949; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s0 glc
1950; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1951; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x100, v0
1952; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
1953; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1954; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x100, v0
1955; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
1956; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
1957; GFX9-PAL-NEXT:    s_endpgm
1958;
1959; GFX940-LABEL: store_load_vindex_small_offset_kernel:
1960; GFX940:       ; %bb.0: ; %bb
1961; GFX940-NEXT:    scratch_load_dword v1, off, off sc0 sc1
1962; GFX940-NEXT:    s_waitcnt vmcnt(0)
1963; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1964; GFX940-NEXT:    v_and_b32_e32 v0, 0xffc, v0
1965; GFX940-NEXT:    v_mov_b32_e32 v1, 15
1966; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:256 sc0 sc1
1967; GFX940-NEXT:    s_waitcnt vmcnt(0)
1968; GFX940-NEXT:    v_sub_u32_e32 v0, 0x100, v0
1969; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
1970; GFX940-NEXT:    s_waitcnt vmcnt(0)
1971; GFX940-NEXT:    s_endpgm
1972;
1973; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
1974; GFX1010-PAL:       ; %bb.0: ; %bb
1975; GFX1010-PAL-NEXT:    s_getpc_b64 s[12:13]
1976; GFX1010-PAL-NEXT:    s_mov_b32 s12, s0
1977; GFX1010-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
1978; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
1979; GFX1010-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
1980; GFX1010-PAL-NEXT:    s_add_u32 s12, s12, s11
1981; GFX1010-PAL-NEXT:    s_addc_u32 s13, s13, 0
1982; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
1983; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
1984; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1985; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
1986; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
1987; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, s0 glc dlc
1988; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1989; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x100, v0
1990; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x100, v0
1991; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
1992; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
1993; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
1994; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
1995; GFX1010-PAL-NEXT:    s_endpgm
1996;
1997; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
1998; GFX1030-PAL:       ; %bb.0: ; %bb
1999; GFX1030-PAL-NEXT:    s_getpc_b64 s[12:13]
2000; GFX1030-PAL-NEXT:    s_mov_b32 s12, s0
2001; GFX1030-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
2002; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2003; GFX1030-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
2004; GFX1030-PAL-NEXT:    s_add_u32 s12, s12, s11
2005; GFX1030-PAL-NEXT:    s_addc_u32 s13, s13, 0
2006; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
2007; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
2008; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2009; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
2010; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off glc dlc
2011; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2012; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x100, v0
2013; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x100, v0
2014; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
2015; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2016; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
2017; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2018; GFX1030-PAL-NEXT:    s_endpgm
2019;
2020; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
2021; GFX11-PAL:       ; %bb.0: ; %bb
2022; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
2023; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off glc dlc
2024; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2025; GFX11-PAL-NEXT:    v_and_b32_e32 v0, 0xffc, v0
2026; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2027; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
2028; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:256 dlc
2029; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2030; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
2031; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2032; GFX11-PAL-NEXT:    s_endpgm
2033;
2034; GFX12-PAL-LABEL: store_load_vindex_small_offset_kernel:
2035; GFX12-PAL:       ; %bb.0: ; %bb
2036; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
2037; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
2038; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
2039; GFX12-PAL-NEXT:    v_and_b32_e32 v0, 0xffc, v0
2040; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
2041; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x100, v0
2042; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:256 scope:SCOPE_SYS
2043; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
2044; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
2045; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
2046; GFX12-PAL-NEXT:    s_endpgm
2047bb:
2048  %padding = alloca [64 x i32], align 4, addrspace(5)
2049  %i = alloca [32 x float], align 4, addrspace(5)
2050  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
2051  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
2052  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
2053  %i3 = zext i32 %i2 to i64
2054  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
2055  store volatile i32 15, ptr addrspace(5) %i7, align 4
2056  %i9 = sub nsw i32 31, %i2
2057  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
2058  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
2059  ret void
2060}
2061
2062define void @store_load_vindex_small_offset_foo(i32 %idx) {
2063; GFX9-LABEL: store_load_vindex_small_offset_foo:
2064; GFX9:       ; %bb.0: ; %bb
2065; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2066; GFX9-NEXT:    scratch_load_dword v1, off, s32 glc
2067; GFX9-NEXT:    s_waitcnt vmcnt(0)
2068; GFX9-NEXT:    s_add_i32 s0, s32, 0x100
2069; GFX9-NEXT:    v_mov_b32_e32 v1, s0
2070; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
2071; GFX9-NEXT:    v_mov_b32_e32 v3, 15
2072; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
2073; GFX9-NEXT:    scratch_store_dword v2, v3, off
2074; GFX9-NEXT:    s_waitcnt vmcnt(0)
2075; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2076; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
2077; GFX9-NEXT:    s_waitcnt vmcnt(0)
2078; GFX9-NEXT:    s_setpc_b64 s[30:31]
2079;
2080; GFX10-LABEL: store_load_vindex_small_offset_foo:
2081; GFX10:       ; %bb.0: ; %bb
2082; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2083; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
2084; GFX10-NEXT:    s_add_i32 s0, s32, 0x100
2085; GFX10-NEXT:    v_mov_b32_e32 v2, 15
2086; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
2087; GFX10-NEXT:    s_add_i32 s0, s32, 0x100
2088; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
2089; GFX10-NEXT:    s_waitcnt vmcnt(0)
2090; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
2091; GFX10-NEXT:    scratch_store_dword v0, v2, off
2092; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2093; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
2094; GFX10-NEXT:    s_waitcnt vmcnt(0)
2095; GFX10-NEXT:    s_setpc_b64 s[30:31]
2096;
2097; GFX11-LABEL: store_load_vindex_small_offset_foo:
2098; GFX11:       ; %bb.0: ; %bb
2099; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2100; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
2101; GFX11-NEXT:    s_add_i32 s0, s32, 0x100
2102; GFX11-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
2103; GFX11-NEXT:    s_waitcnt vmcnt(0)
2104; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
2105; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
2106; GFX11-NEXT:    scratch_store_b32 v0, v2, off dlc
2107; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2108; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
2109; GFX11-NEXT:    s_waitcnt vmcnt(0)
2110; GFX11-NEXT:    s_setpc_b64 s[30:31]
2111;
2112; GFX12-LABEL: store_load_vindex_small_offset_foo:
2113; GFX12:       ; %bb.0: ; %bb
2114; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2115; GFX12-NEXT:    s_wait_expcnt 0x0
2116; GFX12-NEXT:    s_wait_samplecnt 0x0
2117; GFX12-NEXT:    s_wait_bvhcnt 0x0
2118; GFX12-NEXT:    s_wait_kmcnt 0x0
2119; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
2120; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2121; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
2122; GFX12-NEXT:    s_wait_loadcnt 0x0
2123; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
2124; GFX12-NEXT:    s_wait_storecnt 0x0
2125; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
2126; GFX12-NEXT:    s_wait_storecnt 0x0
2127; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
2128; GFX12-NEXT:    s_wait_loadcnt 0x0
2129; GFX12-NEXT:    s_setpc_b64 s[30:31]
2130;
2131; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
2132; GFX9-PAL:       ; %bb.0: ; %bb
2133; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2134; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 glc
2135; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2136; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x100
2137; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s0
2138; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
2139; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
2140; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
2141; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
2142; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2143; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
2144; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
2145; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2146; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2147;
2148; GFX940-LABEL: store_load_vindex_small_offset_foo:
2149; GFX940:       ; %bb.0: ; %bb
2150; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2151; GFX940-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
2152; GFX940-NEXT:    s_waitcnt vmcnt(0)
2153; GFX940-NEXT:    s_add_i32 s0, s32, 0x100
2154; GFX940-NEXT:    v_mov_b32_e32 v1, s0
2155; GFX940-NEXT:    v_lshl_add_u32 v1, v0, 2, v1
2156; GFX940-NEXT:    v_mov_b32_e32 v2, 15
2157; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
2158; GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
2159; GFX940-NEXT:    s_waitcnt vmcnt(0)
2160; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2161; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
2162; GFX940-NEXT:    s_waitcnt vmcnt(0)
2163; GFX940-NEXT:    s_setpc_b64 s[30:31]
2164;
2165; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
2166; GFX10-PAL:       ; %bb.0: ; %bb
2167; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2168; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
2169; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x100
2170; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
2171; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
2172; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x100
2173; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
2174; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2175; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
2176; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
2177; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2178; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
2179; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
2180; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
2181;
2182; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo:
2183; GFX11-PAL:       ; %bb.0: ; %bb
2184; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2185; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
2186; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x100
2187; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
2188; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2189; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
2190; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
2191; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off dlc
2192; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2193; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
2194; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2195; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
2196;
2197; GFX12-PAL-LABEL: store_load_vindex_small_offset_foo:
2198; GFX12-PAL:       ; %bb.0: ; %bb
2199; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
2200; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
2201; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
2202; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
2203; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
2204; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
2205; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2206; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
2207; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
2208; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
2209; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
2210; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
2211; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
2212; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
2213; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
2214; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
2215bb:
2216  %padding = alloca [64 x i32], align 4, addrspace(5)
2217  %i = alloca [32 x float], align 4, addrspace(5)
2218  %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
2219  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
2220  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
2221  store volatile i32 15, ptr addrspace(5) %i7, align 4
2222  %i9 = and i32 %idx, 15
2223  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
2224  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
2225  ret void
2226}
2227
2228define amdgpu_kernel void @zero_init_large_offset_kernel() {
2229; GFX9-LABEL: zero_init_large_offset_kernel:
2230; GFX9:       ; %bb.0:
2231; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
2232; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
2233; GFX9-NEXT:    s_mov_b32 s0, 0
2234; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
2235; GFX9-NEXT:    s_waitcnt vmcnt(0)
2236; GFX9-NEXT:    s_mov_b32 s1, s0
2237; GFX9-NEXT:    s_mov_b32 s2, s0
2238; GFX9-NEXT:    s_mov_b32 s3, s0
2239; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2240; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2241; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2242; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2243; GFX9-NEXT:    s_movk_i32 s0, 0x4004
2244; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
2245; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
2246; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
2247; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
2248; GFX9-NEXT:    s_endpgm
2249;
2250; GFX10-LABEL: zero_init_large_offset_kernel:
2251; GFX10:       ; %bb.0:
2252; GFX10-NEXT:    s_add_u32 s8, s8, s13
2253; GFX10-NEXT:    s_addc_u32 s9, s9, 0
2254; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
2255; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
2256; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2257; GFX10-NEXT:    s_waitcnt vmcnt(0)
2258; GFX10-NEXT:    s_mov_b32 s0, 0
2259; GFX10-NEXT:    s_mov_b32 s1, s0
2260; GFX10-NEXT:    s_mov_b32 s2, s0
2261; GFX10-NEXT:    s_mov_b32 s3, s0
2262; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2263; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2264; GFX10-NEXT:    v_mov_b32_e32 v2, s2
2265; GFX10-NEXT:    v_mov_b32_e32 v3, s3
2266; GFX10-NEXT:    s_movk_i32 s0, 0x4004
2267; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
2268; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
2269; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
2270; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
2271; GFX10-NEXT:    s_endpgm
2272;
2273; GFX11-LABEL: zero_init_large_offset_kernel:
2274; GFX11:       ; %bb.0:
2275; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2276; GFX11-NEXT:    s_waitcnt vmcnt(0)
2277; GFX11-NEXT:    s_mov_b32 s0, 0
2278; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2279; GFX11-NEXT:    s_mov_b32 s1, s0
2280; GFX11-NEXT:    s_mov_b32 s2, s0
2281; GFX11-NEXT:    s_mov_b32 s3, s0
2282; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2283; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2284; GFX11-NEXT:    s_movk_i32 s0, 0x4004
2285; GFX11-NEXT:    s_clause 0x3
2286; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
2287; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:16
2288; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
2289; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
2290; GFX11-NEXT:    s_endpgm
2291;
2292; GFX12-LABEL: zero_init_large_offset_kernel:
2293; GFX12:       ; %bb.0:
2294; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
2295; GFX12-NEXT:    s_wait_loadcnt 0x0
2296; GFX12-NEXT:    s_mov_b32 s0, 0
2297; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2298; GFX12-NEXT:    s_mov_b32 s1, s0
2299; GFX12-NEXT:    s_mov_b32 s2, s0
2300; GFX12-NEXT:    s_mov_b32 s3, s0
2301; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2302; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2303; GFX12-NEXT:    s_clause 0x3
2304; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16384
2305; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16400
2306; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16416
2307; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:16432
2308; GFX12-NEXT:    s_endpgm
2309;
2310; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
2311; GFX9-PAL:       ; %bb.0:
2312; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
2313; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
2314; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
2315; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
2316; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
2317; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2318; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
2319; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2320; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
2321; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
2322; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
2323; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
2324; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2325; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
2326; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
2327; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
2328; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
2329; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x4004
2330; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
2331; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
2332; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
2333; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
2334; GFX9-PAL-NEXT:    s_endpgm
2335;
2336; GFX940-LABEL: zero_init_large_offset_kernel:
2337; GFX940:       ; %bb.0:
2338; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2339; GFX940-NEXT:    s_waitcnt vmcnt(0)
2340; GFX940-NEXT:    s_mov_b32 s0, 0
2341; GFX940-NEXT:    s_mov_b32 s1, s0
2342; GFX940-NEXT:    s_mov_b32 s2, s0
2343; GFX940-NEXT:    s_mov_b32 s3, s0
2344; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2345; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
2346; GFX940-NEXT:    s_movk_i32 s0, 0x4004
2347; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1
2348; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1
2349; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1
2350; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1
2351; GFX940-NEXT:    s_endpgm
2352;
2353; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
2354; GFX1010-PAL:       ; %bb.0:
2355; GFX1010-PAL-NEXT:    s_getpc_b64 s[12:13]
2356; GFX1010-PAL-NEXT:    s_mov_b32 s12, s0
2357; GFX1010-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
2358; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2359; GFX1010-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
2360; GFX1010-PAL-NEXT:    s_add_u32 s12, s12, s11
2361; GFX1010-PAL-NEXT:    s_addc_u32 s13, s13, 0
2362; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
2363; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
2364; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
2365; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc dlc
2366; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2367; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
2368; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2369; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
2370; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
2371; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
2372; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
2373; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
2374; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x4004
2375; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
2376; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
2377; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
2378; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
2379; GFX1010-PAL-NEXT:    s_endpgm
2380;
2381; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
2382; GFX1030-PAL:       ; %bb.0:
2383; GFX1030-PAL-NEXT:    s_getpc_b64 s[12:13]
2384; GFX1030-PAL-NEXT:    s_mov_b32 s12, s0
2385; GFX1030-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
2386; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2387; GFX1030-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
2388; GFX1030-PAL-NEXT:    s_add_u32 s12, s12, s11
2389; GFX1030-PAL-NEXT:    s_addc_u32 s13, s13, 0
2390; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
2391; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
2392; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2393; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2394; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
2395; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
2396; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2397; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
2398; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
2399; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
2400; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
2401; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
2402; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x4004
2403; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
2404; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
2405; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
2406; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
2407; GFX1030-PAL-NEXT:    s_endpgm
2408;
2409; GFX11-PAL-LABEL: zero_init_large_offset_kernel:
2410; GFX11-PAL:       ; %bb.0:
2411; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2412; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2413; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
2414; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2415; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
2416; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
2417; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
2418; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2419; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2420; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x4004
2421; GFX11-PAL-NEXT:    s_clause 0x3
2422; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0
2423; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:16
2424; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
2425; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
2426; GFX11-PAL-NEXT:    s_endpgm
2427;
2428; GFX12-PAL-LABEL: zero_init_large_offset_kernel:
2429; GFX12-PAL:       ; %bb.0:
2430; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
2431; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
2432; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
2433; GFX12-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2434; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
2435; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
2436; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
2437; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2438; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2439; GFX12-PAL-NEXT:    s_clause 0x3
2440; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16384
2441; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16400
2442; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16416
2443; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:16432
2444; GFX12-PAL-NEXT:    s_endpgm
2445  %padding = alloca [4096 x i32], align 4, addrspace(5)
2446  %alloca = alloca [32 x i16], align 2, addrspace(5)
2447  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
2448  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
2449  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
2450  ret void
2451}
2452
2453define void @zero_init_large_offset_foo() {
2454; GFX9-LABEL: zero_init_large_offset_foo:
2455; GFX9:       ; %bb.0:
2456; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2457; GFX9-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
2458; GFX9-NEXT:    s_waitcnt vmcnt(0)
2459; GFX9-NEXT:    s_mov_b32 s0, 0
2460; GFX9-NEXT:    s_mov_b32 s1, s0
2461; GFX9-NEXT:    s_mov_b32 s2, s0
2462; GFX9-NEXT:    s_mov_b32 s3, s0
2463; GFX9-NEXT:    v_mov_b32_e32 v0, s0
2464; GFX9-NEXT:    v_mov_b32_e32 v1, s1
2465; GFX9-NEXT:    v_mov_b32_e32 v2, s2
2466; GFX9-NEXT:    v_mov_b32_e32 v3, s3
2467; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
2468; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
2469; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
2470; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
2471; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
2472; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
2473; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
2474; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
2475; GFX9-NEXT:    s_waitcnt vmcnt(0)
2476; GFX9-NEXT:    s_setpc_b64 s[30:31]
2477;
2478; GFX10-LABEL: zero_init_large_offset_foo:
2479; GFX10:       ; %bb.0:
2480; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2481; GFX10-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
2482; GFX10-NEXT:    s_waitcnt vmcnt(0)
2483; GFX10-NEXT:    s_mov_b32 s0, 0
2484; GFX10-NEXT:    s_mov_b32 s1, s0
2485; GFX10-NEXT:    s_mov_b32 s2, s0
2486; GFX10-NEXT:    s_mov_b32 s3, s0
2487; GFX10-NEXT:    v_mov_b32_e32 v0, s0
2488; GFX10-NEXT:    v_mov_b32_e32 v1, s1
2489; GFX10-NEXT:    v_mov_b32_e32 v2, s2
2490; GFX10-NEXT:    v_mov_b32_e32 v3, s3
2491; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
2492; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
2493; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
2494; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
2495; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
2496; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
2497; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
2498; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
2499; GFX10-NEXT:    s_setpc_b64 s[30:31]
2500;
2501; GFX11-LABEL: zero_init_large_offset_foo:
2502; GFX11:       ; %bb.0:
2503; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2504; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
2505; GFX11-NEXT:    s_waitcnt vmcnt(0)
2506; GFX11-NEXT:    s_mov_b32 s0, 0
2507; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2508; GFX11-NEXT:    s_mov_b32 s1, s0
2509; GFX11-NEXT:    s_mov_b32 s2, s0
2510; GFX11-NEXT:    s_mov_b32 s3, s0
2511; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2512; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2513; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
2514; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
2515; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
2516; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:16
2517; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
2518; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
2519; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
2520; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
2521; GFX11-NEXT:    s_setpc_b64 s[30:31]
2522;
2523; GFX12-LABEL: zero_init_large_offset_foo:
2524; GFX12:       ; %bb.0:
2525; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
2526; GFX12-NEXT:    s_wait_expcnt 0x0
2527; GFX12-NEXT:    s_wait_samplecnt 0x0
2528; GFX12-NEXT:    s_wait_bvhcnt 0x0
2529; GFX12-NEXT:    s_wait_kmcnt 0x0
2530; GFX12-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
2531; GFX12-NEXT:    s_wait_loadcnt 0x0
2532; GFX12-NEXT:    s_mov_b32 s0, 0
2533; GFX12-NEXT:    s_wait_alu 0xfffe
2534; GFX12-NEXT:    s_mov_b32 s1, s0
2535; GFX12-NEXT:    s_mov_b32 s2, s0
2536; GFX12-NEXT:    s_mov_b32 s3, s0
2537; GFX12-NEXT:    s_wait_alu 0xfffe
2538; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2539; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2540; GFX12-NEXT:    s_clause 0x3
2541; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16384
2542; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16400
2543; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16416
2544; GFX12-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16432
2545; GFX12-NEXT:    s_setpc_b64 s[30:31]
2546;
2547; GFX9-PAL-LABEL: zero_init_large_offset_foo:
2548; GFX9-PAL:       ; %bb.0:
2549; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2550; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc
2551; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2552; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
2553; GFX9-PAL-NEXT:    s_mov_b32 s1, s0
2554; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2555; GFX9-PAL-NEXT:    s_mov_b32 s3, s0
2556; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s0
2557; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
2558; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
2559; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
2560; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2561; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
2562; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2563; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
2564; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2565; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
2566; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2567; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
2568; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2569; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
2570;
2571; GFX940-LABEL: zero_init_large_offset_foo:
2572; GFX940:       ; %bb.0:
2573; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2574; GFX940-NEXT:    scratch_load_dword v0, off, s32 offset:4 sc0 sc1
2575; GFX940-NEXT:    s_waitcnt vmcnt(0)
2576; GFX940-NEXT:    s_mov_b32 s0, 0
2577; GFX940-NEXT:    s_mov_b32 s1, s0
2578; GFX940-NEXT:    s_mov_b32 s2, s0
2579; GFX940-NEXT:    s_mov_b32 s3, s0
2580; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
2581; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
2582; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
2583; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1
2584; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
2585; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1
2586; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
2587; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1
2588; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
2589; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1
2590; GFX940-NEXT:    s_waitcnt vmcnt(0)
2591; GFX940-NEXT:    s_setpc_b64 s[30:31]
2592;
2593; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
2594; GFX1010-PAL:       ; %bb.0:
2595; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2596; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
2597; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2598; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
2599; GFX1010-PAL-NEXT:    s_mov_b32 s1, s0
2600; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
2601; GFX1010-PAL-NEXT:    s_mov_b32 s3, s0
2602; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, s0
2603; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, s1
2604; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, s2
2605; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, s3
2606; GFX1010-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2607; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
2608; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2609; GFX1010-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2610; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
2611; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2612; GFX1010-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2613; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
2614; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
2615; GFX1010-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2616; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
2617; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
2618;
2619; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
2620; GFX1030-PAL:       ; %bb.0:
2621; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2622; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
2623; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2624; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
2625; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
2626; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
2627; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
2628; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, s0
2629; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
2630; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
2631; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
2632; GFX1030-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2633; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0
2634; GFX1030-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2635; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
2636; GFX1030-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2637; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
2638; GFX1030-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2639; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
2640; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
2641;
2642; GFX11-PAL-LABEL: zero_init_large_offset_foo:
2643; GFX11-PAL:       ; %bb.0:
2644; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2645; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
2646; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2647; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
2648; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
2649; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
2650; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
2651; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
2652; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2653; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2654; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2655; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0
2656; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2657; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:16
2658; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2659; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
2660; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
2661; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:48
2662; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
2663;
2664; GFX12-PAL-LABEL: zero_init_large_offset_foo:
2665; GFX12-PAL:       ; %bb.0:
2666; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
2667; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
2668; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
2669; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
2670; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
2671; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 scope:SCOPE_SYS
2672; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
2673; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
2674; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
2675; GFX12-PAL-NEXT:    s_mov_b32 s1, s0
2676; GFX12-PAL-NEXT:    s_mov_b32 s2, s0
2677; GFX12-PAL-NEXT:    s_mov_b32 s3, s0
2678; GFX12-PAL-NEXT:    s_wait_alu 0xfffe
2679; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2680; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2681; GFX12-PAL-NEXT:    s_clause 0x3
2682; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16384
2683; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16400
2684; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16416
2685; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16432
2686; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
2687  %padding = alloca [4096 x i32], align 4, addrspace(5)
2688  %alloca = alloca [32 x i16], align 2, addrspace(5)
2689  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
2690  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
2691  call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
2692  ret void
2693}
2694
2695define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
2696; GFX9-LABEL: store_load_sindex_large_offset_kernel:
2697; GFX9:       ; %bb.0: ; %bb
2698; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x24
2699; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
2700; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
2701; GFX9-NEXT:    s_mov_b32 s1, 0
2702; GFX9-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
2703; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
2704; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
2705; GFX9-NEXT:    s_and_b32 s0, s0, 15
2706; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2707; GFX9-NEXT:    s_addk_i32 s1, 0x4004
2708; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
2709; GFX9-NEXT:    scratch_store_dword off, v0, s1
2710; GFX9-NEXT:    s_waitcnt vmcnt(0)
2711; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2712; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
2713; GFX9-NEXT:    s_waitcnt vmcnt(0)
2714; GFX9-NEXT:    s_endpgm
2715;
2716; GFX10-LABEL: store_load_sindex_large_offset_kernel:
2717; GFX10:       ; %bb.0: ; %bb
2718; GFX10-NEXT:    s_add_u32 s8, s8, s13
2719; GFX10-NEXT:    s_addc_u32 s9, s9, 0
2720; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
2721; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
2722; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x24
2723; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2724; GFX10-NEXT:    s_waitcnt vmcnt(0)
2725; GFX10-NEXT:    v_mov_b32_e32 v0, 15
2726; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
2727; GFX10-NEXT:    s_and_b32 s1, s0, 15
2728; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
2729; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
2730; GFX10-NEXT:    s_addk_i32 s0, 0x4004
2731; GFX10-NEXT:    s_addk_i32 s1, 0x4004
2732; GFX10-NEXT:    scratch_store_dword off, v0, s0
2733; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2734; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2735; GFX10-NEXT:    s_waitcnt vmcnt(0)
2736; GFX10-NEXT:    s_endpgm
2737;
2738; GFX11-LABEL: store_load_sindex_large_offset_kernel:
2739; GFX11:       ; %bb.0: ; %bb
2740; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
2741; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2742; GFX11-NEXT:    s_waitcnt vmcnt(0)
2743; GFX11-NEXT:    v_mov_b32_e32 v0, 15
2744; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
2745; GFX11-NEXT:    s_and_b32 s1, s0, 15
2746; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
2747; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
2748; GFX11-NEXT:    s_addk_i32 s0, 0x4004
2749; GFX11-NEXT:    s_addk_i32 s1, 0x4004
2750; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
2751; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2752; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2753; GFX11-NEXT:    s_waitcnt vmcnt(0)
2754; GFX11-NEXT:    s_endpgm
2755;
2756; GFX12-LABEL: store_load_sindex_large_offset_kernel:
2757; GFX12:       ; %bb.0: ; %bb
2758; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x24
2759; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
2760; GFX12-NEXT:    s_wait_loadcnt 0x0
2761; GFX12-NEXT:    v_mov_b32_e32 v0, 15
2762; GFX12-NEXT:    s_wait_kmcnt 0x0
2763; GFX12-NEXT:    s_and_b32 s1, s0, 15
2764; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
2765; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
2766; GFX12-NEXT:    s_addk_co_i32 s0, 0x4000
2767; GFX12-NEXT:    s_addk_co_i32 s1, 0x4000
2768; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
2769; GFX12-NEXT:    s_wait_storecnt 0x0
2770; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
2771; GFX12-NEXT:    s_wait_loadcnt 0x0
2772; GFX12-NEXT:    s_endpgm
2773;
2774; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
2775; GFX9-PAL:       ; %bb.0: ; %bb
2776; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
2777; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
2778; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
2779; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
2780; GFX9-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
2781; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2782; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
2783; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
2784; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
2785; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
2786; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2787; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
2788; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
2789; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
2790; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
2791; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2792; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
2793; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2794; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
2795; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
2796; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
2797; GFX9-PAL-NEXT:    s_endpgm
2798;
2799; GFX940-LABEL: store_load_sindex_large_offset_kernel:
2800; GFX940:       ; %bb.0: ; %bb
2801; GFX940-NEXT:    s_load_dword s0, s[4:5], 0x24
2802; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
2803; GFX940-NEXT:    s_waitcnt vmcnt(0)
2804; GFX940-NEXT:    v_mov_b32_e32 v0, 15
2805; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
2806; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
2807; GFX940-NEXT:    s_and_b32 s0, s0, 15
2808; GFX940-NEXT:    s_addk_i32 s1, 0x4004
2809; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
2810; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
2811; GFX940-NEXT:    s_waitcnt vmcnt(0)
2812; GFX940-NEXT:    s_addk_i32 s0, 0x4004
2813; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
2814; GFX940-NEXT:    s_waitcnt vmcnt(0)
2815; GFX940-NEXT:    s_endpgm
2816;
2817; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
2818; GFX1010-PAL:       ; %bb.0: ; %bb
2819; GFX1010-PAL-NEXT:    s_getpc_b64 s[12:13]
2820; GFX1010-PAL-NEXT:    s_mov_b32 s12, s0
2821; GFX1010-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
2822; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2823; GFX1010-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
2824; GFX1010-PAL-NEXT:    s_add_u32 s12, s12, s11
2825; GFX1010-PAL-NEXT:    s_addc_u32 s13, s13, 0
2826; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
2827; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
2828; GFX1010-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
2829; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
2830; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc dlc
2831; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2832; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
2833; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2834; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
2835; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2836; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2837; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
2838; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
2839; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
2840; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2841; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2842; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
2843; GFX1010-PAL-NEXT:    s_endpgm
2844;
2845; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
2846; GFX1030-PAL:       ; %bb.0: ; %bb
2847; GFX1030-PAL-NEXT:    s_getpc_b64 s[12:13]
2848; GFX1030-PAL-NEXT:    s_mov_b32 s12, s0
2849; GFX1030-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
2850; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2851; GFX1030-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
2852; GFX1030-PAL-NEXT:    s_add_u32 s12, s12, s11
2853; GFX1030-PAL-NEXT:    s_addc_u32 s13, s13, 0
2854; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
2855; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
2856; GFX1030-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
2857; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2858; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2859; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
2860; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2861; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
2862; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2863; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2864; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
2865; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
2866; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
2867; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2868; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
2869; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
2870; GFX1030-PAL-NEXT:    s_endpgm
2871;
2872; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel:
2873; GFX11-PAL:       ; %bb.0: ; %bb
2874; GFX11-PAL-NEXT:    s_load_b32 s0, s[4:5], 0x0
2875; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2876; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2877; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
2878; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2879; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
2880; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2881; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2882; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x4004
2883; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x4004
2884; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
2885; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
2886; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2887; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
2888; GFX11-PAL-NEXT:    s_endpgm
2889;
2890; GFX12-PAL-LABEL: store_load_sindex_large_offset_kernel:
2891; GFX12-PAL:       ; %bb.0: ; %bb
2892; GFX12-PAL-NEXT:    s_load_b32 s0, s[4:5], 0x0
2893; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
2894; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
2895; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
2896; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
2897; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
2898; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
2899; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
2900; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4000
2901; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4000
2902; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
2903; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
2904; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
2905; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
2906; GFX12-PAL-NEXT:    s_endpgm
2907bb:
2908  %padding = alloca [4096 x i32], align 4, addrspace(5)
2909  %i = alloca [32 x float], align 4, addrspace(5)
2910  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
2911  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
2912  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
2913  store volatile i32 15, ptr addrspace(5) %i7, align 4
2914  %i9 = and i32 %idx, 15
2915  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
2916  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
2917  ret void
2918}
2919
2920define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
2921; GFX9-LABEL: store_load_sindex_large_offset_foo:
2922; GFX9:       ; %bb.0: ; %bb
2923; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
2924; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
2925; GFX9-NEXT:    s_mov_b32 s0, 0
2926; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:4 glc
2927; GFX9-NEXT:    s_waitcnt vmcnt(0)
2928; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
2929; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2930; GFX9-NEXT:    v_mov_b32_e32 v0, 15
2931; GFX9-NEXT:    scratch_store_dword off, v0, s0
2932; GFX9-NEXT:    s_waitcnt vmcnt(0)
2933; GFX9-NEXT:    s_and_b32 s0, s2, 15
2934; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
2935; GFX9-NEXT:    s_addk_i32 s0, 0x4004
2936; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
2937; GFX9-NEXT:    s_waitcnt vmcnt(0)
2938; GFX9-NEXT:    s_endpgm
2939;
2940; GFX10-LABEL: store_load_sindex_large_offset_foo:
2941; GFX10:       ; %bb.0: ; %bb
2942; GFX10-NEXT:    s_add_u32 s0, s0, s3
2943; GFX10-NEXT:    s_addc_u32 s1, s1, 0
2944; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2945; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2946; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
2947; GFX10-NEXT:    s_waitcnt vmcnt(0)
2948; GFX10-NEXT:    v_mov_b32_e32 v0, 15
2949; GFX10-NEXT:    s_and_b32 s0, s2, 15
2950; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
2951; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
2952; GFX10-NEXT:    s_addk_i32 s1, 0x4004
2953; GFX10-NEXT:    s_addk_i32 s0, 0x4004
2954; GFX10-NEXT:    scratch_store_dword off, v0, s1
2955; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
2956; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
2957; GFX10-NEXT:    s_waitcnt vmcnt(0)
2958; GFX10-NEXT:    s_endpgm
2959;
2960; GFX11-LABEL: store_load_sindex_large_offset_foo:
2961; GFX11:       ; %bb.0: ; %bb
2962; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
2963; GFX11-NEXT:    s_waitcnt vmcnt(0)
2964; GFX11-NEXT:    v_mov_b32_e32 v0, 15
2965; GFX11-NEXT:    s_and_b32 s1, s0, 15
2966; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
2967; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
2968; GFX11-NEXT:    s_addk_i32 s0, 0x4004
2969; GFX11-NEXT:    s_addk_i32 s1, 0x4004
2970; GFX11-NEXT:    scratch_store_b32 off, v0, s0 dlc
2971; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
2972; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
2973; GFX11-NEXT:    s_waitcnt vmcnt(0)
2974; GFX11-NEXT:    s_endpgm
2975;
2976; GFX12-LABEL: store_load_sindex_large_offset_foo:
2977; GFX12:       ; %bb.0: ; %bb
2978; GFX12-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
2979; GFX12-NEXT:    s_wait_loadcnt 0x0
2980; GFX12-NEXT:    v_mov_b32_e32 v0, 15
2981; GFX12-NEXT:    s_and_b32 s1, s0, 15
2982; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
2983; GFX12-NEXT:    s_lshl_b32 s1, s1, 2
2984; GFX12-NEXT:    s_addk_co_i32 s0, 0x4000
2985; GFX12-NEXT:    s_addk_co_i32 s1, 0x4000
2986; GFX12-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
2987; GFX12-NEXT:    s_wait_storecnt 0x0
2988; GFX12-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
2989; GFX12-NEXT:    s_wait_loadcnt 0x0
2990; GFX12-NEXT:    s_endpgm
2991;
2992; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
2993; GFX9-PAL:       ; %bb.0: ; %bb
2994; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
2995; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
2996; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
2997; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
2998; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
2999; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s1
3000; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
3001; GFX9-PAL-NEXT:    s_mov_b32 s1, 0
3002; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc
3003; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3004; GFX9-PAL-NEXT:    s_lshl_b32 s1, s0, 2
3005; GFX9-PAL-NEXT:    s_and_b32 s0, s0, 15
3006; GFX9-PAL-NEXT:    s_addk_i32 s1, 0x4004
3007; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
3008; GFX9-PAL-NEXT:    s_lshl_b32 s0, s0, 2
3009; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s1
3010; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3011; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
3012; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
3013; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3014; GFX9-PAL-NEXT:    s_endpgm
3015;
3016; GFX940-LABEL: store_load_sindex_large_offset_foo:
3017; GFX940:       ; %bb.0: ; %bb
3018; GFX940-NEXT:    scratch_load_dword v0, off, off offset:4 sc0 sc1
3019; GFX940-NEXT:    s_waitcnt vmcnt(0)
3020; GFX940-NEXT:    s_lshl_b32 s1, s0, 2
3021; GFX940-NEXT:    s_and_b32 s0, s0, 15
3022; GFX940-NEXT:    s_addk_i32 s1, 0x4004
3023; GFX940-NEXT:    v_mov_b32_e32 v0, 15
3024; GFX940-NEXT:    s_lshl_b32 s0, s0, 2
3025; GFX940-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
3026; GFX940-NEXT:    s_waitcnt vmcnt(0)
3027; GFX940-NEXT:    s_addk_i32 s0, 0x4004
3028; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
3029; GFX940-NEXT:    s_waitcnt vmcnt(0)
3030; GFX940-NEXT:    s_endpgm
3031;
3032; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
3033; GFX1010-PAL:       ; %bb.0: ; %bb
3034; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
3035; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
3036; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3037; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3038; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3039; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s1
3040; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
3041; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3042; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3043; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
3044; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 offset:4 glc dlc
3045; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3046; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
3047; GFX1010-PAL-NEXT:    s_and_b32 s1, s0, 15
3048; GFX1010-PAL-NEXT:    s_lshl_b32 s0, s0, 2
3049; GFX1010-PAL-NEXT:    s_lshl_b32 s1, s1, 2
3050; GFX1010-PAL-NEXT:    s_addk_i32 s0, 0x4004
3051; GFX1010-PAL-NEXT:    s_addk_i32 s1, 0x4004
3052; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s0
3053; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3054; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
3055; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3056; GFX1010-PAL-NEXT:    s_endpgm
3057;
3058; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
3059; GFX1030-PAL:       ; %bb.0: ; %bb
3060; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
3061; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
3062; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
3063; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3064; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
3065; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s1
3066; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
3067; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3068; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3069; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
3070; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3071; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
3072; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
3073; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
3074; GFX1030-PAL-NEXT:    s_lshl_b32 s1, s1, 2
3075; GFX1030-PAL-NEXT:    s_addk_i32 s0, 0x4004
3076; GFX1030-PAL-NEXT:    s_addk_i32 s1, 0x4004
3077; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, s0
3078; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3079; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
3080; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3081; GFX1030-PAL-NEXT:    s_endpgm
3082;
3083; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo:
3084; GFX11-PAL:       ; %bb.0: ; %bb
3085; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
3086; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3087; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
3088; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
3089; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
3090; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
3091; GFX11-PAL-NEXT:    s_addk_i32 s0, 0x4004
3092; GFX11-PAL-NEXT:    s_addk_i32 s1, 0x4004
3093; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s0 dlc
3094; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3095; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
3096; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3097; GFX11-PAL-NEXT:    s_endpgm
3098;
3099; GFX12-PAL-LABEL: store_load_sindex_large_offset_foo:
3100; GFX12-PAL:       ; %bb.0: ; %bb
3101; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off scope:SCOPE_SYS
3102; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
3103; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 15
3104; GFX12-PAL-NEXT:    s_and_b32 s1, s0, 15
3105; GFX12-PAL-NEXT:    s_lshl_b32 s0, s0, 2
3106; GFX12-PAL-NEXT:    s_lshl_b32 s1, s1, 2
3107; GFX12-PAL-NEXT:    s_addk_co_i32 s0, 0x4000
3108; GFX12-PAL-NEXT:    s_addk_co_i32 s1, 0x4000
3109; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s0 scope:SCOPE_SYS
3110; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
3111; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s1 scope:SCOPE_SYS
3112; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
3113; GFX12-PAL-NEXT:    s_endpgm
3114bb:
3115  %padding = alloca [4096 x i32], align 4, addrspace(5)
3116  %i = alloca [32 x float], align 4, addrspace(5)
3117  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
3118  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
3119  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
3120  store volatile i32 15, ptr addrspace(5) %i7, align 4
3121  %i9 = and i32 %idx, 15
3122  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
3123  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
3124  ret void
3125}
3126
3127define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
3128; GFX9-LABEL: store_load_vindex_large_offset_kernel:
3129; GFX9:       ; %bb.0: ; %bb
3130; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
3131; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
3132; GFX9-NEXT:    s_mov_b32 s0, 0
3133; GFX9-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
3134; GFX9-NEXT:    s_waitcnt vmcnt(0)
3135; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3136; GFX9-NEXT:    v_add_u32_e32 v1, 0x4004, v0
3137; GFX9-NEXT:    v_mov_b32_e32 v2, 15
3138; GFX9-NEXT:    scratch_store_dword v1, v2, off
3139; GFX9-NEXT:    s_waitcnt vmcnt(0)
3140; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
3141; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
3142; GFX9-NEXT:    s_waitcnt vmcnt(0)
3143; GFX9-NEXT:    s_endpgm
3144;
3145; GFX10-LABEL: store_load_vindex_large_offset_kernel:
3146; GFX10:       ; %bb.0: ; %bb
3147; GFX10-NEXT:    s_add_u32 s8, s8, s13
3148; GFX10-NEXT:    s_addc_u32 s9, s9, 0
3149; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
3150; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
3151; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3152; GFX10-NEXT:    v_mov_b32_e32 v2, 15
3153; GFX10-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
3154; GFX10-NEXT:    s_waitcnt vmcnt(0)
3155; GFX10-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
3156; GFX10-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
3157; GFX10-NEXT:    scratch_store_dword v1, v2, off
3158; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3159; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
3160; GFX10-NEXT:    s_waitcnt vmcnt(0)
3161; GFX10-NEXT:    s_endpgm
3162;
3163; GFX11-LABEL: store_load_vindex_large_offset_kernel:
3164; GFX11:       ; %bb.0: ; %bb
3165; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
3166; GFX11-NEXT:    s_movk_i32 s0, 0x4004
3167; GFX11-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
3168; GFX11-NEXT:    s_waitcnt vmcnt(0)
3169; GFX11-NEXT:    v_and_b32_e32 v0, 0xffc, v0
3170; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3171; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
3172; GFX11-NEXT:    scratch_store_b32 v0, v1, s0 dlc
3173; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3174; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
3175; GFX11-NEXT:    s_waitcnt vmcnt(0)
3176; GFX11-NEXT:    s_endpgm
3177;
3178; GFX12-LABEL: store_load_vindex_large_offset_kernel:
3179; GFX12:       ; %bb.0: ; %bb
3180; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
3181; GFX12-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
3182; GFX12-NEXT:    s_wait_loadcnt 0x0
3183; GFX12-NEXT:    v_and_b32_e32 v0, 0xffc, v0
3184; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3185; GFX12-NEXT:    v_sub_nc_u32_e32 v2, 0x4000, v0
3186; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
3187; GFX12-NEXT:    s_wait_storecnt 0x0
3188; GFX12-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
3189; GFX12-NEXT:    s_wait_loadcnt 0x0
3190; GFX12-NEXT:    s_endpgm
3191;
3192; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
3193; GFX9-PAL:       ; %bb.0: ; %bb
3194; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
3195; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
3196; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
3197; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
3198; GFX9-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3199; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
3200; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3201; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
3202; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
3203; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
3204; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s0 offset:4 glc
3205; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3206; GFX9-PAL-NEXT:    v_add_u32_e32 v1, 0x4004, v0
3207; GFX9-PAL-NEXT:    scratch_store_dword v1, v2, off
3208; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3209; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
3210; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
3211; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3212; GFX9-PAL-NEXT:    s_endpgm
3213;
3214; GFX940-LABEL: store_load_vindex_large_offset_kernel:
3215; GFX940:       ; %bb.0: ; %bb
3216; GFX940-NEXT:    scratch_load_dword v1, off, off offset:4 sc0 sc1
3217; GFX940-NEXT:    s_waitcnt vmcnt(0)
3218; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3219; GFX940-NEXT:    v_and_b32_e32 v0, 0xffc, v0
3220; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3221; GFX940-NEXT:    s_movk_i32 s0, 0x4004
3222; GFX940-NEXT:    scratch_store_dword v0, v1, s0 sc0 sc1
3223; GFX940-NEXT:    s_waitcnt vmcnt(0)
3224; GFX940-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
3225; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
3226; GFX940-NEXT:    s_waitcnt vmcnt(0)
3227; GFX940-NEXT:    s_endpgm
3228;
3229; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
3230; GFX1010-PAL:       ; %bb.0: ; %bb
3231; GFX1010-PAL-NEXT:    s_getpc_b64 s[12:13]
3232; GFX1010-PAL-NEXT:    s_mov_b32 s12, s0
3233; GFX1010-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
3234; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3235; GFX1010-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
3236; GFX1010-PAL-NEXT:    s_add_u32 s12, s12, s11
3237; GFX1010-PAL-NEXT:    s_addc_u32 s13, s13, 0
3238; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
3239; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
3240; GFX1010-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3241; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, 15
3242; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
3243; GFX1010-PAL-NEXT:    scratch_load_dword v3, off, s0 offset:4 glc dlc
3244; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3245; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
3246; GFX1010-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
3247; GFX1010-PAL-NEXT:    scratch_store_dword v1, v2, off
3248; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3249; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
3250; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3251; GFX1010-PAL-NEXT:    s_endpgm
3252;
3253; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
3254; GFX1030-PAL:       ; %bb.0: ; %bb
3255; GFX1030-PAL-NEXT:    s_getpc_b64 s[12:13]
3256; GFX1030-PAL-NEXT:    s_mov_b32 s12, s0
3257; GFX1030-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
3258; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3259; GFX1030-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
3260; GFX1030-PAL-NEXT:    s_add_u32 s12, s12, s11
3261; GFX1030-PAL-NEXT:    s_addc_u32 s13, s13, 0
3262; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
3263; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
3264; GFX1030-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3265; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, 15
3266; GFX1030-PAL-NEXT:    scratch_load_dword v3, off, off offset:4 glc dlc
3267; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3268; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v1, 0x4004, v0
3269; GFX1030-PAL-NEXT:    v_sub_nc_u32_e32 v0, 0x4004, v0
3270; GFX1030-PAL-NEXT:    scratch_store_dword v1, v2, off
3271; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3272; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
3273; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3274; GFX1030-PAL-NEXT:    s_endpgm
3275;
3276; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel:
3277; GFX11-PAL:       ; %bb.0: ; %bb
3278; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
3279; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x4004
3280; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, off offset:4 glc dlc
3281; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3282; GFX11-PAL-NEXT:    v_and_b32_e32 v0, 0xffc, v0
3283; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3284; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
3285; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, s0 dlc
3286; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3287; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
3288; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3289; GFX11-PAL-NEXT:    s_endpgm
3290;
3291; GFX12-PAL-LABEL: store_load_vindex_large_offset_kernel:
3292; GFX12-PAL:       ; %bb.0: ; %bb
3293; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0
3294; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, off scope:SCOPE_SYS
3295; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
3296; GFX12-PAL-NEXT:    v_and_b32_e32 v0, 0xffc, v0
3297; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3298; GFX12-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4000, v0
3299; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:16384 scope:SCOPE_SYS
3300; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
3301; GFX12-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 scope:SCOPE_SYS
3302; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
3303; GFX12-PAL-NEXT:    s_endpgm
3304bb:
3305  %padding = alloca [4096 x i32], align 4, addrspace(5)
3306  %i = alloca [32 x float], align 4, addrspace(5)
3307  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
3308  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
3309  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
3310  %i3 = zext i32 %i2 to i64
3311  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
3312  store volatile i32 15, ptr addrspace(5) %i7, align 4
3313  %i9 = sub nsw i32 31, %i2
3314  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
3315  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
3316  ret void
3317}
3318
3319define void @store_load_vindex_large_offset_foo(i32 %idx) {
3320; GFX9-LABEL: store_load_vindex_large_offset_foo:
3321; GFX9:       ; %bb.0: ; %bb
3322; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3323; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
3324; GFX9-NEXT:    s_waitcnt vmcnt(0)
3325; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
3326; GFX9-NEXT:    v_mov_b32_e32 v1, s0
3327; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
3328; GFX9-NEXT:    v_mov_b32_e32 v3, 15
3329; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
3330; GFX9-NEXT:    scratch_store_dword v2, v3, off
3331; GFX9-NEXT:    s_waitcnt vmcnt(0)
3332; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3333; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
3334; GFX9-NEXT:    s_waitcnt vmcnt(0)
3335; GFX9-NEXT:    s_setpc_b64 s[30:31]
3336;
3337; GFX10-LABEL: store_load_vindex_large_offset_foo:
3338; GFX10:       ; %bb.0: ; %bb
3339; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3340; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
3341; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
3342; GFX10-NEXT:    v_mov_b32_e32 v2, 15
3343; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
3344; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
3345; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
3346; GFX10-NEXT:    s_waitcnt vmcnt(0)
3347; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
3348; GFX10-NEXT:    scratch_store_dword v0, v2, off
3349; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3350; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
3351; GFX10-NEXT:    s_waitcnt vmcnt(0)
3352; GFX10-NEXT:    s_setpc_b64 s[30:31]
3353;
3354; GFX11-LABEL: store_load_vindex_large_offset_foo:
3355; GFX11:       ; %bb.0: ; %bb
3356; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3357; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
3358; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
3359; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
3360; GFX11-NEXT:    s_waitcnt vmcnt(0)
3361; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
3362; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
3363; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
3364; GFX11-NEXT:    scratch_store_b32 v0, v2, off dlc
3365; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3366; GFX11-NEXT:    scratch_load_b32 v0, v1, s0 glc dlc
3367; GFX11-NEXT:    s_waitcnt vmcnt(0)
3368; GFX11-NEXT:    s_setpc_b64 s[30:31]
3369;
3370; GFX12-LABEL: store_load_vindex_large_offset_foo:
3371; GFX12:       ; %bb.0: ; %bb
3372; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3373; GFX12-NEXT:    s_wait_expcnt 0x0
3374; GFX12-NEXT:    s_wait_samplecnt 0x0
3375; GFX12-NEXT:    s_wait_bvhcnt 0x0
3376; GFX12-NEXT:    s_wait_kmcnt 0x0
3377; GFX12-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
3378; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3379; GFX12-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
3380; GFX12-NEXT:    s_wait_loadcnt 0x0
3381; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
3382; GFX12-NEXT:    s_wait_storecnt 0x0
3383; GFX12-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
3384; GFX12-NEXT:    s_wait_storecnt 0x0
3385; GFX12-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
3386; GFX12-NEXT:    s_wait_loadcnt 0x0
3387; GFX12-NEXT:    s_setpc_b64 s[30:31]
3388;
3389; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
3390; GFX9-PAL:       ; %bb.0: ; %bb
3391; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3392; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
3393; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3394; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
3395; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s0
3396; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
3397; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
3398; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
3399; GFX9-PAL-NEXT:    scratch_store_dword v2, v3, off
3400; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3401; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3402; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
3403; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3404; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3405;
3406; GFX940-LABEL: store_load_vindex_large_offset_foo:
3407; GFX940:       ; %bb.0: ; %bb
3408; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3409; GFX940-NEXT:    scratch_load_dword v1, off, s32 offset:4 sc0 sc1
3410; GFX940-NEXT:    s_waitcnt vmcnt(0)
3411; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
3412; GFX940-NEXT:    v_mov_b32_e32 v1, s0
3413; GFX940-NEXT:    v_lshl_add_u32 v1, v0, 2, v1
3414; GFX940-NEXT:    v_mov_b32_e32 v2, 15
3415; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
3416; GFX940-NEXT:    scratch_store_dword v1, v2, off sc0 sc1
3417; GFX940-NEXT:    s_waitcnt vmcnt(0)
3418; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3419; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
3420; GFX940-NEXT:    scratch_load_dword v0, v0, s0 sc0 sc1
3421; GFX940-NEXT:    s_waitcnt vmcnt(0)
3422; GFX940-NEXT:    s_setpc_b64 s[30:31]
3423;
3424; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
3425; GFX10-PAL:       ; %bb.0: ; %bb
3426; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3427; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
3428; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
3429; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
3430; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
3431; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
3432; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
3433; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3434; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
3435; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
3436; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3437; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
3438; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3439; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3440;
3441; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo:
3442; GFX11-PAL:       ; %bb.0: ; %bb
3443; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3444; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
3445; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
3446; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
3447; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3448; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
3449; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
3450; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
3451; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, off dlc
3452; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3453; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s0 glc dlc
3454; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3455; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3456;
3457; GFX12-PAL-LABEL: store_load_vindex_large_offset_foo:
3458; GFX12-PAL:       ; %bb.0: ; %bb
3459; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
3460; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
3461; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
3462; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
3463; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
3464; GFX12-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
3465; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
3466; GFX12-PAL-NEXT:    scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
3467; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
3468; GFX12-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
3469; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
3470; GFX12-PAL-NEXT:    scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
3471; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
3472; GFX12-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
3473; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
3474; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
3475bb:
3476  %padding = alloca [4096 x i32], align 4, addrspace(5)
3477  %i = alloca [32 x float], align 4, addrspace(5)
3478  %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
3479  %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
3480  %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
3481  store volatile i32 15, ptr addrspace(5) %i7, align 4
3482  %i9 = and i32 %idx, 15
3483  %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
3484  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
3485  ret void
3486}
3487
3488define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
3489; GFX9-LABEL: store_load_large_imm_offset_kernel:
3490; GFX9:       ; %bb.0: ; %bb
3491; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
3492; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
3493; GFX9-NEXT:    v_mov_b32_e32 v0, 13
3494; GFX9-NEXT:    s_mov_b32 s0, 0
3495; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:4
3496; GFX9-NEXT:    s_waitcnt vmcnt(0)
3497; GFX9-NEXT:    s_movk_i32 s0, 0x3000
3498; GFX9-NEXT:    s_add_i32 s0, s0, 4
3499; GFX9-NEXT:    v_mov_b32_e32 v0, 15
3500; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3501; GFX9-NEXT:    s_waitcnt vmcnt(0)
3502; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3503; GFX9-NEXT:    s_waitcnt vmcnt(0)
3504; GFX9-NEXT:    s_endpgm
3505;
3506; GFX10-LABEL: store_load_large_imm_offset_kernel:
3507; GFX10:       ; %bb.0: ; %bb
3508; GFX10-NEXT:    s_add_u32 s8, s8, s13
3509; GFX10-NEXT:    s_addc_u32 s9, s9, 0
3510; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
3511; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
3512; GFX10-NEXT:    v_mov_b32_e32 v0, 13
3513; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3514; GFX10-NEXT:    s_movk_i32 s0, 0x3800
3515; GFX10-NEXT:    s_add_i32 s0, s0, 4
3516; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
3517; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3518; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3519; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3520; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3521; GFX10-NEXT:    s_waitcnt vmcnt(0)
3522; GFX10-NEXT:    s_endpgm
3523;
3524; GFX11-LABEL: store_load_large_imm_offset_kernel:
3525; GFX11:       ; %bb.0: ; %bb
3526; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
3527; GFX11-NEXT:    v_mov_b32_e32 v2, 15
3528; GFX11-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
3529; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3530; GFX11-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
3531; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3532; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
3533; GFX11-NEXT:    s_waitcnt vmcnt(0)
3534; GFX11-NEXT:    s_endpgm
3535;
3536; GFX12-LABEL: store_load_large_imm_offset_kernel:
3537; GFX12:       ; %bb.0: ; %bb
3538; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
3539; GFX12-NEXT:    scratch_store_b32 off, v0, off scope:SCOPE_SYS
3540; GFX12-NEXT:    s_wait_storecnt 0x0
3541; GFX12-NEXT:    scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS
3542; GFX12-NEXT:    s_wait_storecnt 0x0
3543; GFX12-NEXT:    scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS
3544; GFX12-NEXT:    s_wait_loadcnt 0x0
3545; GFX12-NEXT:    s_endpgm
3546;
3547; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
3548; GFX9-PAL:       ; %bb.0: ; %bb
3549; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
3550; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
3551; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
3552; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
3553; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
3554; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3555; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
3556; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
3557; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
3558; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:4
3559; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3560; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
3561; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
3562; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
3563; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3564; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3565; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3566; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3567; GFX9-PAL-NEXT:    s_endpgm
3568;
3569; GFX940-LABEL: store_load_large_imm_offset_kernel:
3570; GFX940:       ; %bb.0: ; %bb
3571; GFX940-NEXT:    v_mov_b32_e32 v0, 13
3572; GFX940-NEXT:    scratch_store_dword off, v0, off offset:4 sc0 sc1
3573; GFX940-NEXT:    s_waitcnt vmcnt(0)
3574; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
3575; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3576; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:3716 sc0 sc1
3577; GFX940-NEXT:    s_waitcnt vmcnt(0)
3578; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:3716 sc0 sc1
3579; GFX940-NEXT:    s_waitcnt vmcnt(0)
3580; GFX940-NEXT:    s_endpgm
3581;
3582; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
3583; GFX1010-PAL:       ; %bb.0: ; %bb
3584; GFX1010-PAL-NEXT:    s_getpc_b64 s[12:13]
3585; GFX1010-PAL-NEXT:    s_mov_b32 s12, s0
3586; GFX1010-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
3587; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3588; GFX1010-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
3589; GFX1010-PAL-NEXT:    s_add_u32 s12, s12, s11
3590; GFX1010-PAL-NEXT:    s_addc_u32 s13, s13, 0
3591; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
3592; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
3593; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 13
3594; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 15
3595; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x3800
3596; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
3597; GFX1010-PAL-NEXT:    s_add_i32 s0, s0, 4
3598; GFX1010-PAL-NEXT:    scratch_store_dword off, v0, s1 offset:4
3599; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3600; GFX1010-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3601; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3602; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3603; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
3604; GFX1010-PAL-NEXT:    s_endpgm
3605;
3606; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
3607; GFX1030-PAL:       ; %bb.0: ; %bb
3608; GFX1030-PAL-NEXT:    s_getpc_b64 s[12:13]
3609; GFX1030-PAL-NEXT:    s_mov_b32 s12, s0
3610; GFX1030-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
3611; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3612; GFX1030-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
3613; GFX1030-PAL-NEXT:    s_add_u32 s12, s12, s11
3614; GFX1030-PAL-NEXT:    s_addc_u32 s13, s13, 0
3615; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
3616; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
3617; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 13
3618; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 15
3619; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x3800
3620; GFX1030-PAL-NEXT:    s_add_i32 s0, s0, 4
3621; GFX1030-PAL-NEXT:    scratch_store_dword off, v0, off offset:4
3622; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3623; GFX1030-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3624; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3625; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3626; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
3627; GFX1030-PAL-NEXT:    s_endpgm
3628;
3629; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel:
3630; GFX11-PAL:       ; %bb.0: ; %bb
3631; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
3632; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
3633; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, off offset:4 dlc
3634; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3635; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, off offset:3716 dlc
3636; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3637; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
3638; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3639; GFX11-PAL-NEXT:    s_endpgm
3640;
3641; GFX12-PAL-LABEL: store_load_large_imm_offset_kernel:
3642; GFX12-PAL:       ; %bb.0: ; %bb
3643; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
3644; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, off scope:SCOPE_SYS
3645; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
3646; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, off offset:16000 scope:SCOPE_SYS
3647; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
3648; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, off offset:16000 scope:SCOPE_SYS
3649; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
3650; GFX12-PAL-NEXT:    s_endpgm
3651bb:
3652  %i = alloca [4096 x i32], align 4, addrspace(5)
3653  %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
3654  store volatile i32 13, ptr addrspace(5) %i1, align 4
3655  %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
3656  store volatile i32 15, ptr addrspace(5) %i7, align 4
3657  %i10 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
3658  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
3659  ret void
3660}
3661
3662define void @store_load_large_imm_offset_foo() {
3663; GFX9-LABEL: store_load_large_imm_offset_foo:
3664; GFX9:       ; %bb.0: ; %bb
3665; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3666; GFX9-NEXT:    s_movk_i32 s0, 0x3000
3667; GFX9-NEXT:    v_mov_b32_e32 v0, 13
3668; GFX9-NEXT:    s_add_i32 s1, s32, s0
3669; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
3670; GFX9-NEXT:    s_waitcnt vmcnt(0)
3671; GFX9-NEXT:    s_add_i32 s0, s1, 4
3672; GFX9-NEXT:    v_mov_b32_e32 v0, 15
3673; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3674; GFX9-NEXT:    s_waitcnt vmcnt(0)
3675; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3676; GFX9-NEXT:    s_waitcnt vmcnt(0)
3677; GFX9-NEXT:    s_setpc_b64 s[30:31]
3678;
3679; GFX10-LABEL: store_load_large_imm_offset_foo:
3680; GFX10:       ; %bb.0: ; %bb
3681; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3682; GFX10-NEXT:    v_mov_b32_e32 v0, 13
3683; GFX10-NEXT:    s_movk_i32 s0, 0x3800
3684; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3685; GFX10-NEXT:    s_add_i32 s1, s32, s0
3686; GFX10-NEXT:    s_add_i32 s0, s1, 4
3687; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
3688; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3689; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3690; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3691; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3692; GFX10-NEXT:    s_waitcnt vmcnt(0)
3693; GFX10-NEXT:    s_setpc_b64 s[30:31]
3694;
3695; GFX11-LABEL: store_load_large_imm_offset_foo:
3696; GFX11:       ; %bb.0: ; %bb
3697; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3698; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
3699; GFX11-NEXT:    v_mov_b32_e32 v2, 15
3700; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
3701; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3702; GFX11-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
3703; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3704; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
3705; GFX11-NEXT:    s_waitcnt vmcnt(0)
3706; GFX11-NEXT:    s_setpc_b64 s[30:31]
3707;
3708; GFX12-LABEL: store_load_large_imm_offset_foo:
3709; GFX12:       ; %bb.0: ; %bb
3710; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
3711; GFX12-NEXT:    s_wait_expcnt 0x0
3712; GFX12-NEXT:    s_wait_samplecnt 0x0
3713; GFX12-NEXT:    s_wait_bvhcnt 0x0
3714; GFX12-NEXT:    s_wait_kmcnt 0x0
3715; GFX12-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
3716; GFX12-NEXT:    s_wait_storecnt 0x0
3717; GFX12-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
3718; GFX12-NEXT:    s_wait_storecnt 0x0
3719; GFX12-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
3720; GFX12-NEXT:    s_wait_storecnt 0x0
3721; GFX12-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
3722; GFX12-NEXT:    s_wait_loadcnt 0x0
3723; GFX12-NEXT:    s_setpc_b64 s[30:31]
3724;
3725; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
3726; GFX9-PAL:       ; %bb.0: ; %bb
3727; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3728; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
3729; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
3730; GFX9-PAL-NEXT:    s_add_i32 s1, s32, s0
3731; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
3732; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3733; GFX9-PAL-NEXT:    s_add_i32 s0, s1, 4
3734; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
3735; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
3736; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3737; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
3738; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3739; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
3740;
3741; GFX940-LABEL: store_load_large_imm_offset_foo:
3742; GFX940:       ; %bb.0: ; %bb
3743; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3744; GFX940-NEXT:    v_mov_b32_e32 v0, 13
3745; GFX940-NEXT:    scratch_store_dword off, v0, s32 offset:4 sc0 sc1
3746; GFX940-NEXT:    s_waitcnt vmcnt(0)
3747; GFX940-NEXT:    v_mov_b32_e32 v0, 0x3000
3748; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3749; GFX940-NEXT:    scratch_store_dword v0, v1, s32 offset:3716 sc0 sc1
3750; GFX940-NEXT:    s_waitcnt vmcnt(0)
3751; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1
3752; GFX940-NEXT:    s_waitcnt vmcnt(0)
3753; GFX940-NEXT:    s_setpc_b64 s[30:31]
3754;
3755; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
3756; GFX10-PAL:       ; %bb.0: ; %bb
3757; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3758; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
3759; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
3760; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3761; GFX10-PAL-NEXT:    s_add_i32 s1, s32, s0
3762; GFX10-PAL-NEXT:    s_add_i32 s0, s1, 4
3763; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
3764; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3765; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
3766; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3767; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
3768; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3769; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
3770;
3771; GFX11-PAL-LABEL: store_load_large_imm_offset_foo:
3772; GFX11-PAL:       ; %bb.0: ; %bb
3773; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3774; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
3775; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
3776; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
3777; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3778; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
3779; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3780; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
3781; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3782; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
3783;
3784; GFX12-PAL-LABEL: store_load_large_imm_offset_foo:
3785; GFX12-PAL:       ; %bb.0: ; %bb
3786; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
3787; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
3788; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
3789; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
3790; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
3791; GFX12-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
3792; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
3793; GFX12-PAL-NEXT:    scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
3794; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
3795; GFX12-PAL-NEXT:    scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
3796; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
3797; GFX12-PAL-NEXT:    scratch_load_b32 v0, off, s32 offset:16000 scope:SCOPE_SYS
3798; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
3799; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
3800bb:
3801  %i = alloca [4096 x i32], align 4, addrspace(5)
3802  %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
3803  store volatile i32 13, ptr addrspace(5) %i1, align 4
3804  %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
3805  store volatile i32 15, ptr addrspace(5) %i7, align 4
3806  %i10 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
3807  %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
3808  ret void
3809}
3810
3811define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
3812; GFX9-LABEL: store_load_vidx_sidx_offset:
3813; GFX9:       ; %bb.0: ; %bb
3814; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x24
3815; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s8, s13
3816; GFX9-NEXT:    v_mov_b32_e32 v1, 0
3817; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s9, 0
3818; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
3819; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
3820; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3821; GFX9-NEXT:    v_mov_b32_e32 v1, 15
3822; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
3823; GFX9-NEXT:    s_waitcnt vmcnt(0)
3824; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
3825; GFX9-NEXT:    s_waitcnt vmcnt(0)
3826; GFX9-NEXT:    s_endpgm
3827;
3828; GFX10-LABEL: store_load_vidx_sidx_offset:
3829; GFX10:       ; %bb.0: ; %bb
3830; GFX10-NEXT:    s_add_u32 s8, s8, s13
3831; GFX10-NEXT:    s_addc_u32 s9, s9, 0
3832; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s8
3833; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s9
3834; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x24
3835; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3836; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
3837; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
3838; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
3839; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
3840; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3841; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
3842; GFX10-NEXT:    s_waitcnt vmcnt(0)
3843; GFX10-NEXT:    s_endpgm
3844;
3845; GFX11-LABEL: store_load_vidx_sidx_offset:
3846; GFX11:       ; %bb.0: ; %bb
3847; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
3848; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
3849; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
3850; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3851; GFX11-NEXT:    v_add_nc_u32_e32 v0, s0, v0
3852; GFX11-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
3853; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
3854; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3855; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
3856; GFX11-NEXT:    s_waitcnt vmcnt(0)
3857; GFX11-NEXT:    s_endpgm
3858;
3859; GFX12-LABEL: store_load_vidx_sidx_offset:
3860; GFX12:       ; %bb.0: ; %bb
3861; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x24
3862; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
3863; GFX12-NEXT:    s_wait_kmcnt 0x0
3864; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3865; GFX12-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3866; GFX12-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
3867; GFX12-NEXT:    s_wait_storecnt 0x0
3868; GFX12-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
3869; GFX12-NEXT:    s_wait_loadcnt 0x0
3870; GFX12-NEXT:    s_endpgm
3871;
3872; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
3873; GFX9-PAL:       ; %bb.0: ; %bb
3874; GFX9-PAL-NEXT:    s_getpc_b64 s[12:13]
3875; GFX9-PAL-NEXT:    s_mov_b32 s12, s0
3876; GFX9-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
3877; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 0
3878; GFX9-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
3879; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3880; GFX9-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
3881; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s12, s11
3882; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
3883; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
3884; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3885; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
3886; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
3887; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3888; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
3889; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
3890; GFX9-PAL-NEXT:    s_endpgm
3891;
3892; GFX940-LABEL: store_load_vidx_sidx_offset:
3893; GFX940:       ; %bb.0: ; %bb
3894; GFX940-NEXT:    s_load_dword s0, s[4:5], 0x24
3895; GFX940-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
3896; GFX940-NEXT:    v_mov_b32_e32 v1, 0
3897; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
3898; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
3899; GFX940-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
3900; GFX940-NEXT:    v_mov_b32_e32 v1, 15
3901; GFX940-NEXT:    scratch_store_dword v0, v1, off offset:1024 sc0 sc1
3902; GFX940-NEXT:    s_waitcnt vmcnt(0)
3903; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1024 sc0 sc1
3904; GFX940-NEXT:    s_waitcnt vmcnt(0)
3905; GFX940-NEXT:    s_endpgm
3906;
3907; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
3908; GFX10-PAL:       ; %bb.0: ; %bb
3909; GFX10-PAL-NEXT:    s_getpc_b64 s[12:13]
3910; GFX10-PAL-NEXT:    s_mov_b32 s12, s0
3911; GFX10-PAL-NEXT:    s_load_dwordx2 s[12:13], s[12:13], 0x0
3912; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3913; GFX10-PAL-NEXT:    s_and_b32 s13, s13, 0xffff
3914; GFX10-PAL-NEXT:    s_add_u32 s12, s12, s11
3915; GFX10-PAL-NEXT:    s_addc_u32 s13, s13, 0
3916; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
3917; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
3918; GFX10-PAL-NEXT:    s_load_dword s0, s[4:5], 0x0
3919; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
3920; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3921; GFX10-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
3922; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
3923; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
3924; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3925; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
3926; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
3927; GFX10-PAL-NEXT:    s_endpgm
3928;
3929; GFX11-PAL-LABEL: store_load_vidx_sidx_offset:
3930; GFX11-PAL:       ; %bb.0: ; %bb
3931; GFX11-PAL-NEXT:    s_load_b32 s0, s[4:5], 0x0
3932; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
3933; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
3934; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3935; GFX11-PAL-NEXT:    v_add_nc_u32_e32 v0, s0, v0
3936; GFX11-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, 0
3937; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 dlc
3938; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
3939; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 glc dlc
3940; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
3941; GFX11-PAL-NEXT:    s_endpgm
3942;
3943; GFX12-PAL-LABEL: store_load_vidx_sidx_offset:
3944; GFX12-PAL:       ; %bb.0: ; %bb
3945; GFX12-PAL-NEXT:    s_load_b32 s0, s[4:5], 0x0
3946; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_and_b32 v0, 0x3ff, v0
3947; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
3948; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
3949; GFX12-PAL-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
3950; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:1024 scope:SCOPE_SYS
3951; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
3952; GFX12-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1024 scope:SCOPE_SYS
3953; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
3954; GFX12-PAL-NEXT:    s_endpgm
3955bb:
3956  %alloca = alloca [32 x i32], align 4, addrspace(5)
3957  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
3958  %add1 = add nsw i32 %sidx, %vidx
3959  %add2 = add nsw i32 %add1, 256
3960  %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %add2
3961  store volatile i32 15, ptr addrspace(5) %gep, align 4
3962  %load = load volatile i32, ptr addrspace(5) %gep, align 4
3963  ret void
3964}
3965
3966define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
3967; GFX9-LABEL: store_load_i64_aligned:
3968; GFX9:       ; %bb.0: ; %bb
3969; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3970; GFX9-NEXT:    v_mov_b32_e32 v1, 15
3971; GFX9-NEXT:    v_mov_b32_e32 v2, 0
3972; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3973; GFX9-NEXT:    s_waitcnt vmcnt(0)
3974; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
3975; GFX9-NEXT:    s_waitcnt vmcnt(0)
3976; GFX9-NEXT:    s_setpc_b64 s[30:31]
3977;
3978; GFX10-LABEL: store_load_i64_aligned:
3979; GFX10:       ; %bb.0: ; %bb
3980; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3981; GFX10-NEXT:    v_mov_b32_e32 v1, 15
3982; GFX10-NEXT:    v_mov_b32_e32 v2, 0
3983; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
3984; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
3985; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
3986; GFX10-NEXT:    s_waitcnt vmcnt(0)
3987; GFX10-NEXT:    s_setpc_b64 s[30:31]
3988;
3989; GFX11-LABEL: store_load_i64_aligned:
3990; GFX11:       ; %bb.0: ; %bb
3991; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3992; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
3993; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
3994; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
3995; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
3996; GFX11-NEXT:    s_waitcnt vmcnt(0)
3997; GFX11-NEXT:    s_setpc_b64 s[30:31]
3998;
3999; GFX12-LABEL: store_load_i64_aligned:
4000; GFX12:       ; %bb.0: ; %bb
4001; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4002; GFX12-NEXT:    s_wait_expcnt 0x0
4003; GFX12-NEXT:    s_wait_samplecnt 0x0
4004; GFX12-NEXT:    s_wait_bvhcnt 0x0
4005; GFX12-NEXT:    s_wait_kmcnt 0x0
4006; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4007; GFX12-NEXT:    s_wait_storecnt 0x0
4008; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
4009; GFX12-NEXT:    s_wait_storecnt 0x0
4010; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
4011; GFX12-NEXT:    s_wait_loadcnt 0x0
4012; GFX12-NEXT:    s_setpc_b64 s[30:31]
4013;
4014; GFX9-PAL-LABEL: store_load_i64_aligned:
4015; GFX9-PAL:       ; %bb.0: ; %bb
4016; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4017; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
4018; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
4019; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
4020; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4021; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
4022; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4023; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
4024;
4025; GFX940-LABEL: store_load_i64_aligned:
4026; GFX940:       ; %bb.0: ; %bb
4027; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4028; GFX940-NEXT:    v_mov_b32_e32 v2, 15
4029; GFX940-NEXT:    v_mov_b32_e32 v3, 0
4030; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
4031; GFX940-NEXT:    s_waitcnt vmcnt(0)
4032; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
4033; GFX940-NEXT:    s_waitcnt vmcnt(0)
4034; GFX940-NEXT:    s_setpc_b64 s[30:31]
4035;
4036; GFX10-PAL-LABEL: store_load_i64_aligned:
4037; GFX10-PAL:       ; %bb.0: ; %bb
4038; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4039; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
4040; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
4041; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
4042; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4043; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
4044; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
4045; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
4046;
4047; GFX11-PAL-LABEL: store_load_i64_aligned:
4048; GFX11-PAL:       ; %bb.0: ; %bb
4049; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4050; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4051; GFX11-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
4052; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4053; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
4054; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4055; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4056;
4057; GFX12-PAL-LABEL: store_load_i64_aligned:
4058; GFX12-PAL:       ; %bb.0: ; %bb
4059; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
4060; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
4061; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
4062; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
4063; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
4064; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4065; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4066; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
4067; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4068; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
4069; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
4070; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
4071bb:
4072  store volatile i64 15, ptr addrspace(5) %arg, align 8
4073  %load = load volatile i64, ptr addrspace(5) %arg, align 8
4074  ret void
4075}
4076
4077define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
4078; GFX9-LABEL: store_load_i64_unaligned:
4079; GFX9:       ; %bb.0: ; %bb
4080; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4081; GFX9-NEXT:    v_mov_b32_e32 v1, 15
4082; GFX9-NEXT:    v_mov_b32_e32 v2, 0
4083; GFX9-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
4084; GFX9-NEXT:    s_waitcnt vmcnt(0)
4085; GFX9-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
4086; GFX9-NEXT:    s_waitcnt vmcnt(0)
4087; GFX9-NEXT:    s_setpc_b64 s[30:31]
4088;
4089; GFX10-LABEL: store_load_i64_unaligned:
4090; GFX10:       ; %bb.0: ; %bb
4091; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4092; GFX10-NEXT:    v_mov_b32_e32 v1, 15
4093; GFX10-NEXT:    v_mov_b32_e32 v2, 0
4094; GFX10-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
4095; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4096; GFX10-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
4097; GFX10-NEXT:    s_waitcnt vmcnt(0)
4098; GFX10-NEXT:    s_setpc_b64 s[30:31]
4099;
4100; GFX11-LABEL: store_load_i64_unaligned:
4101; GFX11:       ; %bb.0: ; %bb
4102; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4103; GFX11-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4104; GFX11-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
4105; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4106; GFX11-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
4107; GFX11-NEXT:    s_waitcnt vmcnt(0)
4108; GFX11-NEXT:    s_setpc_b64 s[30:31]
4109;
4110; GFX12-LABEL: store_load_i64_unaligned:
4111; GFX12:       ; %bb.0: ; %bb
4112; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4113; GFX12-NEXT:    s_wait_expcnt 0x0
4114; GFX12-NEXT:    s_wait_samplecnt 0x0
4115; GFX12-NEXT:    s_wait_bvhcnt 0x0
4116; GFX12-NEXT:    s_wait_kmcnt 0x0
4117; GFX12-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4118; GFX12-NEXT:    s_wait_storecnt 0x0
4119; GFX12-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
4120; GFX12-NEXT:    s_wait_storecnt 0x0
4121; GFX12-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
4122; GFX12-NEXT:    s_wait_loadcnt 0x0
4123; GFX12-NEXT:    s_setpc_b64 s[30:31]
4124;
4125; GFX9-PAL-LABEL: store_load_i64_unaligned:
4126; GFX9-PAL:       ; %bb.0: ; %bb
4127; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4128; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
4129; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 0
4130; GFX9-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
4131; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4132; GFX9-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc
4133; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4134; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
4135;
4136; GFX940-LABEL: store_load_i64_unaligned:
4137; GFX940:       ; %bb.0: ; %bb
4138; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4139; GFX940-NEXT:    v_mov_b32_e32 v2, 15
4140; GFX940-NEXT:    v_mov_b32_e32 v3, 0
4141; GFX940-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
4142; GFX940-NEXT:    s_waitcnt vmcnt(0)
4143; GFX940-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
4144; GFX940-NEXT:    s_waitcnt vmcnt(0)
4145; GFX940-NEXT:    s_setpc_b64 s[30:31]
4146;
4147; GFX10-PAL-LABEL: store_load_i64_unaligned:
4148; GFX10-PAL:       ; %bb.0: ; %bb
4149; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4150; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
4151; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 0
4152; GFX10-PAL-NEXT:    scratch_store_dwordx2 v0, v[1:2], off
4153; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4154; GFX10-PAL-NEXT:    scratch_load_dwordx2 v[0:1], v0, off glc dlc
4155; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
4156; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
4157;
4158; GFX11-PAL-LABEL: store_load_i64_unaligned:
4159; GFX11-PAL:       ; %bb.0: ; %bb
4160; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4161; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4162; GFX11-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off dlc
4163; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4164; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
4165; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4166; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4167;
4168; GFX12-PAL-LABEL: store_load_i64_unaligned:
4169; GFX12-PAL:       ; %bb.0: ; %bb
4170; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
4171; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
4172; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
4173; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
4174; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
4175; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4176; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4177; GFX12-PAL-NEXT:    scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
4178; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4179; GFX12-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
4180; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
4181; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
4182bb:
4183  store volatile i64 15, ptr addrspace(5) %arg, align 1
4184  %load = load volatile i64, ptr addrspace(5) %arg, align 1
4185  ret void
4186}
4187
4188define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
4189; GFX9-LABEL: store_load_v3i32_unaligned:
4190; GFX9:       ; %bb.0: ; %bb
4191; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4192; GFX9-NEXT:    v_mov_b32_e32 v1, 1
4193; GFX9-NEXT:    v_mov_b32_e32 v2, 2
4194; GFX9-NEXT:    v_mov_b32_e32 v3, 3
4195; GFX9-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
4196; GFX9-NEXT:    s_waitcnt vmcnt(0)
4197; GFX9-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
4198; GFX9-NEXT:    s_waitcnt vmcnt(0)
4199; GFX9-NEXT:    s_setpc_b64 s[30:31]
4200;
4201; GFX10-LABEL: store_load_v3i32_unaligned:
4202; GFX10:       ; %bb.0: ; %bb
4203; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4204; GFX10-NEXT:    v_mov_b32_e32 v1, 1
4205; GFX10-NEXT:    v_mov_b32_e32 v2, 2
4206; GFX10-NEXT:    v_mov_b32_e32 v3, 3
4207; GFX10-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
4208; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4209; GFX10-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
4210; GFX10-NEXT:    s_waitcnt vmcnt(0)
4211; GFX10-NEXT:    s_setpc_b64 s[30:31]
4212;
4213; GFX11-LABEL: store_load_v3i32_unaligned:
4214; GFX11:       ; %bb.0: ; %bb
4215; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4216; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4217; GFX11-NEXT:    v_mov_b32_e32 v3, 3
4218; GFX11-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
4219; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4220; GFX11-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
4221; GFX11-NEXT:    s_waitcnt vmcnt(0)
4222; GFX11-NEXT:    s_setpc_b64 s[30:31]
4223;
4224; GFX12-LABEL: store_load_v3i32_unaligned:
4225; GFX12:       ; %bb.0: ; %bb
4226; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4227; GFX12-NEXT:    s_wait_expcnt 0x0
4228; GFX12-NEXT:    s_wait_samplecnt 0x0
4229; GFX12-NEXT:    s_wait_bvhcnt 0x0
4230; GFX12-NEXT:    s_wait_kmcnt 0x0
4231; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4232; GFX12-NEXT:    v_mov_b32_e32 v3, 3
4233; GFX12-NEXT:    s_wait_storecnt 0x0
4234; GFX12-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
4235; GFX12-NEXT:    s_wait_storecnt 0x0
4236; GFX12-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
4237; GFX12-NEXT:    s_wait_loadcnt 0x0
4238; GFX12-NEXT:    s_setpc_b64 s[30:31]
4239;
4240; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
4241; GFX9-PAL:       ; %bb.0: ; %bb
4242; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4243; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
4244; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
4245; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
4246; GFX9-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
4247; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4248; GFX9-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc
4249; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4250; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
4251;
4252; GFX940-LABEL: store_load_v3i32_unaligned:
4253; GFX940:       ; %bb.0: ; %bb
4254; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4255; GFX940-NEXT:    v_mov_b32_e32 v2, 1
4256; GFX940-NEXT:    v_mov_b32_e32 v3, 2
4257; GFX940-NEXT:    v_mov_b32_e32 v4, 3
4258; GFX940-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
4259; GFX940-NEXT:    s_waitcnt vmcnt(0)
4260; GFX940-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
4261; GFX940-NEXT:    s_waitcnt vmcnt(0)
4262; GFX940-NEXT:    s_setpc_b64 s[30:31]
4263;
4264; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
4265; GFX10-PAL:       ; %bb.0: ; %bb
4266; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4267; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
4268; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
4269; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
4270; GFX10-PAL-NEXT:    scratch_store_dwordx3 v0, v[1:3], off
4271; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4272; GFX10-PAL-NEXT:    scratch_load_dwordx3 v[0:2], v0, off glc dlc
4273; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
4274; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
4275;
4276; GFX11-PAL-LABEL: store_load_v3i32_unaligned:
4277; GFX11-PAL:       ; %bb.0: ; %bb
4278; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4279; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4280; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, 3
4281; GFX11-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off dlc
4282; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4283; GFX11-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
4284; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4285; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4286;
4287; GFX12-PAL-LABEL: store_load_v3i32_unaligned:
4288; GFX12-PAL:       ; %bb.0: ; %bb
4289; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
4290; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
4291; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
4292; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
4293; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
4294; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4295; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, 3
4296; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4297; GFX12-PAL-NEXT:    scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
4298; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4299; GFX12-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
4300; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
4301; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
4302bb:
4303  store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
4304  %load = load volatile <3 x i32>, ptr addrspace(5) %arg, align 1
4305  ret void
4306}
4307
4308define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
4309; GFX9-LABEL: store_load_v4i32_unaligned:
4310; GFX9:       ; %bb.0: ; %bb
4311; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4312; GFX9-NEXT:    v_mov_b32_e32 v1, 1
4313; GFX9-NEXT:    v_mov_b32_e32 v2, 2
4314; GFX9-NEXT:    v_mov_b32_e32 v3, 3
4315; GFX9-NEXT:    v_mov_b32_e32 v4, 4
4316; GFX9-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
4317; GFX9-NEXT:    s_waitcnt vmcnt(0)
4318; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
4319; GFX9-NEXT:    s_waitcnt vmcnt(0)
4320; GFX9-NEXT:    s_setpc_b64 s[30:31]
4321;
4322; GFX10-LABEL: store_load_v4i32_unaligned:
4323; GFX10:       ; %bb.0: ; %bb
4324; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4325; GFX10-NEXT:    v_mov_b32_e32 v1, 1
4326; GFX10-NEXT:    v_mov_b32_e32 v2, 2
4327; GFX10-NEXT:    v_mov_b32_e32 v3, 3
4328; GFX10-NEXT:    v_mov_b32_e32 v4, 4
4329; GFX10-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
4330; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4331; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
4332; GFX10-NEXT:    s_waitcnt vmcnt(0)
4333; GFX10-NEXT:    s_setpc_b64 s[30:31]
4334;
4335; GFX11-LABEL: store_load_v4i32_unaligned:
4336; GFX11:       ; %bb.0: ; %bb
4337; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4338; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4339; GFX11-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
4340; GFX11-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
4341; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4342; GFX11-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
4343; GFX11-NEXT:    s_waitcnt vmcnt(0)
4344; GFX11-NEXT:    s_setpc_b64 s[30:31]
4345;
4346; GFX12-LABEL: store_load_v4i32_unaligned:
4347; GFX12:       ; %bb.0: ; %bb
4348; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4349; GFX12-NEXT:    s_wait_expcnt 0x0
4350; GFX12-NEXT:    s_wait_samplecnt 0x0
4351; GFX12-NEXT:    s_wait_bvhcnt 0x0
4352; GFX12-NEXT:    s_wait_kmcnt 0x0
4353; GFX12-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4354; GFX12-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
4355; GFX12-NEXT:    s_wait_storecnt 0x0
4356; GFX12-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
4357; GFX12-NEXT:    s_wait_storecnt 0x0
4358; GFX12-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
4359; GFX12-NEXT:    s_wait_loadcnt 0x0
4360; GFX12-NEXT:    s_setpc_b64 s[30:31]
4361;
4362; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
4363; GFX9-PAL:       ; %bb.0: ; %bb
4364; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4365; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
4366; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 2
4367; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 3
4368; GFX9-PAL-NEXT:    v_mov_b32_e32 v4, 4
4369; GFX9-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
4370; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4371; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc
4372; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4373; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
4374;
4375; GFX940-LABEL: store_load_v4i32_unaligned:
4376; GFX940:       ; %bb.0: ; %bb
4377; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4378; GFX940-NEXT:    v_mov_b32_e32 v2, 1
4379; GFX940-NEXT:    v_mov_b32_e32 v3, 2
4380; GFX940-NEXT:    v_mov_b32_e32 v4, 3
4381; GFX940-NEXT:    v_mov_b32_e32 v5, 4
4382; GFX940-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
4383; GFX940-NEXT:    s_waitcnt vmcnt(0)
4384; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
4385; GFX940-NEXT:    s_waitcnt vmcnt(0)
4386; GFX940-NEXT:    s_setpc_b64 s[30:31]
4387;
4388; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
4389; GFX10-PAL:       ; %bb.0: ; %bb
4390; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4391; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 1
4392; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 2
4393; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, 3
4394; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, 4
4395; GFX10-PAL-NEXT:    scratch_store_dwordx4 v0, v[1:4], off
4396; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4397; GFX10-PAL-NEXT:    scratch_load_dwordx4 v[0:3], v0, off glc dlc
4398; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
4399; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
4400;
4401; GFX11-PAL-LABEL: store_load_v4i32_unaligned:
4402; GFX11-PAL:       ; %bb.0: ; %bb
4403; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4404; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4405; GFX11-PAL-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
4406; GFX11-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off dlc
4407; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4408; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
4409; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4410; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4411;
4412; GFX12-PAL-LABEL: store_load_v4i32_unaligned:
4413; GFX12-PAL:       ; %bb.0: ; %bb
4414; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
4415; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
4416; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
4417; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
4418; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
4419; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
4420; GFX12-PAL-NEXT:    v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
4421; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4422; GFX12-PAL-NEXT:    scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
4423; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4424; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
4425; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
4426; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
4427bb:
4428  store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
4429  %load = load volatile <4 x i32>, ptr addrspace(5) %arg, align 1
4430  ret void
4431}
4432
4433define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) {
4434; GFX9-LABEL: store_load_i32_negative_unaligned:
4435; GFX9:       ; %bb.0: ; %bb
4436; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4437; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
4438; GFX9-NEXT:    v_mov_b32_e32 v1, 1
4439; GFX9-NEXT:    scratch_store_byte v0, v1, off
4440; GFX9-NEXT:    s_waitcnt vmcnt(0)
4441; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
4442; GFX9-NEXT:    s_waitcnt vmcnt(0)
4443; GFX9-NEXT:    s_setpc_b64 s[30:31]
4444;
4445; GFX10-LABEL: store_load_i32_negative_unaligned:
4446; GFX10:       ; %bb.0: ; %bb
4447; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4448; GFX10-NEXT:    v_mov_b32_e32 v1, 1
4449; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-1
4450; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4451; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
4452; GFX10-NEXT:    s_waitcnt vmcnt(0)
4453; GFX10-NEXT:    s_setpc_b64 s[30:31]
4454;
4455; GFX11-LABEL: store_load_i32_negative_unaligned:
4456; GFX11:       ; %bb.0: ; %bb
4457; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4458; GFX11-NEXT:    v_mov_b32_e32 v1, 1
4459; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
4460; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4461; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
4462; GFX11-NEXT:    s_waitcnt vmcnt(0)
4463; GFX11-NEXT:    s_setpc_b64 s[30:31]
4464;
4465; GFX12-LABEL: store_load_i32_negative_unaligned:
4466; GFX12:       ; %bb.0: ; %bb
4467; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4468; GFX12-NEXT:    s_wait_expcnt 0x0
4469; GFX12-NEXT:    s_wait_samplecnt 0x0
4470; GFX12-NEXT:    s_wait_bvhcnt 0x0
4471; GFX12-NEXT:    s_wait_kmcnt 0x0
4472; GFX12-NEXT:    v_mov_b32_e32 v1, 1
4473; GFX12-NEXT:    s_wait_storecnt 0x0
4474; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
4475; GFX12-NEXT:    s_wait_storecnt 0x0
4476; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
4477; GFX12-NEXT:    s_wait_loadcnt 0x0
4478; GFX12-NEXT:    s_setpc_b64 s[30:31]
4479;
4480; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
4481; GFX9-PAL:       ; %bb.0: ; %bb
4482; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4483; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -1, v0
4484; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
4485; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
4486; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4487; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
4488; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4489; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
4490;
4491; GFX940-LABEL: store_load_i32_negative_unaligned:
4492; GFX940:       ; %bb.0: ; %bb
4493; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4494; GFX940-NEXT:    v_add_u32_e32 v0, -1, v0
4495; GFX940-NEXT:    v_mov_b32_e32 v1, 1
4496; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
4497; GFX940-NEXT:    s_waitcnt vmcnt(0)
4498; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
4499; GFX940-NEXT:    s_waitcnt vmcnt(0)
4500; GFX940-NEXT:    s_setpc_b64 s[30:31]
4501;
4502; GFX1010-PAL-LABEL: store_load_i32_negative_unaligned:
4503; GFX1010-PAL:       ; %bb.0: ; %bb
4504; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4505; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, -1, v0
4506; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
4507; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off
4508; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4509; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc dlc
4510; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
4511; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
4512;
4513; GFX1030-PAL-LABEL: store_load_i32_negative_unaligned:
4514; GFX1030-PAL:       ; %bb.0: ; %bb
4515; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4516; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
4517; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-1
4518; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4519; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-1 glc dlc
4520; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
4521; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
4522;
4523; GFX11-PAL-LABEL: store_load_i32_negative_unaligned:
4524; GFX11-PAL:       ; %bb.0: ; %bb
4525; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4526; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 1
4527; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 dlc
4528; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4529; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 glc dlc
4530; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4531; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4532;
4533; GFX12-PAL-LABEL: store_load_i32_negative_unaligned:
4534; GFX12-PAL:       ; %bb.0: ; %bb
4535; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
4536; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
4537; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
4538; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
4539; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
4540; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
4541; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4542; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
4543; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4544; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
4545; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
4546; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
4547bb:
4548  %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1
4549  store volatile i8 1, ptr addrspace(5) %ptr, align 1
4550  %load = load volatile i8, ptr addrspace(5) %ptr, align 1
4551  ret void
4552}
4553
4554define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture %arg) {
4555; GFX9-LABEL: store_load_i32_large_negative_unaligned:
4556; GFX9:       ; %bb.0: ; %bb
4557; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4558; GFX9-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
4559; GFX9-NEXT:    v_mov_b32_e32 v1, 1
4560; GFX9-NEXT:    scratch_store_byte v0, v1, off
4561; GFX9-NEXT:    s_waitcnt vmcnt(0)
4562; GFX9-NEXT:    scratch_load_ubyte v0, v0, off glc
4563; GFX9-NEXT:    s_waitcnt vmcnt(0)
4564; GFX9-NEXT:    s_setpc_b64 s[30:31]
4565;
4566; GFX10-LABEL: store_load_i32_large_negative_unaligned:
4567; GFX10:       ; %bb.0: ; %bb
4568; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4569; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4570; GFX10-NEXT:    v_mov_b32_e32 v1, 1
4571; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:-129
4572; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4573; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
4574; GFX10-NEXT:    s_waitcnt vmcnt(0)
4575; GFX10-NEXT:    s_setpc_b64 s[30:31]
4576;
4577; GFX11-LABEL: store_load_i32_large_negative_unaligned:
4578; GFX11:       ; %bb.0: ; %bb
4579; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4580; GFX11-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0
4581; GFX11-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
4582; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4583; GFX11-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
4584; GFX11-NEXT:    s_waitcnt vmcnt(0)
4585; GFX11-NEXT:    s_setpc_b64 s[30:31]
4586;
4587; GFX12-LABEL: store_load_i32_large_negative_unaligned:
4588; GFX12:       ; %bb.0: ; %bb
4589; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
4590; GFX12-NEXT:    s_wait_expcnt 0x0
4591; GFX12-NEXT:    s_wait_samplecnt 0x0
4592; GFX12-NEXT:    s_wait_bvhcnt 0x0
4593; GFX12-NEXT:    s_wait_kmcnt 0x0
4594; GFX12-NEXT:    v_mov_b32_e32 v1, 1
4595; GFX12-NEXT:    s_wait_storecnt 0x0
4596; GFX12-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
4597; GFX12-NEXT:    s_wait_storecnt 0x0
4598; GFX12-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
4599; GFX12-NEXT:    s_wait_loadcnt 0x0
4600; GFX12-NEXT:    s_setpc_b64 s[30:31]
4601;
4602; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
4603; GFX9-PAL:       ; %bb.0: ; %bb
4604; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4605; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
4606; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 1
4607; GFX9-PAL-NEXT:    scratch_store_byte v0, v1, off
4608; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4609; GFX9-PAL-NEXT:    scratch_load_ubyte v0, v0, off glc
4610; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4611; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
4612;
4613; GFX940-LABEL: store_load_i32_large_negative_unaligned:
4614; GFX940:       ; %bb.0: ; %bb
4615; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4616; GFX940-NEXT:    v_add_u32_e32 v0, 0xffffef7f, v0
4617; GFX940-NEXT:    v_mov_b32_e32 v1, 1
4618; GFX940-NEXT:    scratch_store_byte v0, v1, off sc0 sc1
4619; GFX940-NEXT:    s_waitcnt vmcnt(0)
4620; GFX940-NEXT:    scratch_load_ubyte v0, v0, off sc0 sc1
4621; GFX940-NEXT:    s_waitcnt vmcnt(0)
4622; GFX940-NEXT:    s_setpc_b64 s[30:31]
4623;
4624; GFX1010-PAL-LABEL: store_load_i32_large_negative_unaligned:
4625; GFX1010-PAL:       ; %bb.0: ; %bb
4626; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4627; GFX1010-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xffffefff, v0
4628; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 1
4629; GFX1010-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-128
4630; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4631; GFX1010-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-128 glc dlc
4632; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
4633; GFX1010-PAL-NEXT:    s_setpc_b64 s[30:31]
4634;
4635; GFX1030-PAL-LABEL: store_load_i32_large_negative_unaligned:
4636; GFX1030-PAL:       ; %bb.0: ; %bb
4637; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4638; GFX1030-PAL-NEXT:    v_add_nc_u32_e32 v0, 0xfffff000, v0
4639; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 1
4640; GFX1030-PAL-NEXT:    scratch_store_byte v0, v1, off offset:-129
4641; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4642; GFX1030-PAL-NEXT:    scratch_load_ubyte v0, v0, off offset:-129 glc dlc
4643; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
4644; GFX1030-PAL-NEXT:    s_setpc_b64 s[30:31]
4645;
4646; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned:
4647; GFX11-PAL:       ; %bb.0: ; %bb
4648; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4649; GFX11-PAL-NEXT:    v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0
4650; GFX11-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-129 dlc
4651; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4652; GFX11-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-129 glc dlc
4653; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4654; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
4655;
4656; GFX12-PAL-LABEL: store_load_i32_large_negative_unaligned:
4657; GFX12-PAL:       ; %bb.0: ; %bb
4658; GFX12-PAL-NEXT:    s_wait_loadcnt_dscnt 0x0
4659; GFX12-PAL-NEXT:    s_wait_expcnt 0x0
4660; GFX12-PAL-NEXT:    s_wait_samplecnt 0x0
4661; GFX12-PAL-NEXT:    s_wait_bvhcnt 0x0
4662; GFX12-PAL-NEXT:    s_wait_kmcnt 0x0
4663; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 1
4664; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4665; GFX12-PAL-NEXT:    scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
4666; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4667; GFX12-PAL-NEXT:    scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
4668; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
4669; GFX12-PAL-NEXT:    s_setpc_b64 s[30:31]
4670bb:
4671  %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225
4672  store volatile i8 1, ptr addrspace(5) %ptr, align 1
4673  %load = load volatile i8, ptr addrspace(5) %ptr, align 1
4674  ret void
4675}
4676
4677define amdgpu_ps void @large_offset() {
4678; GFX9-LABEL: large_offset:
4679; GFX9:       ; %bb.0: ; %bb
4680; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s2
4681; GFX9-NEXT:    v_mov_b32_e32 v0, 0
4682; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
4683; GFX9-NEXT:    v_mov_b32_e32 v1, v0
4684; GFX9-NEXT:    v_mov_b32_e32 v2, v0
4685; GFX9-NEXT:    v_mov_b32_e32 v3, v0
4686; GFX9-NEXT:    s_mov_b32 s0, 0
4687; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:3024
4688; GFX9-NEXT:    s_waitcnt vmcnt(0)
4689; GFX9-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 offset:3024 glc
4690; GFX9-NEXT:    s_waitcnt vmcnt(0)
4691; GFX9-NEXT:    s_mov_b32 s0, 16
4692; GFX9-NEXT:    ;;#ASMSTART
4693; GFX9-NEXT:    ; use s0
4694; GFX9-NEXT:    ;;#ASMEND
4695; GFX9-NEXT:    s_movk_i32 s0, 0x810
4696; GFX9-NEXT:    ;;#ASMSTART
4697; GFX9-NEXT:    ; use s0
4698; GFX9-NEXT:    ;;#ASMEND
4699; GFX9-NEXT:    s_endpgm
4700;
4701; GFX10-LABEL: large_offset:
4702; GFX10:       ; %bb.0: ; %bb
4703; GFX10-NEXT:    s_add_u32 s0, s0, s2
4704; GFX10-NEXT:    s_addc_u32 s1, s1, 0
4705; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
4706; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
4707; GFX10-NEXT:    v_mov_b32_e32 v0, 0
4708; GFX10-NEXT:    s_movk_i32 s0, 0x810
4709; GFX10-NEXT:    s_add_i32 s1, s0, 0x3c0
4710; GFX10-NEXT:    v_mov_b32_e32 v1, v0
4711; GFX10-NEXT:    v_mov_b32_e32 v2, v0
4712; GFX10-NEXT:    v_mov_b32_e32 v3, v0
4713; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s1
4714; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
4715; GFX10-NEXT:    scratch_load_dwordx4 v[0:3], off, s1 glc dlc
4716; GFX10-NEXT:    s_waitcnt vmcnt(0)
4717; GFX10-NEXT:    s_mov_b32 s1, 16
4718; GFX10-NEXT:    ;;#ASMSTART
4719; GFX10-NEXT:    ; use s1
4720; GFX10-NEXT:    ;;#ASMEND
4721; GFX10-NEXT:    ;;#ASMSTART
4722; GFX10-NEXT:    ; use s0
4723; GFX10-NEXT:    ;;#ASMEND
4724; GFX10-NEXT:    s_endpgm
4725;
4726; GFX11-LABEL: large_offset:
4727; GFX11:       ; %bb.0: ; %bb
4728; GFX11-NEXT:    v_mov_b32_e32 v0, 0
4729; GFX11-NEXT:    s_mov_b32 s0, 16
4730; GFX11-NEXT:    s_movk_i32 s1, 0x810
4731; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4732; GFX11-NEXT:    v_mov_b32_e32 v1, v0
4733; GFX11-NEXT:    v_mov_b32_e32 v2, v0
4734; GFX11-NEXT:    v_mov_b32_e32 v3, v0
4735; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 dlc
4736; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
4737; GFX11-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
4738; GFX11-NEXT:    s_waitcnt vmcnt(0)
4739; GFX11-NEXT:    ;;#ASMSTART
4740; GFX11-NEXT:    ; use s0
4741; GFX11-NEXT:    ;;#ASMEND
4742; GFX11-NEXT:    ;;#ASMSTART
4743; GFX11-NEXT:    ; use s1
4744; GFX11-NEXT:    ;;#ASMEND
4745; GFX11-NEXT:    s_endpgm
4746;
4747; GFX12-LABEL: large_offset:
4748; GFX12:       ; %bb.0: ; %bb
4749; GFX12-NEXT:    v_mov_b32_e32 v0, 0
4750; GFX12-NEXT:    s_mov_b32 s0, 0
4751; GFX12-NEXT:    s_movk_i32 s1, 0x800
4752; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4753; GFX12-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
4754; GFX12-NEXT:    v_mov_b32_e32 v3, v0
4755; GFX12-NEXT:    scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS
4756; GFX12-NEXT:    s_wait_storecnt 0x0
4757; GFX12-NEXT:    scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS
4758; GFX12-NEXT:    s_wait_loadcnt 0x0
4759; GFX12-NEXT:    ;;#ASMSTART
4760; GFX12-NEXT:    ; use s0
4761; GFX12-NEXT:    ;;#ASMEND
4762; GFX12-NEXT:    ;;#ASMSTART
4763; GFX12-NEXT:    ; use s1
4764; GFX12-NEXT:    ;;#ASMEND
4765; GFX12-NEXT:    s_endpgm
4766;
4767; GFX9-PAL-LABEL: large_offset:
4768; GFX9-PAL:       ; %bb.0: ; %bb
4769; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
4770; GFX9-PAL-NEXT:    s_mov_b32 s2, s0
4771; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
4772; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0
4773; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, v0
4774; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, v0
4775; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, v0
4776; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
4777; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
4778; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s0
4779; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
4780; GFX9-PAL-NEXT:    s_mov_b32 s0, 0
4781; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:3024
4782; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4783; GFX9-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 offset:3024 glc
4784; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4785; GFX9-PAL-NEXT:    s_mov_b32 s0, 16
4786; GFX9-PAL-NEXT:    ;;#ASMSTART
4787; GFX9-PAL-NEXT:    ; use s0
4788; GFX9-PAL-NEXT:    ;;#ASMEND
4789; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x810
4790; GFX9-PAL-NEXT:    ;;#ASMSTART
4791; GFX9-PAL-NEXT:    ; use s0
4792; GFX9-PAL-NEXT:    ;;#ASMEND
4793; GFX9-PAL-NEXT:    s_endpgm
4794;
4795; GFX940-LABEL: large_offset:
4796; GFX940:       ; %bb.0: ; %bb
4797; GFX940-NEXT:    v_mov_b32_e32 v0, 0
4798; GFX940-NEXT:    v_mov_b32_e32 v1, v0
4799; GFX940-NEXT:    v_mov_b32_e32 v2, v0
4800; GFX940-NEXT:    v_mov_b32_e32 v3, v0
4801; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
4802; GFX940-NEXT:    s_waitcnt vmcnt(0)
4803; GFX940-NEXT:    scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
4804; GFX940-NEXT:    s_waitcnt vmcnt(0)
4805; GFX940-NEXT:    s_mov_b32 s0, 16
4806; GFX940-NEXT:    ;;#ASMSTART
4807; GFX940-NEXT:    ; use s0
4808; GFX940-NEXT:    ;;#ASMEND
4809; GFX940-NEXT:    s_movk_i32 s0, 0x810
4810; GFX940-NEXT:    ;;#ASMSTART
4811; GFX940-NEXT:    ; use s0
4812; GFX940-NEXT:    ;;#ASMEND
4813; GFX940-NEXT:    s_endpgm
4814;
4815; GFX1010-PAL-LABEL: large_offset:
4816; GFX1010-PAL:       ; %bb.0: ; %bb
4817; GFX1010-PAL-NEXT:    s_getpc_b64 s[2:3]
4818; GFX1010-PAL-NEXT:    s_mov_b32 s2, s0
4819; GFX1010-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
4820; GFX1010-PAL-NEXT:    s_waitcnt lgkmcnt(0)
4821; GFX1010-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
4822; GFX1010-PAL-NEXT:    s_add_u32 s2, s2, s0
4823; GFX1010-PAL-NEXT:    s_addc_u32 s3, s3, 0
4824; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
4825; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
4826; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 0
4827; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x810
4828; GFX1010-PAL-NEXT:    s_add_i32 s1, s0, 0x3c0
4829; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, v0
4830; GFX1010-PAL-NEXT:    v_mov_b32_e32 v2, v0
4831; GFX1010-PAL-NEXT:    v_mov_b32_e32 v3, v0
4832; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1
4833; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4834; GFX1010-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s1 glc dlc
4835; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
4836; GFX1010-PAL-NEXT:    s_waitcnt_depctr 0xffe3
4837; GFX1010-PAL-NEXT:    s_mov_b32 s1, 16
4838; GFX1010-PAL-NEXT:    ;;#ASMSTART
4839; GFX1010-PAL-NEXT:    ; use s1
4840; GFX1010-PAL-NEXT:    ;;#ASMEND
4841; GFX1010-PAL-NEXT:    ;;#ASMSTART
4842; GFX1010-PAL-NEXT:    ; use s0
4843; GFX1010-PAL-NEXT:    ;;#ASMEND
4844; GFX1010-PAL-NEXT:    s_endpgm
4845;
4846; GFX1030-PAL-LABEL: large_offset:
4847; GFX1030-PAL:       ; %bb.0: ; %bb
4848; GFX1030-PAL-NEXT:    s_getpc_b64 s[2:3]
4849; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
4850; GFX1030-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
4851; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
4852; GFX1030-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
4853; GFX1030-PAL-NEXT:    s_add_u32 s2, s2, s0
4854; GFX1030-PAL-NEXT:    s_addc_u32 s3, s3, 0
4855; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
4856; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
4857; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 0
4858; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x810
4859; GFX1030-PAL-NEXT:    s_add_i32 s1, s0, 0x3c0
4860; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, v0
4861; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, v0
4862; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, v0
4863; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1
4864; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4865; GFX1030-PAL-NEXT:    scratch_load_dwordx4 v[0:3], off, s1 glc dlc
4866; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
4867; GFX1030-PAL-NEXT:    s_mov_b32 s1, 16
4868; GFX1030-PAL-NEXT:    ;;#ASMSTART
4869; GFX1030-PAL-NEXT:    ; use s1
4870; GFX1030-PAL-NEXT:    ;;#ASMEND
4871; GFX1030-PAL-NEXT:    ;;#ASMSTART
4872; GFX1030-PAL-NEXT:    ; use s0
4873; GFX1030-PAL-NEXT:    ;;#ASMEND
4874; GFX1030-PAL-NEXT:    s_endpgm
4875;
4876; GFX11-PAL-LABEL: large_offset:
4877; GFX11-PAL:       ; %bb.0: ; %bb
4878; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 0
4879; GFX11-PAL-NEXT:    s_mov_b32 s0, 16
4880; GFX11-PAL-NEXT:    s_movk_i32 s1, 0x810
4881; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4882; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, v0
4883; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, v0
4884; GFX11-PAL-NEXT:    v_mov_b32_e32 v3, v0
4885; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3024 dlc
4886; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
4887; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
4888; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
4889; GFX11-PAL-NEXT:    ;;#ASMSTART
4890; GFX11-PAL-NEXT:    ; use s0
4891; GFX11-PAL-NEXT:    ;;#ASMEND
4892; GFX11-PAL-NEXT:    ;;#ASMSTART
4893; GFX11-PAL-NEXT:    ; use s1
4894; GFX11-PAL-NEXT:    ;;#ASMEND
4895; GFX11-PAL-NEXT:    s_endpgm
4896;
4897; GFX12-PAL-LABEL: large_offset:
4898; GFX12-PAL:       ; %bb.0: ; %bb
4899; GFX12-PAL-NEXT:    v_mov_b32_e32 v0, 0
4900; GFX12-PAL-NEXT:    s_mov_b32 s0, 0
4901; GFX12-PAL-NEXT:    s_movk_i32 s1, 0x800
4902; GFX12-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
4903; GFX12-PAL-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, v0
4904; GFX12-PAL-NEXT:    v_mov_b32_e32 v3, v0
4905; GFX12-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:3008 scope:SCOPE_SYS
4906; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
4907; GFX12-PAL-NEXT:    scratch_load_b128 v[0:3], off, off offset:3008 scope:SCOPE_SYS
4908; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
4909; GFX12-PAL-NEXT:    ;;#ASMSTART
4910; GFX12-PAL-NEXT:    ; use s0
4911; GFX12-PAL-NEXT:    ;;#ASMEND
4912; GFX12-PAL-NEXT:    ;;#ASMSTART
4913; GFX12-PAL-NEXT:    ; use s1
4914; GFX12-PAL-NEXT:    ;;#ASMEND
4915; GFX12-PAL-NEXT:    s_endpgm
4916bb:
4917  %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5)
4918  %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5)
4919  %gep = getelementptr inbounds [128 x <4 x i32>], ptr addrspace(5) %alloca2, i32 0, i32 60
4920  store volatile <4 x i32> zeroinitializer, ptr addrspace(5) %gep, align 16
4921  %load = load volatile <4 x i32>, ptr addrspace(5) %gep, align 16
4922  call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %alloca) #0
4923  call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %alloca2) #0
4924  ret void
4925}
4926
4927define amdgpu_gs void @sgpr_base_large_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
4928; GFX9-LABEL: sgpr_base_large_offset:
4929; GFX9:       ; %bb.0: ; %entry
4930; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
4931; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
4932; GFX9-NEXT:    s_add_i32 s2, s2, 0xffe8
4933; GFX9-NEXT:    scratch_load_dword v2, off, s2
4934; GFX9-NEXT:    s_waitcnt vmcnt(0)
4935; GFX9-NEXT:    global_store_dword v[0:1], v2, off
4936; GFX9-NEXT:    s_endpgm
4937;
4938; GFX10-LABEL: sgpr_base_large_offset:
4939; GFX10:       ; %bb.0: ; %entry
4940; GFX10-NEXT:    s_add_u32 s0, s0, s5
4941; GFX10-NEXT:    s_addc_u32 s1, s1, 0
4942; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
4943; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
4944; GFX10-NEXT:    s_add_i32 s2, s2, 0xffe8
4945; GFX10-NEXT:    scratch_load_dword v2, off, s2
4946; GFX10-NEXT:    s_waitcnt vmcnt(0)
4947; GFX10-NEXT:    global_store_dword v[0:1], v2, off
4948; GFX10-NEXT:    s_endpgm
4949;
4950; GFX11-LABEL: sgpr_base_large_offset:
4951; GFX11:       ; %bb.0: ; %entry
4952; GFX11-NEXT:    s_add_i32 s0, s0, 0xffe8
4953; GFX11-NEXT:    scratch_load_b32 v2, off, s0
4954; GFX11-NEXT:    s_waitcnt vmcnt(0)
4955; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
4956; GFX11-NEXT:    s_endpgm
4957;
4958; GFX12-LABEL: sgpr_base_large_offset:
4959; GFX12:       ; %bb.0: ; %entry
4960; GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:65512
4961; GFX12-NEXT:    s_wait_loadcnt 0x0
4962; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
4963; GFX12-NEXT:    s_endpgm
4964;
4965; GFX9-PAL-LABEL: sgpr_base_large_offset:
4966; GFX9-PAL:       ; %bb.0: ; %entry
4967; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
4968; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
4969; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
4970; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
4971; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
4972; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
4973; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
4974; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 0xffe8
4975; GFX9-PAL-NEXT:    scratch_load_dword v2, off, s0
4976; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
4977; GFX9-PAL-NEXT:    global_store_dword v[0:1], v2, off
4978; GFX9-PAL-NEXT:    s_endpgm
4979;
4980; GFX940-LABEL: sgpr_base_large_offset:
4981; GFX940:       ; %bb.0: ; %entry
4982; GFX940-NEXT:    s_add_i32 s0, s0, 0xffe8
4983; GFX940-NEXT:    scratch_load_dword v2, off, s0
4984; GFX940-NEXT:    s_waitcnt vmcnt(0)
4985; GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
4986; GFX940-NEXT:    s_endpgm
4987;
4988; GFX10-PAL-LABEL: sgpr_base_large_offset:
4989; GFX10-PAL:       ; %bb.0: ; %entry
4990; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
4991; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
4992; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
4993; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
4994; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
4995; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
4996; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
4997; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
4998; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
4999; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 0xffe8
5000; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s0
5001; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
5002; GFX10-PAL-NEXT:    global_store_dword v[0:1], v2, off
5003; GFX10-PAL-NEXT:    s_endpgm
5004;
5005; GFX11-PAL-LABEL: sgpr_base_large_offset:
5006; GFX11-PAL:       ; %bb.0: ; %entry
5007; GFX11-PAL-NEXT:    s_add_i32 s0, s0, 0xffe8
5008; GFX11-PAL-NEXT:    scratch_load_b32 v2, off, s0
5009; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
5010; GFX11-PAL-NEXT:    global_store_b32 v[0:1], v2, off
5011; GFX11-PAL-NEXT:    s_endpgm
5012;
5013; GFX12-PAL-LABEL: sgpr_base_large_offset:
5014; GFX12-PAL:       ; %bb.0: ; %entry
5015; GFX12-PAL-NEXT:    scratch_load_b32 v2, off, s0 offset:65512
5016; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
5017; GFX12-PAL-NEXT:    global_store_b32 v[0:1], v2, off
5018; GFX12-PAL-NEXT:    s_endpgm
5019entry:
5020  %large_offset = getelementptr i8, ptr addrspace(5) %sgpr_base, i32 65512
5021  %load = load i32, ptr addrspace(5) %large_offset, align 4
5022  store i32 %load, ptr addrspace(1) %out
5023  ret void
5024}
5025
5026define amdgpu_gs void @sgpr_base_large_offset_split(ptr addrspace(1) %out, ptr addrspace(5) inreg %sgpr_base) {
5027; GFX9-LABEL: sgpr_base_large_offset_split:
5028; GFX9:       ; %bb.0: ; %entry
5029; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
5030; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
5031; GFX9-NEXT:    s_and_b32 s0, s2, -4
5032; GFX9-NEXT:    s_add_i32 s0, s0, 0x100f000
5033; GFX9-NEXT:    scratch_load_dword v2, off, s0 offset:4072 glc
5034; GFX9-NEXT:    s_waitcnt vmcnt(0)
5035; GFX9-NEXT:    global_store_dword v[0:1], v2, off
5036; GFX9-NEXT:    s_endpgm
5037;
5038; GFX10-LABEL: sgpr_base_large_offset_split:
5039; GFX10:       ; %bb.0: ; %entry
5040; GFX10-NEXT:    s_add_u32 s0, s0, s5
5041; GFX10-NEXT:    s_addc_u32 s1, s1, 0
5042; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
5043; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
5044; GFX10-NEXT:    s_and_b32 s0, s2, -4
5045; GFX10-NEXT:    s_add_i32 s0, s0, 0x100f800
5046; GFX10-NEXT:    scratch_load_dword v2, off, s0 offset:2024 glc dlc
5047; GFX10-NEXT:    s_waitcnt vmcnt(0)
5048; GFX10-NEXT:    global_store_dword v[0:1], v2, off
5049; GFX10-NEXT:    s_endpgm
5050;
5051; GFX11-LABEL: sgpr_base_large_offset_split:
5052; GFX11:       ; %bb.0: ; %entry
5053; GFX11-NEXT:    v_mov_b32_e32 v2, 0x100f000
5054; GFX11-NEXT:    s_and_b32 s0, s0, -4
5055; GFX11-NEXT:    scratch_load_b32 v2, v2, s0 offset:4072 glc dlc
5056; GFX11-NEXT:    s_waitcnt vmcnt(0)
5057; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
5058; GFX11-NEXT:    s_endpgm
5059;
5060; GFX12-LABEL: sgpr_base_large_offset_split:
5061; GFX12:       ; %bb.0: ; %entry
5062; GFX12-NEXT:    v_mov_b32_e32 v2, 0x1000000
5063; GFX12-NEXT:    s_and_b32 s0, s0, -4
5064; GFX12-NEXT:    scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS
5065; GFX12-NEXT:    s_wait_loadcnt 0x0
5066; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
5067; GFX12-NEXT:    s_endpgm
5068;
5069; GFX9-PAL-LABEL: sgpr_base_large_offset_split:
5070; GFX9-PAL:       ; %bb.0: ; %entry
5071; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
5072; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
5073; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
5074; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
5075; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
5076; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
5077; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
5078; GFX9-PAL-NEXT:    s_and_b32 s0, s0, -4
5079; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 0x100f000
5080; GFX9-PAL-NEXT:    scratch_load_dword v2, off, s0 offset:4072 glc
5081; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
5082; GFX9-PAL-NEXT:    global_store_dword v[0:1], v2, off
5083; GFX9-PAL-NEXT:    s_endpgm
5084;
5085; GFX940-LABEL: sgpr_base_large_offset_split:
5086; GFX940:       ; %bb.0: ; %entry
5087; GFX940-NEXT:    s_and_b32 s0, s0, -4
5088; GFX940-NEXT:    v_mov_b32_e32 v2, 0x100f000
5089; GFX940-NEXT:    scratch_load_dword v2, v2, s0 offset:4072 sc0 sc1
5090; GFX940-NEXT:    s_waitcnt vmcnt(0)
5091; GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
5092; GFX940-NEXT:    s_endpgm
5093;
5094; GFX10-PAL-LABEL: sgpr_base_large_offset_split:
5095; GFX10-PAL:       ; %bb.0: ; %entry
5096; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
5097; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
5098; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
5099; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
5100; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
5101; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
5102; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
5103; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
5104; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
5105; GFX10-PAL-NEXT:    s_and_b32 s0, s0, -4
5106; GFX10-PAL-NEXT:    s_add_i32 s0, s0, 0x100f800
5107; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s0 offset:2024 glc dlc
5108; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
5109; GFX10-PAL-NEXT:    global_store_dword v[0:1], v2, off
5110; GFX10-PAL-NEXT:    s_endpgm
5111;
5112; GFX11-PAL-LABEL: sgpr_base_large_offset_split:
5113; GFX11-PAL:       ; %bb.0: ; %entry
5114; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 0x100f000
5115; GFX11-PAL-NEXT:    s_and_b32 s0, s0, -4
5116; GFX11-PAL-NEXT:    scratch_load_b32 v2, v2, s0 offset:4072 glc dlc
5117; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
5118; GFX11-PAL-NEXT:    global_store_b32 v[0:1], v2, off
5119; GFX11-PAL-NEXT:    s_endpgm
5120;
5121; GFX12-PAL-LABEL: sgpr_base_large_offset_split:
5122; GFX12-PAL:       ; %bb.0: ; %entry
5123; GFX12-PAL-NEXT:    v_mov_b32_e32 v2, 0x1000000
5124; GFX12-PAL-NEXT:    s_and_b32 s0, s0, -4
5125; GFX12-PAL-NEXT:    scratch_load_b32 v2, v2, s0 offset:65512 scope:SCOPE_SYS
5126; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
5127; GFX12-PAL-NEXT:    global_store_b32 v[0:1], v2, off
5128; GFX12-PAL-NEXT:    s_endpgm
5129entry:
5130  ;%allignedBase = alloca [33554432 x i8], align 4, addrspace(5)
5131  %sgpr_base_i32 = ptrtoint ptr addrspace(5) %sgpr_base to i32
5132  %sgpr_base_i32_align4 = and i32 %sgpr_base_i32, 4294967292
5133  %sgpr_base_align4 = inttoptr i32 %sgpr_base_i32_align4 to ptr addrspace(5)
5134  %split_offset = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base_align4, i32 0, i32 16842728
5135  %load = load volatile i32, ptr addrspace(5) %split_offset, align 4
5136  store i32 %load, ptr addrspace(1) %out
5137  ret void
5138}
5139
5140define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
5141; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
5142; GFX9:       ; %bb.0: ; %bb
5143; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
5144; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
5145; GFX9-NEXT:    s_add_i32 s2, s2, s3
5146; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
5147; GFX9-NEXT:    v_add_u32_e32 v0, 0xffe8, v0
5148; GFX9-NEXT:    v_mov_b32_e32 v1, 15
5149; GFX9-NEXT:    scratch_store_dword v0, v1, off
5150; GFX9-NEXT:    s_waitcnt vmcnt(0)
5151; GFX9-NEXT:    s_endpgm
5152;
5153; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
5154; GFX10:       ; %bb.0: ; %bb
5155; GFX10-NEXT:    s_add_u32 s0, s0, s5
5156; GFX10-NEXT:    s_addc_u32 s1, s1, 0
5157; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
5158; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
5159; GFX10-NEXT:    s_add_i32 s2, s2, s3
5160; GFX10-NEXT:    v_mov_b32_e32 v1, 15
5161; GFX10-NEXT:    v_add3_u32 v0, s2, v0, 0xffe8
5162; GFX10-NEXT:    scratch_store_dword v0, v1, off
5163; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5164; GFX10-NEXT:    s_endpgm
5165;
5166; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
5167; GFX11:       ; %bb.0: ; %bb
5168; GFX11-NEXT:    s_add_i32 s0, s0, s1
5169; GFX11-NEXT:    v_mov_b32_e32 v1, 15
5170; GFX11-NEXT:    v_add3_u32 v0, s0, v0, 0xffe8
5171; GFX11-NEXT:    scratch_store_b32 v0, v1, off dlc
5172; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5173; GFX11-NEXT:    s_endpgm
5174;
5175; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
5176; GFX12:       ; %bb.0: ; %bb
5177; GFX12-NEXT:    v_mov_b32_e32 v1, 15
5178; GFX12-NEXT:    s_add_co_i32 s0, s0, s1
5179; GFX12-NEXT:    scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
5180; GFX12-NEXT:    s_wait_storecnt 0x0
5181; GFX12-NEXT:    s_endpgm
5182;
5183; GFX9-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
5184; GFX9-PAL:       ; %bb.0: ; %bb
5185; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
5186; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
5187; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
5188; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
5189; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
5190; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
5191; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
5192; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
5193; GFX9-PAL-NEXT:    s_add_i32 s0, s0, s1
5194; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
5195; GFX9-PAL-NEXT:    v_add_u32_e32 v0, 0xffe8, v0
5196; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
5197; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
5198; GFX9-PAL-NEXT:    s_endpgm
5199;
5200; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
5201; GFX940:       ; %bb.0: ; %bb
5202; GFX940-NEXT:    s_add_i32 s0, s0, s1
5203; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
5204; GFX940-NEXT:    v_add_u32_e32 v0, 0xffe8, v0
5205; GFX940-NEXT:    v_mov_b32_e32 v1, 15
5206; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
5207; GFX940-NEXT:    s_waitcnt vmcnt(0)
5208; GFX940-NEXT:    s_endpgm
5209;
5210; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
5211; GFX10-PAL:       ; %bb.0: ; %bb
5212; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
5213; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
5214; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
5215; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
5216; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
5217; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
5218; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
5219; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
5220; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
5221; GFX10-PAL-NEXT:    s_add_i32 s0, s0, s1
5222; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
5223; GFX10-PAL-NEXT:    v_add3_u32 v0, s0, v0, 0xffe8
5224; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off
5225; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
5226; GFX10-PAL-NEXT:    s_endpgm
5227;
5228; GFX11-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
5229; GFX11-PAL:       ; %bb.0: ; %bb
5230; GFX11-PAL-NEXT:    s_add_i32 s0, s0, s1
5231; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
5232; GFX11-PAL-NEXT:    v_add3_u32 v0, s0, v0, 0xffe8
5233; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off dlc
5234; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
5235; GFX11-PAL-NEXT:    s_endpgm
5236;
5237; GFX12-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_large_imm_offset:
5238; GFX12-PAL:       ; %bb.0: ; %bb
5239; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 15
5240; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, s1
5241; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, s0 offset:65512 scope:SCOPE_SYS
5242; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
5243; GFX12-PAL-NEXT:    s_endpgm
5244bb:
5245  %add1 = add nsw i32 %sidx, %vidx
5246  %add2 = add nsw i32 %add1, 65512
5247  %gep = getelementptr inbounds [33554432 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
5248  store volatile i32 15, ptr addrspace(5) %gep, align 4
5249  ret void
5250}
5251
5252define amdgpu_gs void @sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset(ptr addrspace(5) inreg %sgpr_base, i32 inreg %sidx, i32 %vidx) {
5253; GFX9-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
5254; GFX9:       ; %bb.0: ; %bb
5255; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
5256; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
5257; GFX9-NEXT:    s_add_i32 s2, s2, s3
5258; GFX9-NEXT:    v_add_u32_e32 v0, s2, v0
5259; GFX9-NEXT:    v_add_u32_e32 v0, -16, v0
5260; GFX9-NEXT:    v_mov_b32_e32 v1, 15
5261; GFX9-NEXT:    scratch_store_dword v0, v1, off
5262; GFX9-NEXT:    s_waitcnt vmcnt(0)
5263; GFX9-NEXT:    s_endpgm
5264;
5265; GFX10-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
5266; GFX10:       ; %bb.0: ; %bb
5267; GFX10-NEXT:    s_add_u32 s0, s0, s5
5268; GFX10-NEXT:    s_addc_u32 s1, s1, 0
5269; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
5270; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
5271; GFX10-NEXT:    v_add3_u32 v0, s2, s3, v0
5272; GFX10-NEXT:    v_mov_b32_e32 v1, 15
5273; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:-16
5274; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
5275; GFX10-NEXT:    s_endpgm
5276;
5277; GFX11-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
5278; GFX11:       ; %bb.0: ; %bb
5279; GFX11-NEXT:    v_add3_u32 v0, s0, s1, v0
5280; GFX11-NEXT:    v_mov_b32_e32 v1, 15
5281; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
5282; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
5283; GFX11-NEXT:    s_endpgm
5284;
5285; GFX12-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
5286; GFX12:       ; %bb.0: ; %bb
5287; GFX12-NEXT:    v_mov_b32_e32 v1, 15
5288; GFX12-NEXT:    s_add_co_i32 s0, s0, s1
5289; GFX12-NEXT:    scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
5290; GFX12-NEXT:    s_wait_storecnt 0x0
5291; GFX12-NEXT:    s_endpgm
5292;
5293; GFX9-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
5294; GFX9-PAL:       ; %bb.0: ; %bb
5295; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
5296; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
5297; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
5298; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
5299; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
5300; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
5301; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
5302; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
5303; GFX9-PAL-NEXT:    s_add_i32 s0, s0, s1
5304; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
5305; GFX9-PAL-NEXT:    v_add_u32_e32 v0, -16, v0
5306; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off
5307; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
5308; GFX9-PAL-NEXT:    s_endpgm
5309;
5310; GFX940-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
5311; GFX940:       ; %bb.0: ; %bb
5312; GFX940-NEXT:    s_add_i32 s0, s0, s1
5313; GFX940-NEXT:    v_add_u32_e32 v0, s0, v0
5314; GFX940-NEXT:    v_add_u32_e32 v0, -16, v0
5315; GFX940-NEXT:    v_mov_b32_e32 v1, 15
5316; GFX940-NEXT:    scratch_store_dword v0, v1, off sc0 sc1
5317; GFX940-NEXT:    s_waitcnt vmcnt(0)
5318; GFX940-NEXT:    s_endpgm
5319;
5320; GFX10-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
5321; GFX10-PAL:       ; %bb.0: ; %bb
5322; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
5323; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
5324; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
5325; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
5326; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
5327; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
5328; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
5329; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
5330; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
5331; GFX10-PAL-NEXT:    v_add3_u32 v0, s0, s1, v0
5332; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
5333; GFX10-PAL-NEXT:    scratch_store_dword v0, v1, off offset:-16
5334; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
5335; GFX10-PAL-NEXT:    s_endpgm
5336;
5337; GFX11-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
5338; GFX11-PAL:       ; %bb.0: ; %bb
5339; GFX11-PAL-NEXT:    v_add3_u32 v0, s0, s1, v0
5340; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 15
5341; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:-16 dlc
5342; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
5343; GFX11-PAL-NEXT:    s_endpgm
5344;
5345; GFX12-PAL-LABEL: sgpr_base_plus_sgpr_plus_vgpr_plus_negative_imm_offset:
5346; GFX12-PAL:       ; %bb.0: ; %bb
5347; GFX12-PAL-NEXT:    v_mov_b32_e32 v1, 15
5348; GFX12-PAL-NEXT:    s_add_co_i32 s0, s0, s1
5349; GFX12-PAL-NEXT:    scratch_store_b32 v0, v1, s0 offset:-16 scope:SCOPE_SYS
5350; GFX12-PAL-NEXT:    s_wait_storecnt 0x0
5351; GFX12-PAL-NEXT:    s_endpgm
5352bb:
5353  %add1 = add nsw i32 %sidx, %vidx
5354  %add2 = add nsw i32 %add1, -16
5355  %gep = getelementptr inbounds [16 x i8], ptr addrspace(5) %sgpr_base, i32 0, i32 %add2
5356  store volatile i32 15, ptr addrspace(5) %gep, align 4
5357  ret void
5358}
5359
5360define amdgpu_gs void @sgpr_base_negative_offset(ptr addrspace(1) %out, ptr addrspace(5) inreg %scevgep) {
5361; GFX9-LABEL: sgpr_base_negative_offset:
5362; GFX9:       ; %bb.0: ; %entry
5363; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s5
5364; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
5365; GFX9-NEXT:    s_addk_i32 s2, 0xffe8
5366; GFX9-NEXT:    scratch_load_dword v2, off, s2
5367; GFX9-NEXT:    s_waitcnt vmcnt(0)
5368; GFX9-NEXT:    global_store_dword v[0:1], v2, off
5369; GFX9-NEXT:    s_endpgm
5370;
5371; GFX10-LABEL: sgpr_base_negative_offset:
5372; GFX10:       ; %bb.0: ; %entry
5373; GFX10-NEXT:    s_add_u32 s0, s0, s5
5374; GFX10-NEXT:    s_addc_u32 s1, s1, 0
5375; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
5376; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
5377; GFX10-NEXT:    scratch_load_dword v2, off, s2 offset:-24
5378; GFX10-NEXT:    s_waitcnt vmcnt(0)
5379; GFX10-NEXT:    global_store_dword v[0:1], v2, off
5380; GFX10-NEXT:    s_endpgm
5381;
5382; GFX11-LABEL: sgpr_base_negative_offset:
5383; GFX11:       ; %bb.0: ; %entry
5384; GFX11-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
5385; GFX11-NEXT:    s_waitcnt vmcnt(0)
5386; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
5387; GFX11-NEXT:    s_endpgm
5388;
5389; GFX12-LABEL: sgpr_base_negative_offset:
5390; GFX12:       ; %bb.0: ; %entry
5391; GFX12-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
5392; GFX12-NEXT:    s_wait_loadcnt 0x0
5393; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
5394; GFX12-NEXT:    s_endpgm
5395;
5396; GFX9-PAL-LABEL: sgpr_base_negative_offset:
5397; GFX9-PAL:       ; %bb.0: ; %entry
5398; GFX9-PAL-NEXT:    s_getpc_b64 s[2:3]
5399; GFX9-PAL-NEXT:    s_mov_b32 s2, s8
5400; GFX9-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
5401; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
5402; GFX9-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
5403; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
5404; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
5405; GFX9-PAL-NEXT:    s_addk_i32 s0, 0xffe8
5406; GFX9-PAL-NEXT:    scratch_load_dword v2, off, s0
5407; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
5408; GFX9-PAL-NEXT:    global_store_dword v[0:1], v2, off
5409; GFX9-PAL-NEXT:    s_endpgm
5410;
5411; GFX940-LABEL: sgpr_base_negative_offset:
5412; GFX940:       ; %bb.0: ; %entry
5413; GFX940-NEXT:    s_addk_i32 s0, 0xffe8
5414; GFX940-NEXT:    scratch_load_dword v2, off, s0
5415; GFX940-NEXT:    s_waitcnt vmcnt(0)
5416; GFX940-NEXT:    global_store_dword v[0:1], v2, off sc0 sc1
5417; GFX940-NEXT:    s_endpgm
5418;
5419; GFX10-PAL-LABEL: sgpr_base_negative_offset:
5420; GFX10-PAL:       ; %bb.0: ; %entry
5421; GFX10-PAL-NEXT:    s_getpc_b64 s[2:3]
5422; GFX10-PAL-NEXT:    s_mov_b32 s2, s8
5423; GFX10-PAL-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
5424; GFX10-PAL-NEXT:    s_waitcnt lgkmcnt(0)
5425; GFX10-PAL-NEXT:    s_and_b32 s3, s3, 0xffff
5426; GFX10-PAL-NEXT:    s_add_u32 s2, s2, s5
5427; GFX10-PAL-NEXT:    s_addc_u32 s3, s3, 0
5428; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
5429; GFX10-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
5430; GFX10-PAL-NEXT:    scratch_load_dword v2, off, s0 offset:-24
5431; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
5432; GFX10-PAL-NEXT:    global_store_dword v[0:1], v2, off
5433; GFX10-PAL-NEXT:    s_endpgm
5434;
5435; GFX11-PAL-LABEL: sgpr_base_negative_offset:
5436; GFX11-PAL:       ; %bb.0: ; %entry
5437; GFX11-PAL-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
5438; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
5439; GFX11-PAL-NEXT:    global_store_b32 v[0:1], v2, off
5440; GFX11-PAL-NEXT:    s_endpgm
5441;
5442; GFX12-PAL-LABEL: sgpr_base_negative_offset:
5443; GFX12-PAL:       ; %bb.0: ; %entry
5444; GFX12-PAL-NEXT:    scratch_load_b32 v2, off, s0 offset:-24
5445; GFX12-PAL-NEXT:    s_wait_loadcnt 0x0
5446; GFX12-PAL-NEXT:    global_store_b32 v[0:1], v2, off
5447; GFX12-PAL-NEXT:    s_endpgm
5448entry:
5449  %scevgep28 = getelementptr i8, ptr addrspace(5) %scevgep, i32 -24
5450  %0 = load i32, ptr addrspace(5) %scevgep28, align 4
5451  store i32 %0, ptr addrspace(1) %out
5452  ret void
5453}
5454
5455declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)
5456declare i32 @llvm.amdgcn.workitem.id.x()
5457