xref: /llvm-project/llvm/test/CodeGen/AMDGPU/ds_read2.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn--amdpal -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope --check-prefix=CI %s
3; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,-unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-ALIGNED %s
4; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+unaligned-access-mode < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-UNALIGNED %s
5
6; FIXME: We don't get cases where the address was an SGPR because we
7; get a copy to the address register for each one.
8
9@lds = addrspace(3) global [512 x float] undef, align 4
10@lds.f64 = addrspace(3) global [512 x double] undef, align 8
11
12define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 {
13; CI-LABEL: simple_read2_f32:
14; CI:       ; %bb.0:
15; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
16; CI-NEXT:    s_mov_b32 m0, -1
17; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset1:8
18; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
19; CI-NEXT:    s_mov_b32 s3, 0xf000
20; CI-NEXT:    s_mov_b32 s2, 0
21; CI-NEXT:    s_waitcnt lgkmcnt(0)
22; CI-NEXT:    v_add_f32_e32 v2, v1, v2
23; CI-NEXT:    v_mov_b32_e32 v1, 0
24; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
25; CI-NEXT:    s_endpgm
26;
27; GFX9-LABEL: simple_read2_f32:
28; GFX9:       ; %bb.0:
29; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
30; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:8
31; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
32; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
33; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
34; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
35; GFX9-NEXT:    s_endpgm
36  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
37  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
38  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
39  %add.x = add nsw i32 %x.i, 8
40  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
41  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
42  %sum = fadd float %val0, %val1
43  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
44  store float %sum, ptr addrspace(1) %out.gep, align 4
45  ret void
46}
47
48define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 {
49; CI-LABEL: simple_read2_f32_max_offset:
50; CI:       ; %bb.0:
51; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
52; CI-NEXT:    s_mov_b32 m0, -1
53; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset1:255
54; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
55; CI-NEXT:    s_mov_b32 s3, 0xf000
56; CI-NEXT:    s_mov_b32 s2, 0
57; CI-NEXT:    s_waitcnt lgkmcnt(0)
58; CI-NEXT:    v_add_f32_e32 v2, v1, v2
59; CI-NEXT:    v_mov_b32_e32 v1, 0
60; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
61; CI-NEXT:    s_endpgm
62;
63; GFX9-LABEL: simple_read2_f32_max_offset:
64; GFX9:       ; %bb.0:
65; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
66; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:255
67; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
68; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
70; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
71; GFX9-NEXT:    s_endpgm
72  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
73  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
74  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
75  %add.x = add nsw i32 %x.i, 255
76  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
77  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
78  %sum = fadd float %val0, %val1
79  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
80  store float %sum, ptr addrspace(1) %out.gep, align 4
81  ret void
82}
83
84define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 {
85; CI-LABEL: simple_read2_f32_too_far:
86; CI:       ; %bb.0:
87; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
88; CI-NEXT:    s_mov_b32 m0, -1
89; CI-NEXT:    ds_read_b32 v1, v0
90; CI-NEXT:    ds_read_b32 v2, v0 offset:1028
91; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
92; CI-NEXT:    s_mov_b32 s3, 0xf000
93; CI-NEXT:    s_mov_b32 s2, 0
94; CI-NEXT:    s_waitcnt lgkmcnt(0)
95; CI-NEXT:    v_add_f32_e32 v2, v1, v2
96; CI-NEXT:    v_mov_b32_e32 v1, 0
97; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
98; CI-NEXT:    s_endpgm
99;
100; GFX9-LABEL: simple_read2_f32_too_far:
101; GFX9:       ; %bb.0:
102; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
103; GFX9-NEXT:    ds_read_b32 v1, v0
104; GFX9-NEXT:    ds_read_b32 v2, v0 offset:1028
105; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
106; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
107; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
108; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
109; GFX9-NEXT:    s_endpgm
110  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
111  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
112  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
113  %add.x = add nsw i32 %x.i, 257
114  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
115  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
116  %sum = fadd float %val0, %val1
117  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
118  store float %sum, ptr addrspace(1) %out.gep, align 4
119  ret void
120}
121
122define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 {
123; CI-LABEL: simple_read2_f32_x2:
124; CI:       ; %bb.0:
125; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
126; CI-NEXT:    s_mov_b32 m0, -1
127; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset1:8
128; CI-NEXT:    ds_read2_b32 v[3:4], v0 offset0:11 offset1:27
129; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
130; CI-NEXT:    s_mov_b32 s3, 0xf000
131; CI-NEXT:    s_mov_b32 s2, 0
132; CI-NEXT:    s_waitcnt lgkmcnt(0)
133; CI-NEXT:    v_add_f32_e32 v1, v1, v2
134; CI-NEXT:    v_add_f32_e32 v2, v3, v4
135; CI-NEXT:    v_add_f32_e32 v2, v1, v2
136; CI-NEXT:    v_mov_b32_e32 v1, 0
137; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
138; CI-NEXT:    s_endpgm
139;
140; GFX9-LABEL: simple_read2_f32_x2:
141; GFX9:       ; %bb.0:
142; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
143; GFX9-NEXT:    ds_read2_b32 v[0:1], v4 offset1:8
144; GFX9-NEXT:    ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
145; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
146; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
148; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
149; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
150; GFX9-NEXT:    global_store_dword v4, v0, s[0:1]
151; GFX9-NEXT:    s_endpgm
152  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
153  %idx.0 = add nsw i32 %tid.x, 0
154  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
155  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
156
157  %idx.1 = add nsw i32 %tid.x, 8
158  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
159  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
160  %sum.0 = fadd float %val0, %val1
161
162  %idx.2 = add nsw i32 %tid.x, 11
163  %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
164  %val2 = load float, ptr addrspace(3) %arrayidx2, align 4
165
166  %idx.3 = add nsw i32 %tid.x, 27
167  %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
168  %val3 = load float, ptr addrspace(3) %arrayidx3, align 4
169  %sum.1 = fadd float %val2, %val3
170
171  %sum = fadd float %sum.0, %sum.1
172  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0
173  store float %sum, ptr addrspace(1) %out.gep, align 4
174  ret void
175}
176
177; Make sure there is an instruction between the two sets of reads.
178define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 {
179; CI-LABEL: simple_read2_f32_x2_barrier:
180; CI:       ; %bb.0:
181; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
182; CI-NEXT:    s_mov_b32 m0, -1
183; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset1:8
184; CI-NEXT:    s_waitcnt lgkmcnt(0)
185; CI-NEXT:    s_barrier
186; CI-NEXT:    ds_read2_b32 v[3:4], v0 offset0:11 offset1:27
187; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
188; CI-NEXT:    s_mov_b32 s3, 0xf000
189; CI-NEXT:    v_add_f32_e32 v1, v1, v2
190; CI-NEXT:    s_mov_b32 s2, 0
191; CI-NEXT:    s_waitcnt lgkmcnt(0)
192; CI-NEXT:    v_add_f32_e32 v2, v3, v4
193; CI-NEXT:    v_add_f32_e32 v2, v1, v2
194; CI-NEXT:    v_mov_b32_e32 v1, 0
195; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
196; CI-NEXT:    s_endpgm
197;
198; GFX9-LABEL: simple_read2_f32_x2_barrier:
199; GFX9:       ; %bb.0:
200; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
201; GFX9-NEXT:    ds_read2_b32 v[0:1], v4 offset1:8
202; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
203; GFX9-NEXT:    s_barrier
204; GFX9-NEXT:    ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
205; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
206; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
207; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
208; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
209; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
210; GFX9-NEXT:    global_store_dword v4, v0, s[0:1]
211; GFX9-NEXT:    s_endpgm
212  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
213  %idx.0 = add nsw i32 %tid.x, 0
214  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
215  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
216
217  %idx.1 = add nsw i32 %tid.x, 8
218  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
219  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
220  %sum.0 = fadd float %val0, %val1
221
222  call void @llvm.amdgcn.s.barrier() #2
223
224  %idx.2 = add nsw i32 %tid.x, 11
225  %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
226  %val2 = load float, ptr addrspace(3) %arrayidx2, align 4
227
228  %idx.3 = add nsw i32 %tid.x, 27
229  %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
230  %val3 = load float, ptr addrspace(3) %arrayidx3, align 4
231  %sum.1 = fadd float %val2, %val3
232
233  %sum = fadd float %sum.0, %sum.1
234  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0
235  store float %sum, ptr addrspace(1) %out.gep, align 4
236  ret void
237}
238
239; For some reason adding something to the base address for the first
240; element results in only folding the inner pair.
241define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %out) #0 {
242; CI-LABEL: simple_read2_f32_x2_nonzero_base:
243; CI:       ; %bb.0:
244; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
245; CI-NEXT:    s_mov_b32 m0, -1
246; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset0:2 offset1:8
247; CI-NEXT:    ds_read2_b32 v[3:4], v0 offset0:11 offset1:27
248; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
249; CI-NEXT:    s_mov_b32 s3, 0xf000
250; CI-NEXT:    s_mov_b32 s2, 0
251; CI-NEXT:    s_waitcnt lgkmcnt(0)
252; CI-NEXT:    v_add_f32_e32 v1, v1, v2
253; CI-NEXT:    v_add_f32_e32 v2, v3, v4
254; CI-NEXT:    v_add_f32_e32 v2, v1, v2
255; CI-NEXT:    v_mov_b32_e32 v1, 0
256; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:8
257; CI-NEXT:    s_endpgm
258;
259; GFX9-LABEL: simple_read2_f32_x2_nonzero_base:
260; GFX9:       ; %bb.0:
261; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
262; GFX9-NEXT:    ds_read2_b32 v[0:1], v4 offset0:2 offset1:8
263; GFX9-NEXT:    ds_read2_b32 v[2:3], v4 offset0:11 offset1:27
264; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
265; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
267; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
268; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
269; GFX9-NEXT:    global_store_dword v4, v0, s[0:1] offset:8
270; GFX9-NEXT:    s_endpgm
271  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
272  %idx.0 = add nsw i32 %tid.x, 2
273  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.0
274  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
275
276  %idx.1 = add nsw i32 %tid.x, 8
277  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.1
278  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
279  %sum.0 = fadd float %val0, %val1
280
281  %idx.2 = add nsw i32 %tid.x, 11
282  %arrayidx2 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.2
283  %val2 = load float, ptr addrspace(3) %arrayidx2, align 4
284
285  %idx.3 = add nsw i32 %tid.x, 27
286  %arrayidx3 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %idx.3
287  %val3 = load float, ptr addrspace(3) %arrayidx3, align 4
288  %sum.1 = fadd float %val2, %val3
289
290  %sum = fadd float %sum.0, %sum.1
291  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %idx.0
292  store float %sum, ptr addrspace(1) %out.gep, align 4
293  ret void
294}
295
296; Be careful of vectors of pointers. We don't know if the 2 pointers
297; in the vectors are really the same base, so this is not safe to
298; merge.
299; Base pointers come from different subregister of same super
300; register. We can't safely merge this.
301define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 {
302; CI-LABEL: read2_ptr_is_subreg_arg_f32:
303; CI:       ; %bb.0:
304; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
305; CI-NEXT:    s_mov_b32 m0, -1
306; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
307; CI-NEXT:    s_waitcnt lgkmcnt(0)
308; CI-NEXT:    v_mov_b32_e32 v1, s2
309; CI-NEXT:    v_mov_b32_e32 v2, s3
310; CI-NEXT:    ds_read_b32 v1, v1 offset:32
311; CI-NEXT:    ds_read_b32 v2, v2
312; CI-NEXT:    s_mov_b32 s3, 0xf000
313; CI-NEXT:    s_mov_b32 s2, 0
314; CI-NEXT:    s_waitcnt lgkmcnt(0)
315; CI-NEXT:    v_add_f32_e32 v2, v1, v2
316; CI-NEXT:    v_mov_b32_e32 v1, 0
317; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
318; CI-NEXT:    s_endpgm
319;
320; GFX9-LABEL: read2_ptr_is_subreg_arg_f32:
321; GFX9:       ; %bb.0:
322; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
323; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
324; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
325; GFX9-NEXT:    v_mov_b32_e32 v1, s2
326; GFX9-NEXT:    v_mov_b32_e32 v2, s3
327; GFX9-NEXT:    ds_read_b32 v1, v1 offset:32
328; GFX9-NEXT:    ds_read_b32 v2, v2
329; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
330; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
331; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
332; GFX9-NEXT:    s_endpgm
333  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
334  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
335  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
336  %gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1
337  %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
338  %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
339  %val0 = load float, ptr addrspace(3) %gep.0, align 4
340  %val1 = load float, ptr addrspace(3) %gep.1, align 4
341  %add.x = add nsw i32 %x.i, 8
342  %sum = fadd float %val0, %val1
343  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
344  store float %sum, ptr addrspace(1) %out.gep, align 4
345  ret void
346}
347
348; Apply a constant scalar offset after the pointer vector extract.  We
349; are rejecting merges that have the same, constant 0 offset, so make
350; sure we are really rejecting it because of the different
351; subregisters.
352define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 {
353; CI-LABEL: read2_ptr_is_subreg_arg_offset_f32:
354; CI:       ; %bb.0:
355; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
356; CI-NEXT:    s_mov_b32 m0, -1
357; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
358; CI-NEXT:    s_waitcnt lgkmcnt(0)
359; CI-NEXT:    v_mov_b32_e32 v1, s2
360; CI-NEXT:    v_mov_b32_e32 v2, s3
361; CI-NEXT:    ds_read_b32 v1, v1 offset:32
362; CI-NEXT:    ds_read_b32 v2, v2 offset:32
363; CI-NEXT:    s_mov_b32 s3, 0xf000
364; CI-NEXT:    s_mov_b32 s2, 0
365; CI-NEXT:    s_waitcnt lgkmcnt(0)
366; CI-NEXT:    v_add_f32_e32 v2, v1, v2
367; CI-NEXT:    v_mov_b32_e32 v1, 0
368; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
369; CI-NEXT:    s_endpgm
370;
371; GFX9-LABEL: read2_ptr_is_subreg_arg_offset_f32:
372; GFX9:       ; %bb.0:
373; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
374; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
375; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
376; GFX9-NEXT:    v_mov_b32_e32 v1, s2
377; GFX9-NEXT:    v_mov_b32_e32 v2, s3
378; GFX9-NEXT:    ds_read_b32 v1, v1 offset:32
379; GFX9-NEXT:    ds_read_b32 v2, v2 offset:32
380; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
381; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
382; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
383; GFX9-NEXT:    s_endpgm
384  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
385  %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
386  %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
387  %gep = getelementptr inbounds float, <2 x ptr addrspace(3)> %lds.ptr, <2 x i32> %index.1
388  %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
389  %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
390
391  ; Apply an additional offset after the vector that will be more obviously folded.
392  %gep.1.offset = getelementptr float, ptr addrspace(3) %gep.1, i32 8
393
394  %val0 = load float, ptr addrspace(3) %gep.0, align 4
395  %val1 = load float, ptr addrspace(3) %gep.1.offset, align 4
396  %add.x = add nsw i32 %x.i, 8
397  %sum = fadd float %val0, %val1
398  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
399  store float %sum, ptr addrspace(1) %out.gep, align 4
400  ret void
401}
402
403define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 {
404; CI-LABEL: read2_ptr_is_subreg_f32:
405; CI:       ; %bb.0:
406; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
407; CI-NEXT:    s_mov_b32 m0, -1
408; CI-NEXT:    ds_read2_b32 v[1:2], v0 offset1:8
409; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
410; CI-NEXT:    s_mov_b32 s3, 0xf000
411; CI-NEXT:    s_mov_b32 s2, 0
412; CI-NEXT:    s_waitcnt lgkmcnt(0)
413; CI-NEXT:    v_add_f32_e32 v2, v1, v2
414; CI-NEXT:    v_mov_b32_e32 v1, 0
415; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
416; CI-NEXT:    s_endpgm
417;
418; GFX9-LABEL: read2_ptr_is_subreg_f32:
419; GFX9:       ; %bb.0:
420; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
421; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:8
422; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
423; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
424; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
425; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
426; GFX9-NEXT:    s_endpgm
427  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
428  %ptr.0 = insertelement <2 x ptr addrspace(3)> undef, ptr addrspace(3) @lds, i32 0
429  %ptr.1 = insertelement <2 x ptr addrspace(3)> %ptr.0, ptr addrspace(3) @lds, i32 1
430  %x.i.v.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
431  %x.i.v.1 = insertelement <2 x i32> %x.i.v.0, i32 %x.i, i32 1
432  %idx = add <2 x i32> %x.i.v.1, <i32 0, i32 8>
433  %gep = getelementptr inbounds [512 x float], <2 x ptr addrspace(3)> %ptr.1, <2 x i32> <i32 0, i32 0>, <2 x i32> %idx
434  %gep.0 = extractelement <2 x ptr addrspace(3)> %gep, i32 0
435  %gep.1 = extractelement <2 x ptr addrspace(3)> %gep, i32 1
436  %val0 = load float, ptr addrspace(3) %gep.0, align 4
437  %val1 = load float, ptr addrspace(3) %gep.1, align 4
438  %add.x = add nsw i32 %x.i, 8
439  %sum = fadd float %val0, %val1
440  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
441  store float %sum, ptr addrspace(1) %out.gep, align 4
442  ret void
443}
444
445define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 {
446; CI-LABEL: simple_read2_f32_volatile_0:
447; CI:       ; %bb.0:
448; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
449; CI-NEXT:    s_mov_b32 m0, -1
450; CI-NEXT:    ds_read_b32 v1, v0
451; CI-NEXT:    ds_read_b32 v2, v0 offset:32
452; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
453; CI-NEXT:    s_mov_b32 s3, 0xf000
454; CI-NEXT:    s_mov_b32 s2, 0
455; CI-NEXT:    s_waitcnt lgkmcnt(0)
456; CI-NEXT:    v_add_f32_e32 v2, v1, v2
457; CI-NEXT:    v_mov_b32_e32 v1, 0
458; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
459; CI-NEXT:    s_endpgm
460;
461; GFX9-LABEL: simple_read2_f32_volatile_0:
462; GFX9:       ; %bb.0:
463; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
464; GFX9-NEXT:    ds_read_b32 v1, v0
465; GFX9-NEXT:    ds_read_b32 v2, v0 offset:32
466; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
467; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
468; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
469; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
470; GFX9-NEXT:    s_endpgm
471  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
472  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
473  %val0 = load volatile float, ptr addrspace(3) %arrayidx0, align 4
474  %add.x = add nsw i32 %x.i, 8
475  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
476  %val1 = load float, ptr addrspace(3) %arrayidx1, align 4
477  %sum = fadd float %val0, %val1
478  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
479  store float %sum, ptr addrspace(1) %out.gep, align 4
480  ret void
481}
482
483define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 {
484; CI-LABEL: simple_read2_f32_volatile_1:
485; CI:       ; %bb.0:
486; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
487; CI-NEXT:    s_mov_b32 m0, -1
488; CI-NEXT:    ds_read_b32 v1, v0
489; CI-NEXT:    ds_read_b32 v2, v0 offset:32
490; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
491; CI-NEXT:    s_mov_b32 s3, 0xf000
492; CI-NEXT:    s_mov_b32 s2, 0
493; CI-NEXT:    s_waitcnt lgkmcnt(0)
494; CI-NEXT:    v_add_f32_e32 v2, v1, v2
495; CI-NEXT:    v_mov_b32_e32 v1, 0
496; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
497; CI-NEXT:    s_endpgm
498;
499; GFX9-LABEL: simple_read2_f32_volatile_1:
500; GFX9:       ; %bb.0:
501; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
502; GFX9-NEXT:    ds_read_b32 v1, v0
503; GFX9-NEXT:    ds_read_b32 v2, v0 offset:32
504; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
505; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
506; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
507; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
508; GFX9-NEXT:    s_endpgm
509  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
510  %arrayidx0 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %x.i
511  %val0 = load float, ptr addrspace(3) %arrayidx0, align 4
512  %add.x = add nsw i32 %x.i, 8
513  %arrayidx1 = getelementptr inbounds [512 x float], ptr addrspace(3) @lds, i32 0, i32 %add.x
514  %val1 = load volatile float, ptr addrspace(3) %arrayidx1, align 4
515  %sum = fadd float %val0, %val1
516  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
517  store float %sum, ptr addrspace(1) %out.gep, align 4
518  ret void
519}
520
521; Can't fold since not correctly aligned.
522define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
523; CI-LABEL: unaligned_read2_f32:
524; CI:       ; %bb.0:
525; CI-NEXT:    s_load_dword s0, s[4:5], 0x2
526; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
527; CI-NEXT:    s_mov_b32 m0, -1
528; CI-NEXT:    s_mov_b32 s3, 0xf000
529; CI-NEXT:    s_mov_b32 s2, 0
530; CI-NEXT:    s_waitcnt lgkmcnt(0)
531; CI-NEXT:    v_add_i32_e32 v1, vcc, s0, v0
532; CI-NEXT:    ds_read_u8 v2, v1 offset:34
533; CI-NEXT:    ds_read_u8 v3, v1 offset:32
534; CI-NEXT:    ds_read_u8 v4, v1 offset:3
535; CI-NEXT:    ds_read_u8 v5, v1 offset:2
536; CI-NEXT:    ds_read_u8 v6, v1 offset:1
537; CI-NEXT:    ds_read_u8 v7, v1
538; CI-NEXT:    ds_read_u8 v8, v1 offset:33
539; CI-NEXT:    ds_read_u8 v1, v1 offset:35
540; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
541; CI-NEXT:    s_waitcnt lgkmcnt(0)
542; CI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
543; CI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
544; CI-NEXT:    v_or_b32_e32 v4, v4, v5
545; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
546; CI-NEXT:    v_lshlrev_b32_e32 v5, 8, v8
547; CI-NEXT:    v_or_b32_e32 v1, v1, v2
548; CI-NEXT:    v_or_b32_e32 v6, v6, v7
549; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
550; CI-NEXT:    v_or_b32_e32 v3, v5, v3
551; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
552; CI-NEXT:    v_or_b32_e32 v4, v4, v6
553; CI-NEXT:    v_or_b32_e32 v1, v1, v3
554; CI-NEXT:    v_add_f32_e32 v2, v4, v1
555; CI-NEXT:    v_mov_b32_e32 v1, 0
556; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
557; CI-NEXT:    s_endpgm
558;
559; GFX9-ALIGNED-LABEL: unaligned_read2_f32:
560; GFX9-ALIGNED:       ; %bb.0:
561; GFX9-ALIGNED-NEXT:    s_load_dword s2, s[4:5], 0x8
562; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
563; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
564; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
565; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v1, s2, v0
566; GFX9-ALIGNED-NEXT:    ds_read_u8 v2, v1
567; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v1 offset:1
568; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v1 offset:2
569; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v1 offset:3
570; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v1 offset:32
571; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v1 offset:33
572; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v1 offset:34
573; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v1 offset:35
574; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(6)
575; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
576; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(4)
577; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
578; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
579; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(2)
580; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
581; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
582; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 8, v8
583; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
584; GFX9-ALIGNED-NEXT:    v_add_f32_e32 v1, v2, v1
585; GFX9-ALIGNED-NEXT:    global_store_dword v0, v1, s[0:1]
586; GFX9-ALIGNED-NEXT:    s_endpgm
587;
588; GFX9-UNALIGNED-LABEL: unaligned_read2_f32:
589; GFX9-UNALIGNED:       ; %bb.0:
590; GFX9-UNALIGNED-NEXT:    s_load_dword s0, s[4:5], 0x8
591; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
592; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
593; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v0, s0, v2
594; GFX9-UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:8
595; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
596; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
597; GFX9-UNALIGNED-NEXT:    v_add_f32_e32 v0, v0, v1
598; GFX9-UNALIGNED-NEXT:    global_store_dword v2, v0, s[0:1]
599; GFX9-UNALIGNED-NEXT:    s_endpgm
600  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
601  %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i
602  %val0 = load float, ptr addrspace(3) %arrayidx0, align 1
603  %add.x = add nsw i32 %x.i, 8
604  %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x
605  %val1 = load float, ptr addrspace(3) %arrayidx1, align 1
606  %sum = fadd float %val0, %val1
607  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
608  store float %sum, ptr addrspace(1) %out.gep, align 4
609  ret void
610}
611
612define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
613; CI-LABEL: unaligned_offset_read2_f32:
614; CI:       ; %bb.0:
615; CI-NEXT:    s_load_dword s0, s[4:5], 0x2
616; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
617; CI-NEXT:    s_mov_b32 m0, -1
618; CI-NEXT:    s_mov_b32 s3, 0xf000
619; CI-NEXT:    s_mov_b32 s2, 0
620; CI-NEXT:    s_waitcnt lgkmcnt(0)
621; CI-NEXT:    v_add_i32_e32 v1, vcc, s0, v0
622; CI-NEXT:    ds_read_u8 v2, v1 offset:11
623; CI-NEXT:    ds_read_u8 v3, v1 offset:9
624; CI-NEXT:    ds_read_u8 v4, v1 offset:8
625; CI-NEXT:    ds_read_u8 v5, v1 offset:7
626; CI-NEXT:    ds_read_u8 v6, v1 offset:6
627; CI-NEXT:    ds_read_u8 v7, v1 offset:5
628; CI-NEXT:    ds_read_u8 v8, v1 offset:10
629; CI-NEXT:    ds_read_u8 v1, v1 offset:12
630; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
631; CI-NEXT:    s_waitcnt lgkmcnt(0)
632; CI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
633; CI-NEXT:    v_lshlrev_b32_e32 v6, 8, v6
634; CI-NEXT:    v_or_b32_e32 v4, v4, v5
635; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
636; CI-NEXT:    v_lshlrev_b32_e32 v5, 8, v8
637; CI-NEXT:    v_or_b32_e32 v1, v1, v2
638; CI-NEXT:    v_or_b32_e32 v6, v6, v7
639; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
640; CI-NEXT:    v_or_b32_e32 v3, v5, v3
641; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
642; CI-NEXT:    v_or_b32_e32 v4, v4, v6
643; CI-NEXT:    v_or_b32_e32 v1, v1, v3
644; CI-NEXT:    v_add_f32_e32 v2, v4, v1
645; CI-NEXT:    v_mov_b32_e32 v1, 0
646; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
647; CI-NEXT:    s_endpgm
648;
649; GFX9-ALIGNED-LABEL: unaligned_offset_read2_f32:
650; GFX9-ALIGNED:       ; %bb.0:
651; GFX9-ALIGNED-NEXT:    s_load_dword s2, s[4:5], 0x8
652; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
653; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
654; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
655; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v1, s2, v0
656; GFX9-ALIGNED-NEXT:    ds_read_u8 v2, v1 offset:5
657; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v1 offset:6
658; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v1 offset:7
659; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v1 offset:8
660; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v1 offset:9
661; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v1 offset:10
662; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v1 offset:11
663; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v1 offset:12
664; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(6)
665; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 8, v2
666; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(4)
667; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v5, 8, v4
668; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
669; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(2)
670; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v3, v7, 8, v6
671; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
672; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 8, v8
673; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 16, v3
674; GFX9-ALIGNED-NEXT:    v_add_f32_e32 v1, v2, v1
675; GFX9-ALIGNED-NEXT:    global_store_dword v0, v1, s[0:1]
676; GFX9-ALIGNED-NEXT:    s_endpgm
677;
678; GFX9-UNALIGNED-LABEL: unaligned_offset_read2_f32:
679; GFX9-UNALIGNED:       ; %bb.0:
680; GFX9-UNALIGNED-NEXT:    s_load_dword s0, s[4:5], 0x8
681; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
682; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
683; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v0, s0, v2
684; GFX9-UNALIGNED-NEXT:    ds_read_b64 v[0:1], v0 offset:5
685; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
686; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
687; GFX9-UNALIGNED-NEXT:    v_add_f32_e32 v0, v0, v1
688; GFX9-UNALIGNED-NEXT:    global_store_dword v2, v0, s[0:1]
689; GFX9-UNALIGNED-NEXT:    s_endpgm
690  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
691  %base = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i
692  %addr0.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 5
693  %val0 = load float, ptr addrspace(3) %addr0.i8, align 1
694  %addr1.i8 = getelementptr inbounds i8, ptr addrspace(3) %base, i32 9
695  %val1 = load float, ptr addrspace(3) %addr1.i8, align 1
696  %sum = fadd float %val0, %val1
697  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
698  store float %sum, ptr addrspace(1) %out.gep, align 4
699  ret void
700}
701
702define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
703; CI-LABEL: misaligned_2_simple_read2_f32:
704; CI:       ; %bb.0:
705; CI-NEXT:    s_load_dword s0, s[4:5], 0x2
706; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
707; CI-NEXT:    s_mov_b32 m0, -1
708; CI-NEXT:    s_mov_b32 s3, 0xf000
709; CI-NEXT:    s_mov_b32 s2, 0
710; CI-NEXT:    s_waitcnt lgkmcnt(0)
711; CI-NEXT:    v_add_i32_e32 v1, vcc, s0, v0
712; CI-NEXT:    ds_read_u16 v2, v1 offset:32
713; CI-NEXT:    ds_read_u16 v3, v1 offset:2
714; CI-NEXT:    ds_read_u16 v4, v1
715; CI-NEXT:    ds_read_u16 v1, v1 offset:34
716; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
717; CI-NEXT:    s_waitcnt lgkmcnt(0)
718; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
719; CI-NEXT:    v_or_b32_e32 v3, v3, v4
720; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
721; CI-NEXT:    v_or_b32_e32 v1, v1, v2
722; CI-NEXT:    v_add_f32_e32 v2, v3, v1
723; CI-NEXT:    v_mov_b32_e32 v1, 0
724; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
725; CI-NEXT:    s_endpgm
726;
727; GFX9-ALIGNED-LABEL: misaligned_2_simple_read2_f32:
728; GFX9-ALIGNED:       ; %bb.0:
729; GFX9-ALIGNED-NEXT:    s_load_dword s0, s[4:5], 0x8
730; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
731; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
732; GFX9-ALIGNED-NEXT:    v_add_u32_e32 v1, s0, v0
733; GFX9-ALIGNED-NEXT:    ds_read_u16 v2, v1
734; GFX9-ALIGNED-NEXT:    ds_read_u16 v3, v1 offset:2
735; GFX9-ALIGNED-NEXT:    ds_read_u16 v4, v1 offset:32
736; GFX9-ALIGNED-NEXT:    ds_read_u16 v1, v1 offset:34
737; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
738; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
739; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
740; GFX9-ALIGNED-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
741; GFX9-ALIGNED-NEXT:    v_add_f32_e32 v1, v2, v1
742; GFX9-ALIGNED-NEXT:    global_store_dword v0, v1, s[0:1]
743; GFX9-ALIGNED-NEXT:    s_endpgm
744;
745; GFX9-UNALIGNED-LABEL: misaligned_2_simple_read2_f32:
746; GFX9-UNALIGNED:       ; %bb.0:
747; GFX9-UNALIGNED-NEXT:    s_load_dword s0, s[4:5], 0x8
748; GFX9-UNALIGNED-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
749; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
750; GFX9-UNALIGNED-NEXT:    v_add_u32_e32 v0, s0, v2
751; GFX9-UNALIGNED-NEXT:    ds_read2_b32 v[0:1], v0 offset1:8
752; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
753; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
754; GFX9-UNALIGNED-NEXT:    v_add_f32_e32 v0, v0, v1
755; GFX9-UNALIGNED-NEXT:    global_store_dword v2, v0, s[0:1]
756; GFX9-UNALIGNED-NEXT:    s_endpgm
757  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
758  %arrayidx0 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %x.i
759  %val0 = load float, ptr addrspace(3) %arrayidx0, align 2
760  %add.x = add nsw i32 %x.i, 8
761  %arrayidx1 = getelementptr inbounds float, ptr addrspace(3) %lds, i32 %add.x
762  %val1 = load float, ptr addrspace(3) %arrayidx1, align 2
763  %sum = fadd float %val0, %val1
764  %out.gep = getelementptr inbounds float, ptr addrspace(1) %out, i32 %x.i
765  store float %sum, ptr addrspace(1) %out.gep, align 4
766  ret void
767}
768
769define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 {
770; CI-LABEL: simple_read2_f64:
771; CI:       ; %bb.0:
772; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
773; CI-NEXT:    s_mov_b32 m0, -1
774; CI-NEXT:    ds_read2_b64 v[0:3], v4 offset1:8
775; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
776; CI-NEXT:    s_mov_b32 s3, 0xf000
777; CI-NEXT:    s_mov_b32 s2, 0
778; CI-NEXT:    v_mov_b32_e32 v5, 0
779; CI-NEXT:    s_waitcnt lgkmcnt(0)
780; CI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
781; CI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
782; CI-NEXT:    s_endpgm
783;
784; GFX9-LABEL: simple_read2_f64:
785; GFX9:       ; %bb.0:
786; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
787; GFX9-NEXT:    ds_read2_b64 v[0:3], v4 offset1:8
788; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
789; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
790; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
791; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
792; GFX9-NEXT:    s_endpgm
793  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
794  %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
795  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
796  %add.x = add nsw i32 %x.i, 8
797  %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
798  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
799  %sum = fadd double %val0, %val1
800  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
801  store double %sum, ptr addrspace(1) %out.gep, align 8
802  ret void
803}
804
805define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 {
806; CI-LABEL: simple_read2_f64_max_offset:
807; CI:       ; %bb.0:
808; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
809; CI-NEXT:    s_mov_b32 m0, -1
810; CI-NEXT:    ds_read2_b64 v[0:3], v4 offset1:255
811; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
812; CI-NEXT:    s_mov_b32 s3, 0xf000
813; CI-NEXT:    s_mov_b32 s2, 0
814; CI-NEXT:    v_mov_b32_e32 v5, 0
815; CI-NEXT:    s_waitcnt lgkmcnt(0)
816; CI-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
817; CI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
818; CI-NEXT:    s_endpgm
819;
820; GFX9-LABEL: simple_read2_f64_max_offset:
821; GFX9:       ; %bb.0:
822; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
823; GFX9-NEXT:    ds_read2_b64 v[0:3], v4 offset1:255
824; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
825; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
826; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
827; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
828; GFX9-NEXT:    s_endpgm
829  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
830  %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
831  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
832  %add.x = add nsw i32 %x.i, 255
833  %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
834  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
835  %sum = fadd double %val0, %val1
836  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
837  store double %sum, ptr addrspace(1) %out.gep, align 8
838  ret void
839}
840
841define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 {
842; CI-LABEL: simple_read2_f64_too_far:
843; CI:       ; %bb.0:
844; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
845; CI-NEXT:    s_mov_b32 m0, -1
846; CI-NEXT:    ds_read_b64 v[1:2], v0
847; CI-NEXT:    ds_read_b64 v[3:4], v0 offset:2056
848; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
849; CI-NEXT:    s_mov_b32 s3, 0xf000
850; CI-NEXT:    s_mov_b32 s2, 0
851; CI-NEXT:    s_waitcnt lgkmcnt(0)
852; CI-NEXT:    v_add_f64 v[2:3], v[1:2], v[3:4]
853; CI-NEXT:    v_mov_b32_e32 v1, 0
854; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
855; CI-NEXT:    s_endpgm
856;
857; GFX9-LABEL: simple_read2_f64_too_far:
858; GFX9:       ; %bb.0:
859; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
860; GFX9-NEXT:    ds_read_b64 v[0:1], v4
861; GFX9-NEXT:    ds_read_b64 v[2:3], v4 offset:2056
862; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
863; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
864; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
865; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
866; GFX9-NEXT:    s_endpgm
867  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
868  %arrayidx0 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %x.i
869  %val0 = load double, ptr addrspace(3) %arrayidx0, align 8
870  %add.x = add nsw i32 %x.i, 257
871  %arrayidx1 = getelementptr inbounds [512 x double], ptr addrspace(3) @lds.f64, i32 0, i32 %add.x
872  %val1 = load double, ptr addrspace(3) %arrayidx1, align 8
873  %sum = fadd double %val0, %val1
874  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
875  store double %sum, ptr addrspace(1) %out.gep, align 8
876  ret void
877}
878
879; Alignment only 4
880define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
881; CI-LABEL: misaligned_read2_f64:
882; CI:       ; %bb.0:
883; CI-NEXT:    s_load_dword s0, s[4:5], 0x2
884; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
885; CI-NEXT:    s_mov_b32 m0, -1
886; CI-NEXT:    s_mov_b32 s3, 0xf000
887; CI-NEXT:    s_mov_b32 s2, 0
888; CI-NEXT:    s_waitcnt lgkmcnt(0)
889; CI-NEXT:    v_add_i32_e32 v3, vcc, s0, v0
890; CI-NEXT:    ds_read2_b32 v[1:2], v3 offset1:1
891; CI-NEXT:    ds_read2_b32 v[3:4], v3 offset0:14 offset1:15
892; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
893; CI-NEXT:    s_waitcnt lgkmcnt(0)
894; CI-NEXT:    v_add_f64 v[2:3], v[1:2], v[3:4]
895; CI-NEXT:    v_mov_b32_e32 v1, 0
896; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
897; CI-NEXT:    s_endpgm
898;
899; GFX9-LABEL: misaligned_read2_f64:
900; GFX9:       ; %bb.0:
901; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x8
902; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
903; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
904; GFX9-NEXT:    v_add_u32_e32 v2, s0, v4
905; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
906; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset0:14 offset1:15
907; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
908; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
909; GFX9-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
910; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
911; GFX9-NEXT:    s_endpgm
912  %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
913  %arrayidx0 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %x.i
914  %val0 = load double, ptr addrspace(3) %arrayidx0, align 4
915  %add.x = add nsw i32 %x.i, 7
916  %arrayidx1 = getelementptr inbounds double, ptr addrspace(3) %lds, i32 %add.x
917  %val1 = load double, ptr addrspace(3) %arrayidx1, align 4
918  %sum = fadd double %val0, %val1
919  %out.gep = getelementptr inbounds double, ptr addrspace(1) %out, i32 %x.i
920  store double %sum, ptr addrspace(1) %out.gep, align 4
921  ret void
922}
923
924@foo = addrspace(3) global [4 x i32] undef, align 4
925
926define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) {
927; CI-LABEL: load_constant_adjacent_offsets:
928; CI:       ; %bb.0:
929; CI-NEXT:    v_mov_b32_e32 v0, 0
930; CI-NEXT:    s_mov_b32 m0, -1
931; CI-NEXT:    ds_read_b64 v[0:1], v0
932; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
933; CI-NEXT:    s_mov_b32 s3, 0xf000
934; CI-NEXT:    s_mov_b32 s2, -1
935; CI-NEXT:    s_waitcnt lgkmcnt(0)
936; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
937; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
938; CI-NEXT:    s_endpgm
939;
940; GFX9-LABEL: load_constant_adjacent_offsets:
941; GFX9:       ; %bb.0:
942; GFX9-NEXT:    v_mov_b32_e32 v2, 0
943; GFX9-NEXT:    ds_read_b64 v[0:1], v2
944; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
945; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
946; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
947; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
948; GFX9-NEXT:    s_endpgm
949  %val0 = load i32, ptr addrspace(3) @foo, align 4
950  %val1 = load i32, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 1), align 4
951  %sum = add i32 %val0, %val1
952  store i32 %sum, ptr addrspace(1) %out, align 4
953  ret void
954}
955
956define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) {
957; CI-LABEL: load_constant_disjoint_offsets:
958; CI:       ; %bb.0:
959; CI-NEXT:    v_mov_b32_e32 v0, 0
960; CI-NEXT:    s_mov_b32 m0, -1
961; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:2
962; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
963; CI-NEXT:    s_mov_b32 s3, 0xf000
964; CI-NEXT:    s_mov_b32 s2, -1
965; CI-NEXT:    s_waitcnt lgkmcnt(0)
966; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
967; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
968; CI-NEXT:    s_endpgm
969;
970; GFX9-LABEL: load_constant_disjoint_offsets:
971; GFX9:       ; %bb.0:
972; GFX9-NEXT:    v_mov_b32_e32 v2, 0
973; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:2
974; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
975; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
976; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
977; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
978; GFX9-NEXT:    s_endpgm
979  %val0 = load i32, ptr addrspace(3) @foo, align 4
980  %val1 = load i32, ptr addrspace(3) getelementptr inbounds ([4 x i32], ptr addrspace(3) @foo, i32 0, i32 2), align 4
981  %sum = add i32 %val0, %val1
982  store i32 %sum, ptr addrspace(1) %out, align 4
983  ret void
984}
985
986@bar = addrspace(3) global [4 x i64] undef, align 4
987
988define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) %out) {
989; CI-LABEL: load_misaligned64_constant_offsets:
990; CI:       ; %bb.0:
991; CI-NEXT:    v_mov_b32_e32 v0, 0
992; CI-NEXT:    s_mov_b32 m0, -1
993; CI-NEXT:    ds_read_b128 v[0:3], v0
994; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
995; CI-NEXT:    s_mov_b32 s3, 0xf000
996; CI-NEXT:    s_mov_b32 s2, -1
997; CI-NEXT:    s_waitcnt lgkmcnt(0)
998; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
999; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
1000; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1001; CI-NEXT:    s_endpgm
1002;
1003; GFX9-LABEL: load_misaligned64_constant_offsets:
1004; GFX9:       ; %bb.0:
1005; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1006; GFX9-NEXT:    ds_read_b128 v[0:3], v4
1007; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1008; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1009; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
1010; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
1011; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
1012; GFX9-NEXT:    s_endpgm
1013  %val0 = load i64, ptr addrspace(3) @bar, align 4
1014  %val1 = load i64, ptr addrspace(3) getelementptr inbounds ([4 x i64], ptr addrspace(3) @bar, i32 0, i32 1), align 4
1015  %sum = add i64 %val0, %val1
1016  store i64 %sum, ptr addrspace(1) %out, align 8
1017  ret void
1018}
1019
1020@bar.large = addrspace(3) global [4096 x i64] undef, align 4
1021
1022define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspace(1) %out) {
1023; CI-LABEL: load_misaligned64_constant_large_offsets:
1024; CI:       ; %bb.0:
1025; CI-NEXT:    v_mov_b32_e32 v2, 0
1026; CI-NEXT:    s_mov_b32 m0, -1
1027; CI-NEXT:    ds_read_b64 v[0:1], v2 offset:16384
1028; CI-NEXT:    ds_read_b64 v[2:3], v2 offset:32760
1029; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1030; CI-NEXT:    s_mov_b32 s3, 0xf000
1031; CI-NEXT:    s_mov_b32 s2, -1
1032; CI-NEXT:    s_waitcnt lgkmcnt(0)
1033; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
1034; CI-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
1035; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1036; CI-NEXT:    s_endpgm
1037;
1038; GFX9-LABEL: load_misaligned64_constant_large_offsets:
1039; GFX9:       ; %bb.0:
1040; GFX9-NEXT:    v_mov_b32_e32 v4, 0
1041; GFX9-NEXT:    ds_read_b64 v[0:1], v4 offset:16384
1042; GFX9-NEXT:    ds_read_b64 v[2:3], v4 offset:32760
1043; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1044; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1045; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
1046; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
1047; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
1048; GFX9-NEXT:    s_endpgm
1049  %val0 = load i64, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 2048), align 4
1050  %val1 = load i64, ptr addrspace(3) getelementptr inbounds ([4096 x i64], ptr addrspace(3) @bar.large, i32 0, i32 4095), align 4
1051  %sum = add i64 %val0, %val1
1052  store i64 %sum, ptr addrspace(1) %out, align 8
1053  ret void
1054}
1055
1056@sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
1057@sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
1058
1059define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 {
1060; CI-LABEL: sgemm_inner_loop_read2_sequence:
1061; CI:       ; %bb.0:
1062; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1063; CI-NEXT:    s_lshl_b32 s4, s8, 2
1064; CI-NEXT:    s_add_i32 s5, s4, 0xc20
1065; CI-NEXT:    s_addk_i32 s4, 0xc60
1066; CI-NEXT:    v_mov_b32_e32 v0, s5
1067; CI-NEXT:    v_mov_b32_e32 v2, s4
1068; CI-NEXT:    v_lshlrev_b32_e32 v8, 2, v1
1069; CI-NEXT:    s_mov_b32 m0, -1
1070; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1071; CI-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
1072; CI-NEXT:    ds_read2_b32 v[4:5], v8 offset1:1
1073; CI-NEXT:    ds_read2_b32 v[6:7], v8 offset0:32 offset1:33
1074; CI-NEXT:    ds_read2_b32 v[8:9], v8 offset0:64 offset1:65
1075; CI-NEXT:    s_waitcnt lgkmcnt(0)
1076; CI-NEXT:    v_add_f32_e32 v0, v0, v1
1077; CI-NEXT:    v_add_f32_e32 v0, v0, v2
1078; CI-NEXT:    v_add_f32_e32 v0, v0, v3
1079; CI-NEXT:    v_add_f32_e32 v0, v0, v4
1080; CI-NEXT:    v_add_f32_e32 v0, v0, v5
1081; CI-NEXT:    v_add_f32_e32 v0, v0, v6
1082; CI-NEXT:    v_add_f32_e32 v0, v0, v7
1083; CI-NEXT:    v_add_f32_e32 v0, v0, v8
1084; CI-NEXT:    s_mov_b32 s3, 0xf000
1085; CI-NEXT:    s_mov_b32 s2, -1
1086; CI-NEXT:    v_add_f32_e32 v0, v0, v9
1087; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1088; CI-NEXT:    s_endpgm
1089;
1090; GFX9-LABEL: sgemm_inner_loop_read2_sequence:
1091; GFX9:       ; %bb.0:
1092; GFX9-NEXT:    s_lshl_b32 s2, s8, 2
1093; GFX9-NEXT:    s_add_i32 s3, s2, 0xc20
1094; GFX9-NEXT:    s_addk_i32 s2, 0xc60
1095; GFX9-NEXT:    v_mov_b32_e32 v0, s3
1096; GFX9-NEXT:    v_mov_b32_e32 v2, s2
1097; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 2, v1
1098; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1099; GFX9-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
1100; GFX9-NEXT:    ds_read2_b32 v[4:5], v8 offset1:1
1101; GFX9-NEXT:    ds_read2_b32 v[6:7], v8 offset0:32 offset1:33
1102; GFX9-NEXT:    ds_read2_b32 v[8:9], v8 offset0:64 offset1:65
1103; GFX9-NEXT:    s_waitcnt lgkmcnt(4)
1104; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
1105; GFX9-NEXT:    s_waitcnt lgkmcnt(3)
1106; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
1107; GFX9-NEXT:    v_add_f32_e32 v0, v0, v3
1108; GFX9-NEXT:    s_waitcnt lgkmcnt(2)
1109; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
1110; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1111; GFX9-NEXT:    v_add_f32_e32 v0, v0, v5
1112; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1113; GFX9-NEXT:    v_add_f32_e32 v0, v0, v6
1114; GFX9-NEXT:    v_add_f32_e32 v0, v0, v7
1115; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
1116; GFX9-NEXT:    v_mov_b32_e32 v10, 0
1117; GFX9-NEXT:    v_add_f32_e32 v0, v0, v9
1118; GFX9-NEXT:    global_store_dword v10, v0, s[0:1]
1119; GFX9-NEXT:    s_endpgm
1120  %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
1121  %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
1122  %arrayidx44 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %x.i
1123  %tmp16 = load float, ptr addrspace(3) %arrayidx44, align 4
1124  %add47 = add nsw i32 %x.i, 1
1125  %arrayidx48 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add47
1126  %tmp17 = load float, ptr addrspace(3) %arrayidx48, align 4
1127  %add51 = add nsw i32 %x.i, 16
1128  %arrayidx52 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add51
1129  %tmp18 = load float, ptr addrspace(3) %arrayidx52, align 4
1130  %add55 = add nsw i32 %x.i, 17
1131  %arrayidx56 = getelementptr inbounds [264 x float], ptr addrspace(3) @sgemm.lA, i32 0, i32 %add55
1132  %tmp19 = load float, ptr addrspace(3) %arrayidx56, align 4
1133  %arrayidx60 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %y.i
1134  %tmp20 = load float, ptr addrspace(3) %arrayidx60, align 4
1135  %add63 = add nsw i32 %y.i, 1
1136  %arrayidx64 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add63
1137  %tmp21 = load float, ptr addrspace(3) %arrayidx64, align 4
1138  %add67 = add nsw i32 %y.i, 32
1139  %arrayidx68 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add67
1140  %tmp22 = load float, ptr addrspace(3) %arrayidx68, align 4
1141  %add71 = add nsw i32 %y.i, 33
1142  %arrayidx72 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add71
1143  %tmp23 = load float, ptr addrspace(3) %arrayidx72, align 4
1144  %add75 = add nsw i32 %y.i, 64
1145  %arrayidx76 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add75
1146  %tmp24 = load float, ptr addrspace(3) %arrayidx76, align 4
1147  %add79 = add nsw i32 %y.i, 65
1148  %arrayidx80 = getelementptr inbounds [776 x float], ptr addrspace(3) @sgemm.lB, i32 0, i32 %add79
1149  %tmp25 = load float, ptr addrspace(3) %arrayidx80, align 4
1150  %sum.0 = fadd float %tmp16, %tmp17
1151  %sum.1 = fadd float %sum.0, %tmp18
1152  %sum.2 = fadd float %sum.1, %tmp19
1153  %sum.3 = fadd float %sum.2, %tmp20
1154  %sum.4 = fadd float %sum.3, %tmp21
1155  %sum.5 = fadd float %sum.4, %tmp22
1156  %sum.6 = fadd float %sum.5, %tmp23
1157  %sum.7 = fadd float %sum.6, %tmp24
1158  %sum.8 = fadd float %sum.7, %tmp25
1159  store float %sum.8, ptr addrspace(1) %C, align 4
1160  ret void
1161}
1162
1163define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
1164; CI-LABEL: misaligned_read2_v2i32:
1165; CI:       ; %bb.0:
1166; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
1167; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1168; CI-NEXT:    s_mov_b32 m0, -1
1169; CI-NEXT:    s_mov_b32 s3, 0xf000
1170; CI-NEXT:    s_waitcnt lgkmcnt(0)
1171; CI-NEXT:    v_mov_b32_e32 v0, s2
1172; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1173; CI-NEXT:    s_mov_b32 s2, -1
1174; CI-NEXT:    s_waitcnt lgkmcnt(0)
1175; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1176; CI-NEXT:    s_endpgm
1177;
1178; GFX9-LABEL: misaligned_read2_v2i32:
1179; GFX9:       ; %bb.0:
1180; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
1181; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1182; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1183; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1184; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1185; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1186; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1187; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1188; GFX9-NEXT:    s_endpgm
1189  %load = load <2 x i32>, ptr addrspace(3) %in, align 4
1190  store <2 x i32> %load, ptr addrspace(1) %out, align 8
1191  ret void
1192}
1193
1194define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
1195; CI-LABEL: misaligned_read2_i64:
1196; CI:       ; %bb.0:
1197; CI-NEXT:    s_load_dword s2, s[4:5], 0x2
1198; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1199; CI-NEXT:    s_mov_b32 m0, -1
1200; CI-NEXT:    s_mov_b32 s3, 0xf000
1201; CI-NEXT:    s_waitcnt lgkmcnt(0)
1202; CI-NEXT:    v_mov_b32_e32 v0, s2
1203; CI-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1204; CI-NEXT:    s_mov_b32 s2, -1
1205; CI-NEXT:    s_waitcnt lgkmcnt(0)
1206; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1207; CI-NEXT:    s_endpgm
1208;
1209; GFX9-LABEL: misaligned_read2_i64:
1210; GFX9:       ; %bb.0:
1211; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x8
1212; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1213; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1214; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1215; GFX9-NEXT:    v_mov_b32_e32 v0, s2
1216; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
1217; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1218; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1219; GFX9-NEXT:    s_endpgm
1220  %load = load i64, ptr addrspace(3) %in, align 4
1221  store i64 %load, ptr addrspace(1) %out, align 8
1222  ret void
1223}
1224
1225define amdgpu_kernel void @ds_read_diff_base_interleaving(
1226; CI-LABEL: ds_read_diff_base_interleaving:
1227; CI:       ; %bb.0: ; %bb
1228; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2
1229; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
1230; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1231; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1232; CI-NEXT:    s_mov_b32 m0, -1
1233; CI-NEXT:    s_waitcnt lgkmcnt(0)
1234; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v1
1235; CI-NEXT:    v_add_i32_e32 v3, vcc, s1, v0
1236; CI-NEXT:    v_add_i32_e32 v4, vcc, s2, v1
1237; CI-NEXT:    v_add_i32_e32 v6, vcc, s3, v0
1238; CI-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
1239; CI-NEXT:    ds_read2_b32 v[2:3], v3 offset1:4
1240; CI-NEXT:    ds_read2_b32 v[4:5], v4 offset1:1
1241; CI-NEXT:    ds_read2_b32 v[6:7], v6 offset1:4
1242; CI-NEXT:    s_mov_b32 s7, 0xf000
1243; CI-NEXT:    s_mov_b32 s6, -1
1244; CI-NEXT:    s_waitcnt lgkmcnt(2)
1245; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
1246; CI-NEXT:    v_add_f32_e32 v0, 2.0, v0
1247; CI-NEXT:    s_waitcnt lgkmcnt(0)
1248; CI-NEXT:    v_mul_f32_e32 v2, v4, v6
1249; CI-NEXT:    v_sub_f32_e32 v0, v0, v2
1250; CI-NEXT:    v_mul_f32_e32 v1, v1, v3
1251; CI-NEXT:    v_sub_f32_e32 v0, v0, v1
1252; CI-NEXT:    v_mul_f32_e32 v1, v5, v7
1253; CI-NEXT:    v_sub_f32_e32 v0, v0, v1
1254; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:40
1255; CI-NEXT:    s_endpgm
1256;
1257; GFX9-LABEL: ds_read_diff_base_interleaving:
1258; GFX9:       ; %bb.0: ; %bb
1259; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
1260; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
1261; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1262; GFX9-NEXT:    v_mov_b32_e32 v8, 0
1263; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1264; GFX9-NEXT:    v_add_u32_e32 v2, s0, v1
1265; GFX9-NEXT:    v_add_u32_e32 v3, s1, v0
1266; GFX9-NEXT:    v_add_u32_e32 v4, s2, v1
1267; GFX9-NEXT:    v_add_u32_e32 v6, s3, v0
1268; GFX9-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
1269; GFX9-NEXT:    ds_read2_b32 v[2:3], v3 offset1:4
1270; GFX9-NEXT:    ds_read2_b32 v[4:5], v4 offset1:1
1271; GFX9-NEXT:    ds_read2_b32 v[6:7], v6 offset1:4
1272; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1273; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1274; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
1275; GFX9-NEXT:    v_add_f32_e32 v0, 2.0, v0
1276; GFX9-NEXT:    v_mul_f32_e32 v2, v4, v6
1277; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
1278; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
1279; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
1280; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v7
1281; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
1282; GFX9-NEXT:    global_store_dword v8, v0, s[0:1] offset:40
1283; GFX9-NEXT:    s_endpgm
1284  ptr addrspace(1) nocapture %arg,
1285  ptr addrspace(3) %arg1,
1286  ptr addrspace(3) %arg2,
1287  ptr addrspace(3) %arg3,
1288  ptr addrspace(3) %arg4) #1 {
1289bb:
1290  %tmp = getelementptr float, ptr addrspace(1) %arg, i64 10
1291  %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.x() #2
1292  %tmp6 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
1293  %tmp7 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg1, i32 0, i32 %tmp6, i32 0
1294  %tmp8 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg2, i32 0, i32 0, i32 %tmp5
1295  %tmp9 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg3, i32 0, i32 %tmp6, i32 0
1296  %tmp10 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg4, i32 0, i32 0, i32 %tmp5
1297  %tmp11 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg1, i32 0, i32 %tmp6, i32 1
1298  %tmp12 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg2, i32 0, i32 1, i32 %tmp5
1299  %tmp13 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg3, i32 0, i32 %tmp6, i32 1
1300  %tmp14 = getelementptr [4 x [4 x float]], ptr addrspace(3) %arg4, i32 0, i32 1, i32 %tmp5
1301  %tmp15 = load float, ptr addrspace(3) %tmp7
1302  %tmp16 = load float, ptr addrspace(3) %tmp8
1303  %tmp17 = fmul float %tmp15, %tmp16
1304  %tmp18 = fadd float 2.000000e+00, %tmp17
1305  %tmp19 = load float, ptr addrspace(3) %tmp9
1306  %tmp20 = load float, ptr addrspace(3) %tmp10
1307  %tmp21 = fmul float %tmp19, %tmp20
1308  %tmp22 = fsub float %tmp18, %tmp21
1309  %tmp23 = load float, ptr addrspace(3) %tmp11
1310  %tmp24 = load float, ptr addrspace(3) %tmp12
1311  %tmp25 = fmul float %tmp23, %tmp24
1312  %tmp26 = fsub float %tmp22, %tmp25
1313  %tmp27 = load float, ptr addrspace(3) %tmp13
1314  %tmp28 = load float, ptr addrspace(3) %tmp14
1315  %tmp29 = fmul float %tmp27, %tmp28
1316  %tmp30 = fsub float %tmp26, %tmp29
1317  store float %tmp30, ptr addrspace(1) %tmp
1318  ret void
1319}
1320
1321define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspace(3) %arg) {
1322; CI-LABEL: ds_read_call_read:
1323; CI:       ; %bb.0:
1324; CI-NEXT:    s_getpc_b64 s[40:41]
1325; CI-NEXT:    s_mov_b32 s40, s0
1326; CI-NEXT:    s_load_dwordx4 s[40:43], s[40:41], 0x0
1327; CI-NEXT:    s_mov_b32 s14, s10
1328; CI-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
1329; CI-NEXT:    s_mov_b32 m0, -1
1330; CI-NEXT:    s_mov_b32 s12, s8
1331; CI-NEXT:    s_waitcnt lgkmcnt(0)
1332; CI-NEXT:    s_add_u32 s40, s40, s11
1333; CI-NEXT:    s_mov_b64 s[10:11], s[6:7]
1334; CI-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
1335; CI-NEXT:    s_load_dword s6, s[4:5], 0x2
1336; CI-NEXT:    s_addc_u32 s41, s41, 0
1337; CI-NEXT:    s_add_u32 s8, s4, 12
1338; CI-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
1339; CI-NEXT:    s_mov_b32 s13, s9
1340; CI-NEXT:    s_waitcnt lgkmcnt(0)
1341; CI-NEXT:    v_add_i32_e32 v40, vcc, s6, v3
1342; CI-NEXT:    ds_read_b32 v41, v40
1343; CI-NEXT:    s_addc_u32 s9, s5, 0
1344; CI-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
1345; CI-NEXT:    v_or_b32_e32 v0, v0, v1
1346; CI-NEXT:    s_mov_b64 s[4:5], s[0:1]
1347; CI-NEXT:    s_mov_b64 s[6:7], s[2:3]
1348; CI-NEXT:    s_mov_b64 s[0:1], s[40:41]
1349; CI-NEXT:    s_mov_b32 s17, void_func_void@abs32@hi
1350; CI-NEXT:    s_mov_b32 s16, void_func_void@abs32@lo
1351; CI-NEXT:    v_or_b32_e32 v31, v0, v2
1352; CI-NEXT:    s_mov_b64 s[2:3], s[42:43]
1353; CI-NEXT:    s_mov_b32 s32, 0
1354; CI-NEXT:    s_mov_b32 s39, 0xf000
1355; CI-NEXT:    s_mov_b32 s38, -1
1356; CI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
1357; CI-NEXT:    ds_read_b32 v0, v40 offset:4
1358; CI-NEXT:    s_waitcnt lgkmcnt(0)
1359; CI-NEXT:    v_add_i32_e32 v0, vcc, v41, v0
1360; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0
1361; CI-NEXT:    s_endpgm
1362;
1363; GFX9-LABEL: ds_read_call_read:
1364; GFX9:       ; %bb.0:
1365; GFX9-NEXT:    s_getpc_b64 s[36:37]
1366; GFX9-NEXT:    s_mov_b32 s36, s0
1367; GFX9-NEXT:    s_load_dwordx4 s[36:39], s[36:37], 0x0
1368; GFX9-NEXT:    s_mov_b32 s14, s10
1369; GFX9-NEXT:    s_mov_b32 s12, s8
1370; GFX9-NEXT:    s_mov_b32 s13, s9
1371; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
1372; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1373; GFX9-NEXT:    s_add_u32 s36, s36, s11
1374; GFX9-NEXT:    s_mov_b64 s[10:11], s[6:7]
1375; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x8
1376; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
1377; GFX9-NEXT:    s_addc_u32 s37, s37, 0
1378; GFX9-NEXT:    s_add_u32 s8, s4, 12
1379; GFX9-NEXT:    s_addc_u32 s9, s5, 0
1380; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1381; GFX9-NEXT:    v_lshl_add_u32 v41, v0, 2, s6
1382; GFX9-NEXT:    ds_read_b32 v42, v41
1383; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
1384; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
1385; GFX9-NEXT:    s_mov_b64 s[6:7], s[2:3]
1386; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
1387; GFX9-NEXT:    s_mov_b32 s17, void_func_void@abs32@hi
1388; GFX9-NEXT:    s_mov_b32 s16, void_func_void@abs32@lo
1389; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
1390; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
1391; GFX9-NEXT:    s_mov_b32 s32, 0
1392; GFX9-NEXT:    v_mov_b32_e32 v40, 0
1393; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
1394; GFX9-NEXT:    ds_read_b32 v0, v41 offset:4
1395; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1396; GFX9-NEXT:    v_add_u32_e32 v0, v42, v0
1397; GFX9-NEXT:    global_store_dword v40, v0, s[34:35]
1398; GFX9-NEXT:    s_endpgm
1399  %x = call i32 @llvm.amdgcn.workitem.id.x()
1400  %arrayidx0 = getelementptr i32, ptr addrspace(3) %arg, i32 %x
1401  %arrayidx1 = getelementptr i32, ptr addrspace(3) %arrayidx0, i32 1
1402  %v0 = load i32, ptr addrspace(3) %arrayidx0, align 4
1403  call void @void_func_void()
1404  %v1 = load i32, ptr addrspace(3) %arrayidx1, align 4
1405  %r = add i32 %v0, %v1
1406  store i32 %r, ptr addrspace(1) %out, align 4
1407  ret void
1408}
1409
1410define amdgpu_ps <2 x float> @ds_read_interp_read(i32 inreg %prims, ptr addrspace(3) %inptr) {
1411; CI-LABEL: ds_read_interp_read:
1412; CI:       ; %bb.0:
1413; CI-NEXT:    s_mov_b32 m0, -1
1414; CI-NEXT:    ds_read_b32 v2, v0
1415; CI-NEXT:    s_mov_b32 m0, s0
1416; CI-NEXT:    v_interp_mov_f32 v1, p10, attr0.x
1417; CI-NEXT:    s_mov_b32 m0, -1
1418; CI-NEXT:    ds_read_b32 v0, v0 offset:16
1419; CI-NEXT:    s_waitcnt lgkmcnt(0)
1420; CI-NEXT:    v_add_f32_e32 v1, v0, v1
1421; CI-NEXT:    v_mov_b32_e32 v0, v2
1422; CI-NEXT:    ; return to shader part epilog
1423;
1424; GFX9-LABEL: ds_read_interp_read:
1425; GFX9:       ; %bb.0:
1426; GFX9-NEXT:    ds_read2_b32 v[0:1], v0 offset1:4
1427; GFX9-NEXT:    s_mov_b32 m0, s0
1428; GFX9-NEXT:    s_nop 0
1429; GFX9-NEXT:    v_interp_mov_f32_e32 v2, p10, attr0.x
1430; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
1431; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
1432; GFX9-NEXT:    ; return to shader part epilog
1433  %v0 = load float, ptr addrspace(3) %inptr, align 4
1434  %intrp = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 %prims)
1435  %ptr1 = getelementptr float, ptr addrspace(3) %inptr, i32 4
1436  %v1 = load float, ptr addrspace(3) %ptr1, align 4
1437  %v1b = fadd float %v1, %intrp
1438  %r0 = insertelement <2 x float> undef, float %v0, i32 0
1439  %r1 = insertelement <2 x float> %r0, float %v1b, i32 1
1440  ret <2 x float> %r1
1441}
1442
1443@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1
1444
1445define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) {
1446; CI-LABEL: read2_v2i32_align1_odd_offset:
1447; CI:       ; %bb.0: ; %entry
1448; CI-NEXT:    v_mov_b32_e32 v0, 0
1449; CI-NEXT:    s_mov_b32 m0, -1
1450; CI-NEXT:    ds_read_u8 v1, v0 offset:70
1451; CI-NEXT:    ds_read_u8 v2, v0 offset:72
1452; CI-NEXT:    ds_read_u8 v3, v0 offset:71
1453; CI-NEXT:    ds_read_u8 v4, v0 offset:69
1454; CI-NEXT:    ds_read_u8 v5, v0 offset:68
1455; CI-NEXT:    s_waitcnt lgkmcnt(4)
1456; CI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1457; CI-NEXT:    s_waitcnt lgkmcnt(3)
1458; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1459; CI-NEXT:    s_waitcnt lgkmcnt(2)
1460; CI-NEXT:    v_or_b32_e32 v2, v2, v3
1461; CI-NEXT:    s_waitcnt lgkmcnt(1)
1462; CI-NEXT:    v_or_b32_e32 v1, v1, v4
1463; CI-NEXT:    ds_read_u8 v4, v0 offset:67
1464; CI-NEXT:    ds_read_u8 v6, v0 offset:66
1465; CI-NEXT:    ds_read_u8 v0, v0 offset:65
1466; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1467; CI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1468; CI-NEXT:    v_or_b32_e32 v1, v2, v1
1469; CI-NEXT:    s_waitcnt lgkmcnt(0)
1470; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v6
1471; CI-NEXT:    v_or_b32_e32 v0, v2, v0
1472; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v5
1473; CI-NEXT:    v_or_b32_e32 v2, v2, v4
1474; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
1475; CI-NEXT:    s_mov_b32 s3, 0xf000
1476; CI-NEXT:    s_mov_b32 s2, -1
1477; CI-NEXT:    v_or_b32_e32 v0, v2, v0
1478; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1479; CI-NEXT:    s_endpgm
1480;
1481; GFX9-ALIGNED-LABEL: read2_v2i32_align1_odd_offset:
1482; GFX9-ALIGNED:       ; %bb.0: ; %entry
1483; GFX9-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0
1484; GFX9-ALIGNED-NEXT:    ds_read_u8 v0, v2 offset:70
1485; GFX9-ALIGNED-NEXT:    ds_read_u8 v3, v2 offset:65
1486; GFX9-ALIGNED-NEXT:    ds_read_u8 v4, v2 offset:66
1487; GFX9-ALIGNED-NEXT:    ds_read_u8 v5, v2 offset:67
1488; GFX9-ALIGNED-NEXT:    ds_read_u8 v6, v2 offset:68
1489; GFX9-ALIGNED-NEXT:    ds_read_u8 v1, v2 offset:69
1490; GFX9-ALIGNED-NEXT:    ds_read_u8 v7, v2 offset:72
1491; GFX9-ALIGNED-NEXT:    ds_read_u8 v8, v2 offset:71
1492; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(7)
1493; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1494; GFX9-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1495; GFX9-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
1496; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v1
1497; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v1, 8, v7
1498; GFX9-ALIGNED-NEXT:    v_or_b32_sdwa v1, v1, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1499; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v1, v1, v0
1500; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v0, 8, v4
1501; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v0, v3
1502; GFX9-ALIGNED-NEXT:    v_lshlrev_b32_e32 v3, 8, v6
1503; GFX9-ALIGNED-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1504; GFX9-ALIGNED-NEXT:    v_or_b32_e32 v0, v3, v0
1505; GFX9-ALIGNED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1506; GFX9-ALIGNED-NEXT:    s_endpgm
1507;
1508; GFX9-UNALIGNED-LABEL: read2_v2i32_align1_odd_offset:
1509; GFX9-UNALIGNED:       ; %bb.0: ; %entry
1510; GFX9-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0
1511; GFX9-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
1512; GFX9-UNALIGNED-NEXT:    ds_read_b64 v[0:1], v2 offset:65
1513; GFX9-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
1514; GFX9-UNALIGNED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1515; GFX9-UNALIGNED-NEXT:    s_endpgm
1516entry:
1517  %load = load <2 x i32>, ptr addrspace(3) getelementptr (i8, ptr addrspace(3) @v2i32_align1, i32 65), align 1
1518  store <2 x i32> %load, ptr addrspace(1) %out
1519  ret void
1520}
1521
1522declare void @void_func_void() #3
1523
1524declare i32 @llvm.amdgcn.workgroup.id.x() #1
1525declare i32 @llvm.amdgcn.workgroup.id.y() #1
1526declare i32 @llvm.amdgcn.workitem.id.x() #1
1527declare i32 @llvm.amdgcn.workitem.id.y() #1
1528
1529declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) nounwind readnone
1530
1531declare void @llvm.amdgcn.s.barrier() #2
1532
1533attributes #0 = { nounwind }
1534attributes #1 = { nounwind readnone speculatable }
1535attributes #2 = { convergent nounwind }
1536attributes #3 = { nounwind noinline }
1537
1538!llvm.module.flags = !{!0}
1539!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
1540