xref: /llvm-project/llvm/test/CodeGen/AMDGPU/ds-alignment.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-SDAG
3; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,ALIGNED,ALIGNED-GISEL
4; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-SDAG
5; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck %s -check-prefixes=GCN,UNALIGNED,UNALIGNED-GISEL
6
7define amdgpu_kernel void @ds1align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
8; GCN-LABEL: ds1align1:
9; GCN:       ; %bb.0:
10; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
11; GCN-NEXT:    s_waitcnt lgkmcnt(0)
12; GCN-NEXT:    v_mov_b32_e32 v0, s0
13; GCN-NEXT:    ds_read_u8 v0, v0
14; GCN-NEXT:    v_mov_b32_e32 v1, s1
15; GCN-NEXT:    s_waitcnt lgkmcnt(0)
16; GCN-NEXT:    ds_write_b8 v1, v0
17; GCN-NEXT:    s_endpgm
18  %val = load i8, ptr addrspace(3) %in, align 1
19  store i8 %val, ptr addrspace(3) %out, align 1
20  ret void
21}
22
23define amdgpu_kernel void @ds2align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
24; ALIGNED-SDAG-LABEL: ds2align1:
25; ALIGNED-SDAG:       ; %bb.0:
26; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
27; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
28; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
29; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
30; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:1
31; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s1
32; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
33; ALIGNED-SDAG-NEXT:    ds_write_b8 v2, v1
34; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
35; ALIGNED-SDAG-NEXT:    ds_write_b8 v2, v0 offset:1
36; ALIGNED-SDAG-NEXT:    s_endpgm
37;
38; ALIGNED-GISEL-LABEL: ds2align1:
39; ALIGNED-GISEL:       ; %bb.0:
40; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
41; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
42; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
43; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
44; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:1
45; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s1
46; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
47; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 8, v1
48; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
49; ALIGNED-GISEL-NEXT:    ds_write_b8 v2, v0
50; ALIGNED-GISEL-NEXT:    ds_write_b8 v2, v1 offset:1
51; ALIGNED-GISEL-NEXT:    s_endpgm
52;
53; UNALIGNED-LABEL: ds2align1:
54; UNALIGNED:       ; %bb.0:
55; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
56; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
57; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
58; UNALIGNED-NEXT:    ds_read_u16 v0, v0
59; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
60; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
61; UNALIGNED-NEXT:    ds_write_b16 v1, v0
62; UNALIGNED-NEXT:    s_endpgm
63  %val = load i16, ptr addrspace(3) %in, align 1
64  store i16 %val, ptr addrspace(3) %out, align 1
65  ret void
66}
67
68define amdgpu_kernel void @ds2align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
69; GCN-LABEL: ds2align2:
70; GCN:       ; %bb.0:
71; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
72; GCN-NEXT:    s_waitcnt lgkmcnt(0)
73; GCN-NEXT:    v_mov_b32_e32 v0, s0
74; GCN-NEXT:    ds_read_u16 v0, v0
75; GCN-NEXT:    v_mov_b32_e32 v1, s1
76; GCN-NEXT:    s_waitcnt lgkmcnt(0)
77; GCN-NEXT:    ds_write_b16 v1, v0
78; GCN-NEXT:    s_endpgm
79  %val = load i16, ptr addrspace(3) %in, align 2
80  store i16 %val, ptr addrspace(3) %out, align 2
81  ret void
82}
83
84define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
85; ALIGNED-SDAG-LABEL: ds4align1:
86; ALIGNED-SDAG:       ; %bb.0:
87; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
88; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
89; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
90; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
91; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
92; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
93; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:3
94; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
95; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
96; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v1
97; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
98; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v2 offset:1
99; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
100; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v3 offset:2
101; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
102; ALIGNED-SDAG-NEXT:    ds_write_b8 v4, v0 offset:3
103; ALIGNED-SDAG-NEXT:    s_endpgm
104;
105; ALIGNED-GISEL-LABEL: ds4align1:
106; ALIGNED-GISEL:       ; %bb.0:
107; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
108; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, 8
109; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
110; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
111; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
112; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
113; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:3
114; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:2
115; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v5, s1
116; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
117; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
118; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
119; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
120; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
121; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
122; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v2, v0, v1
123; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
124; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v0
125; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:1
126; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
127; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v0 offset:2
128; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:3
129; ALIGNED-GISEL-NEXT:    s_endpgm
130;
131; UNALIGNED-LABEL: ds4align1:
132; UNALIGNED:       ; %bb.0:
133; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
134; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
135; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
136; UNALIGNED-NEXT:    ds_read_b32 v0, v0
137; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
138; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
139; UNALIGNED-NEXT:    ds_write_b32 v1, v0
140; UNALIGNED-NEXT:    s_endpgm
141  %val = load i32, ptr addrspace(3) %in, align 1
142  store i32 %val, ptr addrspace(3) %out, align 1
143  ret void
144}
145
146define amdgpu_kernel void @ds4align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
147; ALIGNED-SDAG-LABEL: ds4align2:
148; ALIGNED-SDAG:       ; %bb.0:
149; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
150; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
151; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
152; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0
153; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:2
154; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s1
155; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
156; ALIGNED-SDAG-NEXT:    ds_write_b16 v2, v1
157; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
158; ALIGNED-SDAG-NEXT:    ds_write_b16 v2, v0 offset:2
159; ALIGNED-SDAG-NEXT:    s_endpgm
160;
161; ALIGNED-GISEL-LABEL: ds4align2:
162; ALIGNED-GISEL:       ; %bb.0:
163; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
164; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
165; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
166; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
167; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:2
168; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s1
169; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
170; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
171; ALIGNED-GISEL-NEXT:    ds_write_b16 v2, v0
172; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v2, v0 offset:2
173; ALIGNED-GISEL-NEXT:    s_endpgm
174;
175; UNALIGNED-LABEL: ds4align2:
176; UNALIGNED:       ; %bb.0:
177; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
178; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
179; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
180; UNALIGNED-NEXT:    ds_read_b32 v0, v0
181; UNALIGNED-NEXT:    v_mov_b32_e32 v1, s1
182; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
183; UNALIGNED-NEXT:    ds_write_b32 v1, v0
184; UNALIGNED-NEXT:    s_endpgm
185  %val = load i32, ptr addrspace(3) %in, align 2
186  store i32 %val, ptr addrspace(3) %out, align 2
187  ret void
188}
189
190define amdgpu_kernel void @ds4align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
191; GCN-LABEL: ds4align4:
192; GCN:       ; %bb.0:
193; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
194; GCN-NEXT:    s_waitcnt lgkmcnt(0)
195; GCN-NEXT:    v_mov_b32_e32 v0, s0
196; GCN-NEXT:    ds_read_b32 v0, v0
197; GCN-NEXT:    v_mov_b32_e32 v1, s1
198; GCN-NEXT:    s_waitcnt lgkmcnt(0)
199; GCN-NEXT:    ds_write_b32 v1, v0
200; GCN-NEXT:    s_endpgm
201  %val = load i32, ptr addrspace(3) %in, align 4
202  store i32 %val, ptr addrspace(3) %out, align 4
203  ret void
204}
205
206define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
207; ALIGNED-SDAG-LABEL: ds8align1:
208; ALIGNED-SDAG:       ; %bb.0:
209; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
210; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
211; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
212; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
213; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
214; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
215; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
216; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
217; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
218; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:6
219; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:7
220; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v7, s1
221; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
222; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v5 offset:4
223; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
224; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v6 offset:5
225; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v1
226; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v2 offset:1
227; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v3 offset:2
228; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v4 offset:3
229; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
230; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v8 offset:6
231; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
232; ALIGNED-SDAG-NEXT:    ds_write_b8 v7, v0 offset:7
233; ALIGNED-SDAG-NEXT:    s_endpgm
234;
235; ALIGNED-GISEL-LABEL: ds8align1:
236; ALIGNED-GISEL:       ; %bb.0:
237; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
238; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
239; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
240; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
241; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
242; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
243; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
244; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
245; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
246; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
247; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:7
248; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
249; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
250; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
251; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
252; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
253; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
254; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
255; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
256; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
257; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
258; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
259; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v2
260; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v2, 8, v1
261; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
262; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1
263; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v2 offset:1
264; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, 8
265; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
266; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v3, v1 offset:2
267; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v4 offset:3
268; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
269; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v0 offset:4
270; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1 offset:5
271; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
272; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v3, v0 offset:6
273; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1 offset:7
274; ALIGNED-GISEL-NEXT:    s_endpgm
275;
276; UNALIGNED-LABEL: ds8align1:
277; UNALIGNED:       ; %bb.0:
278; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
279; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
280; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
281; UNALIGNED-NEXT:    ds_read_b64 v[0:1], v0
282; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
283; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
284; UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1]
285; UNALIGNED-NEXT:    s_endpgm
286  %val = load <2 x i32>, ptr addrspace(3) %in, align 1
287  store <2 x i32> %val, ptr addrspace(3) %out, align 1
288  ret void
289}
290
291define amdgpu_kernel void @ds8align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
292; ALIGNED-SDAG-LABEL: ds8align2:
293; ALIGNED-SDAG:       ; %bb.0:
294; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
295; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
296; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
297; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:4
298; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
299; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
300; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:6
301; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
302; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
303; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v1 offset:4
304; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
305; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v2
306; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
307; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v3 offset:2
308; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
309; ALIGNED-SDAG-NEXT:    ds_write_b16 v4, v0 offset:6
310; ALIGNED-SDAG-NEXT:    s_endpgm
311;
312; ALIGNED-GISEL-LABEL: ds8align2:
313; ALIGNED-GISEL:       ; %bb.0:
314; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
315; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
316; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
317; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
318; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
319; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
320; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:6
321; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
322; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
323; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
324; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
325; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
326; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v1
327; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v1 offset:2
328; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0 offset:4
329; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v0 offset:6
330; ALIGNED-GISEL-NEXT:    s_endpgm
331;
332; UNALIGNED-LABEL: ds8align2:
333; UNALIGNED:       ; %bb.0:
334; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
335; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
336; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
337; UNALIGNED-NEXT:    ds_read_b64 v[0:1], v0
338; UNALIGNED-NEXT:    v_mov_b32_e32 v2, s1
339; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
340; UNALIGNED-NEXT:    ds_write_b64 v2, v[0:1]
341; UNALIGNED-NEXT:    s_endpgm
342  %val = load <2 x i32>, ptr addrspace(3) %in, align 2
343  store <2 x i32> %val, ptr addrspace(3) %out, align 2
344  ret void
345}
346
347define amdgpu_kernel void @ds8align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
348; GCN-LABEL: ds8align4:
349; GCN:       ; %bb.0:
350; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
351; GCN-NEXT:    s_waitcnt lgkmcnt(0)
352; GCN-NEXT:    v_mov_b32_e32 v0, s0
353; GCN-NEXT:    ds_read2_b32 v[0:1], v0 offset1:1
354; GCN-NEXT:    v_mov_b32_e32 v2, s1
355; GCN-NEXT:    s_waitcnt lgkmcnt(0)
356; GCN-NEXT:    ds_write2_b32 v2, v0, v1 offset1:1
357; GCN-NEXT:    s_endpgm
358  %val = load <2 x i32>, ptr addrspace(3) %in, align 4
359  store <2 x i32> %val, ptr addrspace(3) %out, align 4
360  ret void
361}
362
363define amdgpu_kernel void @ds8align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
364; GCN-LABEL: ds8align8:
365; GCN:       ; %bb.0:
366; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
367; GCN-NEXT:    s_waitcnt lgkmcnt(0)
368; GCN-NEXT:    v_mov_b32_e32 v0, s0
369; GCN-NEXT:    ds_read_b64 v[0:1], v0
370; GCN-NEXT:    v_mov_b32_e32 v2, s1
371; GCN-NEXT:    s_waitcnt lgkmcnt(0)
372; GCN-NEXT:    ds_write_b64 v2, v[0:1]
373; GCN-NEXT:    s_endpgm
374  %val = load <2 x i32>, ptr addrspace(3) %in, align 8
375  store <2 x i32> %val, ptr addrspace(3) %out, align 8
376  ret void
377}
378
379define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
380; ALIGNED-SDAG-LABEL: ds12align1:
381; ALIGNED-SDAG:       ; %bb.0:
382; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
383; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
384; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
385; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
386; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
387; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
388; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
389; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
390; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
391; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
392; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
393; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
394; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
395; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
396; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:11
397; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v12, s1
398; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
399; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v9 offset:8
400; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
401; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v10 offset:9
402; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v5 offset:4
403; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v6 offset:5
404; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v1
405; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v2 offset:1
406; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v3 offset:2
407; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v4 offset:3
408; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v7 offset:6
409; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v8 offset:7
410; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
411; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v11 offset:10
412; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(11)
413; ALIGNED-SDAG-NEXT:    ds_write_b8 v12, v0 offset:11
414; ALIGNED-SDAG-NEXT:    s_endpgm
415;
416; ALIGNED-GISEL-LABEL: ds12align1:
417; ALIGNED-GISEL:       ; %bb.0:
418; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
419; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
420; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
421; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
422; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
423; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
424; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
425; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
426; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
427; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
428; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:7
429; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
430; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
431; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
432; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
433; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
434; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
435; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
436; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
437; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:8
438; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:9
439; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:10
440; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:11
441; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
442; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 24, v8
443; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
444; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
445; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v4, 8, v3
446; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
447; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
448; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
449; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
450; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v4, v3
451; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
452; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
453; ALIGNED-GISEL-NEXT:    v_or3_b32 v2, v6, v7, v2
454; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1
455; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v3 offset:1
456; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, 8
457; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
458; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v1 offset:2
459; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v5 offset:3
460; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v2
461; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v2 offset:4
462; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:5
463; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
464; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v2 offset:6
465; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:7
466; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
467; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v0 offset:8
468; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:9
469; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
470; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v0 offset:10
471; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:11
472; ALIGNED-GISEL-NEXT:    s_endpgm
473;
474; UNALIGNED-LABEL: ds12align1:
475; UNALIGNED:       ; %bb.0:
476; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
477; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
478; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
479; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
480; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
481; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
482; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
483; UNALIGNED-NEXT:    s_endpgm
484  %val = load <3 x i32>, ptr addrspace(3) %in, align 1
485  store <3 x i32> %val, ptr addrspace(3) %out, align 1
486  ret void
487}
488
489define amdgpu_kernel void @ds12align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
490; ALIGNED-SDAG-LABEL: ds12align2:
491; ALIGNED-SDAG:       ; %bb.0:
492; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
493; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
494; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
495; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:8
496; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
497; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
498; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
499; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
500; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v6, s1
501; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:10
502; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
503; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v1 offset:8
504; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
505; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v4 offset:4
506; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v2
507; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v3 offset:2
508; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
509; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v5 offset:6
510; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
511; ALIGNED-SDAG-NEXT:    ds_write_b16 v6, v0 offset:10
512; ALIGNED-SDAG-NEXT:    s_endpgm
513;
514; ALIGNED-GISEL-LABEL: ds12align2:
515; ALIGNED-GISEL:       ; %bb.0:
516; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
517; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
518; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
519; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
520; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
521; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
522; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:6
523; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:8
524; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:10
525; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v6, s1
526; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
527; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
528; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
529; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
530; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
531; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v5
532; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v1
533; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v6, v1 offset:2
534; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v2 offset:4
535; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v6, v2 offset:6
536; ALIGNED-GISEL-NEXT:    ds_write_b16 v6, v0 offset:8
537; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v6, v0 offset:10
538; ALIGNED-GISEL-NEXT:    s_endpgm
539;
540; UNALIGNED-LABEL: ds12align2:
541; UNALIGNED:       ; %bb.0:
542; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
543; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
544; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
545; UNALIGNED-NEXT:    ds_read_b96 v[0:2], v0
546; UNALIGNED-NEXT:    v_mov_b32_e32 v3, s1
547; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
548; UNALIGNED-NEXT:    ds_write_b96 v3, v[0:2]
549; UNALIGNED-NEXT:    s_endpgm
550  %val = load <3 x i32>, ptr addrspace(3) %in, align 2
551  store <3 x i32> %val, ptr addrspace(3) %out, align 2
552  ret void
553}
554
555define amdgpu_kernel void @ds12align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
556; ALIGNED-LABEL: ds12align4:
557; ALIGNED:       ; %bb.0:
558; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
559; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
560; ALIGNED-NEXT:    v_mov_b32_e32 v2, s0
561; ALIGNED-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
562; ALIGNED-NEXT:    ds_read_b32 v2, v2 offset:8
563; ALIGNED-NEXT:    v_mov_b32_e32 v3, s1
564; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
565; ALIGNED-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
566; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
567; ALIGNED-NEXT:    ds_write_b32 v3, v2 offset:8
568; ALIGNED-NEXT:    s_endpgm
569;
570; UNALIGNED-SDAG-LABEL: ds12align4:
571; UNALIGNED-SDAG:       ; %bb.0:
572; UNALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
573; UNALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
574; UNALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
575; UNALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
576; UNALIGNED-SDAG-NEXT:    ds_read_b32 v2, v2 offset:8
577; UNALIGNED-SDAG-NEXT:    v_mov_b32_e32 v3, s1
578; UNALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
579; UNALIGNED-SDAG-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
580; UNALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
581; UNALIGNED-SDAG-NEXT:    ds_write_b32 v3, v2 offset:8
582; UNALIGNED-SDAG-NEXT:    s_endpgm
583;
584; UNALIGNED-GISEL-LABEL: ds12align4:
585; UNALIGNED-GISEL:       ; %bb.0:
586; UNALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
587; UNALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
588; UNALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
589; UNALIGNED-GISEL-NEXT:    ds_read_b96 v[0:2], v0
590; UNALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
591; UNALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
592; UNALIGNED-GISEL-NEXT:    ds_write_b96 v3, v[0:2]
593; UNALIGNED-GISEL-NEXT:    s_endpgm
594  %val = load <3 x i32>, ptr addrspace(3) %in, align 4
595  store <3 x i32> %val, ptr addrspace(3) %out, align 4
596  ret void
597}
598
599define amdgpu_kernel void @ds12align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
600; ALIGNED-SDAG-LABEL: ds12align8:
601; ALIGNED-SDAG:       ; %bb.0:
602; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
603; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
604; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
605; ALIGNED-SDAG-NEXT:    ds_read_b64 v[0:1], v2
606; ALIGNED-SDAG-NEXT:    ds_read_b32 v2, v2 offset:8
607; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v3, s1
608; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
609; ALIGNED-SDAG-NEXT:    ds_write_b64 v3, v[0:1]
610; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
611; ALIGNED-SDAG-NEXT:    ds_write_b32 v3, v2 offset:8
612; ALIGNED-SDAG-NEXT:    s_endpgm
613;
614; ALIGNED-GISEL-LABEL: ds12align8:
615; ALIGNED-GISEL:       ; %bb.0:
616; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
617; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
618; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, s0
619; ALIGNED-GISEL-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
620; ALIGNED-GISEL-NEXT:    ds_read_b32 v2, v2 offset:8
621; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
622; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
623; ALIGNED-GISEL-NEXT:    ds_write2_b32 v3, v0, v1 offset1:1
624; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
625; ALIGNED-GISEL-NEXT:    ds_write_b32 v3, v2 offset:8
626; ALIGNED-GISEL-NEXT:    s_endpgm
627;
628; UNALIGNED-SDAG-LABEL: ds12align8:
629; UNALIGNED-SDAG:       ; %bb.0:
630; UNALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
631; UNALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
632; UNALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
633; UNALIGNED-SDAG-NEXT:    ds_read_b32 v2, v0 offset:8
634; UNALIGNED-SDAG-NEXT:    ds_read_b64 v[0:1], v0
635; UNALIGNED-SDAG-NEXT:    v_mov_b32_e32 v3, s1
636; UNALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
637; UNALIGNED-SDAG-NEXT:    ds_write_b32 v3, v2 offset:8
638; UNALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
639; UNALIGNED-SDAG-NEXT:    ds_write_b64 v3, v[0:1]
640; UNALIGNED-SDAG-NEXT:    s_endpgm
641;
642; UNALIGNED-GISEL-LABEL: ds12align8:
643; UNALIGNED-GISEL:       ; %bb.0:
644; UNALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
645; UNALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
646; UNALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
647; UNALIGNED-GISEL-NEXT:    ds_read_b96 v[0:2], v0
648; UNALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
649; UNALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
650; UNALIGNED-GISEL-NEXT:    ds_write_b96 v3, v[0:2]
651; UNALIGNED-GISEL-NEXT:    s_endpgm
652  %val = load <3 x i32>, ptr addrspace(3) %in, align 8
653  store <3 x i32> %val, ptr addrspace(3) %out, align 8
654  ret void
655}
656
657define amdgpu_kernel void @ds12align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
658; GCN-LABEL: ds12align16:
659; GCN:       ; %bb.0:
660; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
661; GCN-NEXT:    s_waitcnt lgkmcnt(0)
662; GCN-NEXT:    v_mov_b32_e32 v0, s0
663; GCN-NEXT:    ds_read_b96 v[0:2], v0
664; GCN-NEXT:    v_mov_b32_e32 v3, s1
665; GCN-NEXT:    s_waitcnt lgkmcnt(0)
666; GCN-NEXT:    ds_write_b96 v3, v[0:2]
667; GCN-NEXT:    s_endpgm
668  %val = load <3 x i32>, ptr addrspace(3) %in, align 16
669  store <3 x i32> %val, ptr addrspace(3) %out, align 16
670  ret void
671}
672
673define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %out) {
674; ALIGNED-SDAG-LABEL: ds16align1:
675; ALIGNED-SDAG:       ; %bb.0:
676; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
677; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
678; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
679; ALIGNED-SDAG-NEXT:    ds_read_u8 v1, v0
680; ALIGNED-SDAG-NEXT:    ds_read_u8 v2, v0 offset:1
681; ALIGNED-SDAG-NEXT:    ds_read_u8 v3, v0 offset:2
682; ALIGNED-SDAG-NEXT:    ds_read_u8 v4, v0 offset:3
683; ALIGNED-SDAG-NEXT:    ds_read_u8 v5, v0 offset:4
684; ALIGNED-SDAG-NEXT:    ds_read_u8 v6, v0 offset:5
685; ALIGNED-SDAG-NEXT:    ds_read_u8 v7, v0 offset:6
686; ALIGNED-SDAG-NEXT:    ds_read_u8 v8, v0 offset:7
687; ALIGNED-SDAG-NEXT:    ds_read_u8 v9, v0 offset:8
688; ALIGNED-SDAG-NEXT:    ds_read_u8 v10, v0 offset:9
689; ALIGNED-SDAG-NEXT:    ds_read_u8 v11, v0 offset:10
690; ALIGNED-SDAG-NEXT:    ds_read_u8 v12, v0 offset:11
691; ALIGNED-SDAG-NEXT:    ds_read_u8 v13, v0 offset:12
692; ALIGNED-SDAG-NEXT:    ds_read_u8 v14, v0 offset:13
693; ALIGNED-SDAG-NEXT:    ds_read_u8 v15, v0 offset:14
694; ALIGNED-SDAG-NEXT:    ds_read_u8 v0, v0 offset:15
695; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v16, s1
696; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
697; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v13 offset:12
698; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(3)
699; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v14 offset:13
700; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v1
701; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v2 offset:1
702; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v5 offset:4
703; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v6 offset:5
704; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v9 offset:8
705; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v10 offset:9
706; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v3 offset:2
707; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v4 offset:3
708; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v7 offset:6
709; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v8 offset:7
710; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v11 offset:10
711; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v12 offset:11
712; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(14)
713; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v15 offset:14
714; ALIGNED-SDAG-NEXT:    ds_write_b8 v16, v0 offset:15
715; ALIGNED-SDAG-NEXT:    s_endpgm
716;
717; ALIGNED-GISEL-LABEL: ds16align1:
718; ALIGNED-GISEL:       ; %bb.0:
719; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
720; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
721; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
722; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
723; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
724; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:2
725; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:3
726; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:4
727; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:5
728; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:6
729; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:7
730; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
731; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
732; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
733; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
734; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
735; ALIGNED-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
736; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
737; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v6, 8, v5
738; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
739; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v8
740; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
741; ALIGNED-GISEL-NEXT:    v_or3_b32 v2, v3, v4, v2
742; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:8
743; ALIGNED-GISEL-NEXT:    ds_read_u8 v4, v0 offset:9
744; ALIGNED-GISEL-NEXT:    ds_read_u8 v5, v0 offset:10
745; ALIGNED-GISEL-NEXT:    ds_read_u8 v6, v0 offset:11
746; ALIGNED-GISEL-NEXT:    ds_read_u8 v7, v0 offset:12
747; ALIGNED-GISEL-NEXT:    ds_read_u8 v8, v0 offset:13
748; ALIGNED-GISEL-NEXT:    ds_read_u8 v9, v0 offset:14
749; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:15
750; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
751; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v4, 8, v3
752; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
753; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 24, v6
754; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
755; ALIGNED-GISEL-NEXT:    v_or3_b32 v3, v4, v5, v3
756; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
757; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v4, v8, 8, v7
758; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
759; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
760; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v9
761; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v4
762; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v4, 8, v1
763; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v5, s1
764; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1
765; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v4 offset:1
766; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, 8
767; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v6, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
768; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v1 offset:2
769; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v6 offset:3
770; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v2
771; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v2 offset:4
772; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:5
773; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
774; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v2 offset:6
775; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:7
776; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v3
777; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v3 offset:8
778; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:9
779; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
780; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v3 offset:10
781; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:11
782; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
783; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v0 offset:12
784; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:13
785; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
786; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v0 offset:14
787; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:15
788; ALIGNED-GISEL-NEXT:    s_endpgm
789;
790; UNALIGNED-LABEL: ds16align1:
791; UNALIGNED:       ; %bb.0:
792; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
793; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
794; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
795; UNALIGNED-NEXT:    ds_read_b128 v[0:3], v0
796; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
797; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
798; UNALIGNED-NEXT:    ds_write_b128 v4, v[0:3]
799; UNALIGNED-NEXT:    s_endpgm
800  %val = load <4 x i32>, ptr addrspace(3) %in, align 1
801  store <4 x i32> %val, ptr addrspace(3) %out, align 1
802  ret void
803}
804
805define amdgpu_kernel void @ds16align2(ptr addrspace(3) %in, ptr addrspace(3) %out) {
806; ALIGNED-SDAG-LABEL: ds16align2:
807; ALIGNED-SDAG:       ; %bb.0:
808; ALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
809; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
810; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v0, s0
811; ALIGNED-SDAG-NEXT:    ds_read_u16 v1, v0 offset:12
812; ALIGNED-SDAG-NEXT:    ds_read_u16 v2, v0
813; ALIGNED-SDAG-NEXT:    ds_read_u16 v3, v0 offset:2
814; ALIGNED-SDAG-NEXT:    ds_read_u16 v4, v0 offset:4
815; ALIGNED-SDAG-NEXT:    ds_read_u16 v5, v0 offset:6
816; ALIGNED-SDAG-NEXT:    ds_read_u16 v6, v0 offset:8
817; ALIGNED-SDAG-NEXT:    ds_read_u16 v7, v0 offset:10
818; ALIGNED-SDAG-NEXT:    v_mov_b32_e32 v8, s1
819; ALIGNED-SDAG-NEXT:    ds_read_u16 v0, v0 offset:14
820; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
821; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v1 offset:12
822; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
823; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v2
824; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(6)
825; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v4 offset:4
826; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(5)
827; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v6 offset:8
828; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v3 offset:2
829; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v5 offset:6
830; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
831; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v7 offset:10
832; ALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(7)
833; ALIGNED-SDAG-NEXT:    ds_write_b16 v8, v0 offset:14
834; ALIGNED-SDAG-NEXT:    s_endpgm
835;
836; ALIGNED-GISEL-LABEL: ds16align2:
837; ALIGNED-GISEL:       ; %bb.0:
838; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
839; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
840; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
841; ALIGNED-GISEL-NEXT:    ds_read_u16 v1, v0
842; ALIGNED-GISEL-NEXT:    ds_read_u16 v2, v0 offset:2
843; ALIGNED-GISEL-NEXT:    ds_read_u16 v3, v0 offset:4
844; ALIGNED-GISEL-NEXT:    ds_read_u16 v4, v0 offset:6
845; ALIGNED-GISEL-NEXT:    ds_read_u16 v5, v0 offset:8
846; ALIGNED-GISEL-NEXT:    ds_read_u16 v6, v0 offset:10
847; ALIGNED-GISEL-NEXT:    ds_read_u16 v7, v0 offset:12
848; ALIGNED-GISEL-NEXT:    ds_read_u16 v0, v0 offset:14
849; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(6)
850; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
851; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(4)
852; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v2, v4, 16, v3
853; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
854; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
855; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v3, v6, 16, v5
856; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
857; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v7
858; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v1
859; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v1 offset:2
860; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v2 offset:4
861; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v2 offset:6
862; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v3 offset:8
863; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v3 offset:10
864; ALIGNED-GISEL-NEXT:    ds_write_b16 v4, v0 offset:12
865; ALIGNED-GISEL-NEXT:    ds_write_b16_d16_hi v4, v0 offset:14
866; ALIGNED-GISEL-NEXT:    s_endpgm
867;
868; UNALIGNED-LABEL: ds16align2:
869; UNALIGNED:       ; %bb.0:
870; UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
871; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
872; UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
873; UNALIGNED-NEXT:    ds_read_b128 v[0:3], v0
874; UNALIGNED-NEXT:    v_mov_b32_e32 v4, s1
875; UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
876; UNALIGNED-NEXT:    ds_write_b128 v4, v[0:3]
877; UNALIGNED-NEXT:    s_endpgm
878  %val = load <4 x i32>, ptr addrspace(3) %in, align 2
879  store <4 x i32> %val, ptr addrspace(3) %out, align 2
880  ret void
881}
882
883define amdgpu_kernel void @ds16align4(ptr addrspace(3) %in, ptr addrspace(3) %out) {
884; ALIGNED-LABEL: ds16align4:
885; ALIGNED:       ; %bb.0:
886; ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
887; ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
888; ALIGNED-NEXT:    v_mov_b32_e32 v2, s0
889; ALIGNED-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
890; ALIGNED-NEXT:    ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
891; ALIGNED-NEXT:    v_mov_b32_e32 v4, s1
892; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
893; ALIGNED-NEXT:    ds_write2_b32 v4, v0, v1 offset1:1
894; ALIGNED-NEXT:    s_waitcnt lgkmcnt(1)
895; ALIGNED-NEXT:    ds_write2_b32 v4, v2, v3 offset0:2 offset1:3
896; ALIGNED-NEXT:    s_endpgm
897;
898; UNALIGNED-SDAG-LABEL: ds16align4:
899; UNALIGNED-SDAG:       ; %bb.0:
900; UNALIGNED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
901; UNALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
902; UNALIGNED-SDAG-NEXT:    v_mov_b32_e32 v2, s0
903; UNALIGNED-SDAG-NEXT:    ds_read2_b32 v[0:1], v2 offset0:2 offset1:3
904; UNALIGNED-SDAG-NEXT:    ds_read2_b32 v[2:3], v2 offset1:1
905; UNALIGNED-SDAG-NEXT:    v_mov_b32_e32 v4, s1
906; UNALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
907; UNALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v0, v1 offset0:2 offset1:3
908; UNALIGNED-SDAG-NEXT:    s_waitcnt lgkmcnt(1)
909; UNALIGNED-SDAG-NEXT:    ds_write2_b32 v4, v2, v3 offset1:1
910; UNALIGNED-SDAG-NEXT:    s_endpgm
911;
912; UNALIGNED-GISEL-LABEL: ds16align4:
913; UNALIGNED-GISEL:       ; %bb.0:
914; UNALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
915; UNALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
916; UNALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
917; UNALIGNED-GISEL-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
918; UNALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
919; UNALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
920; UNALIGNED-GISEL-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
921; UNALIGNED-GISEL-NEXT:    s_endpgm
922  %val = load <4 x i32>, ptr addrspace(3) %in, align 4
923  store <4 x i32> %val, ptr addrspace(3) %out, align 4
924  ret void
925}
926
927define amdgpu_kernel void @ds16align8(ptr addrspace(3) %in, ptr addrspace(3) %out) {
928; GCN-LABEL: ds16align8:
929; GCN:       ; %bb.0:
930; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
931; GCN-NEXT:    s_waitcnt lgkmcnt(0)
932; GCN-NEXT:    v_mov_b32_e32 v0, s0
933; GCN-NEXT:    ds_read2_b64 v[0:3], v0 offset1:1
934; GCN-NEXT:    v_mov_b32_e32 v4, s1
935; GCN-NEXT:    s_waitcnt lgkmcnt(0)
936; GCN-NEXT:    ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
937; GCN-NEXT:    s_endpgm
938  %val = load <4 x i32>, ptr addrspace(3) %in, align 8
939  store <4 x i32> %val, ptr addrspace(3) %out, align 8
940  ret void
941}
942
943define amdgpu_kernel void @ds16align16(ptr addrspace(3) %in, ptr addrspace(3) %out) {
944; GCN-LABEL: ds16align16:
945; GCN:       ; %bb.0:
946; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
947; GCN-NEXT:    s_waitcnt lgkmcnt(0)
948; GCN-NEXT:    v_mov_b32_e32 v0, s0
949; GCN-NEXT:    ds_read_b128 v[0:3], v0
950; GCN-NEXT:    v_mov_b32_e32 v4, s1
951; GCN-NEXT:    s_waitcnt lgkmcnt(0)
952; GCN-NEXT:    ds_write_b128 v4, v[0:3]
953; GCN-NEXT:    s_endpgm
954  %val = load <4 x i32>, ptr addrspace(3) %in, align 16
955  store <4 x i32> %val, ptr addrspace(3) %out, align 16
956  ret void
957}
958