xref: /llvm-project/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=VI %s
4
5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
7
8define amdgpu_kernel void @test_copy_v4i8(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
9; SI-LABEL: test_copy_v4i8:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s10, 0
14; SI-NEXT:    s_mov_b32 s11, s7
15; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
18; SI-NEXT:    v_mov_b32_e32 v1, 0
19; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
20; SI-NEXT:    s_mov_b32 s6, -1
21; SI-NEXT:    s_mov_b32 s4, s0
22; SI-NEXT:    s_mov_b32 s5, s1
23; SI-NEXT:    s_waitcnt vmcnt(0)
24; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
25; SI-NEXT:    s_endpgm
26;
27; VI-LABEL: test_copy_v4i8:
28; VI:       ; %bb.0:
29; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
30; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
31; VI-NEXT:    s_waitcnt lgkmcnt(0)
32; VI-NEXT:    v_mov_b32_e32 v1, s3
33; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
34; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
35; VI-NEXT:    flat_load_dword v0, v[0:1]
36; VI-NEXT:    s_mov_b32 s3, 0xf000
37; VI-NEXT:    s_mov_b32 s2, -1
38; VI-NEXT:    s_waitcnt vmcnt(0)
39; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
40; VI-NEXT:    s_endpgm
41  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
42  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
43  %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
44  store <4 x i8> %val, ptr addrspace(1) %out, align 4
45  ret void
46}
47
48define amdgpu_kernel void @test_copy_v4i8_x2(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind {
49; SI-LABEL: test_copy_v4i8_x2:
50; SI:       ; %bb.0:
51; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
52; SI-NEXT:    s_mov_b32 s7, 0xf000
53; SI-NEXT:    s_mov_b32 s2, 0
54; SI-NEXT:    s_mov_b32 s3, s7
55; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
56; SI-NEXT:    v_mov_b32_e32 v1, 0
57; SI-NEXT:    s_waitcnt lgkmcnt(0)
58; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64
59; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
60; SI-NEXT:    s_mov_b32 s6, -1
61; SI-NEXT:    s_mov_b32 s10, s6
62; SI-NEXT:    s_mov_b32 s11, s7
63; SI-NEXT:    s_waitcnt lgkmcnt(0)
64; SI-NEXT:    s_mov_b32 s4, s0
65; SI-NEXT:    s_mov_b32 s5, s1
66; SI-NEXT:    s_mov_b32 s8, s2
67; SI-NEXT:    s_mov_b32 s9, s3
68; SI-NEXT:    s_waitcnt vmcnt(0)
69; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
70; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
71; SI-NEXT:    s_endpgm
72;
73; VI-LABEL: test_copy_v4i8_x2:
74; VI:       ; %bb.0:
75; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
76; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
77; VI-NEXT:    s_mov_b32 s7, 0xf000
78; VI-NEXT:    s_mov_b32 s6, -1
79; VI-NEXT:    s_mov_b32 s10, s6
80; VI-NEXT:    s_waitcnt lgkmcnt(0)
81; VI-NEXT:    v_mov_b32_e32 v1, s1
82; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
83; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
84; VI-NEXT:    flat_load_dword v0, v[0:1]
85; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
86; VI-NEXT:    s_mov_b32 s11, s7
87; VI-NEXT:    s_waitcnt lgkmcnt(0)
88; VI-NEXT:    s_mov_b32 s4, s0
89; VI-NEXT:    s_mov_b32 s5, s1
90; VI-NEXT:    s_mov_b32 s8, s2
91; VI-NEXT:    s_mov_b32 s9, s3
92; VI-NEXT:    s_waitcnt vmcnt(0)
93; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
94; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
95; VI-NEXT:    s_endpgm
96  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
97  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
98  %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
99  store <4 x i8> %val, ptr addrspace(1) %out0, align 4
100  store <4 x i8> %val, ptr addrspace(1) %out1, align 4
101  ret void
102}
103
104define amdgpu_kernel void @test_copy_v4i8_x3(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind {
105; SI-LABEL: test_copy_v4i8_x3:
106; SI:       ; %bb.0:
107; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
108; SI-NEXT:    s_mov_b32 s11, 0xf000
109; SI-NEXT:    s_mov_b32 s14, 0
110; SI-NEXT:    s_mov_b32 s15, s11
111; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
112; SI-NEXT:    s_waitcnt lgkmcnt(0)
113; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
114; SI-NEXT:    v_mov_b32_e32 v1, 0
115; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
116; SI-NEXT:    s_mov_b32 s10, -1
117; SI-NEXT:    s_mov_b32 s8, s0
118; SI-NEXT:    s_mov_b32 s9, s1
119; SI-NEXT:    s_mov_b32 s14, s10
120; SI-NEXT:    s_mov_b32 s6, s10
121; SI-NEXT:    s_mov_b32 s7, s11
122; SI-NEXT:    s_mov_b32 s12, s2
123; SI-NEXT:    s_mov_b32 s13, s3
124; SI-NEXT:    s_waitcnt vmcnt(0)
125; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
126; SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
127; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
128; SI-NEXT:    s_endpgm
129;
130; VI-LABEL: test_copy_v4i8_x3:
131; VI:       ; %bb.0:
132; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
133; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
134; VI-NEXT:    s_mov_b32 s11, 0xf000
135; VI-NEXT:    s_mov_b32 s10, -1
136; VI-NEXT:    s_mov_b32 s14, s10
137; VI-NEXT:    s_waitcnt lgkmcnt(0)
138; VI-NEXT:    v_mov_b32_e32 v1, s7
139; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
140; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
141; VI-NEXT:    flat_load_dword v0, v[0:1]
142; VI-NEXT:    s_mov_b32 s8, s0
143; VI-NEXT:    s_mov_b32 s9, s1
144; VI-NEXT:    s_mov_b32 s15, s11
145; VI-NEXT:    s_mov_b32 s6, s10
146; VI-NEXT:    s_mov_b32 s7, s11
147; VI-NEXT:    s_mov_b32 s12, s2
148; VI-NEXT:    s_mov_b32 s13, s3
149; VI-NEXT:    s_waitcnt vmcnt(0)
150; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
151; VI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
152; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
153; VI-NEXT:    s_endpgm
154  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
155  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
156  %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
157  store <4 x i8> %val, ptr addrspace(1) %out0, align 4
158  store <4 x i8> %val, ptr addrspace(1) %out1, align 4
159  store <4 x i8> %val, ptr addrspace(1) %out2, align 4
160  ret void
161}
162
163define amdgpu_kernel void @test_copy_v4i8_x4(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3, ptr addrspace(1) %in) nounwind {
164; SI-LABEL: test_copy_v4i8_x4:
165; SI:       ; %bb.0:
166; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x11
167; SI-NEXT:    s_mov_b32 s3, 0xf000
168; SI-NEXT:    s_mov_b32 s10, 0
169; SI-NEXT:    s_mov_b32 s11, s3
170; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
171; SI-NEXT:    v_mov_b32_e32 v1, 0
172; SI-NEXT:    s_waitcnt lgkmcnt(0)
173; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
174; SI-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x9
175; SI-NEXT:    s_mov_b32 s2, -1
176; SI-NEXT:    s_mov_b32 s14, s2
177; SI-NEXT:    s_mov_b32 s15, s3
178; SI-NEXT:    s_mov_b32 s18, s2
179; SI-NEXT:    s_waitcnt lgkmcnt(0)
180; SI-NEXT:    s_mov_b32 s0, s4
181; SI-NEXT:    s_mov_b32 s1, s5
182; SI-NEXT:    s_mov_b32 s19, s3
183; SI-NEXT:    s_mov_b32 s22, s2
184; SI-NEXT:    s_mov_b32 s23, s3
185; SI-NEXT:    s_mov_b32 s12, s6
186; SI-NEXT:    s_mov_b32 s13, s7
187; SI-NEXT:    s_mov_b32 s16, s8
188; SI-NEXT:    s_mov_b32 s17, s9
189; SI-NEXT:    s_mov_b32 s20, s10
190; SI-NEXT:    s_mov_b32 s21, s11
191; SI-NEXT:    s_waitcnt vmcnt(0)
192; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
193; SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
194; SI-NEXT:    buffer_store_dword v0, off, s[16:19], 0
195; SI-NEXT:    buffer_store_dword v0, off, s[20:23], 0
196; SI-NEXT:    s_endpgm
197;
198; VI-LABEL: test_copy_v4i8_x4:
199; VI:       ; %bb.0:
200; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x44
201; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
202; VI-NEXT:    s_mov_b32 s11, 0xf000
203; VI-NEXT:    s_mov_b32 s10, -1
204; VI-NEXT:    s_mov_b32 s14, s10
205; VI-NEXT:    s_waitcnt lgkmcnt(0)
206; VI-NEXT:    v_mov_b32_e32 v1, s1
207; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
208; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
209; VI-NEXT:    flat_load_dword v0, v[0:1]
210; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
211; VI-NEXT:    s_mov_b32 s15, s11
212; VI-NEXT:    s_mov_b32 s18, s10
213; VI-NEXT:    s_mov_b32 s19, s11
214; VI-NEXT:    s_mov_b32 s22, s10
215; VI-NEXT:    s_waitcnt lgkmcnt(0)
216; VI-NEXT:    s_mov_b32 s8, s0
217; VI-NEXT:    s_mov_b32 s9, s1
218; VI-NEXT:    s_mov_b32 s23, s11
219; VI-NEXT:    s_mov_b32 s12, s2
220; VI-NEXT:    s_mov_b32 s13, s3
221; VI-NEXT:    s_mov_b32 s16, s4
222; VI-NEXT:    s_mov_b32 s17, s5
223; VI-NEXT:    s_mov_b32 s20, s6
224; VI-NEXT:    s_mov_b32 s21, s7
225; VI-NEXT:    s_waitcnt vmcnt(0)
226; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
227; VI-NEXT:    buffer_store_dword v0, off, s[12:15], 0
228; VI-NEXT:    buffer_store_dword v0, off, s[16:19], 0
229; VI-NEXT:    buffer_store_dword v0, off, s[20:23], 0
230; VI-NEXT:    s_endpgm
231  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
232  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
233  %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
234  store <4 x i8> %val, ptr addrspace(1) %out0, align 4
235  store <4 x i8> %val, ptr addrspace(1) %out1, align 4
236  store <4 x i8> %val, ptr addrspace(1) %out2, align 4
237  store <4 x i8> %val, ptr addrspace(1) %out3, align 4
238  ret void
239}
240
241define amdgpu_kernel void @test_copy_v4i8_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %in) nounwind {
242; SI-LABEL: test_copy_v4i8_extra_use:
243; SI:       ; %bb.0:
244; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
245; SI-NEXT:    s_mov_b32 s3, 0xf000
246; SI-NEXT:    s_mov_b32 s10, 0
247; SI-NEXT:    s_mov_b32 s11, s3
248; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
249; SI-NEXT:    v_mov_b32_e32 v1, 0
250; SI-NEXT:    s_waitcnt lgkmcnt(0)
251; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
252; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
253; SI-NEXT:    s_mov_b32 s2, -1
254; SI-NEXT:    s_mov_b32 s10, s2
255; SI-NEXT:    s_waitcnt lgkmcnt(0)
256; SI-NEXT:    s_mov_b32 s0, s4
257; SI-NEXT:    s_mov_b32 s1, s5
258; SI-NEXT:    s_mov_b32 s8, s6
259; SI-NEXT:    s_mov_b32 s9, s7
260; SI-NEXT:    s_waitcnt vmcnt(0)
261; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
262; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
263; SI-NEXT:    v_and_b32_e32 v2, 0xff00, v0
264; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
265; SI-NEXT:    v_and_b32_e32 v4, 0xff00, v1
266; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
267; SI-NEXT:    v_or_b32_e32 v2, v2, v3
268; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
269; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x900, v2
270; SI-NEXT:    v_or_b32_e32 v1, v4, v1
271; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
272; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
273; SI-NEXT:    v_or_b32_e32 v1, v1, v2
274; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x9000000, v1
275; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
276; SI-NEXT:    buffer_store_dword v1, off, s[8:11], 0
277; SI-NEXT:    s_endpgm
278;
279; VI-LABEL: test_copy_v4i8_extra_use:
280; VI:       ; %bb.0:
281; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
282; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
283; VI-NEXT:    s_mov_b32 s7, 0xf000
284; VI-NEXT:    s_mov_b32 s6, -1
285; VI-NEXT:    s_mov_b32 s10, s6
286; VI-NEXT:    s_waitcnt lgkmcnt(0)
287; VI-NEXT:    v_mov_b32_e32 v1, s1
288; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
289; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
290; VI-NEXT:    flat_load_dword v0, v[0:1]
291; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
292; VI-NEXT:    s_mov_b32 s11, s7
293; VI-NEXT:    s_waitcnt lgkmcnt(0)
294; VI-NEXT:    s_mov_b32 s4, s0
295; VI-NEXT:    s_mov_b32 s5, s1
296; VI-NEXT:    s_mov_b32 s8, s2
297; VI-NEXT:    s_mov_b32 s9, s3
298; VI-NEXT:    s_waitcnt vmcnt(0)
299; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
300; VI-NEXT:    v_and_b32_e32 v4, 0xffffff00, v1
301; VI-NEXT:    v_add_u16_e32 v1, 9, v1
302; VI-NEXT:    v_add_u16_e32 v3, 9, v0
303; VI-NEXT:    v_and_b32_e32 v1, 0xff, v1
304; VI-NEXT:    v_and_b32_e32 v2, 0xffffff00, v0
305; VI-NEXT:    v_and_b32_e32 v3, 0xff, v3
306; VI-NEXT:    v_or_b32_e32 v1, v4, v1
307; VI-NEXT:    v_or_b32_e32 v2, v2, v3
308; VI-NEXT:    v_add_u16_e32 v1, 0x900, v1
309; VI-NEXT:    v_add_u16_e32 v2, 0x900, v2
310; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
311; VI-NEXT:    v_or_b32_e32 v1, v2, v1
312; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
313; VI-NEXT:    buffer_store_dword v1, off, s[8:11], 0
314; VI-NEXT:    s_endpgm
315  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
316  %gep = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
317  %val = load <4 x i8>, ptr addrspace(1) %gep, align 4
318  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
319  store <4 x i8> %val, ptr addrspace(1) %out0, align 4
320  store <4 x i8> %add, ptr addrspace(1) %out1, align 4
321  ret void
322}
323
324; FIXME: Need to handle non-uniform case for function below (load without gep).
325define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %in) nounwind {
326; SI-LABEL: test_copy_v4i8_x2_extra_use:
327; SI:       ; %bb.0:
328; SI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x9
329; SI-NEXT:    s_mov_b32 s11, 0xf000
330; SI-NEXT:    s_mov_b32 s14, 0
331; SI-NEXT:    s_mov_b32 s15, s11
332; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
333; SI-NEXT:    s_waitcnt lgkmcnt(0)
334; SI-NEXT:    s_mov_b64 s[12:13], s[6:7]
335; SI-NEXT:    v_mov_b32_e32 v1, 0
336; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64
337; SI-NEXT:    s_mov_b32 s10, -1
338; SI-NEXT:    s_mov_b32 s14, s10
339; SI-NEXT:    s_mov_b32 s8, s0
340; SI-NEXT:    s_mov_b32 s9, s1
341; SI-NEXT:    s_mov_b32 s12, s2
342; SI-NEXT:    s_mov_b32 s13, s3
343; SI-NEXT:    s_mov_b32 s6, s10
344; SI-NEXT:    s_mov_b32 s7, s11
345; SI-NEXT:    s_waitcnt vmcnt(0)
346; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
347; SI-NEXT:    v_add_i32_e32 v3, vcc, 9, v0
348; SI-NEXT:    v_and_b32_e32 v2, 0xff00, v0
349; SI-NEXT:    v_and_b32_e32 v3, 0xff, v3
350; SI-NEXT:    v_and_b32_e32 v4, 0xff00, v1
351; SI-NEXT:    v_add_i32_e32 v1, vcc, 9, v1
352; SI-NEXT:    v_or_b32_e32 v2, v2, v3
353; SI-NEXT:    v_and_b32_e32 v1, 0xff, v1
354; SI-NEXT:    v_add_i32_e32 v2, vcc, 0x900, v2
355; SI-NEXT:    v_or_b32_e32 v1, v4, v1
356; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
357; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
358; SI-NEXT:    v_or_b32_e32 v1, v1, v2
359; SI-NEXT:    v_add_i32_e32 v1, vcc, 0x9000000, v1
360; SI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
361; SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0
362; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
363; SI-NEXT:    s_endpgm
364;
365; VI-LABEL: test_copy_v4i8_x2_extra_use:
366; VI:       ; %bb.0:
367; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
368; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
369; VI-NEXT:    s_mov_b32 s11, 0xf000
370; VI-NEXT:    s_mov_b32 s10, -1
371; VI-NEXT:    s_mov_b32 s14, s10
372; VI-NEXT:    s_waitcnt lgkmcnt(0)
373; VI-NEXT:    v_mov_b32_e32 v1, s7
374; VI-NEXT:    v_add_u32_e32 v0, vcc, s6, v0
375; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
376; VI-NEXT:    flat_load_dword v0, v[0:1]
377; VI-NEXT:    s_mov_b32 s15, s11
378; VI-NEXT:    s_mov_b32 s8, s0
379; VI-NEXT:    s_mov_b32 s9, s1
380; VI-NEXT:    s_mov_b32 s12, s2
381; VI-NEXT:    s_mov_b32 s13, s3
382; VI-NEXT:    s_mov_b32 s6, s10
383; VI-NEXT:    s_mov_b32 s7, s11
384; VI-NEXT:    s_waitcnt vmcnt(0)
385; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
386; VI-NEXT:    v_and_b32_e32 v4, 0xffffff00, v1
387; VI-NEXT:    v_add_u16_e32 v1, 9, v1
388; VI-NEXT:    v_add_u16_e32 v3, 9, v0
389; VI-NEXT:    v_and_b32_e32 v1, 0xff, v1
390; VI-NEXT:    v_and_b32_e32 v2, 0xffffff00, v0
391; VI-NEXT:    v_and_b32_e32 v3, 0xff, v3
392; VI-NEXT:    v_or_b32_e32 v1, v4, v1
393; VI-NEXT:    v_or_b32_e32 v2, v2, v3
394; VI-NEXT:    v_add_u16_e32 v1, 0x900, v1
395; VI-NEXT:    v_add_u16_e32 v2, 0x900, v2
396; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
397; VI-NEXT:    v_or_b32_e32 v1, v2, v1
398; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
399; VI-NEXT:    buffer_store_dword v1, off, s[12:15], 0
400; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
401; VI-NEXT:    s_endpgm
402  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
403  %in.ptr = getelementptr <4 x i8>, ptr addrspace(1) %in, i32 %tid.x
404  %val = load <4 x i8>, ptr addrspace(1) %in.ptr, align 4
405  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
406  store <4 x i8> %val, ptr addrspace(1) %out0, align 4
407  store <4 x i8> %add, ptr addrspace(1) %out1, align 4
408  store <4 x i8> %val, ptr addrspace(1) %out2, align 4
409  ret void
410}
411
412define amdgpu_kernel void @test_copy_v3i8_align4(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
413; SI-LABEL: test_copy_v3i8_align4:
414; SI:       ; %bb.0:
415; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
416; SI-NEXT:    s_mov_b32 s7, 0xf000
417; SI-NEXT:    s_mov_b32 s10, 0
418; SI-NEXT:    s_mov_b32 s11, s7
419; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
420; SI-NEXT:    s_waitcnt lgkmcnt(0)
421; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
422; SI-NEXT:    v_mov_b32_e32 v1, 0
423; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
424; SI-NEXT:    s_mov_b32 s6, -1
425; SI-NEXT:    s_mov_b32 s4, s0
426; SI-NEXT:    s_mov_b32 s5, s1
427; SI-NEXT:    s_waitcnt vmcnt(0)
428; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
429; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
430; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
431; SI-NEXT:    s_endpgm
432;
433; VI-LABEL: test_copy_v3i8_align4:
434; VI:       ; %bb.0:
435; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
436; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
437; VI-NEXT:    s_waitcnt lgkmcnt(0)
438; VI-NEXT:    v_mov_b32_e32 v1, s3
439; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
440; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
441; VI-NEXT:    flat_load_dword v0, v[0:1]
442; VI-NEXT:    s_mov_b32 s3, 0xf000
443; VI-NEXT:    s_mov_b32 s2, -1
444; VI-NEXT:    s_waitcnt vmcnt(0)
445; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
446; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
447; VI-NEXT:    buffer_store_byte v1, off, s[0:3], 0 offset:2
448; VI-NEXT:    s_endpgm
449  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
450  %gep = getelementptr <3 x i8>, ptr addrspace(1) %in, i32 %tid.x
451  %val = load <3 x i8>, ptr addrspace(1) %gep, align 4
452  store <3 x i8> %val, ptr addrspace(1) %out, align 4
453  ret void
454}
455
456define amdgpu_kernel void @test_copy_v3i8_align2(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
457; SI-LABEL: test_copy_v3i8_align2:
458; SI:       ; %bb.0:
459; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
460; SI-NEXT:    s_mov_b32 s7, 0xf000
461; SI-NEXT:    s_mov_b32 s6, -1
462; SI-NEXT:    s_mov_b32 s10, s6
463; SI-NEXT:    s_mov_b32 s11, s7
464; SI-NEXT:    s_waitcnt lgkmcnt(0)
465; SI-NEXT:    s_mov_b32 s8, s2
466; SI-NEXT:    s_mov_b32 s9, s3
467; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:2
468; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
469; SI-NEXT:    s_mov_b32 s4, s0
470; SI-NEXT:    s_mov_b32 s5, s1
471; SI-NEXT:    s_waitcnt vmcnt(1)
472; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:2
473; SI-NEXT:    s_waitcnt vmcnt(1)
474; SI-NEXT:    buffer_store_short v1, off, s[4:7], 0
475; SI-NEXT:    s_endpgm
476;
477; VI-LABEL: test_copy_v3i8_align2:
478; VI:       ; %bb.0:
479; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
480; VI-NEXT:    s_mov_b32 s7, 0xf000
481; VI-NEXT:    s_mov_b32 s6, -1
482; VI-NEXT:    s_mov_b32 s10, s6
483; VI-NEXT:    s_mov_b32 s11, s7
484; VI-NEXT:    s_waitcnt lgkmcnt(0)
485; VI-NEXT:    s_mov_b32 s8, s2
486; VI-NEXT:    s_mov_b32 s9, s3
487; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:2
488; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0
489; VI-NEXT:    s_mov_b32 s4, s0
490; VI-NEXT:    s_mov_b32 s5, s1
491; VI-NEXT:    s_waitcnt vmcnt(1)
492; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:2
493; VI-NEXT:    s_waitcnt vmcnt(1)
494; VI-NEXT:    buffer_store_short v1, off, s[4:7], 0
495; VI-NEXT:    s_endpgm
496  %val = load <3 x i8>, ptr addrspace(1) %in, align 2
497  store <3 x i8> %val, ptr addrspace(1) %out, align 2
498  ret void
499}
500
501define amdgpu_kernel void @test_copy_v3i8_align1(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
502; SI-LABEL: test_copy_v3i8_align1:
503; SI:       ; %bb.0:
504; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
505; SI-NEXT:    s_mov_b32 s7, 0xf000
506; SI-NEXT:    s_mov_b32 s6, -1
507; SI-NEXT:    s_mov_b32 s10, s6
508; SI-NEXT:    s_mov_b32 s11, s7
509; SI-NEXT:    s_waitcnt lgkmcnt(0)
510; SI-NEXT:    s_mov_b32 s8, s2
511; SI-NEXT:    s_mov_b32 s9, s3
512; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
513; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:1
514; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:2
515; SI-NEXT:    s_mov_b32 s4, s0
516; SI-NEXT:    s_mov_b32 s5, s1
517; SI-NEXT:    s_waitcnt vmcnt(2)
518; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
519; SI-NEXT:    s_waitcnt vmcnt(2)
520; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:1
521; SI-NEXT:    s_waitcnt vmcnt(2)
522; SI-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:2
523; SI-NEXT:    s_endpgm
524;
525; VI-LABEL: test_copy_v3i8_align1:
526; VI:       ; %bb.0:
527; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
528; VI-NEXT:    s_mov_b32 s7, 0xf000
529; VI-NEXT:    s_mov_b32 s6, -1
530; VI-NEXT:    s_mov_b32 s10, s6
531; VI-NEXT:    s_mov_b32 s11, s7
532; VI-NEXT:    s_waitcnt lgkmcnt(0)
533; VI-NEXT:    s_mov_b32 s8, s2
534; VI-NEXT:    s_mov_b32 s9, s3
535; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
536; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:2
537; VI-NEXT:    s_mov_b32 s4, s0
538; VI-NEXT:    s_mov_b32 s5, s1
539; VI-NEXT:    s_waitcnt vmcnt(1)
540; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
541; VI-NEXT:    s_waitcnt vmcnt(1)
542; VI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
543; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
544; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:1
545; VI-NEXT:    s_endpgm
546  %val = load <3 x i8>, ptr addrspace(1) %in, align 1
547  store <3 x i8> %val, ptr addrspace(1) %out, align 1
548  ret void
549}
550
551define amdgpu_kernel void @test_copy_v4i8_volatile_load(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
552; SI-LABEL: test_copy_v4i8_volatile_load:
553; SI:       ; %bb.0:
554; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
555; SI-NEXT:    s_mov_b32 s7, 0xf000
556; SI-NEXT:    s_mov_b32 s6, -1
557; SI-NEXT:    s_mov_b32 s10, s6
558; SI-NEXT:    s_mov_b32 s11, s7
559; SI-NEXT:    s_waitcnt lgkmcnt(0)
560; SI-NEXT:    s_mov_b32 s8, s2
561; SI-NEXT:    s_mov_b32 s9, s3
562; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
563; SI-NEXT:    s_waitcnt vmcnt(0)
564; SI-NEXT:    s_mov_b32 s4, s0
565; SI-NEXT:    s_mov_b32 s5, s1
566; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
567; SI-NEXT:    s_endpgm
568;
569; VI-LABEL: test_copy_v4i8_volatile_load:
570; VI:       ; %bb.0:
571; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
572; VI-NEXT:    s_mov_b32 s7, 0xf000
573; VI-NEXT:    s_mov_b32 s6, -1
574; VI-NEXT:    s_mov_b32 s10, s6
575; VI-NEXT:    s_mov_b32 s11, s7
576; VI-NEXT:    s_waitcnt lgkmcnt(0)
577; VI-NEXT:    s_mov_b32 s8, s2
578; VI-NEXT:    s_mov_b32 s9, s3
579; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0 glc
580; VI-NEXT:    s_waitcnt vmcnt(0)
581; VI-NEXT:    s_mov_b32 s4, s0
582; VI-NEXT:    s_mov_b32 s5, s1
583; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
584; VI-NEXT:    s_endpgm
585  %val = load volatile <4 x i8>, ptr addrspace(1) %in, align 4
586  store <4 x i8> %val, ptr addrspace(1) %out, align 4
587  ret void
588}
589
590define amdgpu_kernel void @test_copy_v4i8_volatile_store(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
591; SI-LABEL: test_copy_v4i8_volatile_store:
592; SI:       ; %bb.0:
593; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
594; SI-NEXT:    s_mov_b32 s7, 0xf000
595; SI-NEXT:    s_mov_b32 s6, -1
596; SI-NEXT:    s_mov_b32 s10, s6
597; SI-NEXT:    s_mov_b32 s11, s7
598; SI-NEXT:    s_waitcnt lgkmcnt(0)
599; SI-NEXT:    s_mov_b32 s8, s2
600; SI-NEXT:    s_mov_b32 s9, s3
601; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:3
602; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:2
603; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:1
604; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0
605; SI-NEXT:    s_mov_b32 s4, s0
606; SI-NEXT:    s_mov_b32 s5, s1
607; SI-NEXT:    s_waitcnt vmcnt(3)
608; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:3
609; SI-NEXT:    s_waitcnt vmcnt(0)
610; SI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
611; SI-NEXT:    s_waitcnt vmcnt(0)
612; SI-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:1
613; SI-NEXT:    s_waitcnt vmcnt(0)
614; SI-NEXT:    buffer_store_byte v3, off, s[4:7], 0
615; SI-NEXT:    s_waitcnt vmcnt(0)
616; SI-NEXT:    s_endpgm
617;
618; VI-LABEL: test_copy_v4i8_volatile_store:
619; VI:       ; %bb.0:
620; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
621; VI-NEXT:    s_mov_b32 s7, 0xf000
622; VI-NEXT:    s_mov_b32 s6, -1
623; VI-NEXT:    s_mov_b32 s10, s6
624; VI-NEXT:    s_mov_b32 s11, s7
625; VI-NEXT:    s_waitcnt lgkmcnt(0)
626; VI-NEXT:    s_mov_b32 s8, s2
627; VI-NEXT:    s_mov_b32 s9, s3
628; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:3
629; VI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:2
630; VI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0 offset:1
631; VI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0
632; VI-NEXT:    s_mov_b32 s4, s0
633; VI-NEXT:    s_mov_b32 s5, s1
634; VI-NEXT:    s_waitcnt vmcnt(3)
635; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0 offset:3
636; VI-NEXT:    s_waitcnt vmcnt(0)
637; VI-NEXT:    buffer_store_byte v1, off, s[4:7], 0 offset:2
638; VI-NEXT:    s_waitcnt vmcnt(0)
639; VI-NEXT:    buffer_store_byte v2, off, s[4:7], 0 offset:1
640; VI-NEXT:    s_waitcnt vmcnt(0)
641; VI-NEXT:    buffer_store_byte v3, off, s[4:7], 0
642; VI-NEXT:    s_waitcnt vmcnt(0)
643; VI-NEXT:    s_endpgm
644  %val = load <4 x i8>, ptr addrspace(1) %in, align 4
645  store volatile <4 x i8> %val, ptr addrspace(1) %out, align 4
646  ret void
647}
648