xref: /llvm-project/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
4
5define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i8> %foo) #0 {
6; SI-LABEL: extract_vector_elt_v1i8:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
9; SI-NEXT:    s_load_dword s2, s[8:9], 0x2
10; SI-NEXT:    s_waitcnt lgkmcnt(0)
11; SI-NEXT:    v_mov_b32_e32 v0, s0
12; SI-NEXT:    v_mov_b32_e32 v1, s1
13; SI-NEXT:    v_mov_b32_e32 v2, s2
14; SI-NEXT:    flat_store_byte v[0:1], v2
15; SI-NEXT:    s_endpgm
16;
17; VI-LABEL: extract_vector_elt_v1i8:
18; VI:       ; %bb.0:
19; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
20; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
21; VI-NEXT:    s_waitcnt lgkmcnt(0)
22; VI-NEXT:    v_mov_b32_e32 v0, s0
23; VI-NEXT:    v_mov_b32_e32 v1, s1
24; VI-NEXT:    v_mov_b32_e32 v2, s2
25; VI-NEXT:    flat_store_byte v[0:1], v2
26; VI-NEXT:    s_endpgm
27  %p0 = extractelement <1 x i8> %foo, i32 0
28  store i8 %p0, ptr addrspace(1) %out
29  ret void
30}
31
32define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i8> %foo) #0 {
33; SI-LABEL: extract_vector_elt_v2i8:
34; SI:       ; %bb.0:
35; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
36; SI-NEXT:    s_load_dword s2, s[8:9], 0x2
37; SI-NEXT:    s_waitcnt lgkmcnt(0)
38; SI-NEXT:    s_lshr_b32 s3, s2, 8
39; SI-NEXT:    v_mov_b32_e32 v0, s0
40; SI-NEXT:    v_mov_b32_e32 v1, s1
41; SI-NEXT:    v_mov_b32_e32 v2, s2
42; SI-NEXT:    s_add_u32 s0, s0, 1
43; SI-NEXT:    v_mov_b32_e32 v3, s3
44; SI-NEXT:    s_addc_u32 s1, s1, 0
45; SI-NEXT:    flat_store_byte v[0:1], v3
46; SI-NEXT:    s_waitcnt vmcnt(0)
47; SI-NEXT:    v_mov_b32_e32 v0, s0
48; SI-NEXT:    v_mov_b32_e32 v1, s1
49; SI-NEXT:    flat_store_byte v[0:1], v2
50; SI-NEXT:    s_waitcnt vmcnt(0)
51; SI-NEXT:    s_endpgm
52;
53; VI-LABEL: extract_vector_elt_v2i8:
54; VI:       ; %bb.0:
55; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
56; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
57; VI-NEXT:    s_waitcnt lgkmcnt(0)
58; VI-NEXT:    s_lshr_b32 s3, s2, 8
59; VI-NEXT:    v_mov_b32_e32 v0, s0
60; VI-NEXT:    v_mov_b32_e32 v1, s1
61; VI-NEXT:    s_add_u32 s0, s0, 1
62; VI-NEXT:    v_mov_b32_e32 v2, s3
63; VI-NEXT:    s_addc_u32 s1, s1, 0
64; VI-NEXT:    flat_store_byte v[0:1], v2
65; VI-NEXT:    s_waitcnt vmcnt(0)
66; VI-NEXT:    v_mov_b32_e32 v0, s0
67; VI-NEXT:    v_mov_b32_e32 v1, s1
68; VI-NEXT:    v_mov_b32_e32 v2, s2
69; VI-NEXT:    flat_store_byte v[0:1], v2
70; VI-NEXT:    s_waitcnt vmcnt(0)
71; VI-NEXT:    s_endpgm
72  %p0 = extractelement <2 x i8> %foo, i32 0
73  %p1 = extractelement <2 x i8> %foo, i32 1
74  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
75  store volatile i8 %p1, ptr addrspace(1) %out
76  store volatile i8 %p0, ptr addrspace(1) %out1
77  ret void
78}
79
80define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i8> %foo) #0 {
81; SI-LABEL: extract_vector_elt_v3i8:
82; SI:       ; %bb.0:
83; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
84; SI-NEXT:    s_load_dword s2, s[8:9], 0x2
85; SI-NEXT:    s_waitcnt lgkmcnt(0)
86; SI-NEXT:    s_lshr_b32 s3, s2, 16
87; SI-NEXT:    v_mov_b32_e32 v0, s0
88; SI-NEXT:    v_mov_b32_e32 v1, s1
89; SI-NEXT:    v_mov_b32_e32 v2, s2
90; SI-NEXT:    s_add_u32 s0, s0, 1
91; SI-NEXT:    v_mov_b32_e32 v3, s3
92; SI-NEXT:    s_addc_u32 s1, s1, 0
93; SI-NEXT:    flat_store_byte v[0:1], v3
94; SI-NEXT:    s_waitcnt vmcnt(0)
95; SI-NEXT:    v_mov_b32_e32 v0, s0
96; SI-NEXT:    v_mov_b32_e32 v1, s1
97; SI-NEXT:    flat_store_byte v[0:1], v2
98; SI-NEXT:    s_waitcnt vmcnt(0)
99; SI-NEXT:    s_endpgm
100;
101; VI-LABEL: extract_vector_elt_v3i8:
102; VI:       ; %bb.0:
103; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
104; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
105; VI-NEXT:    s_waitcnt lgkmcnt(0)
106; VI-NEXT:    s_lshr_b32 s3, s2, 16
107; VI-NEXT:    v_mov_b32_e32 v0, s0
108; VI-NEXT:    v_mov_b32_e32 v1, s1
109; VI-NEXT:    s_add_u32 s0, s0, 1
110; VI-NEXT:    v_mov_b32_e32 v2, s3
111; VI-NEXT:    s_addc_u32 s1, s1, 0
112; VI-NEXT:    flat_store_byte v[0:1], v2
113; VI-NEXT:    s_waitcnt vmcnt(0)
114; VI-NEXT:    v_mov_b32_e32 v0, s0
115; VI-NEXT:    v_mov_b32_e32 v1, s1
116; VI-NEXT:    v_mov_b32_e32 v2, s2
117; VI-NEXT:    flat_store_byte v[0:1], v2
118; VI-NEXT:    s_waitcnt vmcnt(0)
119; VI-NEXT:    s_endpgm
120  %p0 = extractelement <3 x i8> %foo, i32 0
121  %p1 = extractelement <3 x i8> %foo, i32 2
122  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
123  store volatile i8 %p1, ptr addrspace(1) %out
124  store volatile i8 %p0, ptr addrspace(1) %out1
125  ret void
126}
127
128define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i8> %foo) #0 {
129; SI-LABEL: extract_vector_elt_v4i8:
130; SI:       ; %bb.0:
131; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
132; SI-NEXT:    s_load_dword s2, s[8:9], 0x2
133; SI-NEXT:    s_waitcnt lgkmcnt(0)
134; SI-NEXT:    s_lshr_b32 s3, s2, 16
135; SI-NEXT:    v_mov_b32_e32 v0, s0
136; SI-NEXT:    v_mov_b32_e32 v1, s1
137; SI-NEXT:    v_mov_b32_e32 v2, s2
138; SI-NEXT:    s_add_u32 s0, s0, 1
139; SI-NEXT:    v_mov_b32_e32 v3, s3
140; SI-NEXT:    s_addc_u32 s1, s1, 0
141; SI-NEXT:    flat_store_byte v[0:1], v3
142; SI-NEXT:    s_waitcnt vmcnt(0)
143; SI-NEXT:    v_mov_b32_e32 v0, s0
144; SI-NEXT:    v_mov_b32_e32 v1, s1
145; SI-NEXT:    flat_store_byte v[0:1], v2
146; SI-NEXT:    s_waitcnt vmcnt(0)
147; SI-NEXT:    s_endpgm
148;
149; VI-LABEL: extract_vector_elt_v4i8:
150; VI:       ; %bb.0:
151; VI-NEXT:    s_load_dword s2, s[8:9], 0x8
152; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
153; VI-NEXT:    s_waitcnt lgkmcnt(0)
154; VI-NEXT:    s_lshr_b32 s3, s2, 16
155; VI-NEXT:    v_mov_b32_e32 v0, s0
156; VI-NEXT:    v_mov_b32_e32 v1, s1
157; VI-NEXT:    s_add_u32 s0, s0, 1
158; VI-NEXT:    v_mov_b32_e32 v2, s3
159; VI-NEXT:    s_addc_u32 s1, s1, 0
160; VI-NEXT:    flat_store_byte v[0:1], v2
161; VI-NEXT:    s_waitcnt vmcnt(0)
162; VI-NEXT:    v_mov_b32_e32 v0, s0
163; VI-NEXT:    v_mov_b32_e32 v1, s1
164; VI-NEXT:    v_mov_b32_e32 v2, s2
165; VI-NEXT:    flat_store_byte v[0:1], v2
166; VI-NEXT:    s_waitcnt vmcnt(0)
167; VI-NEXT:    s_endpgm
168  %p0 = extractelement <4 x i8> %foo, i32 0
169  %p1 = extractelement <4 x i8> %foo, i32 2
170  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
171  store volatile i8 %p1, ptr addrspace(1) %out
172  store volatile i8 %p0, ptr addrspace(1) %out1
173  ret void
174}
175
176define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
177; SI-LABEL: extract_vector_elt_v8i8:
178; SI:       ; %bb.0:
179; SI-NEXT:    s_load_dword s0, s[8:9], 0x0
180; SI-NEXT:    s_waitcnt lgkmcnt(0)
181; SI-NEXT:    s_lshr_b32 s1, s0, 16
182; SI-NEXT:    v_mov_b32_e32 v0, 0
183; SI-NEXT:    v_mov_b32_e32 v1, 0
184; SI-NEXT:    v_mov_b32_e32 v2, s0
185; SI-NEXT:    v_mov_b32_e32 v3, s1
186; SI-NEXT:    flat_store_byte v[0:1], v3
187; SI-NEXT:    s_waitcnt vmcnt(0)
188; SI-NEXT:    flat_store_byte v[0:1], v2
189; SI-NEXT:    s_waitcnt vmcnt(0)
190; SI-NEXT:    s_endpgm
191;
192; VI-LABEL: extract_vector_elt_v8i8:
193; VI:       ; %bb.0:
194; VI-NEXT:    s_load_dword s0, s[8:9], 0x0
195; VI-NEXT:    v_mov_b32_e32 v0, 0
196; VI-NEXT:    v_mov_b32_e32 v1, 0
197; VI-NEXT:    s_waitcnt lgkmcnt(0)
198; VI-NEXT:    s_lshr_b32 s1, s0, 16
199; VI-NEXT:    v_mov_b32_e32 v3, s1
200; VI-NEXT:    v_mov_b32_e32 v2, s0
201; VI-NEXT:    flat_store_byte v[0:1], v3
202; VI-NEXT:    s_waitcnt vmcnt(0)
203; VI-NEXT:    flat_store_byte v[0:1], v2
204; VI-NEXT:    s_waitcnt vmcnt(0)
205; VI-NEXT:    s_endpgm
206  %p0 = extractelement <8 x i8> %foo, i32 0
207  %p1 = extractelement <8 x i8> %foo, i32 2
208  store volatile i8 %p1, ptr addrspace(1) null
209  store volatile i8 %p0, ptr addrspace(1) null
210  ret void
211}
212
213define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x i8> %foo) #0 {
214; SI-LABEL: extract_vector_elt_v16i8:
215; SI:       ; %bb.0:
216; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
217; SI-NEXT:    s_load_dword s2, s[8:9], 0x4
218; SI-NEXT:    s_waitcnt lgkmcnt(0)
219; SI-NEXT:    s_lshr_b32 s3, s2, 16
220; SI-NEXT:    v_mov_b32_e32 v0, s0
221; SI-NEXT:    v_mov_b32_e32 v1, s1
222; SI-NEXT:    v_mov_b32_e32 v2, s2
223; SI-NEXT:    s_add_u32 s0, s0, 1
224; SI-NEXT:    v_mov_b32_e32 v3, s3
225; SI-NEXT:    s_addc_u32 s1, s1, 0
226; SI-NEXT:    flat_store_byte v[0:1], v3
227; SI-NEXT:    s_waitcnt vmcnt(0)
228; SI-NEXT:    v_mov_b32_e32 v0, s0
229; SI-NEXT:    v_mov_b32_e32 v1, s1
230; SI-NEXT:    flat_store_byte v[0:1], v2
231; SI-NEXT:    s_waitcnt vmcnt(0)
232; SI-NEXT:    s_endpgm
233;
234; VI-LABEL: extract_vector_elt_v16i8:
235; VI:       ; %bb.0:
236; VI-NEXT:    s_load_dword s2, s[8:9], 0x10
237; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
238; VI-NEXT:    s_waitcnt lgkmcnt(0)
239; VI-NEXT:    s_lshr_b32 s3, s2, 16
240; VI-NEXT:    v_mov_b32_e32 v0, s0
241; VI-NEXT:    v_mov_b32_e32 v1, s1
242; VI-NEXT:    s_add_u32 s0, s0, 1
243; VI-NEXT:    v_mov_b32_e32 v2, s3
244; VI-NEXT:    s_addc_u32 s1, s1, 0
245; VI-NEXT:    flat_store_byte v[0:1], v2
246; VI-NEXT:    s_waitcnt vmcnt(0)
247; VI-NEXT:    v_mov_b32_e32 v0, s0
248; VI-NEXT:    v_mov_b32_e32 v1, s1
249; VI-NEXT:    v_mov_b32_e32 v2, s2
250; VI-NEXT:    flat_store_byte v[0:1], v2
251; VI-NEXT:    s_waitcnt vmcnt(0)
252; VI-NEXT:    s_endpgm
253  %p0 = extractelement <16 x i8> %foo, i32 0
254  %p1 = extractelement <16 x i8> %foo, i32 2
255  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
256  store volatile i8 %p1, ptr addrspace(1) %out
257  store volatile i8 %p0, ptr addrspace(1) %out1
258  ret void
259}
260
261define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
262; SI-LABEL: extract_vector_elt_v32i8:
263; SI:       ; %bb.0:
264; SI-NEXT:    s_load_dword s0, s[8:9], 0x0
265; SI-NEXT:    s_waitcnt lgkmcnt(0)
266; SI-NEXT:    s_lshr_b32 s1, s0, 16
267; SI-NEXT:    v_mov_b32_e32 v0, 0
268; SI-NEXT:    v_mov_b32_e32 v1, 0
269; SI-NEXT:    v_mov_b32_e32 v2, s0
270; SI-NEXT:    v_mov_b32_e32 v3, s1
271; SI-NEXT:    flat_store_byte v[0:1], v3
272; SI-NEXT:    s_waitcnt vmcnt(0)
273; SI-NEXT:    flat_store_byte v[0:1], v2
274; SI-NEXT:    s_waitcnt vmcnt(0)
275; SI-NEXT:    s_endpgm
276;
277; VI-LABEL: extract_vector_elt_v32i8:
278; VI:       ; %bb.0:
279; VI-NEXT:    s_load_dword s0, s[8:9], 0x0
280; VI-NEXT:    v_mov_b32_e32 v0, 0
281; VI-NEXT:    v_mov_b32_e32 v1, 0
282; VI-NEXT:    s_waitcnt lgkmcnt(0)
283; VI-NEXT:    s_lshr_b32 s1, s0, 16
284; VI-NEXT:    v_mov_b32_e32 v3, s1
285; VI-NEXT:    v_mov_b32_e32 v2, s0
286; VI-NEXT:    flat_store_byte v[0:1], v3
287; VI-NEXT:    s_waitcnt vmcnt(0)
288; VI-NEXT:    flat_store_byte v[0:1], v2
289; VI-NEXT:    s_waitcnt vmcnt(0)
290; VI-NEXT:    s_endpgm
291  %p0 = extractelement <32 x i8> %foo, i32 0
292  %p1 = extractelement <32 x i8> %foo, i32 2
293  store volatile i8 %p1, ptr addrspace(1) null
294  store volatile i8 %p0, ptr addrspace(1) null
295  ret void
296}
297
298define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x i8> %foo) #0 {
299; SI-LABEL: extract_vector_elt_v64i8:
300; SI:       ; %bb.0:
301; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
302; SI-NEXT:    s_load_dword s2, s[8:9], 0x10
303; SI-NEXT:    s_waitcnt lgkmcnt(0)
304; SI-NEXT:    s_lshr_b32 s3, s2, 16
305; SI-NEXT:    v_mov_b32_e32 v0, s0
306; SI-NEXT:    v_mov_b32_e32 v1, s1
307; SI-NEXT:    v_mov_b32_e32 v2, s2
308; SI-NEXT:    s_add_u32 s0, s0, 1
309; SI-NEXT:    v_mov_b32_e32 v3, s3
310; SI-NEXT:    s_addc_u32 s1, s1, 0
311; SI-NEXT:    flat_store_byte v[0:1], v3
312; SI-NEXT:    s_waitcnt vmcnt(0)
313; SI-NEXT:    v_mov_b32_e32 v0, s0
314; SI-NEXT:    v_mov_b32_e32 v1, s1
315; SI-NEXT:    flat_store_byte v[0:1], v2
316; SI-NEXT:    s_waitcnt vmcnt(0)
317; SI-NEXT:    s_endpgm
318;
319; VI-LABEL: extract_vector_elt_v64i8:
320; VI:       ; %bb.0:
321; VI-NEXT:    s_load_dword s2, s[8:9], 0x40
322; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
323; VI-NEXT:    s_waitcnt lgkmcnt(0)
324; VI-NEXT:    s_lshr_b32 s3, s2, 16
325; VI-NEXT:    v_mov_b32_e32 v0, s0
326; VI-NEXT:    v_mov_b32_e32 v1, s1
327; VI-NEXT:    s_add_u32 s0, s0, 1
328; VI-NEXT:    v_mov_b32_e32 v2, s3
329; VI-NEXT:    s_addc_u32 s1, s1, 0
330; VI-NEXT:    flat_store_byte v[0:1], v2
331; VI-NEXT:    s_waitcnt vmcnt(0)
332; VI-NEXT:    v_mov_b32_e32 v0, s0
333; VI-NEXT:    v_mov_b32_e32 v1, s1
334; VI-NEXT:    v_mov_b32_e32 v2, s2
335; VI-NEXT:    flat_store_byte v[0:1], v2
336; VI-NEXT:    s_waitcnt vmcnt(0)
337; VI-NEXT:    s_endpgm
338  %p0 = extractelement <64 x i8> %foo, i32 0
339  %p1 = extractelement <64 x i8> %foo, i32 2
340  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
341  store volatile i8 %p1, ptr addrspace(1) %out
342  store volatile i8 %p0, ptr addrspace(1) %out1
343  ret void
344}
345
346; FIXME: SI generates much worse code from that's a pain to match
347
348; FIXME: 16-bit and 32-bit shift not combined after legalize to to
349; isTypeDesirableForOp in SimplifyDemandedBits
350
351define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
352; SI-LABEL: dynamic_extract_vector_elt_v2i8:
353; SI:       ; %bb.0:
354; SI-NEXT:    s_load_dword s2, s[8:9], 0xa
355; SI-NEXT:    s_load_dword s3, s[8:9], 0x13
356; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
357; SI-NEXT:    s_waitcnt lgkmcnt(0)
358; SI-NEXT:    s_and_b32 s2, s2, 0xffff
359; SI-NEXT:    s_lshl_b32 s3, s3, 3
360; SI-NEXT:    s_lshr_b32 s2, s2, s3
361; SI-NEXT:    v_mov_b32_e32 v0, s0
362; SI-NEXT:    v_mov_b32_e32 v1, s1
363; SI-NEXT:    v_mov_b32_e32 v2, s2
364; SI-NEXT:    flat_store_byte v[0:1], v2
365; SI-NEXT:    s_waitcnt vmcnt(0)
366; SI-NEXT:    s_endpgm
367;
368; VI-LABEL: dynamic_extract_vector_elt_v2i8:
369; VI:       ; %bb.0:
370; VI-NEXT:    s_load_dword s2, s[8:9], 0x4c
371; VI-NEXT:    s_load_dword s3, s[8:9], 0x28
372; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
373; VI-NEXT:    s_waitcnt lgkmcnt(0)
374; VI-NEXT:    s_lshl_b32 s2, s2, 3
375; VI-NEXT:    s_and_b32 s3, s3, 0xffff
376; VI-NEXT:    s_lshr_b32 s2, s3, s2
377; VI-NEXT:    v_mov_b32_e32 v0, s0
378; VI-NEXT:    v_mov_b32_e32 v1, s1
379; VI-NEXT:    v_mov_b32_e32 v2, s2
380; VI-NEXT:    flat_store_byte v[0:1], v2
381; VI-NEXT:    s_waitcnt vmcnt(0)
382; VI-NEXT:    s_endpgm
383  %elt = extractelement <2 x i8> %foo, i32 %idx
384  store volatile i8 %elt, ptr addrspace(1) %out
385  ret void
386}
387
388define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
389; SI-LABEL: dynamic_extract_vector_elt_v3i8:
390; SI:       ; %bb.0:
391; SI-NEXT:    s_load_dword s2, s[8:9], 0x13
392; SI-NEXT:    s_load_dword s3, s[8:9], 0xa
393; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
394; SI-NEXT:    s_waitcnt lgkmcnt(0)
395; SI-NEXT:    s_lshl_b32 s2, s2, 3
396; SI-NEXT:    s_lshr_b32 s2, s3, s2
397; SI-NEXT:    v_mov_b32_e32 v0, s0
398; SI-NEXT:    v_mov_b32_e32 v1, s1
399; SI-NEXT:    v_mov_b32_e32 v2, s2
400; SI-NEXT:    flat_store_byte v[0:1], v2
401; SI-NEXT:    s_waitcnt vmcnt(0)
402; SI-NEXT:    s_endpgm
403;
404; VI-LABEL: dynamic_extract_vector_elt_v3i8:
405; VI:       ; %bb.0:
406; VI-NEXT:    s_load_dword s2, s[8:9], 0x4c
407; VI-NEXT:    s_load_dword s3, s[8:9], 0x28
408; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
409; VI-NEXT:    s_waitcnt lgkmcnt(0)
410; VI-NEXT:    s_lshl_b32 s2, s2, 3
411; VI-NEXT:    s_lshr_b32 s2, s3, s2
412; VI-NEXT:    v_mov_b32_e32 v0, s0
413; VI-NEXT:    v_mov_b32_e32 v1, s1
414; VI-NEXT:    v_mov_b32_e32 v2, s2
415; VI-NEXT:    flat_store_byte v[0:1], v2
416; VI-NEXT:    s_waitcnt vmcnt(0)
417; VI-NEXT:    s_endpgm
418  %p0 = extractelement <3 x i8> %foo, i32 %idx
419  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
420  store volatile i8 %p0, ptr addrspace(1) %out
421  ret void
422}
423
424define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 {
425; SI-LABEL: dynamic_extract_vector_elt_v4i8:
426; SI:       ; %bb.0:
427; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
428; SI-NEXT:    s_load_dword s4, s[8:9], 0xc
429; SI-NEXT:    s_waitcnt lgkmcnt(0)
430; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
431; SI-NEXT:    s_lshl_b32 s3, s4, 3
432; SI-NEXT:    s_waitcnt lgkmcnt(0)
433; SI-NEXT:    s_lshr_b32 s2, s2, s3
434; SI-NEXT:    v_mov_b32_e32 v0, s0
435; SI-NEXT:    v_mov_b32_e32 v1, s1
436; SI-NEXT:    v_mov_b32_e32 v2, s2
437; SI-NEXT:    flat_store_byte v[0:1], v2
438; SI-NEXT:    s_waitcnt vmcnt(0)
439; SI-NEXT:    s_endpgm
440;
441; VI-LABEL: dynamic_extract_vector_elt_v4i8:
442; VI:       ; %bb.0:
443; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
444; VI-NEXT:    s_load_dword s4, s[8:9], 0x30
445; VI-NEXT:    s_waitcnt lgkmcnt(0)
446; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
447; VI-NEXT:    v_mov_b32_e32 v0, s0
448; VI-NEXT:    s_lshl_b32 s0, s4, 3
449; VI-NEXT:    v_mov_b32_e32 v1, s1
450; VI-NEXT:    s_waitcnt lgkmcnt(0)
451; VI-NEXT:    s_lshr_b32 s0, s2, s0
452; VI-NEXT:    v_mov_b32_e32 v2, s0
453; VI-NEXT:    flat_store_byte v[0:1], v2
454; VI-NEXT:    s_waitcnt vmcnt(0)
455; VI-NEXT:    s_endpgm
456  %vec = load <4 x i8>, ptr addrspace(4) %vec.ptr
457  %p0 = extractelement <4 x i8> %vec, i32 %idx
458  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
459  store volatile i8 %p0, ptr addrspace(1) %out
460  ret void
461}
462
463define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 {
464; SI-LABEL: dynamic_extract_vector_elt_v8i8:
465; SI:       ; %bb.0:
466; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
467; SI-NEXT:    s_load_dword s4, s[8:9], 0x4
468; SI-NEXT:    s_waitcnt lgkmcnt(0)
469; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
470; SI-NEXT:    s_lshl_b32 s4, s4, 3
471; SI-NEXT:    s_waitcnt lgkmcnt(0)
472; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
473; SI-NEXT:    v_mov_b32_e32 v0, s0
474; SI-NEXT:    v_mov_b32_e32 v1, s1
475; SI-NEXT:    v_mov_b32_e32 v2, s2
476; SI-NEXT:    flat_store_byte v[0:1], v2
477; SI-NEXT:    s_waitcnt vmcnt(0)
478; SI-NEXT:    s_endpgm
479;
480; VI-LABEL: dynamic_extract_vector_elt_v8i8:
481; VI:       ; %bb.0:
482; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
483; VI-NEXT:    s_load_dword s4, s[8:9], 0x10
484; VI-NEXT:    s_waitcnt lgkmcnt(0)
485; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
486; VI-NEXT:    v_mov_b32_e32 v0, s0
487; VI-NEXT:    s_lshl_b32 s0, s4, 3
488; VI-NEXT:    v_mov_b32_e32 v1, s1
489; VI-NEXT:    s_waitcnt lgkmcnt(0)
490; VI-NEXT:    s_lshr_b64 s[0:1], s[2:3], s0
491; VI-NEXT:    v_mov_b32_e32 v2, s0
492; VI-NEXT:    flat_store_byte v[0:1], v2
493; VI-NEXT:    s_waitcnt vmcnt(0)
494; VI-NEXT:    s_endpgm
495  %vec = load <8 x i8>, ptr addrspace(4) %vec.ptr
496  %p0 = extractelement <8 x i8> %vec, i32 %idx
497  %out1 = getelementptr i8, ptr addrspace(1) %out, i32 1
498  store volatile i8 %p0, ptr addrspace(1) %out
499  ret void
500}
501
502define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
503; SI-LABEL: reduce_load_vector_v8i8_extract_0123:
504; SI:       ; %bb.0:
505; SI-NEXT:    s_mov_b64 s[0:1], 0
506; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
507; SI-NEXT:    s_waitcnt lgkmcnt(0)
508; SI-NEXT:    s_lshr_b32 s1, s0, 8
509; SI-NEXT:    s_lshr_b32 s2, s0, 16
510; SI-NEXT:    s_lshr_b32 s3, s0, 24
511; SI-NEXT:    v_mov_b32_e32 v0, s0
512; SI-NEXT:    flat_store_byte v[0:1], v0
513; SI-NEXT:    s_waitcnt vmcnt(0)
514; SI-NEXT:    v_mov_b32_e32 v0, s1
515; SI-NEXT:    flat_store_byte v[0:1], v0
516; SI-NEXT:    s_waitcnt vmcnt(0)
517; SI-NEXT:    v_mov_b32_e32 v0, s2
518; SI-NEXT:    flat_store_byte v[0:1], v0
519; SI-NEXT:    s_waitcnt vmcnt(0)
520; SI-NEXT:    v_mov_b32_e32 v0, s3
521; SI-NEXT:    flat_store_byte v[0:1], v0
522; SI-NEXT:    s_waitcnt vmcnt(0)
523; SI-NEXT:    s_endpgm
524;
525; VI-LABEL: reduce_load_vector_v8i8_extract_0123:
526; VI:       ; %bb.0:
527; VI-NEXT:    s_mov_b64 s[0:1], 0
528; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
529; VI-NEXT:    s_waitcnt lgkmcnt(0)
530; VI-NEXT:    s_lshr_b32 s1, s0, 8
531; VI-NEXT:    v_mov_b32_e32 v0, s0
532; VI-NEXT:    s_lshr_b32 s2, s0, 16
533; VI-NEXT:    flat_store_byte v[0:1], v0
534; VI-NEXT:    s_waitcnt vmcnt(0)
535; VI-NEXT:    v_mov_b32_e32 v0, s1
536; VI-NEXT:    s_lshr_b32 s3, s0, 24
537; VI-NEXT:    flat_store_byte v[0:1], v0
538; VI-NEXT:    s_waitcnt vmcnt(0)
539; VI-NEXT:    v_mov_b32_e32 v0, s2
540; VI-NEXT:    flat_store_byte v[0:1], v0
541; VI-NEXT:    s_waitcnt vmcnt(0)
542; VI-NEXT:    v_mov_b32_e32 v0, s3
543; VI-NEXT:    flat_store_byte v[0:1], v0
544; VI-NEXT:    s_waitcnt vmcnt(0)
545; VI-NEXT:    s_endpgm
546  %load = load <8 x i8>, ptr addrspace(4) null
547  %elt0 = extractelement <8 x i8> %load, i32 0
548  %elt1 = extractelement <8 x i8> %load, i32 1
549  %elt2 = extractelement <8 x i8> %load, i32 2
550  %elt3 = extractelement <8 x i8> %load, i32 3
551  store volatile i8 %elt0, ptr addrspace(1) undef, align 1
552  store volatile i8 %elt1, ptr addrspace(1) undef, align 1
553  store volatile i8 %elt2, ptr addrspace(1) undef, align 1
554  store volatile i8 %elt3, ptr addrspace(1) undef, align 1
555  ret void
556}
557
558define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
559; SI-LABEL: reduce_load_vector_v8i8_extract_0145:
560; SI:       ; %bb.0:
561; SI-NEXT:    s_mov_b64 s[0:1], 0
562; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
563; SI-NEXT:    s_waitcnt lgkmcnt(0)
564; SI-NEXT:    s_lshr_b32 s2, s0, 8
565; SI-NEXT:    s_lshr_b32 s3, s1, 8
566; SI-NEXT:    v_mov_b32_e32 v0, s0
567; SI-NEXT:    flat_store_byte v[0:1], v0
568; SI-NEXT:    s_waitcnt vmcnt(0)
569; SI-NEXT:    v_mov_b32_e32 v0, s1
570; SI-NEXT:    v_mov_b32_e32 v1, s2
571; SI-NEXT:    flat_store_byte v[0:1], v1
572; SI-NEXT:    s_waitcnt vmcnt(0)
573; SI-NEXT:    flat_store_byte v[0:1], v0
574; SI-NEXT:    s_waitcnt vmcnt(0)
575; SI-NEXT:    v_mov_b32_e32 v0, s3
576; SI-NEXT:    flat_store_byte v[0:1], v0
577; SI-NEXT:    s_waitcnt vmcnt(0)
578; SI-NEXT:    s_endpgm
579;
580; VI-LABEL: reduce_load_vector_v8i8_extract_0145:
581; VI:       ; %bb.0:
582; VI-NEXT:    s_mov_b64 s[0:1], 0
583; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
584; VI-NEXT:    s_waitcnt lgkmcnt(0)
585; VI-NEXT:    s_lshr_b32 s2, s0, 8
586; VI-NEXT:    v_mov_b32_e32 v0, s0
587; VI-NEXT:    v_mov_b32_e32 v1, s1
588; VI-NEXT:    s_lshr_b32 s3, s1, 8
589; VI-NEXT:    flat_store_byte v[0:1], v0
590; VI-NEXT:    s_waitcnt vmcnt(0)
591; VI-NEXT:    v_mov_b32_e32 v0, s2
592; VI-NEXT:    flat_store_byte v[0:1], v0
593; VI-NEXT:    s_waitcnt vmcnt(0)
594; VI-NEXT:    flat_store_byte v[0:1], v1
595; VI-NEXT:    s_waitcnt vmcnt(0)
596; VI-NEXT:    v_mov_b32_e32 v0, s3
597; VI-NEXT:    flat_store_byte v[0:1], v0
598; VI-NEXT:    s_waitcnt vmcnt(0)
599; VI-NEXT:    s_endpgm
600  %load = load <8 x i8>, ptr addrspace(4) null
601  %elt0 = extractelement <8 x i8> %load, i32 0
602  %elt1 = extractelement <8 x i8> %load, i32 1
603  %elt4 = extractelement <8 x i8> %load, i32 4
604  %elt5 = extractelement <8 x i8> %load, i32 5
605  store volatile i8 %elt0, ptr addrspace(1) undef, align 1
606  store volatile i8 %elt1, ptr addrspace(1) undef, align 1
607  store volatile i8 %elt4, ptr addrspace(1) undef, align 1
608  store volatile i8 %elt5, ptr addrspace(1) undef, align 1
609  ret void
610}
611
612define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
613; SI-LABEL: reduce_load_vector_v8i8_extract_45:
614; SI:       ; %bb.0:
615; SI-NEXT:    s_mov_b64 s[0:1], 4
616; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
617; SI-NEXT:    s_waitcnt lgkmcnt(0)
618; SI-NEXT:    s_lshr_b32 s1, s0, 8
619; SI-NEXT:    v_mov_b32_e32 v0, s0
620; SI-NEXT:    flat_store_byte v[0:1], v0
621; SI-NEXT:    s_waitcnt vmcnt(0)
622; SI-NEXT:    v_mov_b32_e32 v0, s1
623; SI-NEXT:    flat_store_byte v[0:1], v0
624; SI-NEXT:    s_waitcnt vmcnt(0)
625; SI-NEXT:    s_endpgm
626;
627; VI-LABEL: reduce_load_vector_v8i8_extract_45:
628; VI:       ; %bb.0:
629; VI-NEXT:    s_mov_b64 s[0:1], 4
630; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
631; VI-NEXT:    s_waitcnt lgkmcnt(0)
632; VI-NEXT:    s_lshr_b32 s1, s0, 8
633; VI-NEXT:    v_mov_b32_e32 v0, s0
634; VI-NEXT:    flat_store_byte v[0:1], v0
635; VI-NEXT:    s_waitcnt vmcnt(0)
636; VI-NEXT:    v_mov_b32_e32 v0, s1
637; VI-NEXT:    flat_store_byte v[0:1], v0
638; VI-NEXT:    s_waitcnt vmcnt(0)
639; VI-NEXT:    s_endpgm
640  %load = load <8 x i8>, ptr addrspace(4) null
641  %elt4 = extractelement <8 x i8> %load, i32 4
642  %elt5 = extractelement <8 x i8> %load, i32 5
643  store volatile i8 %elt4, ptr addrspace(1) undef, align 1
644  store volatile i8 %elt5, ptr addrspace(1) undef, align 1
645  ret void
646}
647
648; FIXME: ought to be able to eliminate high half of load
649define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
650; SI-LABEL: reduce_load_vector_v16i8_extract_0145:
651; SI:       ; %bb.0:
652; SI-NEXT:    s_mov_b64 s[0:1], 0
653; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
654; SI-NEXT:    s_waitcnt lgkmcnt(0)
655; SI-NEXT:    s_lshr_b32 s2, s0, 8
656; SI-NEXT:    s_lshr_b32 s3, s1, 8
657; SI-NEXT:    v_mov_b32_e32 v0, s0
658; SI-NEXT:    flat_store_byte v[0:1], v0
659; SI-NEXT:    s_waitcnt vmcnt(0)
660; SI-NEXT:    v_mov_b32_e32 v0, s1
661; SI-NEXT:    v_mov_b32_e32 v1, s2
662; SI-NEXT:    flat_store_byte v[0:1], v1
663; SI-NEXT:    s_waitcnt vmcnt(0)
664; SI-NEXT:    flat_store_byte v[0:1], v0
665; SI-NEXT:    s_waitcnt vmcnt(0)
666; SI-NEXT:    v_mov_b32_e32 v0, s3
667; SI-NEXT:    flat_store_byte v[0:1], v0
668; SI-NEXT:    s_waitcnt vmcnt(0)
669; SI-NEXT:    s_endpgm
670;
671; VI-LABEL: reduce_load_vector_v16i8_extract_0145:
672; VI:       ; %bb.0:
673; VI-NEXT:    s_mov_b64 s[0:1], 0
674; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x0
675; VI-NEXT:    s_waitcnt lgkmcnt(0)
676; VI-NEXT:    s_lshr_b32 s2, s0, 8
677; VI-NEXT:    v_mov_b32_e32 v0, s0
678; VI-NEXT:    v_mov_b32_e32 v1, s1
679; VI-NEXT:    s_lshr_b32 s3, s1, 8
680; VI-NEXT:    flat_store_byte v[0:1], v0
681; VI-NEXT:    s_waitcnt vmcnt(0)
682; VI-NEXT:    v_mov_b32_e32 v0, s2
683; VI-NEXT:    flat_store_byte v[0:1], v0
684; VI-NEXT:    s_waitcnt vmcnt(0)
685; VI-NEXT:    flat_store_byte v[0:1], v1
686; VI-NEXT:    s_waitcnt vmcnt(0)
687; VI-NEXT:    v_mov_b32_e32 v0, s3
688; VI-NEXT:    flat_store_byte v[0:1], v0
689; VI-NEXT:    s_waitcnt vmcnt(0)
690; VI-NEXT:    s_endpgm
691  %load = load <16 x i8>, ptr addrspace(4) null
692  %elt0 = extractelement <16 x i8> %load, i32 0
693  %elt1 = extractelement <16 x i8> %load, i32 1
694  %elt4 = extractelement <16 x i8> %load, i32 4
695  %elt5 = extractelement <16 x i8> %load, i32 5
696  store volatile i8 %elt0, ptr addrspace(1) undef, align 1
697  store volatile i8 %elt1, ptr addrspace(1) undef, align 1
698  store volatile i8 %elt4, ptr addrspace(1) undef, align 1
699  store volatile i8 %elt5, ptr addrspace(1) undef, align 1
700  ret void
701}
702
703attributes #0 = { nounwind }
704