xref: /llvm-project/llvm/test/CodeGen/AMDGPU/sign_extend.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=SI
3; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope --check-prefix=VI
4
5define amdgpu_kernel void @s_sext_i1_to_i32(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
6; SI-LABEL: s_sext_i1_to_i32:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
9; SI-NEXT:    s_mov_b32 s7, 0xf000
10; SI-NEXT:    s_mov_b32 s6, -1
11; SI-NEXT:    s_waitcnt lgkmcnt(0)
12; SI-NEXT:    s_cmp_eq_u32 s2, s3
13; SI-NEXT:    s_mov_b32 s4, s0
14; SI-NEXT:    s_mov_b32 s5, s1
15; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
16; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
17; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
18; SI-NEXT:    s_endpgm
19;
20; VI-LABEL: s_sext_i1_to_i32:
21; VI:       ; %bb.0:
22; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
23; VI-NEXT:    s_mov_b32 s7, 0xf000
24; VI-NEXT:    s_mov_b32 s6, -1
25; VI-NEXT:    s_waitcnt lgkmcnt(0)
26; VI-NEXT:    s_cmp_eq_u32 s2, s3
27; VI-NEXT:    s_mov_b32 s4, s0
28; VI-NEXT:    s_mov_b32 s5, s1
29; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
30; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
31; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
32; VI-NEXT:    s_endpgm
33  %cmp = icmp eq i32 %a, %b
34  %sext = sext i1 %cmp to i32
35  store i32 %sext, ptr addrspace(1) %out, align 4
36  ret void
37}
38
39define amdgpu_kernel void @test_s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind {
40; SI-LABEL: test_s_sext_i32_to_i64:
41; SI:       ; %bb.0: ; %entry
42; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
43; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
44; SI-NEXT:    s_mov_b32 s7, 0xf000
45; SI-NEXT:    s_mov_b32 s6, -1
46; SI-NEXT:    s_waitcnt lgkmcnt(0)
47; SI-NEXT:    s_mul_i32 s0, s0, s1
48; SI-NEXT:    s_add_i32 s0, s0, s2
49; SI-NEXT:    s_ashr_i32 s1, s0, 31
50; SI-NEXT:    v_mov_b32_e32 v0, s0
51; SI-NEXT:    v_mov_b32_e32 v1, s1
52; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
53; SI-NEXT:    s_endpgm
54;
55; VI-LABEL: test_s_sext_i32_to_i64:
56; VI:       ; %bb.0: ; %entry
57; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
58; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
59; VI-NEXT:    s_mov_b32 s7, 0xf000
60; VI-NEXT:    s_mov_b32 s6, -1
61; VI-NEXT:    s_waitcnt lgkmcnt(0)
62; VI-NEXT:    s_mul_i32 s0, s0, s1
63; VI-NEXT:    s_add_i32 s0, s0, s2
64; VI-NEXT:    s_ashr_i32 s1, s0, 31
65; VI-NEXT:    v_mov_b32_e32 v0, s0
66; VI-NEXT:    v_mov_b32_e32 v1, s1
67; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
68; VI-NEXT:    s_endpgm
69entry:
70  %mul = mul i32 %a, %b
71  %add = add i32 %mul, %c
72  %sext = sext i32 %add to i64
73  store i64 %sext, ptr addrspace(1) %out, align 8
74  ret void
75}
76
77define amdgpu_kernel void @s_sext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
78; SI-LABEL: s_sext_i1_to_i64:
79; SI:       ; %bb.0:
80; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
81; SI-NEXT:    s_mov_b32 s7, 0xf000
82; SI-NEXT:    s_mov_b32 s6, -1
83; SI-NEXT:    s_waitcnt lgkmcnt(0)
84; SI-NEXT:    s_cmp_eq_u32 s2, s3
85; SI-NEXT:    s_mov_b32 s4, s0
86; SI-NEXT:    s_mov_b32 s5, s1
87; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
88; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
89; SI-NEXT:    v_mov_b32_e32 v1, v0
90; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
91; SI-NEXT:    s_endpgm
92;
93; VI-LABEL: s_sext_i1_to_i64:
94; VI:       ; %bb.0:
95; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
96; VI-NEXT:    s_mov_b32 s7, 0xf000
97; VI-NEXT:    s_mov_b32 s6, -1
98; VI-NEXT:    s_waitcnt lgkmcnt(0)
99; VI-NEXT:    s_cmp_eq_u32 s2, s3
100; VI-NEXT:    s_mov_b32 s4, s0
101; VI-NEXT:    s_mov_b32 s5, s1
102; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
103; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
104; VI-NEXT:    v_mov_b32_e32 v1, v0
105; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
106; VI-NEXT:    s_endpgm
107  %cmp = icmp eq i32 %a, %b
108  %sext = sext i1 %cmp to i64
109  store i64 %sext, ptr addrspace(1) %out, align 8
110  ret void
111}
112
113define amdgpu_kernel void @s_sext_i32_to_i64(ptr addrspace(1) %out, i32 %a) nounwind {
114; SI-LABEL: s_sext_i32_to_i64:
115; SI:       ; %bb.0:
116; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
117; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
118; SI-NEXT:    s_mov_b32 s3, 0xf000
119; SI-NEXT:    s_mov_b32 s2, -1
120; SI-NEXT:    s_waitcnt lgkmcnt(0)
121; SI-NEXT:    s_ashr_i32 s4, s6, 31
122; SI-NEXT:    v_mov_b32_e32 v0, s6
123; SI-NEXT:    v_mov_b32_e32 v1, s4
124; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
125; SI-NEXT:    s_endpgm
126;
127; VI-LABEL: s_sext_i32_to_i64:
128; VI:       ; %bb.0:
129; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
130; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
131; VI-NEXT:    s_mov_b32 s3, 0xf000
132; VI-NEXT:    s_mov_b32 s2, -1
133; VI-NEXT:    s_waitcnt lgkmcnt(0)
134; VI-NEXT:    s_ashr_i32 s4, s6, 31
135; VI-NEXT:    v_mov_b32_e32 v0, s6
136; VI-NEXT:    v_mov_b32_e32 v1, s4
137; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
138; VI-NEXT:    s_endpgm
139  %sext = sext i32 %a to i64
140  store i64 %sext, ptr addrspace(1) %out, align 8
141  ret void
142}
143
144define amdgpu_kernel void @v_sext_i32_to_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
145; SI-LABEL: v_sext_i32_to_i64:
146; SI:       ; %bb.0:
147; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
148; SI-NEXT:    s_mov_b32 s7, 0xf000
149; SI-NEXT:    s_mov_b32 s6, -1
150; SI-NEXT:    s_mov_b32 s10, s6
151; SI-NEXT:    s_mov_b32 s11, s7
152; SI-NEXT:    s_waitcnt lgkmcnt(0)
153; SI-NEXT:    s_mov_b32 s8, s2
154; SI-NEXT:    s_mov_b32 s9, s3
155; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
156; SI-NEXT:    s_mov_b32 s4, s0
157; SI-NEXT:    s_mov_b32 s5, s1
158; SI-NEXT:    s_waitcnt vmcnt(0)
159; SI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
160; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
161; SI-NEXT:    s_endpgm
162;
163; VI-LABEL: v_sext_i32_to_i64:
164; VI:       ; %bb.0:
165; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
166; VI-NEXT:    s_mov_b32 s7, 0xf000
167; VI-NEXT:    s_mov_b32 s6, -1
168; VI-NEXT:    s_mov_b32 s10, s6
169; VI-NEXT:    s_mov_b32 s11, s7
170; VI-NEXT:    s_waitcnt lgkmcnt(0)
171; VI-NEXT:    s_mov_b32 s8, s2
172; VI-NEXT:    s_mov_b32 s9, s3
173; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
174; VI-NEXT:    s_mov_b32 s4, s0
175; VI-NEXT:    s_mov_b32 s5, s1
176; VI-NEXT:    s_waitcnt vmcnt(0)
177; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
178; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
179; VI-NEXT:    s_endpgm
180  %val = load i32, ptr addrspace(1) %in, align 4
181  %sext = sext i32 %val to i64
182  store i64 %sext, ptr addrspace(1) %out, align 8
183  ret void
184}
185
186define amdgpu_kernel void @s_sext_i16_to_i64(ptr addrspace(1) %out, i16 %a) nounwind {
187; SI-LABEL: s_sext_i16_to_i64:
188; SI:       ; %bb.0:
189; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
190; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
191; SI-NEXT:    s_mov_b32 s3, 0xf000
192; SI-NEXT:    s_mov_b32 s2, -1
193; SI-NEXT:    s_waitcnt lgkmcnt(0)
194; SI-NEXT:    s_bfe_i64 s[4:5], s[6:7], 0x100000
195; SI-NEXT:    v_mov_b32_e32 v0, s4
196; SI-NEXT:    v_mov_b32_e32 v1, s5
197; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
198; SI-NEXT:    s_endpgm
199;
200; VI-LABEL: s_sext_i16_to_i64:
201; VI:       ; %bb.0:
202; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
203; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
204; VI-NEXT:    s_mov_b32 s3, 0xf000
205; VI-NEXT:    s_mov_b32 s2, -1
206; VI-NEXT:    s_waitcnt lgkmcnt(0)
207; VI-NEXT:    s_bfe_i64 s[4:5], s[6:7], 0x100000
208; VI-NEXT:    v_mov_b32_e32 v0, s4
209; VI-NEXT:    v_mov_b32_e32 v1, s5
210; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
211; VI-NEXT:    s_endpgm
212  %sext = sext i16 %a to i64
213  store i64 %sext, ptr addrspace(1) %out, align 8
214  ret void
215}
216
217define amdgpu_kernel void @s_sext_i1_to_i16(ptr addrspace(1) %out, i32 %a, i32 %b) nounwind {
218; SI-LABEL: s_sext_i1_to_i16:
219; SI:       ; %bb.0:
220; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
221; SI-NEXT:    s_mov_b32 s7, 0xf000
222; SI-NEXT:    s_mov_b32 s6, -1
223; SI-NEXT:    s_waitcnt lgkmcnt(0)
224; SI-NEXT:    s_cmp_eq_u32 s2, s3
225; SI-NEXT:    s_mov_b32 s4, s0
226; SI-NEXT:    s_mov_b32 s5, s1
227; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
228; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
229; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
230; SI-NEXT:    s_endpgm
231;
232; VI-LABEL: s_sext_i1_to_i16:
233; VI:       ; %bb.0:
234; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
235; VI-NEXT:    s_mov_b32 s7, 0xf000
236; VI-NEXT:    s_mov_b32 s6, -1
237; VI-NEXT:    s_waitcnt lgkmcnt(0)
238; VI-NEXT:    s_cmp_eq_u32 s2, s3
239; VI-NEXT:    s_mov_b32 s4, s0
240; VI-NEXT:    s_mov_b32 s5, s1
241; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
242; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
243; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
244; VI-NEXT:    s_endpgm
245  %cmp = icmp eq i32 %a, %b
246  %sext = sext i1 %cmp to i16
247  store i16 %sext, ptr addrspace(1) %out
248  ret void
249}
250
251; This purpose of this test is to make sure the i16 = sign_extend i1 node
252; makes it all the way throught the legalizer/optimizer to make sure
253; we select this correctly.  In the s_sext_i1_to_i16, the sign_extend node
254; is optimized to a select very early.
255define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
256; SI-LABEL: s_sext_i1_to_i16_with_and:
257; SI:       ; %bb.0:
258; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
259; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
260; SI-NEXT:    s_mov_b32 s7, 0xf000
261; SI-NEXT:    s_mov_b32 s6, -1
262; SI-NEXT:    s_waitcnt lgkmcnt(0)
263; SI-NEXT:    s_cmp_eq_u32 s0, s1
264; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
265; SI-NEXT:    s_cmp_eq_u32 s2, s3
266; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
267; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
268; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
269; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
270; SI-NEXT:    s_endpgm
271;
272; VI-LABEL: s_sext_i1_to_i16_with_and:
273; VI:       ; %bb.0:
274; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
275; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
276; VI-NEXT:    s_mov_b32 s7, 0xf000
277; VI-NEXT:    s_mov_b32 s6, -1
278; VI-NEXT:    s_waitcnt lgkmcnt(0)
279; VI-NEXT:    s_cmp_eq_u32 s0, s1
280; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
281; VI-NEXT:    s_cmp_eq_u32 s2, s3
282; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
283; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
284; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
285; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
286; VI-NEXT:    s_endpgm
287  %cmp0 = icmp eq i32 %a, %b
288  %cmp1 = icmp eq i32 %c, %d
289  %cmp = and i1 %cmp0, %cmp1
290  %sext = sext i1 %cmp to i16
291  store i16 %sext, ptr addrspace(1) %out
292  ret void
293}
294
295define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) nounwind {
296; SI-LABEL: v_sext_i1_to_i16_with_and:
297; SI:       ; %bb.0:
298; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
299; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
300; SI-NEXT:    s_mov_b32 s7, 0xf000
301; SI-NEXT:    s_mov_b32 s6, -1
302; SI-NEXT:    s_waitcnt lgkmcnt(0)
303; SI-NEXT:    s_cmp_eq_u32 s1, s2
304; SI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
305; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
306; SI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
307; SI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
308; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
309; SI-NEXT:    s_endpgm
310;
311; VI-LABEL: v_sext_i1_to_i16_with_and:
312; VI:       ; %bb.0:
313; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
314; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
315; VI-NEXT:    s_mov_b32 s7, 0xf000
316; VI-NEXT:    s_mov_b32 s6, -1
317; VI-NEXT:    s_waitcnt lgkmcnt(0)
318; VI-NEXT:    s_cmp_eq_u32 s1, s2
319; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s0, v0
320; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
321; VI-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
322; VI-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
323; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
324; VI-NEXT:    s_endpgm
325  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
326  %cmp0 = icmp eq i32 %a, %tid
327  %cmp1 = icmp eq i32 %b, %c
328  %cmp = and i1 %cmp0, %cmp1
329  %sext = sext i1 %cmp to i16
330  store i16 %sext, ptr addrspace(1) %out
331  ret void
332}
333
334; FIXME: We end up with a v_bfe instruction, because the i16 srl
335; gets selected to a v_lshrrev_b16 instructions, so the input to
336; the bfe is a vector registers.  To fix this we need to be able to
337; optimize:
338; t29: i16 = truncate t10
339; t55: i16 = srl t29, Constant:i32<8>
340; t63: i32 = any_extend t55
341; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
342define amdgpu_kernel void @s_sext_v4i8_to_v4i32(ptr addrspace(1) %out, i32 %a) nounwind {
343; SI-LABEL: s_sext_v4i8_to_v4i32:
344; SI:       ; %bb.0:
345; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
346; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
347; SI-NEXT:    s_mov_b32 s3, 0xf000
348; SI-NEXT:    s_mov_b32 s2, -1
349; SI-NEXT:    s_waitcnt lgkmcnt(0)
350; SI-NEXT:    s_ashr_i32 s4, s6, 24
351; SI-NEXT:    s_bfe_i32 s5, s6, 0x80010
352; SI-NEXT:    s_bfe_i32 s7, s6, 0x80008
353; SI-NEXT:    s_sext_i32_i8 s6, s6
354; SI-NEXT:    v_mov_b32_e32 v0, s6
355; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
356; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
357; SI-NEXT:    v_mov_b32_e32 v0, s7
358; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
359; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
360; SI-NEXT:    v_mov_b32_e32 v0, s5
361; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
362; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
363; SI-NEXT:    v_mov_b32_e32 v0, s4
364; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
365; SI-NEXT:    s_waitcnt vmcnt(0)
366; SI-NEXT:    s_endpgm
367;
368; VI-LABEL: s_sext_v4i8_to_v4i32:
369; VI:       ; %bb.0:
370; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
371; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
372; VI-NEXT:    s_mov_b32 s3, 0xf000
373; VI-NEXT:    s_mov_b32 s2, -1
374; VI-NEXT:    s_waitcnt lgkmcnt(0)
375; VI-NEXT:    s_ashr_i32 s4, s6, 24
376; VI-NEXT:    s_bfe_i32 s5, s6, 0x80010
377; VI-NEXT:    s_bfe_i32 s7, s6, 0x80008
378; VI-NEXT:    s_sext_i32_i8 s6, s6
379; VI-NEXT:    v_mov_b32_e32 v0, s6
380; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
381; VI-NEXT:    s_waitcnt vmcnt(0)
382; VI-NEXT:    v_mov_b32_e32 v0, s7
383; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
384; VI-NEXT:    s_waitcnt vmcnt(0)
385; VI-NEXT:    v_mov_b32_e32 v0, s5
386; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
387; VI-NEXT:    s_waitcnt vmcnt(0)
388; VI-NEXT:    v_mov_b32_e32 v0, s4
389; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
390; VI-NEXT:    s_waitcnt vmcnt(0)
391; VI-NEXT:    s_endpgm
392  %cast = bitcast i32 %a to <4 x i8>
393  %ext = sext <4 x i8> %cast to <4 x i32>
394  %elt0 = extractelement <4 x i32> %ext, i32 0
395  %elt1 = extractelement <4 x i32> %ext, i32 1
396  %elt2 = extractelement <4 x i32> %ext, i32 2
397  %elt3 = extractelement <4 x i32> %ext, i32 3
398  store volatile i32 %elt0, ptr addrspace(1) %out
399  store volatile i32 %elt1, ptr addrspace(1) %out
400  store volatile i32 %elt2, ptr addrspace(1) %out
401  store volatile i32 %elt3, ptr addrspace(1) %out
402  ret void
403}
404
405; FIXME: need to optimize same sequence as above test to avoid
406; this shift.
407define amdgpu_kernel void @v_sext_v4i8_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
408; SI-LABEL: v_sext_v4i8_to_v4i32:
409; SI:       ; %bb.0:
410; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
411; SI-NEXT:    s_mov_b32 s7, 0xf000
412; SI-NEXT:    s_mov_b32 s6, -1
413; SI-NEXT:    s_mov_b32 s10, s6
414; SI-NEXT:    s_mov_b32 s11, s7
415; SI-NEXT:    s_waitcnt lgkmcnt(0)
416; SI-NEXT:    s_mov_b32 s8, s2
417; SI-NEXT:    s_mov_b32 s9, s3
418; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
419; SI-NEXT:    s_mov_b32 s4, s0
420; SI-NEXT:    s_mov_b32 s5, s1
421; SI-NEXT:    s_waitcnt vmcnt(0)
422; SI-NEXT:    v_ashrrev_i32_e32 v1, 24, v0
423; SI-NEXT:    v_bfe_i32 v2, v0, 16, 8
424; SI-NEXT:    v_bfe_i32 v3, v0, 8, 8
425; SI-NEXT:    v_bfe_i32 v0, v0, 0, 8
426; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
427; SI-NEXT:    s_waitcnt vmcnt(0)
428; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
429; SI-NEXT:    s_waitcnt vmcnt(0)
430; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
431; SI-NEXT:    s_waitcnt vmcnt(0)
432; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
433; SI-NEXT:    s_waitcnt vmcnt(0)
434; SI-NEXT:    s_endpgm
435;
436; VI-LABEL: v_sext_v4i8_to_v4i32:
437; VI:       ; %bb.0:
438; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
439; VI-NEXT:    s_mov_b32 s7, 0xf000
440; VI-NEXT:    s_mov_b32 s6, -1
441; VI-NEXT:    s_mov_b32 s10, s6
442; VI-NEXT:    s_mov_b32 s11, s7
443; VI-NEXT:    s_waitcnt lgkmcnt(0)
444; VI-NEXT:    s_mov_b32 s8, s2
445; VI-NEXT:    s_mov_b32 s9, s3
446; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
447; VI-NEXT:    s_mov_b32 s4, s0
448; VI-NEXT:    s_mov_b32 s5, s1
449; VI-NEXT:    s_waitcnt vmcnt(0)
450; VI-NEXT:    v_ashrrev_i32_e32 v1, 24, v0
451; VI-NEXT:    v_bfe_i32 v2, v0, 16, 8
452; VI-NEXT:    v_bfe_i32 v3, v0, 8, 8
453; VI-NEXT:    v_bfe_i32 v0, v0, 0, 8
454; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
455; VI-NEXT:    s_waitcnt vmcnt(0)
456; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
457; VI-NEXT:    s_waitcnt vmcnt(0)
458; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
459; VI-NEXT:    s_waitcnt vmcnt(0)
460; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
461; VI-NEXT:    s_waitcnt vmcnt(0)
462; VI-NEXT:    s_endpgm
463  %a = load i32, ptr addrspace(1) %in
464  %cast = bitcast i32 %a to <4 x i8>
465  %ext = sext <4 x i8> %cast to <4 x i32>
466  %elt0 = extractelement <4 x i32> %ext, i32 0
467  %elt1 = extractelement <4 x i32> %ext, i32 1
468  %elt2 = extractelement <4 x i32> %ext, i32 2
469  %elt3 = extractelement <4 x i32> %ext, i32 3
470  store volatile i32 %elt0, ptr addrspace(1) %out
471  store volatile i32 %elt1, ptr addrspace(1) %out
472  store volatile i32 %elt2, ptr addrspace(1) %out
473  store volatile i32 %elt3, ptr addrspace(1) %out
474  ret void
475}
476
477; FIXME: s_bfe_i64, same on SI and VI
478define amdgpu_kernel void @s_sext_v4i16_to_v4i32(ptr addrspace(1) %out, i64 %a) nounwind {
479; SI-LABEL: s_sext_v4i16_to_v4i32:
480; SI:       ; %bb.0:
481; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
482; SI-NEXT:    s_mov_b32 s7, 0xf000
483; SI-NEXT:    s_mov_b32 s6, -1
484; SI-NEXT:    s_waitcnt lgkmcnt(0)
485; SI-NEXT:    s_mov_b32 s4, s0
486; SI-NEXT:    s_mov_b32 s5, s1
487; SI-NEXT:    s_ashr_i64 s[0:1], s[2:3], 48
488; SI-NEXT:    s_ashr_i32 s1, s2, 16
489; SI-NEXT:    s_sext_i32_i16 s2, s2
490; SI-NEXT:    v_mov_b32_e32 v0, s2
491; SI-NEXT:    s_sext_i32_i16 s3, s3
492; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
493; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
494; SI-NEXT:    v_mov_b32_e32 v0, s1
495; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
496; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
497; SI-NEXT:    v_mov_b32_e32 v0, s3
498; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
499; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
500; SI-NEXT:    v_mov_b32_e32 v0, s0
501; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
502; SI-NEXT:    s_waitcnt vmcnt(0)
503; SI-NEXT:    s_endpgm
504;
505; VI-LABEL: s_sext_v4i16_to_v4i32:
506; VI:       ; %bb.0:
507; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
508; VI-NEXT:    s_mov_b32 s7, 0xf000
509; VI-NEXT:    s_mov_b32 s6, -1
510; VI-NEXT:    s_waitcnt lgkmcnt(0)
511; VI-NEXT:    s_mov_b32 s5, s1
512; VI-NEXT:    s_ashr_i32 s1, s2, 16
513; VI-NEXT:    s_sext_i32_i16 s2, s2
514; VI-NEXT:    s_mov_b32 s4, s0
515; VI-NEXT:    v_mov_b32_e32 v0, s2
516; VI-NEXT:    s_ashr_i32 s0, s3, 16
517; VI-NEXT:    s_sext_i32_i16 s3, s3
518; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
519; VI-NEXT:    s_waitcnt vmcnt(0)
520; VI-NEXT:    v_mov_b32_e32 v0, s1
521; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
522; VI-NEXT:    s_waitcnt vmcnt(0)
523; VI-NEXT:    v_mov_b32_e32 v0, s3
524; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
525; VI-NEXT:    s_waitcnt vmcnt(0)
526; VI-NEXT:    v_mov_b32_e32 v0, s0
527; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
528; VI-NEXT:    s_waitcnt vmcnt(0)
529; VI-NEXT:    s_endpgm
530  %cast = bitcast i64 %a to <4 x i16>
531  %ext = sext <4 x i16> %cast to <4 x i32>
532  %elt0 = extractelement <4 x i32> %ext, i32 0
533  %elt1 = extractelement <4 x i32> %ext, i32 1
534  %elt2 = extractelement <4 x i32> %ext, i32 2
535  %elt3 = extractelement <4 x i32> %ext, i32 3
536  store volatile i32 %elt0, ptr addrspace(1) %out
537  store volatile i32 %elt1, ptr addrspace(1) %out
538  store volatile i32 %elt2, ptr addrspace(1) %out
539  store volatile i32 %elt3, ptr addrspace(1) %out
540  ret void
541}
542
543define amdgpu_kernel void @v_sext_v4i16_to_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) nounwind {
544; SI-LABEL: v_sext_v4i16_to_v4i32:
545; SI:       ; %bb.0:
546; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
547; SI-NEXT:    s_mov_b32 s7, 0xf000
548; SI-NEXT:    s_mov_b32 s6, -1
549; SI-NEXT:    s_mov_b32 s10, s6
550; SI-NEXT:    s_mov_b32 s11, s7
551; SI-NEXT:    s_waitcnt lgkmcnt(0)
552; SI-NEXT:    s_mov_b32 s8, s2
553; SI-NEXT:    s_mov_b32 s9, s3
554; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
555; SI-NEXT:    s_mov_b32 s4, s0
556; SI-NEXT:    s_mov_b32 s5, s1
557; SI-NEXT:    s_waitcnt vmcnt(0)
558; SI-NEXT:    v_ashr_i64 v[2:3], v[0:1], 48
559; SI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
560; SI-NEXT:    v_bfe_i32 v0, v0, 0, 16
561; SI-NEXT:    v_bfe_i32 v1, v1, 0, 16
562; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
563; SI-NEXT:    s_waitcnt vmcnt(0)
564; SI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
565; SI-NEXT:    s_waitcnt vmcnt(0)
566; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
567; SI-NEXT:    s_waitcnt vmcnt(0)
568; SI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
569; SI-NEXT:    s_waitcnt vmcnt(0)
570; SI-NEXT:    s_endpgm
571;
572; VI-LABEL: v_sext_v4i16_to_v4i32:
573; VI:       ; %bb.0:
574; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
575; VI-NEXT:    s_mov_b32 s7, 0xf000
576; VI-NEXT:    s_mov_b32 s6, -1
577; VI-NEXT:    s_mov_b32 s10, s6
578; VI-NEXT:    s_mov_b32 s11, s7
579; VI-NEXT:    s_waitcnt lgkmcnt(0)
580; VI-NEXT:    s_mov_b32 s8, s2
581; VI-NEXT:    s_mov_b32 s9, s3
582; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
583; VI-NEXT:    s_mov_b32 s4, s0
584; VI-NEXT:    s_mov_b32 s5, s1
585; VI-NEXT:    s_waitcnt vmcnt(0)
586; VI-NEXT:    v_ashrrev_i32_e32 v3, 16, v0
587; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
588; VI-NEXT:    v_ashrrev_i32_e32 v2, 16, v1
589; VI-NEXT:    v_bfe_i32 v1, v1, 0, 16
590; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
591; VI-NEXT:    s_waitcnt vmcnt(0)
592; VI-NEXT:    buffer_store_dword v3, off, s[4:7], 0
593; VI-NEXT:    s_waitcnt vmcnt(0)
594; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
595; VI-NEXT:    s_waitcnt vmcnt(0)
596; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
597; VI-NEXT:    s_waitcnt vmcnt(0)
598; VI-NEXT:    s_endpgm
599  %a = load i64, ptr addrspace(1) %in
600  %cast = bitcast i64 %a to <4 x i16>
601  %ext = sext <4 x i16> %cast to <4 x i32>
602  %elt0 = extractelement <4 x i32> %ext, i32 0
603  %elt1 = extractelement <4 x i32> %ext, i32 1
604  %elt2 = extractelement <4 x i32> %ext, i32 2
605  %elt3 = extractelement <4 x i32> %ext, i32 3
606  store volatile i32 %elt0, ptr addrspace(1) %out
607  store volatile i32 %elt1, ptr addrspace(1) %out
608  store volatile i32 %elt2, ptr addrspace(1) %out
609  store volatile i32 %elt3, ptr addrspace(1) %out
610  ret void
611}
612
613declare i32 @llvm.amdgcn.workitem.id.x() #1
614
615attributes #1 = { nounwind readnone }
616