xref: /llvm-project/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
7
8define amdgpu_kernel void @s_shl_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
9; GFX9-LABEL: s_shl_v2i16:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
12; GFX9-NEXT:    s_mov_b32 s7, 0xf000
13; GFX9-NEXT:    s_mov_b32 s6, -1
14; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
15; GFX9-NEXT:    v_mov_b32_e32 v0, s2
16; GFX9-NEXT:    s_mov_b32 s4, s0
17; GFX9-NEXT:    s_mov_b32 s5, s1
18; GFX9-NEXT:    v_pk_lshlrev_b16 v0, s3, v0
19; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
20; GFX9-NEXT:    s_endpgm
21;
22; VI-LABEL: s_shl_v2i16:
23; VI:       ; %bb.0:
24; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
25; VI-NEXT:    s_mov_b32 s7, 0xf000
26; VI-NEXT:    s_mov_b32 s6, -1
27; VI-NEXT:    s_waitcnt lgkmcnt(0)
28; VI-NEXT:    s_mov_b32 s4, s0
29; VI-NEXT:    s_mov_b32 s5, s1
30; VI-NEXT:    s_lshr_b32 s0, s2, 16
31; VI-NEXT:    s_lshr_b32 s1, s3, 16
32; VI-NEXT:    s_lshl_b32 s0, s0, s1
33; VI-NEXT:    s_lshl_b32 s1, s2, s3
34; VI-NEXT:    s_lshl_b32 s0, s0, 16
35; VI-NEXT:    s_and_b32 s1, s1, 0xffff
36; VI-NEXT:    s_or_b32 s0, s1, s0
37; VI-NEXT:    v_mov_b32_e32 v0, s0
38; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
39; VI-NEXT:    s_endpgm
40;
41; CI-LABEL: s_shl_v2i16:
42; CI:       ; %bb.0:
43; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
44; CI-NEXT:    s_mov_b32 s7, 0xf000
45; CI-NEXT:    s_mov_b32 s6, -1
46; CI-NEXT:    s_waitcnt lgkmcnt(0)
47; CI-NEXT:    s_mov_b32 s4, s0
48; CI-NEXT:    s_mov_b32 s5, s1
49; CI-NEXT:    s_lshr_b32 s0, s2, 16
50; CI-NEXT:    s_lshr_b32 s1, s3, 16
51; CI-NEXT:    s_lshl_b32 s0, s0, s1
52; CI-NEXT:    s_lshl_b32 s1, s2, s3
53; CI-NEXT:    s_lshl_b32 s0, s0, 16
54; CI-NEXT:    s_and_b32 s1, s1, 0xffff
55; CI-NEXT:    s_or_b32 s0, s1, s0
56; CI-NEXT:    v_mov_b32_e32 v0, s0
57; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
58; CI-NEXT:    s_endpgm
59;
60; GFX10-LABEL: s_shl_v2i16:
61; GFX10:       ; %bb.0:
62; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
63; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
64; GFX10-NEXT:    s_mov_b32 s6, -1
65; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
66; GFX10-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
67; GFX10-NEXT:    s_mov_b32 s4, s0
68; GFX10-NEXT:    s_mov_b32 s5, s1
69; GFX10-NEXT:    buffer_store_dword v0, off, s[4:7], 0
70; GFX10-NEXT:    s_endpgm
71;
72; GFX11-LABEL: s_shl_v2i16:
73; GFX11:       ; %bb.0:
74; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
75; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
76; GFX11-NEXT:    s_mov_b32 s6, -1
77; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
78; GFX11-NEXT:    v_pk_lshlrev_b16 v0, s3, s2
79; GFX11-NEXT:    s_mov_b32 s4, s0
80; GFX11-NEXT:    s_mov_b32 s5, s1
81; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
82; GFX11-NEXT:    s_endpgm
83  %result = shl <2 x i16> %lhs, %rhs
84  store <2 x i16> %result, ptr addrspace(1) %out
85  ret void
86}
87
88define amdgpu_kernel void @v_shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
89; GFX9-LABEL: v_shl_v2i16:
90; GFX9:       ; %bb.0:
91; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
92; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
93; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
95; GFX9-NEXT:    s_waitcnt vmcnt(0)
96; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
97; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
98; GFX9-NEXT:    s_endpgm
99;
100; VI-LABEL: v_shl_v2i16:
101; VI:       ; %bb.0:
102; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
103; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
104; VI-NEXT:    s_waitcnt lgkmcnt(0)
105; VI-NEXT:    v_mov_b32_e32 v1, s3
106; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
107; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
108; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
109; VI-NEXT:    v_mov_b32_e32 v3, s1
110; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
111; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
112; VI-NEXT:    s_waitcnt vmcnt(0)
113; VI-NEXT:    v_lshlrev_b16_e32 v4, v1, v0
114; VI-NEXT:    v_lshlrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
115; VI-NEXT:    v_or_b32_e32 v0, v4, v0
116; VI-NEXT:    flat_store_dword v[2:3], v0
117; VI-NEXT:    s_endpgm
118;
119; CI-LABEL: v_shl_v2i16:
120; CI:       ; %bb.0:
121; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
122; CI-NEXT:    s_mov_b32 s7, 0xf000
123; CI-NEXT:    s_mov_b32 s6, 0
124; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
125; CI-NEXT:    v_mov_b32_e32 v1, 0
126; CI-NEXT:    s_waitcnt lgkmcnt(0)
127; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
128; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
129; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
130; CI-NEXT:    s_waitcnt vmcnt(0)
131; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
132; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
133; CI-NEXT:    v_lshlrev_b32_e32 v2, v3, v2
134; CI-NEXT:    v_lshlrev_b32_e32 v3, v5, v4
135; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
136; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
137; CI-NEXT:    v_or_b32_e32 v2, v2, v3
138; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
139; CI-NEXT:    s_endpgm
140;
141; GFX10-LABEL: v_shl_v2i16:
142; GFX10:       ; %bb.0:
143; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
144; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
145; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
146; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
147; GFX10-NEXT:    s_waitcnt vmcnt(0)
148; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
149; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
150; GFX10-NEXT:    s_endpgm
151;
152; GFX11-LABEL: v_shl_v2i16:
153; GFX11:       ; %bb.0:
154; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
155; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
156; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
157; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
158; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
159; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
160; GFX11-NEXT:    s_waitcnt vmcnt(0)
161; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v1, v0
162; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
163; GFX11-NEXT:    s_endpgm
164  %tid = call i32 @llvm.amdgcn.workitem.id.x()
165  %tid.ext = sext i32 %tid to i64
166  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
167  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
168  %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in.gep, i32 1
169  %a = load <2 x i16>, ptr addrspace(1) %in.gep
170  %b = load <2 x i16>, ptr addrspace(1) %b_ptr
171  %result = shl <2 x i16> %a, %b
172  store <2 x i16> %result, ptr addrspace(1) %out.gep
173  ret void
174}
175
176define amdgpu_kernel void @shl_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
177; GFX9-LABEL: shl_v_s_v2i16:
178; GFX9:       ; %bb.0:
179; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
180; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x34
181; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
182; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
183; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
184; GFX9-NEXT:    s_waitcnt vmcnt(0)
185; GFX9-NEXT:    v_pk_lshlrev_b16 v1, s6, v1
186; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
187; GFX9-NEXT:    s_endpgm
188;
189; VI-LABEL: shl_v_s_v2i16:
190; VI:       ; %bb.0:
191; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
192; VI-NEXT:    s_load_dword s4, s[4:5], 0x34
193; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
194; VI-NEXT:    s_waitcnt lgkmcnt(0)
195; VI-NEXT:    v_mov_b32_e32 v1, s3
196; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
197; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
198; VI-NEXT:    flat_load_dword v3, v[0:1]
199; VI-NEXT:    v_mov_b32_e32 v1, s1
200; VI-NEXT:    s_lshr_b32 s1, s4, 16
201; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
202; VI-NEXT:    v_mov_b32_e32 v2, s1
203; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
204; VI-NEXT:    s_waitcnt vmcnt(0)
205; VI-NEXT:    v_lshlrev_b16_e32 v4, s4, v3
206; VI-NEXT:    v_lshlrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
207; VI-NEXT:    v_or_b32_e32 v2, v4, v2
208; VI-NEXT:    flat_store_dword v[0:1], v2
209; VI-NEXT:    s_endpgm
210;
211; CI-LABEL: shl_v_s_v2i16:
212; CI:       ; %bb.0:
213; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
214; CI-NEXT:    s_load_dword s8, s[4:5], 0xd
215; CI-NEXT:    s_mov_b32 s7, 0xf000
216; CI-NEXT:    s_mov_b32 s6, 0
217; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
218; CI-NEXT:    s_waitcnt lgkmcnt(0)
219; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
220; CI-NEXT:    v_mov_b32_e32 v1, 0
221; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
222; CI-NEXT:    s_lshr_b32 s4, s8, 16
223; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
224; CI-NEXT:    s_waitcnt vmcnt(0)
225; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
226; CI-NEXT:    v_lshlrev_b32_e32 v2, s8, v2
227; CI-NEXT:    v_lshlrev_b32_e32 v3, s4, v3
228; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
229; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
230; CI-NEXT:    v_or_b32_e32 v2, v2, v3
231; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
232; CI-NEXT:    s_endpgm
233;
234; GFX10-LABEL: shl_v_s_v2i16:
235; GFX10:       ; %bb.0:
236; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
237; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
238; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x34
239; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
240; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
241; GFX10-NEXT:    s_waitcnt vmcnt(0)
242; GFX10-NEXT:    v_pk_lshlrev_b16 v1, s4, v1
243; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
244; GFX10-NEXT:    s_endpgm
245;
246; GFX11-LABEL: shl_v_s_v2i16:
247; GFX11:       ; %bb.0:
248; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
249; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
250; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
251; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
252; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
253; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
255; GFX11-NEXT:    s_waitcnt vmcnt(0)
256; GFX11-NEXT:    v_pk_lshlrev_b16 v1, s4, v1
257; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
258; GFX11-NEXT:    s_endpgm
259  %tid = call i32 @llvm.amdgcn.workitem.id.x()
260  %tid.ext = sext i32 %tid to i64
261  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
262  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
263  %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
264  %result = shl <2 x i16> %vgpr, %sgpr
265  store <2 x i16> %result, ptr addrspace(1) %out.gep
266  ret void
267}
268
269define amdgpu_kernel void @shl_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
270; GFX9-LABEL: shl_s_v_v2i16:
271; GFX9:       ; %bb.0:
272; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
273; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x34
274; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
275; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
276; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
277; GFX9-NEXT:    s_waitcnt vmcnt(0)
278; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, s6
279; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
280; GFX9-NEXT:    s_endpgm
281;
282; VI-LABEL: shl_s_v_v2i16:
283; VI:       ; %bb.0:
284; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
285; VI-NEXT:    s_load_dword s4, s[4:5], 0x34
286; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
287; VI-NEXT:    s_waitcnt lgkmcnt(0)
288; VI-NEXT:    v_mov_b32_e32 v1, s3
289; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
290; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
291; VI-NEXT:    flat_load_dword v3, v[0:1]
292; VI-NEXT:    v_mov_b32_e32 v1, s1
293; VI-NEXT:    s_lshr_b32 s1, s4, 16
294; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
295; VI-NEXT:    v_mov_b32_e32 v2, s1
296; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
297; VI-NEXT:    s_waitcnt vmcnt(0)
298; VI-NEXT:    v_lshlrev_b16_e64 v4, v3, s4
299; VI-NEXT:    v_lshlrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
300; VI-NEXT:    v_or_b32_e32 v2, v4, v2
301; VI-NEXT:    flat_store_dword v[0:1], v2
302; VI-NEXT:    s_endpgm
303;
304; CI-LABEL: shl_s_v_v2i16:
305; CI:       ; %bb.0:
306; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
307; CI-NEXT:    s_load_dword s8, s[4:5], 0xd
308; CI-NEXT:    s_mov_b32 s7, 0xf000
309; CI-NEXT:    s_mov_b32 s6, 0
310; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
311; CI-NEXT:    s_waitcnt lgkmcnt(0)
312; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
313; CI-NEXT:    v_mov_b32_e32 v1, 0
314; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
315; CI-NEXT:    s_lshr_b32 s4, s8, 16
316; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
317; CI-NEXT:    s_waitcnt vmcnt(0)
318; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
319; CI-NEXT:    v_lshl_b32_e32 v2, s8, v2
320; CI-NEXT:    v_lshl_b32_e32 v3, s4, v3
321; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
322; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
323; CI-NEXT:    v_or_b32_e32 v2, v2, v3
324; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
325; CI-NEXT:    s_endpgm
326;
327; GFX10-LABEL: shl_s_v_v2i16:
328; GFX10:       ; %bb.0:
329; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
330; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
331; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x34
332; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
333; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
334; GFX10-NEXT:    s_waitcnt vmcnt(0)
335; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, s4
336; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
337; GFX10-NEXT:    s_endpgm
338;
339; GFX11-LABEL: shl_s_v_v2i16:
340; GFX11:       ; %bb.0:
341; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
342; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
343; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
344; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
345; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
346; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
347; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
348; GFX11-NEXT:    s_waitcnt vmcnt(0)
349; GFX11-NEXT:    v_pk_lshlrev_b16 v1, v1, s4
350; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
351; GFX11-NEXT:    s_endpgm
352  %tid = call i32 @llvm.amdgcn.workitem.id.x()
353  %tid.ext = sext i32 %tid to i64
354  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
355  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
356  %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
357  %result = shl <2 x i16> %sgpr, %vgpr
358  store <2 x i16> %result, ptr addrspace(1) %out.gep
359  ret void
360}
361
362define amdgpu_kernel void @shl_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
363; GFX9-LABEL: shl_imm_v_v2i16:
364; GFX9:       ; %bb.0:
365; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
366; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
367; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
368; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
369; GFX9-NEXT:    s_waitcnt vmcnt(0)
370; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
371; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
372; GFX9-NEXT:    s_endpgm
373;
374; VI-LABEL: shl_imm_v_v2i16:
375; VI:       ; %bb.0:
376; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
377; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
378; VI-NEXT:    v_mov_b32_e32 v4, 8
379; VI-NEXT:    s_waitcnt lgkmcnt(0)
380; VI-NEXT:    v_mov_b32_e32 v1, s3
381; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
382; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
383; VI-NEXT:    flat_load_dword v3, v[0:1]
384; VI-NEXT:    v_mov_b32_e32 v1, s1
385; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
386; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
387; VI-NEXT:    s_waitcnt vmcnt(0)
388; VI-NEXT:    v_lshlrev_b16_e64 v2, v3, 8
389; VI-NEXT:    v_lshlrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
390; VI-NEXT:    v_or_b32_e32 v2, v2, v3
391; VI-NEXT:    flat_store_dword v[0:1], v2
392; VI-NEXT:    s_endpgm
393;
394; CI-LABEL: shl_imm_v_v2i16:
395; CI:       ; %bb.0:
396; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
397; CI-NEXT:    s_mov_b32 s7, 0xf000
398; CI-NEXT:    s_mov_b32 s6, 0
399; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
400; CI-NEXT:    v_mov_b32_e32 v1, 0
401; CI-NEXT:    s_waitcnt lgkmcnt(0)
402; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
403; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
404; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
405; CI-NEXT:    s_waitcnt vmcnt(0)
406; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
407; CI-NEXT:    v_lshl_b32_e32 v2, 8, v2
408; CI-NEXT:    v_lshl_b32_e32 v3, 8, v3
409; CI-NEXT:    v_and_b32_e32 v2, 0xfff8, v2
410; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
411; CI-NEXT:    v_or_b32_e32 v2, v2, v3
412; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
413; CI-NEXT:    s_endpgm
414;
415; GFX10-LABEL: shl_imm_v_v2i16:
416; GFX10:       ; %bb.0:
417; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
418; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
419; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
420; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
421; GFX10-NEXT:    s_waitcnt vmcnt(0)
422; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
423; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
424; GFX10-NEXT:    s_endpgm
425;
426; GFX11-LABEL: shl_imm_v_v2i16:
427; GFX11:       ; %bb.0:
428; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
429; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
430; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
431; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
432; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
433; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
434; GFX11-NEXT:    s_waitcnt vmcnt(0)
435; GFX11-NEXT:    v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0]
436; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
437; GFX11-NEXT:    s_endpgm
438  %tid = call i32 @llvm.amdgcn.workitem.id.x()
439  %tid.ext = sext i32 %tid to i64
440  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
441  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
442  %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
443  %result = shl <2 x i16> <i16 8, i16 8>, %vgpr
444  store <2 x i16> %result, ptr addrspace(1) %out.gep
445  ret void
446}
447
448define amdgpu_kernel void @shl_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
449; GFX9-LABEL: shl_v_imm_v2i16:
450; GFX9:       ; %bb.0:
451; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
452; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
453; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
454; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
455; GFX9-NEXT:    s_waitcnt vmcnt(0)
456; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
457; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
458; GFX9-NEXT:    s_endpgm
459;
460; VI-LABEL: shl_v_imm_v2i16:
461; VI:       ; %bb.0:
462; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
463; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
464; VI-NEXT:    s_waitcnt lgkmcnt(0)
465; VI-NEXT:    v_mov_b32_e32 v1, s3
466; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
467; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
468; VI-NEXT:    flat_load_dword v3, v[0:1]
469; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
470; VI-NEXT:    v_mov_b32_e32 v1, s1
471; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
472; VI-NEXT:    s_waitcnt vmcnt(0)
473; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
474; VI-NEXT:    v_and_b32_e32 v2, 0xff000000, v2
475; VI-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
476; VI-NEXT:    v_or_b32_e32 v2, v3, v2
477; VI-NEXT:    flat_store_dword v[0:1], v2
478; VI-NEXT:    s_endpgm
479;
480; CI-LABEL: shl_v_imm_v2i16:
481; CI:       ; %bb.0:
482; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
483; CI-NEXT:    s_mov_b32 s7, 0xf000
484; CI-NEXT:    s_mov_b32 s6, 0
485; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
486; CI-NEXT:    v_mov_b32_e32 v1, 0
487; CI-NEXT:    s_waitcnt lgkmcnt(0)
488; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
489; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
490; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
491; CI-NEXT:    s_waitcnt vmcnt(0)
492; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
493; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff00, v2
494; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
495; CI-NEXT:    s_endpgm
496;
497; GFX10-LABEL: shl_v_imm_v2i16:
498; GFX10:       ; %bb.0:
499; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
500; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
501; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
502; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
503; GFX10-NEXT:    s_waitcnt vmcnt(0)
504; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
505; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
506; GFX10-NEXT:    s_endpgm
507;
508; GFX11-LABEL: shl_v_imm_v2i16:
509; GFX11:       ; %bb.0:
510; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
511; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
512; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
513; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
514; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
515; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
516; GFX11-NEXT:    s_waitcnt vmcnt(0)
517; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
518; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
519; GFX11-NEXT:    s_endpgm
520  %tid = call i32 @llvm.amdgcn.workitem.id.x()
521  %tid.ext = sext i32 %tid to i64
522  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
523  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
524  %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
525  %result = shl <2 x i16> %vgpr, <i16 8, i16 8>
526  store <2 x i16> %result, ptr addrspace(1) %out.gep
527  ret void
528}
529
530define amdgpu_kernel void @v_shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
531; GFX9-LABEL: v_shl_v4i16:
532; GFX9:       ; %bb.0:
533; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
534; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
535; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
536; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
537; GFX9-NEXT:    s_waitcnt vmcnt(0)
538; GFX9-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
539; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
540; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
541; GFX9-NEXT:    s_endpgm
542;
543; VI-LABEL: v_shl_v4i16:
544; VI:       ; %bb.0:
545; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
546; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
547; VI-NEXT:    s_waitcnt lgkmcnt(0)
548; VI-NEXT:    v_mov_b32_e32 v1, s3
549; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
550; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
551; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
552; VI-NEXT:    v_mov_b32_e32 v5, s1
553; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
554; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
555; VI-NEXT:    s_waitcnt vmcnt(0)
556; VI-NEXT:    v_lshlrev_b16_e32 v6, v3, v1
557; VI-NEXT:    v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
558; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v0
559; VI-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
560; VI-NEXT:    v_or_b32_e32 v1, v6, v1
561; VI-NEXT:    v_or_b32_e32 v0, v3, v0
562; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
563; VI-NEXT:    s_endpgm
564;
565; CI-LABEL: v_shl_v4i16:
566; CI:       ; %bb.0:
567; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
568; CI-NEXT:    s_mov_b32 s7, 0xf000
569; CI-NEXT:    s_mov_b32 s6, 0
570; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
571; CI-NEXT:    v_mov_b32_e32 v5, 0
572; CI-NEXT:    s_waitcnt lgkmcnt(0)
573; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
574; CI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
575; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
576; CI-NEXT:    s_waitcnt vmcnt(0)
577; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
578; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
579; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
580; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
581; CI-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
582; CI-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
583; CI-NEXT:    v_lshlrev_b32_e32 v2, v9, v7
584; CI-NEXT:    v_lshlrev_b32_e32 v3, v8, v6
585; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
586; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
587; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
588; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
589; CI-NEXT:    v_or_b32_e32 v1, v1, v2
590; CI-NEXT:    v_or_b32_e32 v0, v0, v3
591; CI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
592; CI-NEXT:    s_endpgm
593;
594; GFX10-LABEL: v_shl_v4i16:
595; GFX10:       ; %bb.0:
596; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
597; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
598; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
599; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
600; GFX10-NEXT:    s_waitcnt vmcnt(0)
601; GFX10-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
602; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
603; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
604; GFX10-NEXT:    s_endpgm
605;
606; GFX11-LABEL: v_shl_v4i16:
607; GFX11:       ; %bb.0:
608; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
609; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
610; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
611; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
612; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
613; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
614; GFX11-NEXT:    s_waitcnt vmcnt(0)
615; GFX11-NEXT:    v_pk_lshlrev_b16 v1, v3, v1
616; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v2, v0
617; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
618; GFX11-NEXT:    s_endpgm
619  %tid = call i32 @llvm.amdgcn.workitem.id.x()
620  %tid.ext = sext i32 %tid to i64
621  %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
622  %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
623  %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in.gep, i32 1
624  %a = load <4 x i16>, ptr addrspace(1) %in.gep
625  %b = load <4 x i16>, ptr addrspace(1) %b_ptr
626  %result = shl <4 x i16> %a, %b
627  store <4 x i16> %result, ptr addrspace(1) %out.gep
628  ret void
629}
630
631define amdgpu_kernel void @shl_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
632; GFX9-LABEL: shl_v_imm_v4i16:
633; GFX9:       ; %bb.0:
634; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
635; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
636; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
637; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
638; GFX9-NEXT:    s_waitcnt vmcnt(0)
639; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
640; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
641; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
642; GFX9-NEXT:    s_endpgm
643;
644; VI-LABEL: shl_v_imm_v4i16:
645; VI:       ; %bb.0:
646; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
647; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
648; VI-NEXT:    s_waitcnt lgkmcnt(0)
649; VI-NEXT:    v_mov_b32_e32 v1, s3
650; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
651; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
652; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
653; VI-NEXT:    v_mov_b32_e32 v3, s1
654; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
655; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
656; VI-NEXT:    s_waitcnt vmcnt(0)
657; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v1
658; VI-NEXT:    v_lshlrev_b16_e32 v5, 8, v0
659; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
660; VI-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
661; VI-NEXT:    v_and_b32_e32 v4, 0xff000000, v4
662; VI-NEXT:    v_and_b32_e32 v0, 0xff000000, v0
663; VI-NEXT:    v_or_b32_e32 v1, v1, v4
664; VI-NEXT:    v_or_b32_e32 v0, v5, v0
665; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
666; VI-NEXT:    s_endpgm
667;
668; CI-LABEL: shl_v_imm_v4i16:
669; CI:       ; %bb.0:
670; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
671; CI-NEXT:    s_mov_b32 s7, 0xf000
672; CI-NEXT:    s_mov_b32 s6, 0
673; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
674; CI-NEXT:    v_mov_b32_e32 v1, 0
675; CI-NEXT:    s_waitcnt lgkmcnt(0)
676; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
677; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
678; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
679; CI-NEXT:    s_waitcnt vmcnt(0)
680; CI-NEXT:    v_lshlrev_b32_e32 v4, 8, v3
681; CI-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
682; CI-NEXT:    v_and_b32_e32 v3, 0xff00, v3
683; CI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
684; CI-NEXT:    v_and_b32_e32 v4, 0xff00, v4
685; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
686; CI-NEXT:    v_or_b32_e32 v3, v4, v3
687; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff00, v2
688; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
689; CI-NEXT:    s_endpgm
690;
691; GFX10-LABEL: shl_v_imm_v4i16:
692; GFX10:       ; %bb.0:
693; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
694; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
695; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
696; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
697; GFX10-NEXT:    s_waitcnt vmcnt(0)
698; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
699; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
700; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
701; GFX10-NEXT:    s_endpgm
702;
703; GFX11-LABEL: shl_v_imm_v4i16:
704; GFX11:       ; %bb.0:
705; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
706; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
707; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
708; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
709; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
710; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
711; GFX11-NEXT:    s_waitcnt vmcnt(0)
712; GFX11-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
713; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
714; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
715; GFX11-NEXT:    s_endpgm
716  %tid = call i32 @llvm.amdgcn.workitem.id.x()
717  %tid.ext = sext i32 %tid to i64
718  %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
719  %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
720  %vgpr = load <4 x i16>, ptr addrspace(1) %in.gep
721  %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
722  store <4 x i16> %result, ptr addrspace(1) %out.gep
723  ret void
724}
725
726declare i32 @llvm.amdgcn.workitem.id.x() #1
727
728attributes #0 = { nounwind }
729attributes #1 = { nounwind readnone }
730