xref: /llvm-project/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s
4; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=CI %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s
7
8define amdgpu_kernel void @s_lshr_v2i16(ptr addrspace(1) %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
9; GFX9-LABEL: s_lshr_v2i16:
10; GFX9:       ; %bb.0:
11; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
12; GFX9-NEXT:    v_mov_b32_e32 v0, 0
13; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
14; GFX9-NEXT:    v_mov_b32_e32 v1, s2
15; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s3, v1
16; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
17; GFX9-NEXT:    s_endpgm
18;
19; VI-LABEL: s_lshr_v2i16:
20; VI:       ; %bb.0:
21; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
22; VI-NEXT:    s_waitcnt lgkmcnt(0)
23; VI-NEXT:    s_and_b32 s4, s2, 0xffff
24; VI-NEXT:    s_lshr_b32 s2, s2, 16
25; VI-NEXT:    s_lshr_b32 s5, s3, 16
26; VI-NEXT:    s_lshr_b32 s2, s2, s5
27; VI-NEXT:    s_lshr_b32 s3, s4, s3
28; VI-NEXT:    s_lshl_b32 s2, s2, 16
29; VI-NEXT:    s_or_b32 s2, s3, s2
30; VI-NEXT:    v_mov_b32_e32 v0, s0
31; VI-NEXT:    v_mov_b32_e32 v1, s1
32; VI-NEXT:    v_mov_b32_e32 v2, s2
33; VI-NEXT:    flat_store_dword v[0:1], v2
34; VI-NEXT:    s_endpgm
35;
36; CI-LABEL: s_lshr_v2i16:
37; CI:       ; %bb.0:
38; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
39; CI-NEXT:    s_mov_b32 s7, 0xf000
40; CI-NEXT:    s_mov_b32 s6, -1
41; CI-NEXT:    s_waitcnt lgkmcnt(0)
42; CI-NEXT:    s_mov_b32 s4, s0
43; CI-NEXT:    s_mov_b32 s5, s1
44; CI-NEXT:    s_and_b32 s0, s2, 0xffff
45; CI-NEXT:    s_lshr_b32 s1, s2, 16
46; CI-NEXT:    s_lshr_b32 s2, s3, 16
47; CI-NEXT:    s_lshr_b32 s1, s1, s2
48; CI-NEXT:    s_lshl_b32 s1, s1, 16
49; CI-NEXT:    s_lshr_b32 s0, s0, s3
50; CI-NEXT:    s_or_b32 s0, s0, s1
51; CI-NEXT:    v_mov_b32_e32 v0, s0
52; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
53; CI-NEXT:    s_endpgm
54;
55; GFX10-LABEL: s_lshr_v2i16:
56; GFX10:       ; %bb.0:
57; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
58; GFX10-NEXT:    v_mov_b32_e32 v0, 0
59; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
60; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s3, s2
61; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
62; GFX10-NEXT:    s_endpgm
63;
64; GFX11-LABEL: s_lshr_v2i16:
65; GFX11:       ; %bb.0:
66; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
67; GFX11-NEXT:    v_mov_b32_e32 v0, 0
68; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX11-NEXT:    v_pk_lshrrev_b16 v1, s3, s2
70; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
71; GFX11-NEXT:    s_endpgm
72  %result = lshr <2 x i16> %lhs, %rhs
73  store <2 x i16> %result, ptr addrspace(1) %out
74  ret void
75}
76
77define amdgpu_kernel void @v_lshr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
78; GFX9-LABEL: v_lshr_v2i16:
79; GFX9:       ; %bb.0:
80; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
81; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
82; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
83; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
84; GFX9-NEXT:    s_waitcnt vmcnt(0)
85; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
86; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
87; GFX9-NEXT:    s_endpgm
88;
89; VI-LABEL: v_lshr_v2i16:
90; VI:       ; %bb.0:
91; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
92; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
93; VI-NEXT:    s_waitcnt lgkmcnt(0)
94; VI-NEXT:    v_mov_b32_e32 v1, s3
95; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
96; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
97; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
98; VI-NEXT:    v_mov_b32_e32 v3, s1
99; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
100; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
101; VI-NEXT:    s_waitcnt vmcnt(0)
102; VI-NEXT:    v_lshrrev_b16_e32 v4, v1, v0
103; VI-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
104; VI-NEXT:    v_or_b32_e32 v0, v4, v0
105; VI-NEXT:    flat_store_dword v[2:3], v0
106; VI-NEXT:    s_endpgm
107;
108; CI-LABEL: v_lshr_v2i16:
109; CI:       ; %bb.0:
110; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
111; CI-NEXT:    s_mov_b32 s7, 0xf000
112; CI-NEXT:    s_mov_b32 s6, 0
113; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
114; CI-NEXT:    v_mov_b32_e32 v1, 0
115; CI-NEXT:    s_waitcnt lgkmcnt(0)
116; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
117; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
118; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
119; CI-NEXT:    s_waitcnt vmcnt(0)
120; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
121; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
122; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
123; CI-NEXT:    v_lshrrev_b32_e32 v2, v3, v2
124; CI-NEXT:    v_lshrrev_b32_e32 v3, v5, v4
125; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
126; CI-NEXT:    v_or_b32_e32 v2, v2, v3
127; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
128; CI-NEXT:    s_endpgm
129;
130; GFX10-LABEL: v_lshr_v2i16:
131; GFX10:       ; %bb.0:
132; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
133; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
134; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
135; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
136; GFX10-NEXT:    s_waitcnt vmcnt(0)
137; GFX10-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
138; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
139; GFX10-NEXT:    s_endpgm
140;
141; GFX11-LABEL: v_lshr_v2i16:
142; GFX11:       ; %bb.0:
143; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
144; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
145; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
146; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
147; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
148; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
149; GFX11-NEXT:    s_waitcnt vmcnt(0)
150; GFX11-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
151; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
152; GFX11-NEXT:    s_endpgm
153  %tid = call i32 @llvm.amdgcn.workitem.id.x()
154  %tid.ext = sext i32 %tid to i64
155  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
156  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
157  %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in.gep, i32 1
158  %a = load <2 x i16>, ptr addrspace(1) %in.gep
159  %b = load <2 x i16>, ptr addrspace(1) %b_ptr
160  %result = lshr <2 x i16> %a, %b
161  store <2 x i16> %result, ptr addrspace(1) %out.gep
162  ret void
163}
164
165define amdgpu_kernel void @lshr_v_s_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
166; GFX9-LABEL: lshr_v_s_v2i16:
167; GFX9:       ; %bb.0:
168; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
169; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x34
170; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
171; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
172; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
173; GFX9-NEXT:    s_waitcnt vmcnt(0)
174; GFX9-NEXT:    v_pk_lshrrev_b16 v1, s6, v1
175; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
176; GFX9-NEXT:    s_endpgm
177;
178; VI-LABEL: lshr_v_s_v2i16:
179; VI:       ; %bb.0:
180; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
181; VI-NEXT:    s_load_dword s4, s[4:5], 0x34
182; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
183; VI-NEXT:    s_waitcnt lgkmcnt(0)
184; VI-NEXT:    v_mov_b32_e32 v1, s3
185; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
186; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
187; VI-NEXT:    flat_load_dword v3, v[0:1]
188; VI-NEXT:    v_mov_b32_e32 v1, s1
189; VI-NEXT:    s_lshr_b32 s1, s4, 16
190; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
191; VI-NEXT:    v_mov_b32_e32 v2, s1
192; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
193; VI-NEXT:    s_waitcnt vmcnt(0)
194; VI-NEXT:    v_lshrrev_b16_e32 v4, s4, v3
195; VI-NEXT:    v_lshrrev_b16_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
196; VI-NEXT:    v_or_b32_e32 v2, v4, v2
197; VI-NEXT:    flat_store_dword v[0:1], v2
198; VI-NEXT:    s_endpgm
199;
200; CI-LABEL: lshr_v_s_v2i16:
201; CI:       ; %bb.0:
202; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
203; CI-NEXT:    s_load_dword s8, s[4:5], 0xd
204; CI-NEXT:    s_mov_b32 s7, 0xf000
205; CI-NEXT:    s_mov_b32 s6, 0
206; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
207; CI-NEXT:    s_waitcnt lgkmcnt(0)
208; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
209; CI-NEXT:    v_mov_b32_e32 v1, 0
210; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
211; CI-NEXT:    s_lshr_b32 s4, s8, 16
212; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
213; CI-NEXT:    s_waitcnt vmcnt(0)
214; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
215; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
216; CI-NEXT:    v_lshrrev_b32_e32 v3, s4, v3
217; CI-NEXT:    v_lshrrev_b32_e32 v2, s8, v2
218; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
219; CI-NEXT:    v_or_b32_e32 v2, v2, v3
220; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
221; CI-NEXT:    s_endpgm
222;
223; GFX10-LABEL: lshr_v_s_v2i16:
224; GFX10:       ; %bb.0:
225; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
226; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
227; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x34
228; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
229; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
230; GFX10-NEXT:    s_waitcnt vmcnt(0)
231; GFX10-NEXT:    v_pk_lshrrev_b16 v1, s4, v1
232; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
233; GFX10-NEXT:    s_endpgm
234;
235; GFX11-LABEL: lshr_v_s_v2i16:
236; GFX11:       ; %bb.0:
237; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
238; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
239; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
240; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
241; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
242; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
243; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
244; GFX11-NEXT:    s_waitcnt vmcnt(0)
245; GFX11-NEXT:    v_pk_lshrrev_b16 v1, s4, v1
246; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
247; GFX11-NEXT:    s_endpgm
248  %tid = call i32 @llvm.amdgcn.workitem.id.x()
249  %tid.ext = sext i32 %tid to i64
250  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
251  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
252  %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
253  %result = lshr <2 x i16> %vgpr, %sgpr
254  store <2 x i16> %result, ptr addrspace(1) %out.gep
255  ret void
256}
257
258define amdgpu_kernel void @lshr_s_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in, <2 x i16> %sgpr) #0 {
259; GFX9-LABEL: lshr_s_v_v2i16:
260; GFX9:       ; %bb.0:
261; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
262; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x34
263; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
264; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
265; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
266; GFX9-NEXT:    s_waitcnt vmcnt(0)
267; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v1, s6
268; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
269; GFX9-NEXT:    s_endpgm
270;
271; VI-LABEL: lshr_s_v_v2i16:
272; VI:       ; %bb.0:
273; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
274; VI-NEXT:    s_load_dword s4, s[4:5], 0x34
275; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
276; VI-NEXT:    s_waitcnt lgkmcnt(0)
277; VI-NEXT:    v_mov_b32_e32 v1, s3
278; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
279; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
280; VI-NEXT:    flat_load_dword v3, v[0:1]
281; VI-NEXT:    v_mov_b32_e32 v1, s1
282; VI-NEXT:    s_lshr_b32 s1, s4, 16
283; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
284; VI-NEXT:    v_mov_b32_e32 v2, s1
285; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
286; VI-NEXT:    s_waitcnt vmcnt(0)
287; VI-NEXT:    v_lshrrev_b16_e64 v4, v3, s4
288; VI-NEXT:    v_lshrrev_b16_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
289; VI-NEXT:    v_or_b32_e32 v2, v4, v2
290; VI-NEXT:    flat_store_dword v[0:1], v2
291; VI-NEXT:    s_endpgm
292;
293; CI-LABEL: lshr_s_v_v2i16:
294; CI:       ; %bb.0:
295; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
296; CI-NEXT:    s_load_dword s8, s[4:5], 0xd
297; CI-NEXT:    s_mov_b32 s7, 0xf000
298; CI-NEXT:    s_mov_b32 s6, 0
299; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
300; CI-NEXT:    s_waitcnt lgkmcnt(0)
301; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
302; CI-NEXT:    v_mov_b32_e32 v1, 0
303; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
304; CI-NEXT:    s_lshr_b32 s4, s8, 16
305; CI-NEXT:    s_and_b32 s5, s8, 0xffff
306; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
307; CI-NEXT:    s_waitcnt vmcnt(0)
308; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
309; CI-NEXT:    v_lshr_b32_e32 v3, s4, v3
310; CI-NEXT:    v_lshr_b32_e32 v2, s5, v2
311; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
312; CI-NEXT:    v_or_b32_e32 v2, v2, v3
313; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
314; CI-NEXT:    s_endpgm
315;
316; GFX10-LABEL: lshr_s_v_v2i16:
317; GFX10:       ; %bb.0:
318; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
319; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
320; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x34
321; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
323; GFX10-NEXT:    s_waitcnt vmcnt(0)
324; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v1, s4
325; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
326; GFX10-NEXT:    s_endpgm
327;
328; GFX11-LABEL: lshr_s_v_v2i16:
329; GFX11:       ; %bb.0:
330; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
331; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
332; GFX11-NEXT:    s_load_b32 s4, s[4:5], 0x34
333; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
334; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
335; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
336; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
337; GFX11-NEXT:    s_waitcnt vmcnt(0)
338; GFX11-NEXT:    v_pk_lshrrev_b16 v1, v1, s4
339; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
340; GFX11-NEXT:    s_endpgm
341  %tid = call i32 @llvm.amdgcn.workitem.id.x()
342  %tid.ext = sext i32 %tid to i64
343  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
344  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
345  %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
346  %result = lshr <2 x i16> %sgpr, %vgpr
347  store <2 x i16> %result, ptr addrspace(1) %out.gep
348  ret void
349}
350
351define amdgpu_kernel void @lshr_imm_v_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
352; GFX9-LABEL: lshr_imm_v_v2i16:
353; GFX9:       ; %bb.0:
354; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
355; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
356; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
357; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
358; GFX9-NEXT:    s_waitcnt vmcnt(0)
359; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
360; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
361; GFX9-NEXT:    s_endpgm
362;
363; VI-LABEL: lshr_imm_v_v2i16:
364; VI:       ; %bb.0:
365; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
366; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
367; VI-NEXT:    v_mov_b32_e32 v4, 8
368; VI-NEXT:    s_waitcnt lgkmcnt(0)
369; VI-NEXT:    v_mov_b32_e32 v1, s3
370; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
371; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
372; VI-NEXT:    flat_load_dword v3, v[0:1]
373; VI-NEXT:    v_mov_b32_e32 v1, s1
374; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
375; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
376; VI-NEXT:    s_waitcnt vmcnt(0)
377; VI-NEXT:    v_lshrrev_b16_e64 v2, v3, 8
378; VI-NEXT:    v_lshrrev_b16_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
379; VI-NEXT:    v_or_b32_e32 v2, v2, v3
380; VI-NEXT:    flat_store_dword v[0:1], v2
381; VI-NEXT:    s_endpgm
382;
383; CI-LABEL: lshr_imm_v_v2i16:
384; CI:       ; %bb.0:
385; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
386; CI-NEXT:    s_mov_b32 s7, 0xf000
387; CI-NEXT:    s_mov_b32 s6, 0
388; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
389; CI-NEXT:    v_mov_b32_e32 v1, 0
390; CI-NEXT:    s_waitcnt lgkmcnt(0)
391; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
392; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
393; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
394; CI-NEXT:    s_waitcnt vmcnt(0)
395; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
396; CI-NEXT:    v_lshr_b32_e32 v3, 8, v3
397; CI-NEXT:    v_lshr_b32_e32 v2, 8, v2
398; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
399; CI-NEXT:    v_or_b32_e32 v2, v2, v3
400; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
401; CI-NEXT:    s_endpgm
402;
403; GFX10-LABEL: lshr_imm_v_v2i16:
404; GFX10:       ; %bb.0:
405; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
406; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
407; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
408; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
409; GFX10-NEXT:    s_waitcnt vmcnt(0)
410; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
411; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
412; GFX10-NEXT:    s_endpgm
413;
414; GFX11-LABEL: lshr_imm_v_v2i16:
415; GFX11:       ; %bb.0:
416; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
417; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
418; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
419; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
420; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
421; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
422; GFX11-NEXT:    s_waitcnt vmcnt(0)
423; GFX11-NEXT:    v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0]
424; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
425; GFX11-NEXT:    s_endpgm
426  %tid = call i32 @llvm.amdgcn.workitem.id.x()
427  %tid.ext = sext i32 %tid to i64
428  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
429  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
430  %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
431  %result = lshr <2 x i16> <i16 8, i16 8>, %vgpr
432  store <2 x i16> %result, ptr addrspace(1) %out.gep
433  ret void
434}
435
436define amdgpu_kernel void @lshr_v_imm_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
437; GFX9-LABEL: lshr_v_imm_v2i16:
438; GFX9:       ; %bb.0:
439; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
440; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
441; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
442; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
443; GFX9-NEXT:    s_waitcnt vmcnt(0)
444; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
445; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
446; GFX9-NEXT:    s_endpgm
447;
448; VI-LABEL: lshr_v_imm_v2i16:
449; VI:       ; %bb.0:
450; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
451; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
452; VI-NEXT:    s_waitcnt lgkmcnt(0)
453; VI-NEXT:    v_mov_b32_e32 v1, s3
454; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
455; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
456; VI-NEXT:    flat_load_dword v3, v[0:1]
457; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
458; VI-NEXT:    v_mov_b32_e32 v1, s1
459; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
460; VI-NEXT:    s_waitcnt vmcnt(0)
461; VI-NEXT:    v_lshrrev_b32_e32 v2, 24, v3
462; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
463; VI-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
464; VI-NEXT:    flat_store_dword v[0:1], v2
465; VI-NEXT:    s_endpgm
466;
467; CI-LABEL: lshr_v_imm_v2i16:
468; CI:       ; %bb.0:
469; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
470; CI-NEXT:    s_mov_b32 s7, 0xf000
471; CI-NEXT:    s_mov_b32 s6, 0
472; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
473; CI-NEXT:    v_mov_b32_e32 v1, 0
474; CI-NEXT:    s_waitcnt lgkmcnt(0)
475; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
476; CI-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
477; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
478; CI-NEXT:    s_waitcnt vmcnt(0)
479; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
480; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff, v2
481; CI-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
482; CI-NEXT:    s_endpgm
483;
484; GFX10-LABEL: lshr_v_imm_v2i16:
485; GFX10:       ; %bb.0:
486; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
487; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
488; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
489; GFX10-NEXT:    global_load_dword v1, v0, s[2:3]
490; GFX10-NEXT:    s_waitcnt vmcnt(0)
491; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
492; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
493; GFX10-NEXT:    s_endpgm
494;
495; GFX11-LABEL: lshr_v_imm_v2i16:
496; GFX11:       ; %bb.0:
497; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
498; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
499; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
500; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
501; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
502; GFX11-NEXT:    global_load_b32 v1, v0, s[2:3]
503; GFX11-NEXT:    s_waitcnt vmcnt(0)
504; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
505; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
506; GFX11-NEXT:    s_endpgm
507  %tid = call i32 @llvm.amdgcn.workitem.id.x()
508  %tid.ext = sext i32 %tid to i64
509  %in.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext
510  %out.gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i64 %tid.ext
511  %vgpr = load <2 x i16>, ptr addrspace(1) %in.gep
512  %result = lshr <2 x i16> %vgpr, <i16 8, i16 8>
513  store <2 x i16> %result, ptr addrspace(1) %out.gep
514  ret void
515}
516
517define amdgpu_kernel void @v_lshr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
518; GFX9-LABEL: v_lshr_v4i16:
519; GFX9:       ; %bb.0:
520; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
521; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
522; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
523; GFX9-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
524; GFX9-NEXT:    s_waitcnt vmcnt(0)
525; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
526; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
527; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
528; GFX9-NEXT:    s_endpgm
529;
530; VI-LABEL: v_lshr_v4i16:
531; VI:       ; %bb.0:
532; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
533; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
534; VI-NEXT:    s_waitcnt lgkmcnt(0)
535; VI-NEXT:    v_mov_b32_e32 v1, s3
536; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
537; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
538; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
539; VI-NEXT:    v_mov_b32_e32 v5, s1
540; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
541; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
542; VI-NEXT:    s_waitcnt vmcnt(0)
543; VI-NEXT:    v_lshrrev_b16_e32 v6, v3, v1
544; VI-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
545; VI-NEXT:    v_lshrrev_b16_e32 v3, v2, v0
546; VI-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
547; VI-NEXT:    v_or_b32_e32 v1, v6, v1
548; VI-NEXT:    v_or_b32_e32 v0, v3, v0
549; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
550; VI-NEXT:    s_endpgm
551;
552; CI-LABEL: v_lshr_v4i16:
553; CI:       ; %bb.0:
554; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
555; CI-NEXT:    s_mov_b32 s7, 0xf000
556; CI-NEXT:    s_mov_b32 s6, 0
557; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
558; CI-NEXT:    v_mov_b32_e32 v5, 0
559; CI-NEXT:    s_waitcnt lgkmcnt(0)
560; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
561; CI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
562; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
563; CI-NEXT:    s_waitcnt vmcnt(0)
564; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
565; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
566; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
567; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
568; CI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
569; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
570; CI-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
571; CI-NEXT:    v_lshrrev_b32_e32 v3, v9, v7
572; CI-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
573; CI-NEXT:    v_lshrrev_b32_e32 v2, v8, v6
574; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
575; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
576; CI-NEXT:    v_or_b32_e32 v1, v1, v3
577; CI-NEXT:    v_or_b32_e32 v0, v0, v2
578; CI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
579; CI-NEXT:    s_endpgm
580;
581; GFX10-LABEL: v_lshr_v4i16:
582; GFX10:       ; %bb.0:
583; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
584; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
585; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
586; GFX10-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
587; GFX10-NEXT:    s_waitcnt vmcnt(0)
588; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
589; GFX10-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
590; GFX10-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
591; GFX10-NEXT:    s_endpgm
592;
593; GFX11-LABEL: v_lshr_v4i16:
594; GFX11:       ; %bb.0:
595; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
596; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
597; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
598; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
599; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
600; GFX11-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
601; GFX11-NEXT:    s_waitcnt vmcnt(0)
602; GFX11-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
603; GFX11-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
604; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
605; GFX11-NEXT:    s_endpgm
606  %tid = call i32 @llvm.amdgcn.workitem.id.x()
607  %tid.ext = sext i32 %tid to i64
608  %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
609  %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
610  %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in.gep, i32 1
611  %a = load <4 x i16>, ptr addrspace(1) %in.gep
612  %b = load <4 x i16>, ptr addrspace(1) %b_ptr
613  %result = lshr <4 x i16> %a, %b
614  store <4 x i16> %result, ptr addrspace(1) %out.gep
615  ret void
616}
617
618define amdgpu_kernel void @lshr_v_imm_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
619; GFX9-LABEL: lshr_v_imm_v4i16:
620; GFX9:       ; %bb.0:
621; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
622; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
623; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
624; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
625; GFX9-NEXT:    s_waitcnt vmcnt(0)
626; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
627; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
628; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
629; GFX9-NEXT:    s_endpgm
630;
631; VI-LABEL: lshr_v_imm_v4i16:
632; VI:       ; %bb.0:
633; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
634; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
635; VI-NEXT:    s_waitcnt lgkmcnt(0)
636; VI-NEXT:    v_mov_b32_e32 v1, s3
637; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
638; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
639; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
640; VI-NEXT:    v_mov_b32_e32 v3, s1
641; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
642; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
643; VI-NEXT:    s_waitcnt vmcnt(0)
644; VI-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
645; VI-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
646; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
647; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
648; VI-NEXT:    v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
649; VI-NEXT:    v_or_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
650; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
651; VI-NEXT:    s_endpgm
652;
653; CI-LABEL: lshr_v_imm_v4i16:
654; CI:       ; %bb.0:
655; CI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
656; CI-NEXT:    s_mov_b32 s7, 0xf000
657; CI-NEXT:    s_mov_b32 s6, 0
658; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
659; CI-NEXT:    v_mov_b32_e32 v1, 0
660; CI-NEXT:    s_waitcnt lgkmcnt(0)
661; CI-NEXT:    s_mov_b64 s[4:5], s[2:3]
662; CI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
663; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
664; CI-NEXT:    s_waitcnt vmcnt(0)
665; CI-NEXT:    v_lshrrev_b32_e32 v3, 8, v3
666; CI-NEXT:    v_lshrrev_b32_e32 v2, 8, v2
667; CI-NEXT:    v_and_b32_e32 v3, 0xff00ff, v3
668; CI-NEXT:    v_and_b32_e32 v2, 0xff00ff, v2
669; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
670; CI-NEXT:    s_endpgm
671;
672; GFX10-LABEL: lshr_v_imm_v4i16:
673; GFX10:       ; %bb.0:
674; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
675; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
676; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
678; GFX10-NEXT:    s_waitcnt vmcnt(0)
679; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
680; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
681; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
682; GFX10-NEXT:    s_endpgm
683;
684; GFX11-LABEL: lshr_v_imm_v4i16:
685; GFX11:       ; %bb.0:
686; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
687; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
688; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
689; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
690; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
691; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
692; GFX11-NEXT:    s_waitcnt vmcnt(0)
693; GFX11-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
694; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
695; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
696; GFX11-NEXT:    s_endpgm
697  %tid = call i32 @llvm.amdgcn.workitem.id.x()
698  %tid.ext = sext i32 %tid to i64
699  %in.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i64 %tid.ext
700  %out.gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i64 %tid.ext
701  %vgpr = load <4 x i16>, ptr addrspace(1) %in.gep
702  %result = lshr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
703  store <4 x i16> %result, ptr addrspace(1) %out.gep
704  ret void
705}
706
707declare i32 @llvm.amdgcn.workitem.id.x() #1
708
709attributes #0 = { nounwind }
710attributes #1 = { nounwind readnone }
711