xref: /llvm-project/llvm/test/CodeGen/AMDGPU/rotl.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600 %s
3; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
4; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
7
8define amdgpu_kernel void @rotl_i32(ptr addrspace(1) %in, i32 %x, i32 %y) {
9; R600-LABEL: rotl_i32:
10; R600:       ; %bb.0: ; %entry
11; R600-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
12; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
13; R600-NEXT:    CF_END
14; R600-NEXT:    PAD
15; R600-NEXT:    ALU clause starting at 4:
16; R600-NEXT:     SUB_INT * T0.W, literal.x, KC0[2].W,
17; R600-NEXT:    32(4.484155e-44), 0(0.000000e+00)
18; R600-NEXT:     BIT_ALIGN_INT T0.X, KC0[2].Z, KC0[2].Z, PV.W,
19; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
20; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
21;
22; SI-LABEL: rotl_i32:
23; SI:       ; %bb.0: ; %entry
24; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
25; SI-NEXT:    s_mov_b32 s7, 0xf000
26; SI-NEXT:    s_waitcnt lgkmcnt(0)
27; SI-NEXT:    s_sub_i32 s3, 32, s3
28; SI-NEXT:    s_mov_b32 s6, -1
29; SI-NEXT:    s_mov_b32 s4, s0
30; SI-NEXT:    s_mov_b32 s5, s1
31; SI-NEXT:    v_mov_b32_e32 v0, s3
32; SI-NEXT:    v_alignbit_b32 v0, s2, s2, v0
33; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
34; SI-NEXT:    s_endpgm
35;
36; GFX8-LABEL: rotl_i32:
37; GFX8:       ; %bb.0: ; %entry
38; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
39; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
40; GFX8-NEXT:    s_sub_i32 s3, 32, s3
41; GFX8-NEXT:    v_mov_b32_e32 v0, s3
42; GFX8-NEXT:    v_alignbit_b32 v2, s2, s2, v0
43; GFX8-NEXT:    v_mov_b32_e32 v0, s0
44; GFX8-NEXT:    v_mov_b32_e32 v1, s1
45; GFX8-NEXT:    flat_store_dword v[0:1], v2
46; GFX8-NEXT:    s_endpgm
47;
48; GFX10-LABEL: rotl_i32:
49; GFX10:       ; %bb.0: ; %entry
50; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
51; GFX10-NEXT:    v_mov_b32_e32 v0, 0
52; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
53; GFX10-NEXT:    s_sub_i32 s3, 32, s3
54; GFX10-NEXT:    v_alignbit_b32 v1, s2, s2, s3
55; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
56; GFX10-NEXT:    s_endpgm
57;
58; GFX11-LABEL: rotl_i32:
59; GFX11:       ; %bb.0: ; %entry
60; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
61; GFX11-NEXT:    v_mov_b32_e32 v0, 0
62; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
63; GFX11-NEXT:    s_sub_i32 s3, 32, s3
64; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
65; GFX11-NEXT:    v_alignbit_b32 v1, s2, s2, s3
66; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
67; GFX11-NEXT:    s_endpgm
68entry:
69  %0 = shl i32 %x, %y
70  %1 = sub i32 32, %y
71  %2 = lshr i32 %x, %1
72  %3 = or i32 %0, %2
73  store i32 %3, ptr addrspace(1) %in
74  ret void
75}
76
77define amdgpu_kernel void @rotl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
78; R600-LABEL: rotl_v2i32:
79; R600:       ; %bb.0: ; %entry
80; R600-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
81; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
82; R600-NEXT:    CF_END
83; R600-NEXT:    PAD
84; R600-NEXT:    ALU clause starting at 4:
85; R600-NEXT:     SUB_INT * T0.W, literal.x, KC0[3].Z,
86; R600-NEXT:    32(4.484155e-44), 0(0.000000e+00)
87; R600-NEXT:     BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].X, PV.W,
88; R600-NEXT:     SUB_INT * T0.W, literal.x, KC0[3].Y,
89; R600-NEXT:    32(4.484155e-44), 0(0.000000e+00)
90; R600-NEXT:     BIT_ALIGN_INT T0.X, KC0[2].W, KC0[2].W, PV.W,
91; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
92; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
93;
94; SI-LABEL: rotl_v2i32:
95; SI:       ; %bb.0: ; %entry
96; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
97; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
98; SI-NEXT:    s_mov_b32 s7, 0xf000
99; SI-NEXT:    s_mov_b32 s6, -1
100; SI-NEXT:    s_waitcnt lgkmcnt(0)
101; SI-NEXT:    s_sub_i32 s3, 32, s3
102; SI-NEXT:    s_sub_i32 s2, 32, s2
103; SI-NEXT:    v_mov_b32_e32 v0, s3
104; SI-NEXT:    v_alignbit_b32 v1, s1, s1, v0
105; SI-NEXT:    v_mov_b32_e32 v0, s2
106; SI-NEXT:    v_alignbit_b32 v0, s0, s0, v0
107; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
108; SI-NEXT:    s_endpgm
109;
110; GFX8-LABEL: rotl_v2i32:
111; GFX8:       ; %bb.0: ; %entry
112; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
113; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
114; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
115; GFX8-NEXT:    s_sub_i32 s2, 32, s2
116; GFX8-NEXT:    s_sub_i32 s3, 32, s3
117; GFX8-NEXT:    v_mov_b32_e32 v0, s3
118; GFX8-NEXT:    v_mov_b32_e32 v2, s2
119; GFX8-NEXT:    v_alignbit_b32 v1, s1, s1, v0
120; GFX8-NEXT:    v_alignbit_b32 v0, s0, s0, v2
121; GFX8-NEXT:    v_mov_b32_e32 v2, s4
122; GFX8-NEXT:    v_mov_b32_e32 v3, s5
123; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
124; GFX8-NEXT:    s_endpgm
125;
126; GFX10-LABEL: rotl_v2i32:
127; GFX10:       ; %bb.0: ; %entry
128; GFX10-NEXT:    s_clause 0x1
129; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
130; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
131; GFX10-NEXT:    v_mov_b32_e32 v2, 0
132; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
133; GFX10-NEXT:    s_sub_i32 s3, 32, s3
134; GFX10-NEXT:    s_sub_i32 s2, 32, s2
135; GFX10-NEXT:    v_alignbit_b32 v1, s1, s1, s3
136; GFX10-NEXT:    v_alignbit_b32 v0, s0, s0, s2
137; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
138; GFX10-NEXT:    s_endpgm
139;
140; GFX11-LABEL: rotl_v2i32:
141; GFX11:       ; %bb.0: ; %entry
142; GFX11-NEXT:    s_clause 0x1
143; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
144; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
145; GFX11-NEXT:    v_mov_b32_e32 v2, 0
146; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
147; GFX11-NEXT:    s_sub_i32 s3, 32, s3
148; GFX11-NEXT:    s_sub_i32 s2, 32, s2
149; GFX11-NEXT:    v_alignbit_b32 v1, s1, s1, s3
150; GFX11-NEXT:    v_alignbit_b32 v0, s0, s0, s2
151; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
152; GFX11-NEXT:    s_endpgm
153entry:
154  %0 = shl <2 x i32> %x, %y
155  %1 = sub <2 x i32> <i32 32, i32 32>, %y
156  %2 = lshr <2 x i32> %x, %1
157  %3 = or <2 x i32> %0, %2
158  store <2 x i32> %3, ptr addrspace(1) %in
159  ret void
160}
161
162define amdgpu_kernel void @rotl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) {
163; R600-LABEL: rotl_v4i32:
164; R600:       ; %bb.0: ; %entry
165; R600-NEXT:    ALU 13, @4, KC0[CB0:0-32], KC1[]
166; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
167; R600-NEXT:    CF_END
168; R600-NEXT:    PAD
169; R600-NEXT:    ALU clause starting at 4:
170; R600-NEXT:     SUB_INT * T0.W, literal.x, KC0[5].X,
171; R600-NEXT:    32(4.484155e-44), 0(0.000000e+00)
172; R600-NEXT:     BIT_ALIGN_INT T0.W, KC0[4].X, KC0[4].X, PV.W,
173; R600-NEXT:     SUB_INT * T1.W, literal.x, KC0[4].W,
174; R600-NEXT:    32(4.484155e-44), 0(0.000000e+00)
175; R600-NEXT:     BIT_ALIGN_INT T0.Z, KC0[3].W, KC0[3].W, PS,
176; R600-NEXT:     SUB_INT * T1.W, literal.x, KC0[4].Z,
177; R600-NEXT:    32(4.484155e-44), 0(0.000000e+00)
178; R600-NEXT:     BIT_ALIGN_INT T0.Y, KC0[3].Z, KC0[3].Z, PV.W,
179; R600-NEXT:     SUB_INT * T1.W, literal.x, KC0[4].Y,
180; R600-NEXT:    32(4.484155e-44), 0(0.000000e+00)
181; R600-NEXT:     BIT_ALIGN_INT T0.X, KC0[3].Y, KC0[3].Y, PV.W,
182; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
183; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
184;
185; SI-LABEL: rotl_v4i32:
186; SI:       ; %bb.0: ; %entry
187; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
188; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
189; SI-NEXT:    s_mov_b32 s3, 0xf000
190; SI-NEXT:    s_mov_b32 s2, -1
191; SI-NEXT:    s_waitcnt lgkmcnt(0)
192; SI-NEXT:    s_sub_i32 s4, 32, s12
193; SI-NEXT:    s_sub_i32 s5, 32, s13
194; SI-NEXT:    s_sub_i32 s6, 32, s15
195; SI-NEXT:    s_sub_i32 s7, 32, s14
196; SI-NEXT:    v_mov_b32_e32 v0, s6
197; SI-NEXT:    v_alignbit_b32 v3, s11, s11, v0
198; SI-NEXT:    v_mov_b32_e32 v0, s7
199; SI-NEXT:    v_alignbit_b32 v2, s10, s10, v0
200; SI-NEXT:    v_mov_b32_e32 v0, s5
201; SI-NEXT:    v_alignbit_b32 v1, s9, s9, v0
202; SI-NEXT:    v_mov_b32_e32 v0, s4
203; SI-NEXT:    v_alignbit_b32 v0, s8, s8, v0
204; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
205; SI-NEXT:    s_endpgm
206;
207; GFX8-LABEL: rotl_v4i32:
208; GFX8:       ; %bb.0: ; %entry
209; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
210; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
211; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
212; GFX8-NEXT:    s_sub_i32 s5, 32, s15
213; GFX8-NEXT:    s_sub_i32 s4, 32, s14
214; GFX8-NEXT:    v_mov_b32_e32 v0, s5
215; GFX8-NEXT:    s_sub_i32 s3, 32, s13
216; GFX8-NEXT:    v_alignbit_b32 v3, s11, s11, v0
217; GFX8-NEXT:    v_mov_b32_e32 v0, s4
218; GFX8-NEXT:    s_sub_i32 s2, 32, s12
219; GFX8-NEXT:    v_alignbit_b32 v2, s10, s10, v0
220; GFX8-NEXT:    v_mov_b32_e32 v0, s3
221; GFX8-NEXT:    v_alignbit_b32 v1, s9, s9, v0
222; GFX8-NEXT:    v_mov_b32_e32 v0, s2
223; GFX8-NEXT:    v_mov_b32_e32 v5, s1
224; GFX8-NEXT:    v_alignbit_b32 v0, s8, s8, v0
225; GFX8-NEXT:    v_mov_b32_e32 v4, s0
226; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
227; GFX8-NEXT:    s_endpgm
228;
229; GFX10-LABEL: rotl_v4i32:
230; GFX10:       ; %bb.0: ; %entry
231; GFX10-NEXT:    s_clause 0x1
232; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
233; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
234; GFX10-NEXT:    v_mov_b32_e32 v4, 0
235; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
236; GFX10-NEXT:    s_sub_i32 s2, 32, s12
237; GFX10-NEXT:    s_sub_i32 s3, 32, s13
238; GFX10-NEXT:    s_sub_i32 s4, 32, s15
239; GFX10-NEXT:    s_sub_i32 s5, 32, s14
240; GFX10-NEXT:    v_alignbit_b32 v3, s11, s11, s4
241; GFX10-NEXT:    v_alignbit_b32 v2, s10, s10, s5
242; GFX10-NEXT:    v_alignbit_b32 v1, s9, s9, s3
243; GFX10-NEXT:    v_alignbit_b32 v0, s8, s8, s2
244; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
245; GFX10-NEXT:    s_endpgm
246;
247; GFX11-LABEL: rotl_v4i32:
248; GFX11:       ; %bb.0: ; %entry
249; GFX11-NEXT:    s_clause 0x1
250; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
251; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
252; GFX11-NEXT:    v_mov_b32_e32 v4, 0
253; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
254; GFX11-NEXT:    s_sub_i32 s2, 32, s12
255; GFX11-NEXT:    s_sub_i32 s3, 32, s13
256; GFX11-NEXT:    s_sub_i32 s4, 32, s15
257; GFX11-NEXT:    s_sub_i32 s5, 32, s14
258; GFX11-NEXT:    v_alignbit_b32 v3, s11, s11, s4
259; GFX11-NEXT:    v_alignbit_b32 v2, s10, s10, s5
260; GFX11-NEXT:    v_alignbit_b32 v1, s9, s9, s3
261; GFX11-NEXT:    v_alignbit_b32 v0, s8, s8, s2
262; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
263; GFX11-NEXT:    s_endpgm
264entry:
265  %0 = shl <4 x i32> %x, %y
266  %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
267  %2 = lshr <4 x i32> %x, %1
268  %3 = or <4 x i32> %0, %2
269  store <4 x i32> %3, ptr addrspace(1) %in
270  ret void
271}
272
273declare i16 @llvm.fshl.i16(i16, i16, i16)
274
275define void @test_rotl_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr addrspace(1) nocapture readonly %sourceB, ptr addrspace(1) nocapture %destValues) {
276; R600-LABEL: test_rotl_i16:
277; R600:       ; %bb.0: ; %entry
278; R600-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
279; R600-NEXT:    TEX 0 @8
280; R600-NEXT:    ALU 0, @13, KC0[CB0:0-32], KC1[]
281; R600-NEXT:    TEX 0 @10
282; R600-NEXT:    ALU 21, @14, KC0[CB0:0-32], KC1[]
283; R600-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
284; R600-NEXT:    CF_END
285; R600-NEXT:    PAD
286; R600-NEXT:    Fetch clause starting at 8:
287; R600-NEXT:     VTX_READ_16 T0.X, T0.X, 48, #1
288; R600-NEXT:    Fetch clause starting at 10:
289; R600-NEXT:     VTX_READ_16 T1.X, T1.X, 32, #1
290; R600-NEXT:    ALU clause starting at 12:
291; R600-NEXT:     MOV * T0.X, KC0[2].Z,
292; R600-NEXT:    ALU clause starting at 13:
293; R600-NEXT:     MOV * T1.X, KC0[2].Y,
294; R600-NEXT:    ALU clause starting at 14:
295; R600-NEXT:     SUB_INT T0.W, 0.0, T0.X,
296; R600-NEXT:     AND_INT * T1.W, T0.X, literal.x,
297; R600-NEXT:    15(2.101948e-44), 0(0.000000e+00)
298; R600-NEXT:     AND_INT * T0.W, PV.W, literal.x,
299; R600-NEXT:    15(2.101948e-44), 0(0.000000e+00)
300; R600-NEXT:     LSHR T0.Z, T1.X, PV.W,
301; R600-NEXT:     LSHL T0.W, T1.X, T1.W,
302; R600-NEXT:     ADD_INT * T1.W, KC0[2].W, literal.x,
303; R600-NEXT:    8(1.121039e-44), 0(0.000000e+00)
304; R600-NEXT:     AND_INT T2.W, PS, literal.x,
305; R600-NEXT:     OR_INT * T0.W, PV.W, PV.Z,
306; R600-NEXT:    3(4.203895e-45), 0(0.000000e+00)
307; R600-NEXT:     AND_INT T0.W, PS, literal.x,
308; R600-NEXT:     LSHL * T2.W, PV.W, literal.y,
309; R600-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
310; R600-NEXT:     LSHL T0.X, PV.W, PS,
311; R600-NEXT:     LSHL * T0.W, literal.x, PS,
312; R600-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
313; R600-NEXT:     MOV T0.Y, 0.0,
314; R600-NEXT:     MOV * T0.Z, 0.0,
315; R600-NEXT:     LSHR * T1.X, T1.W, literal.x,
316; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
317;
318; SI-LABEL: test_rotl_i16:
319; SI:       ; %bb.0: ; %entry
320; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
321; SI-NEXT:    s_mov_b32 s6, 0
322; SI-NEXT:    s_mov_b32 s7, 0xf000
323; SI-NEXT:    s_mov_b32 s4, s6
324; SI-NEXT:    s_mov_b32 s5, s6
325; SI-NEXT:    buffer_load_ushort v2, v[2:3], s[4:7], 0 addr64 offset:48
326; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 offset:32
327; SI-NEXT:    s_waitcnt vmcnt(1)
328; SI-NEXT:    v_and_b32_e32 v1, 15, v2
329; SI-NEXT:    v_sub_i32_e32 v2, vcc, 0, v2
330; SI-NEXT:    s_waitcnt vmcnt(0)
331; SI-NEXT:    v_lshlrev_b32_e32 v1, v1, v0
332; SI-NEXT:    v_and_b32_e32 v2, 15, v2
333; SI-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
334; SI-NEXT:    v_or_b32_e32 v0, v1, v0
335; SI-NEXT:    buffer_store_short v0, v[4:5], s[4:7], 0 addr64 offset:8
336; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
337; SI-NEXT:    s_setpc_b64 s[30:31]
338;
339; GFX8-LABEL: test_rotl_i16:
340; GFX8:       ; %bb.0: ; %entry
341; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
342; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
343; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
344; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 48, v2
345; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
346; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
347; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
348; GFX8-NEXT:    s_waitcnt vmcnt(0)
349; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v2, v0
350; GFX8-NEXT:    v_sub_u16_e32 v2, 0, v2
351; GFX8-NEXT:    v_lshrrev_b16_e32 v0, v2, v0
352; GFX8-NEXT:    v_or_b32_e32 v2, v1, v0
353; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 8, v4
354; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
355; GFX8-NEXT:    flat_store_short v[0:1], v2
356; GFX8-NEXT:    s_waitcnt vmcnt(0)
357; GFX8-NEXT:    s_setpc_b64 s[30:31]
358;
359; GFX10-LABEL: test_rotl_i16:
360; GFX10:       ; %bb.0: ; %entry
361; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362; GFX10-NEXT:    global_load_ushort v6, v[2:3], off offset:48
363; GFX10-NEXT:    global_load_ushort v7, v[0:1], off offset:32
364; GFX10-NEXT:    s_waitcnt vmcnt(1)
365; GFX10-NEXT:    v_sub_nc_u16 v0, 0, v6
366; GFX10-NEXT:    s_waitcnt vmcnt(0)
367; GFX10-NEXT:    v_lshlrev_b16 v1, v6, v7
368; GFX10-NEXT:    v_lshrrev_b16 v0, v0, v7
369; GFX10-NEXT:    v_or_b32_e32 v0, v1, v0
370; GFX10-NEXT:    global_store_short v[4:5], v0, off offset:8
371; GFX10-NEXT:    s_setpc_b64 s[30:31]
372;
373; GFX11-LABEL: test_rotl_i16:
374; GFX11:       ; %bb.0: ; %entry
375; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
376; GFX11-NEXT:    global_load_u16 v2, v[2:3], off offset:48
377; GFX11-NEXT:    global_load_u16 v0, v[0:1], off offset:32
378; GFX11-NEXT:    s_waitcnt vmcnt(1)
379; GFX11-NEXT:    v_sub_nc_u16 v1, 0, v2
380; GFX11-NEXT:    s_waitcnt vmcnt(0)
381; GFX11-NEXT:    v_lshlrev_b16 v2, v2, v0
382; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
383; GFX11-NEXT:    v_lshrrev_b16 v0, v1, v0
384; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
385; GFX11-NEXT:    global_store_b16 v[4:5], v0, off offset:8
386; GFX11-NEXT:    s_setpc_b64 s[30:31]
387entry:
388  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %sourceA, i64 16
389  %a = load i16, ptr addrspace(1) %arrayidx
390  %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %sourceB, i64 24
391  %b = load i16, ptr addrspace(1) %arrayidx2
392  %c = tail call i16 @llvm.fshl.i16(i16 %a, i16 %a, i16 %b)
393  %arrayidx5 = getelementptr inbounds i16, ptr addrspace(1) %destValues, i64 4
394  store i16 %c, ptr addrspace(1) %arrayidx5
395  ret void
396}
397