xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fshr.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,SI
3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,VI
4; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX89,GFX9
5; RUN: llc < %s -mtriple=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s --check-prefix=R600
6; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX10
7; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -check-prefixes=GFX11
8
9declare i32 @llvm.fshr.i32(i32, i32, i32)
10declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>)
11declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>)
12declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
13declare i16 @llvm.fshr.i16(i16, i16, i16)
14declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>)
15declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>)
16declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>)
17declare i64 @llvm.fshr.i64(i64, i64, i64)
18declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
19declare i24 @llvm.fshr.i24(i24, i24, i24)
20declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>)
21
22define amdgpu_kernel void @fshr_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) {
23; SI-LABEL: fshr_i32:
24; SI:       ; %bb.0: ; %entry
25; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
26; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
27; SI-NEXT:    s_mov_b32 s7, 0xf000
28; SI-NEXT:    s_mov_b32 s6, -1
29; SI-NEXT:    s_waitcnt lgkmcnt(0)
30; SI-NEXT:    v_mov_b32_e32 v0, s1
31; SI-NEXT:    v_mov_b32_e32 v1, s2
32; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v1
33; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: fshr_i32:
37; VI:       ; %bb.0: ; %entry
38; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
39; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
40; VI-NEXT:    s_waitcnt lgkmcnt(0)
41; VI-NEXT:    v_mov_b32_e32 v0, s1
42; VI-NEXT:    v_mov_b32_e32 v1, s2
43; VI-NEXT:    v_alignbit_b32 v2, s0, v0, v1
44; VI-NEXT:    v_mov_b32_e32 v0, s4
45; VI-NEXT:    v_mov_b32_e32 v1, s5
46; VI-NEXT:    flat_store_dword v[0:1], v2
47; VI-NEXT:    s_endpgm
48;
49; GFX9-LABEL: fshr_i32:
50; GFX9:       ; %bb.0: ; %entry
51; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
52; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
53; GFX9-NEXT:    v_mov_b32_e32 v0, 0
54; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
55; GFX9-NEXT:    v_mov_b32_e32 v1, s1
56; GFX9-NEXT:    v_mov_b32_e32 v2, s2
57; GFX9-NEXT:    v_alignbit_b32 v1, s0, v1, v2
58; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
59; GFX9-NEXT:    s_endpgm
60;
61; R600-LABEL: fshr_i32:
62; R600:       ; %bb.0: ; %entry
63; R600-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
64; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
65; R600-NEXT:    CF_END
66; R600-NEXT:    PAD
67; R600-NEXT:    ALU clause starting at 4:
68; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
69; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
70; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, KC0[3].X,
71;
72; GFX10-LABEL: fshr_i32:
73; GFX10:       ; %bb.0: ; %entry
74; GFX10-NEXT:    s_clause 0x1
75; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
76; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
77; GFX10-NEXT:    v_mov_b32_e32 v1, 0
78; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
79; GFX10-NEXT:    v_mov_b32_e32 v0, s2
80; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, v0
81; GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
82; GFX10-NEXT:    s_endpgm
83;
84; GFX11-LABEL: fshr_i32:
85; GFX11:       ; %bb.0: ; %entry
86; GFX11-NEXT:    s_clause 0x1
87; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
88; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
89; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
90; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
91; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
92; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, v0
93; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
94; GFX11-NEXT:    s_endpgm
95entry:
96  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
97  store i32 %0, ptr addrspace(1) %in
98  ret void
99}
100
101define amdgpu_kernel void @fshr_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
102; SI-LABEL: fshr_i32_imm:
103; SI:       ; %bb.0: ; %entry
104; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
105; SI-NEXT:    s_mov_b32 s7, 0xf000
106; SI-NEXT:    s_mov_b32 s6, -1
107; SI-NEXT:    s_waitcnt lgkmcnt(0)
108; SI-NEXT:    v_mov_b32_e32 v0, s3
109; SI-NEXT:    s_mov_b32 s4, s0
110; SI-NEXT:    s_mov_b32 s5, s1
111; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 7
112; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
113; SI-NEXT:    s_endpgm
114;
115; VI-LABEL: fshr_i32_imm:
116; VI:       ; %bb.0: ; %entry
117; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
118; VI-NEXT:    s_waitcnt lgkmcnt(0)
119; VI-NEXT:    v_mov_b32_e32 v0, s3
120; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 7
121; VI-NEXT:    v_mov_b32_e32 v0, s0
122; VI-NEXT:    v_mov_b32_e32 v1, s1
123; VI-NEXT:    flat_store_dword v[0:1], v2
124; VI-NEXT:    s_endpgm
125;
126; GFX9-LABEL: fshr_i32_imm:
127; GFX9:       ; %bb.0: ; %entry
128; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
129; GFX9-NEXT:    v_mov_b32_e32 v0, 0
130; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
131; GFX9-NEXT:    v_mov_b32_e32 v1, s3
132; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 7
133; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
134; GFX9-NEXT:    s_endpgm
135;
136; R600-LABEL: fshr_i32_imm:
137; R600:       ; %bb.0: ; %entry
138; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
139; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
140; R600-NEXT:    CF_END
141; R600-NEXT:    PAD
142; R600-NEXT:    ALU clause starting at 4:
143; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
144; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
145; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
146; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
147;
148; GFX10-LABEL: fshr_i32_imm:
149; GFX10:       ; %bb.0: ; %entry
150; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
151; GFX10-NEXT:    v_mov_b32_e32 v0, 0
152; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
153; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 7
154; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
155; GFX10-NEXT:    s_endpgm
156;
157; GFX11-LABEL: fshr_i32_imm:
158; GFX11:       ; %bb.0: ; %entry
159; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
160; GFX11-NEXT:    v_mov_b32_e32 v0, 0
161; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
162; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 7
163; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
164; GFX11-NEXT:    s_endpgm
165entry:
166  %0 = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 7)
167  store i32 %0, ptr addrspace(1) %in
168  ret void
169}
170
171define amdgpu_kernel void @fshr_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
172; SI-LABEL: fshr_v2i32:
173; SI:       ; %bb.0: ; %entry
174; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
175; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xf
176; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
177; SI-NEXT:    s_mov_b32 s7, 0xf000
178; SI-NEXT:    s_mov_b32 s6, -1
179; SI-NEXT:    s_waitcnt lgkmcnt(0)
180; SI-NEXT:    v_mov_b32_e32 v0, s3
181; SI-NEXT:    v_mov_b32_e32 v1, s9
182; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
183; SI-NEXT:    v_mov_b32_e32 v0, s2
184; SI-NEXT:    v_mov_b32_e32 v2, s8
185; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
186; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
187; SI-NEXT:    s_endpgm
188;
189; VI-LABEL: fshr_v2i32:
190; VI:       ; %bb.0: ; %entry
191; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
192; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
193; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
194; VI-NEXT:    s_waitcnt lgkmcnt(0)
195; VI-NEXT:    v_mov_b32_e32 v0, s3
196; VI-NEXT:    v_mov_b32_e32 v1, s7
197; VI-NEXT:    v_mov_b32_e32 v2, s2
198; VI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
199; VI-NEXT:    v_mov_b32_e32 v0, s6
200; VI-NEXT:    v_alignbit_b32 v0, s0, v2, v0
201; VI-NEXT:    v_mov_b32_e32 v2, s4
202; VI-NEXT:    v_mov_b32_e32 v3, s5
203; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
204; VI-NEXT:    s_endpgm
205;
206; GFX9-LABEL: fshr_v2i32:
207; GFX9:       ; %bb.0: ; %entry
208; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
209; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
210; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
211; GFX9-NEXT:    v_mov_b32_e32 v2, 0
212; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
213; GFX9-NEXT:    v_mov_b32_e32 v0, s3
214; GFX9-NEXT:    v_mov_b32_e32 v1, s7
215; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, v1
216; GFX9-NEXT:    v_mov_b32_e32 v0, s2
217; GFX9-NEXT:    v_mov_b32_e32 v3, s6
218; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v3
219; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
220; GFX9-NEXT:    s_endpgm
221;
222; R600-LABEL: fshr_v2i32:
223; R600:       ; %bb.0: ; %entry
224; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
225; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
226; R600-NEXT:    CF_END
227; R600-NEXT:    PAD
228; R600-NEXT:    ALU clause starting at 4:
229; R600-NEXT:     MOV * T0.W, KC0[4].X,
230; R600-NEXT:     BIT_ALIGN_INT T0.Y, KC0[3].X, KC0[3].Z, PV.W,
231; R600-NEXT:     MOV * T0.W, KC0[3].W,
232; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, PV.W,
233; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
234; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
235;
236; GFX10-LABEL: fshr_v2i32:
237; GFX10:       ; %bb.0: ; %entry
238; GFX10-NEXT:    s_clause 0x2
239; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
240; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
241; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
242; GFX10-NEXT:    v_mov_b32_e32 v3, 0
243; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
244; GFX10-NEXT:    v_mov_b32_e32 v0, s7
245; GFX10-NEXT:    v_mov_b32_e32 v2, s6
246; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, v0
247; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, v2
248; GFX10-NEXT:    global_store_dwordx2 v3, v[0:1], s[8:9]
249; GFX10-NEXT:    s_endpgm
250;
251; GFX11-LABEL: fshr_v2i32:
252; GFX11:       ; %bb.0: ; %entry
253; GFX11-NEXT:    s_clause 0x2
254; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
255; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
256; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
257; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
258; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s7
259; GFX11-NEXT:    v_mov_b32_e32 v2, s6
260; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
261; GFX11-NEXT:    v_alignbit_b32 v1, s1, s3, v0
262; GFX11-NEXT:    v_alignbit_b32 v0, s0, s2, v2
263; GFX11-NEXT:    global_store_b64 v3, v[0:1], s[4:5]
264; GFX11-NEXT:    s_endpgm
265entry:
266  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
267  store <2 x i32> %0, ptr addrspace(1) %in
268  ret void
269}
270
271define amdgpu_kernel void @fshr_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
272; SI-LABEL: fshr_v2i32_imm:
273; SI:       ; %bb.0: ; %entry
274; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
275; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
276; SI-NEXT:    s_mov_b32 s7, 0xf000
277; SI-NEXT:    s_mov_b32 s6, -1
278; SI-NEXT:    s_waitcnt lgkmcnt(0)
279; SI-NEXT:    v_mov_b32_e32 v0, s3
280; SI-NEXT:    v_mov_b32_e32 v2, s2
281; SI-NEXT:    v_alignbit_b32 v1, s1, v0, 9
282; SI-NEXT:    v_alignbit_b32 v0, s0, v2, 7
283; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
284; SI-NEXT:    s_endpgm
285;
286; VI-LABEL: fshr_v2i32_imm:
287; VI:       ; %bb.0: ; %entry
288; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
289; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
290; VI-NEXT:    s_waitcnt lgkmcnt(0)
291; VI-NEXT:    v_mov_b32_e32 v0, s3
292; VI-NEXT:    v_mov_b32_e32 v2, s2
293; VI-NEXT:    v_alignbit_b32 v1, s1, v0, 9
294; VI-NEXT:    v_alignbit_b32 v0, s0, v2, 7
295; VI-NEXT:    v_mov_b32_e32 v2, s4
296; VI-NEXT:    v_mov_b32_e32 v3, s5
297; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
298; VI-NEXT:    s_endpgm
299;
300; GFX9-LABEL: fshr_v2i32_imm:
301; GFX9:       ; %bb.0: ; %entry
302; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
303; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
304; GFX9-NEXT:    v_mov_b32_e32 v2, 0
305; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
306; GFX9-NEXT:    v_mov_b32_e32 v0, s3
307; GFX9-NEXT:    v_mov_b32_e32 v3, s2
308; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, 9
309; GFX9-NEXT:    v_alignbit_b32 v0, s0, v3, 7
310; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
311; GFX9-NEXT:    s_endpgm
312;
313; R600-LABEL: fshr_v2i32_imm:
314; R600:       ; %bb.0: ; %entry
315; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
316; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
317; R600-NEXT:    CF_END
318; R600-NEXT:    PAD
319; R600-NEXT:    ALU clause starting at 4:
320; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
321; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
322; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
323; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
324; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
325; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
326;
327; GFX10-LABEL: fshr_v2i32_imm:
328; GFX10:       ; %bb.0: ; %entry
329; GFX10-NEXT:    s_clause 0x1
330; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
331; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
332; GFX10-NEXT:    v_mov_b32_e32 v2, 0
333; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
334; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, 9
335; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, 7
336; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
337; GFX10-NEXT:    s_endpgm
338;
339; GFX11-LABEL: fshr_v2i32_imm:
340; GFX11:       ; %bb.0: ; %entry
341; GFX11-NEXT:    s_clause 0x1
342; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
343; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
344; GFX11-NEXT:    v_mov_b32_e32 v2, 0
345; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
346; GFX11-NEXT:    v_alignbit_b32 v1, s1, s3, 9
347; GFX11-NEXT:    v_alignbit_b32 v0, s0, s2, 7
348; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
349; GFX11-NEXT:    s_endpgm
350entry:
351  %0 = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
352  store <2 x i32> %0, ptr addrspace(1) %in
353  ret void
354}
355
356define amdgpu_kernel void @fshr_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
357; SI-LABEL: fshr_v4i32:
358; SI:       ; %bb.0: ; %entry
359; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
360; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x15
361; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
362; SI-NEXT:    s_mov_b32 s7, 0xf000
363; SI-NEXT:    s_mov_b32 s6, -1
364; SI-NEXT:    s_waitcnt lgkmcnt(0)
365; SI-NEXT:    v_mov_b32_e32 v0, s15
366; SI-NEXT:    v_mov_b32_e32 v1, s3
367; SI-NEXT:    v_alignbit_b32 v3, s11, v0, v1
368; SI-NEXT:    v_mov_b32_e32 v0, s14
369; SI-NEXT:    v_mov_b32_e32 v1, s2
370; SI-NEXT:    v_alignbit_b32 v2, s10, v0, v1
371; SI-NEXT:    v_mov_b32_e32 v0, s13
372; SI-NEXT:    v_mov_b32_e32 v1, s1
373; SI-NEXT:    v_alignbit_b32 v1, s9, v0, v1
374; SI-NEXT:    v_mov_b32_e32 v0, s12
375; SI-NEXT:    v_mov_b32_e32 v4, s0
376; SI-NEXT:    v_alignbit_b32 v0, s8, v0, v4
377; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
378; SI-NEXT:    s_endpgm
379;
380; VI-LABEL: fshr_v4i32:
381; VI:       ; %bb.0: ; %entry
382; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
383; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
384; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
385; VI-NEXT:    s_waitcnt lgkmcnt(0)
386; VI-NEXT:    v_mov_b32_e32 v0, s15
387; VI-NEXT:    v_mov_b32_e32 v1, s3
388; VI-NEXT:    v_mov_b32_e32 v2, s14
389; VI-NEXT:    v_alignbit_b32 v3, s11, v0, v1
390; VI-NEXT:    v_mov_b32_e32 v0, s2
391; VI-NEXT:    v_alignbit_b32 v2, s10, v2, v0
392; VI-NEXT:    v_mov_b32_e32 v0, s13
393; VI-NEXT:    v_mov_b32_e32 v1, s1
394; VI-NEXT:    v_alignbit_b32 v1, s9, v0, v1
395; VI-NEXT:    v_mov_b32_e32 v0, s12
396; VI-NEXT:    v_mov_b32_e32 v4, s0
397; VI-NEXT:    v_alignbit_b32 v0, s8, v0, v4
398; VI-NEXT:    v_mov_b32_e32 v4, s4
399; VI-NEXT:    v_mov_b32_e32 v5, s5
400; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
401; VI-NEXT:    s_endpgm
402;
403; GFX9-LABEL: fshr_v4i32:
404; GFX9:       ; %bb.0: ; %entry
405; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
406; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
407; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
408; GFX9-NEXT:    v_mov_b32_e32 v4, 0
409; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
410; GFX9-NEXT:    v_mov_b32_e32 v0, s15
411; GFX9-NEXT:    v_mov_b32_e32 v1, s3
412; GFX9-NEXT:    v_alignbit_b32 v3, s11, v0, v1
413; GFX9-NEXT:    v_mov_b32_e32 v0, s14
414; GFX9-NEXT:    v_mov_b32_e32 v1, s2
415; GFX9-NEXT:    v_alignbit_b32 v2, s10, v0, v1
416; GFX9-NEXT:    v_mov_b32_e32 v0, s13
417; GFX9-NEXT:    v_mov_b32_e32 v1, s1
418; GFX9-NEXT:    v_alignbit_b32 v1, s9, v0, v1
419; GFX9-NEXT:    v_mov_b32_e32 v0, s12
420; GFX9-NEXT:    v_mov_b32_e32 v5, s0
421; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, v5
422; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
423; GFX9-NEXT:    s_endpgm
424;
425; R600-LABEL: fshr_v4i32:
426; R600:       ; %bb.0: ; %entry
427; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
428; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
429; R600-NEXT:    CF_END
430; R600-NEXT:    PAD
431; R600-NEXT:    ALU clause starting at 4:
432; R600-NEXT:     MOV * T0.W, KC0[6].X,
433; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, PV.W,
434; R600-NEXT:     MOV * T1.W, KC0[5].W,
435; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, PV.W,
436; R600-NEXT:     MOV * T1.W, KC0[5].Z,
437; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, PV.W,
438; R600-NEXT:     MOV * T1.W, KC0[5].Y,
439; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, PV.W,
440; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
441; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
442;
443; GFX10-LABEL: fshr_v4i32:
444; GFX10:       ; %bb.0: ; %entry
445; GFX10-NEXT:    s_clause 0x2
446; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
447; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
448; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
449; GFX10-NEXT:    v_mov_b32_e32 v6, 0
450; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
451; GFX10-NEXT:    v_mov_b32_e32 v0, s3
452; GFX10-NEXT:    v_mov_b32_e32 v1, s2
453; GFX10-NEXT:    v_mov_b32_e32 v4, s1
454; GFX10-NEXT:    v_mov_b32_e32 v5, s0
455; GFX10-NEXT:    v_alignbit_b32 v3, s11, s15, v0
456; GFX10-NEXT:    v_alignbit_b32 v2, s10, s14, v1
457; GFX10-NEXT:    v_alignbit_b32 v1, s9, s13, v4
458; GFX10-NEXT:    v_alignbit_b32 v0, s8, s12, v5
459; GFX10-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
460; GFX10-NEXT:    s_endpgm
461;
462; GFX11-LABEL: fshr_v4i32:
463; GFX11:       ; %bb.0: ; %entry
464; GFX11-NEXT:    s_clause 0x2
465; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
466; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
467; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
468; GFX11-NEXT:    v_mov_b32_e32 v6, 0
469; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
470; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
471; GFX11-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
472; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
473; GFX11-NEXT:    v_alignbit_b32 v3, s11, s15, v0
474; GFX11-NEXT:    v_alignbit_b32 v2, s10, s14, v1
475; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
476; GFX11-NEXT:    v_alignbit_b32 v1, s9, s13, v4
477; GFX11-NEXT:    v_alignbit_b32 v0, s8, s12, v5
478; GFX11-NEXT:    global_store_b128 v6, v[0:3], s[4:5]
479; GFX11-NEXT:    s_endpgm
480entry:
481  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
482  store <4 x i32> %0, ptr addrspace(1) %in
483  ret void
484}
485
486define amdgpu_kernel void @fshr_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) {
487; SI-LABEL: fshr_v4i32_imm:
488; SI:       ; %bb.0: ; %entry
489; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
490; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
491; SI-NEXT:    s_mov_b32 s3, 0xf000
492; SI-NEXT:    s_mov_b32 s2, -1
493; SI-NEXT:    s_waitcnt lgkmcnt(0)
494; SI-NEXT:    v_mov_b32_e32 v0, s15
495; SI-NEXT:    v_mov_b32_e32 v1, s14
496; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
497; SI-NEXT:    v_mov_b32_e32 v0, s13
498; SI-NEXT:    v_alignbit_b32 v2, s10, v1, 9
499; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 7
500; SI-NEXT:    v_mov_b32_e32 v0, s12
501; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
502; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
503; SI-NEXT:    s_endpgm
504;
505; VI-LABEL: fshr_v4i32_imm:
506; VI:       ; %bb.0: ; %entry
507; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
508; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
509; VI-NEXT:    s_waitcnt lgkmcnt(0)
510; VI-NEXT:    v_mov_b32_e32 v0, s15
511; VI-NEXT:    v_mov_b32_e32 v1, s14
512; VI-NEXT:    v_mov_b32_e32 v4, s13
513; VI-NEXT:    v_alignbit_b32 v3, s11, v0, 1
514; VI-NEXT:    v_alignbit_b32 v2, s10, v1, 9
515; VI-NEXT:    v_alignbit_b32 v1, s9, v4, 7
516; VI-NEXT:    v_mov_b32_e32 v0, s12
517; VI-NEXT:    v_mov_b32_e32 v5, s1
518; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
519; VI-NEXT:    v_mov_b32_e32 v4, s0
520; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
521; VI-NEXT:    s_endpgm
522;
523; GFX9-LABEL: fshr_v4i32_imm:
524; GFX9:       ; %bb.0: ; %entry
525; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
526; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
527; GFX9-NEXT:    v_mov_b32_e32 v4, 0
528; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
529; GFX9-NEXT:    v_mov_b32_e32 v0, s15
530; GFX9-NEXT:    v_mov_b32_e32 v1, s14
531; GFX9-NEXT:    v_alignbit_b32 v3, s11, v0, 1
532; GFX9-NEXT:    v_mov_b32_e32 v0, s13
533; GFX9-NEXT:    v_alignbit_b32 v2, s10, v1, 9
534; GFX9-NEXT:    v_alignbit_b32 v1, s9, v0, 7
535; GFX9-NEXT:    v_mov_b32_e32 v0, s12
536; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 1
537; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
538; GFX9-NEXT:    s_endpgm
539;
540; R600-LABEL: fshr_v4i32_imm:
541; R600:       ; %bb.0: ; %entry
542; R600-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
543; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
544; R600-NEXT:    CF_END
545; R600-NEXT:    PAD
546; R600-NEXT:    ALU clause starting at 4:
547; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
548; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
549; R600-NEXT:    9(1.261169e-44), 0(0.000000e+00)
550; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
551; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
552; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, 1,
553; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
554; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
555;
556; GFX10-LABEL: fshr_v4i32_imm:
557; GFX10:       ; %bb.0: ; %entry
558; GFX10-NEXT:    s_clause 0x1
559; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
560; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
561; GFX10-NEXT:    v_mov_b32_e32 v4, 0
562; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
563; GFX10-NEXT:    v_alignbit_b32 v3, s11, s15, 1
564; GFX10-NEXT:    v_alignbit_b32 v2, s10, s14, 9
565; GFX10-NEXT:    v_alignbit_b32 v1, s9, s13, 7
566; GFX10-NEXT:    v_alignbit_b32 v0, s8, s12, 1
567; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
568; GFX10-NEXT:    s_endpgm
569;
570; GFX11-LABEL: fshr_v4i32_imm:
571; GFX11:       ; %bb.0: ; %entry
572; GFX11-NEXT:    s_clause 0x1
573; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
574; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
575; GFX11-NEXT:    v_mov_b32_e32 v4, 0
576; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
577; GFX11-NEXT:    v_alignbit_b32 v3, s11, s15, 1
578; GFX11-NEXT:    v_alignbit_b32 v2, s10, s14, 9
579; GFX11-NEXT:    v_alignbit_b32 v1, s9, s13, 7
580; GFX11-NEXT:    v_alignbit_b32 v0, s8, s12, 1
581; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
582; GFX11-NEXT:    s_endpgm
583entry:
584  %0 = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
585  store <4 x i32> %0, ptr addrspace(1) %in
586  ret void
587}
588
589define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) {
590; GFX89-LABEL: v_fshr_i32:
591; GFX89:       ; %bb.0:
592; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
593; GFX89-NEXT:    v_alignbit_b32 v0, v0, v1, v2
594; GFX89-NEXT:    s_setpc_b64 s[30:31]
595;
596; R600-LABEL: v_fshr_i32:
597; R600:       ; %bb.0:
598; R600-NEXT:    CF_END
599; R600-NEXT:    PAD
600;
601; GFX10-LABEL: v_fshr_i32:
602; GFX10:       ; %bb.0:
603; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
604; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
605; GFX10-NEXT:    s_setpc_b64 s[30:31]
606;
607; GFX11-LABEL: v_fshr_i32:
608; GFX11:       ; %bb.0:
609; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
610; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, v2
611; GFX11-NEXT:    s_setpc_b64 s[30:31]
612  %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2)
613  ret i32 %ret
614}
615
616define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) {
617; GFX89-LABEL: v_fshr_v2i32:
618; GFX89:       ; %bb.0:
619; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
620; GFX89-NEXT:    v_alignbit_b32 v0, v0, v2, v4
621; GFX89-NEXT:    v_alignbit_b32 v1, v1, v3, v5
622; GFX89-NEXT:    s_setpc_b64 s[30:31]
623;
624; R600-LABEL: v_fshr_v2i32:
625; R600:       ; %bb.0:
626; R600-NEXT:    CF_END
627; R600-NEXT:    PAD
628;
629; GFX10-LABEL: v_fshr_v2i32:
630; GFX10:       ; %bb.0:
631; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
632; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
633; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
634; GFX10-NEXT:    s_setpc_b64 s[30:31]
635;
636; GFX11-LABEL: v_fshr_v2i32:
637; GFX11:       ; %bb.0:
638; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
639; GFX11-NEXT:    v_alignbit_b32 v0, v0, v2, v4
640; GFX11-NEXT:    v_alignbit_b32 v1, v1, v3, v5
641; GFX11-NEXT:    s_setpc_b64 s[30:31]
642  %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2)
643  ret <2 x i32> %ret
644}
645
646define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) {
647; GFX89-LABEL: v_fshr_v3i32:
648; GFX89:       ; %bb.0:
649; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
650; GFX89-NEXT:    v_alignbit_b32 v0, v0, v3, v6
651; GFX89-NEXT:    v_alignbit_b32 v1, v1, v4, v7
652; GFX89-NEXT:    v_alignbit_b32 v2, v2, v5, v8
653; GFX89-NEXT:    s_setpc_b64 s[30:31]
654;
655; R600-LABEL: v_fshr_v3i32:
656; R600:       ; %bb.0:
657; R600-NEXT:    CF_END
658; R600-NEXT:    PAD
659;
660; GFX10-LABEL: v_fshr_v3i32:
661; GFX10:       ; %bb.0:
662; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
663; GFX10-NEXT:    v_alignbit_b32 v0, v0, v3, v6
664; GFX10-NEXT:    v_alignbit_b32 v1, v1, v4, v7
665; GFX10-NEXT:    v_alignbit_b32 v2, v2, v5, v8
666; GFX10-NEXT:    s_setpc_b64 s[30:31]
667;
668; GFX11-LABEL: v_fshr_v3i32:
669; GFX11:       ; %bb.0:
670; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
671; GFX11-NEXT:    v_alignbit_b32 v0, v0, v3, v6
672; GFX11-NEXT:    v_alignbit_b32 v1, v1, v4, v7
673; GFX11-NEXT:    v_alignbit_b32 v2, v2, v5, v8
674; GFX11-NEXT:    s_setpc_b64 s[30:31]
675  %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2)
676  ret <3 x i32> %ret
677}
678
679define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) {
680; GFX89-LABEL: v_fshr_v4i32:
681; GFX89:       ; %bb.0:
682; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
683; GFX89-NEXT:    v_alignbit_b32 v0, v0, v4, v8
684; GFX89-NEXT:    v_alignbit_b32 v1, v1, v5, v9
685; GFX89-NEXT:    v_alignbit_b32 v2, v2, v6, v10
686; GFX89-NEXT:    v_alignbit_b32 v3, v3, v7, v11
687; GFX89-NEXT:    s_setpc_b64 s[30:31]
688;
689; R600-LABEL: v_fshr_v4i32:
690; R600:       ; %bb.0:
691; R600-NEXT:    CF_END
692; R600-NEXT:    PAD
693;
694; GFX10-LABEL: v_fshr_v4i32:
695; GFX10:       ; %bb.0:
696; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697; GFX10-NEXT:    v_alignbit_b32 v0, v0, v4, v8
698; GFX10-NEXT:    v_alignbit_b32 v1, v1, v5, v9
699; GFX10-NEXT:    v_alignbit_b32 v2, v2, v6, v10
700; GFX10-NEXT:    v_alignbit_b32 v3, v3, v7, v11
701; GFX10-NEXT:    s_setpc_b64 s[30:31]
702;
703; GFX11-LABEL: v_fshr_v4i32:
704; GFX11:       ; %bb.0:
705; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
706; GFX11-NEXT:    v_alignbit_b32 v0, v0, v4, v8
707; GFX11-NEXT:    v_alignbit_b32 v1, v1, v5, v9
708; GFX11-NEXT:    v_alignbit_b32 v2, v2, v6, v10
709; GFX11-NEXT:    v_alignbit_b32 v3, v3, v7, v11
710; GFX11-NEXT:    s_setpc_b64 s[30:31]
711  %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2)
712  ret <4 x i32> %ret
713}
714
715define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) {
716; SI-LABEL: v_fshr_i16:
717; SI:       ; %bb.0:
718; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
719; SI-NEXT:    v_or_b32_e32 v2, 16, v2
720; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
721; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
722; SI-NEXT:    s_setpc_b64 s[30:31]
723;
724; VI-LABEL: v_fshr_i16:
725; VI:       ; %bb.0:
726; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
727; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
728; VI-NEXT:    v_xor_b32_e32 v3, -1, v2
729; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
730; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
731; VI-NEXT:    v_or_b32_e32 v0, v0, v1
732; VI-NEXT:    s_setpc_b64 s[30:31]
733;
734; GFX9-LABEL: v_fshr_i16:
735; GFX9:       ; %bb.0:
736; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
737; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
738; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
739; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
740; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
741; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
742; GFX9-NEXT:    s_setpc_b64 s[30:31]
743;
744; R600-LABEL: v_fshr_i16:
745; R600:       ; %bb.0:
746; R600-NEXT:    CF_END
747; R600-NEXT:    PAD
748;
749; GFX10-LABEL: v_fshr_i16:
750; GFX10:       ; %bb.0:
751; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
752; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
753; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
754; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
755; GFX10-NEXT:    v_lshlrev_b16 v0, v3, v0
756; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
757; GFX10-NEXT:    s_setpc_b64 s[30:31]
758;
759; GFX11-LABEL: v_fshr_i16:
760; GFX11:       ; %bb.0:
761; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
762; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
763; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
764; GFX11-NEXT:    v_lshrrev_b16 v1, v2, v1
765; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
766; GFX11-NEXT:    v_lshlrev_b16 v0, v3, v0
767; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
768; GFX11-NEXT:    s_setpc_b64 s[30:31]
769  %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2)
770  ret i16 %ret
771}
772
773define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) {
774; SI-LABEL: v_fshr_v2i16:
775; SI:       ; %bb.0:
776; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
777; SI-NEXT:    v_or_b32_e32 v5, 16, v5
778; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
779; SI-NEXT:    v_or_b32_e32 v4, 16, v4
780; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
781; SI-NEXT:    v_alignbit_b32 v1, v1, v3, v5
782; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
783; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
784; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
785; SI-NEXT:    v_or_b32_e32 v0, v0, v3
786; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
787; SI-NEXT:    s_setpc_b64 s[30:31]
788;
789; VI-LABEL: v_fshr_v2i16:
790; VI:       ; %bb.0:
791; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
792; VI-NEXT:    v_mov_b32_e32 v4, 1
793; VI-NEXT:    v_mov_b32_e32 v5, -1
794; VI-NEXT:    v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
795; VI-NEXT:    v_xor_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
796; VI-NEXT:    v_lshrrev_b16_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
797; VI-NEXT:    v_lshlrev_b16_e32 v4, v5, v4
798; VI-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
799; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
800; VI-NEXT:    v_xor_b32_e32 v4, -1, v2
801; VI-NEXT:    v_lshlrev_b16_e32 v0, v4, v0
802; VI-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
803; VI-NEXT:    v_or_b32_e32 v0, v0, v1
804; VI-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
805; VI-NEXT:    s_setpc_b64 s[30:31]
806;
807; GFX9-LABEL: v_fshr_v2i16:
808; GFX9:       ; %bb.0:
809; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
810; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v2
811; GFX9-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
812; GFX9-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
813; GFX9-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
814; GFX9-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
815; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
816; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
817; GFX9-NEXT:    s_setpc_b64 s[30:31]
818;
819; R600-LABEL: v_fshr_v2i16:
820; R600:       ; %bb.0:
821; R600-NEXT:    CF_END
822; R600-NEXT:    PAD
823;
824; GFX10-LABEL: v_fshr_v2i16:
825; GFX10:       ; %bb.0:
826; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
827; GFX10-NEXT:    v_xor_b32_e32 v3, -1, v2
828; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
829; GFX10-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
830; GFX10-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
831; GFX10-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
832; GFX10-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
833; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
834; GFX10-NEXT:    s_setpc_b64 s[30:31]
835;
836; GFX11-LABEL: v_fshr_v2i16:
837; GFX11:       ; %bb.0:
838; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
839; GFX11-NEXT:    v_xor_b32_e32 v3, -1, v2
840; GFX11-NEXT:    v_pk_lshlrev_b16 v0, 1, v0 op_sel_hi:[0,1]
841; GFX11-NEXT:    v_and_b32_e32 v2, 0xf000f, v2
842; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
843; GFX11-NEXT:    v_and_b32_e32 v3, 0xf000f, v3
844; GFX11-NEXT:    v_pk_lshrrev_b16 v1, v2, v1
845; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
846; GFX11-NEXT:    v_pk_lshlrev_b16 v0, v3, v0
847; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
848; GFX11-NEXT:    s_setpc_b64 s[30:31]
849  %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2)
850  ret <2 x i16> %ret
851}
852
853define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) {
854; SI-LABEL: v_fshr_v3i16:
855; SI:       ; %bb.0:
856; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
857; SI-NEXT:    v_or_b32_e32 v7, 16, v7
858; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
859; SI-NEXT:    v_alignbit_b32 v1, v1, v4, v7
860; SI-NEXT:    v_or_b32_e32 v4, 16, v6
861; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
862; SI-NEXT:    v_alignbit_b32 v0, v0, v3, v4
863; SI-NEXT:    v_or_b32_e32 v3, 16, v8
864; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
865; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
866; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
867; SI-NEXT:    v_alignbit_b32 v3, v2, v4, v3
868; SI-NEXT:    v_or_b32_e32 v0, v0, v1
869; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v3
870; SI-NEXT:    v_alignbit_b32 v1, v3, v1, 16
871; SI-NEXT:    s_setpc_b64 s[30:31]
872;
873; VI-LABEL: v_fshr_v3i16:
874; VI:       ; %bb.0:
875; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
876; VI-NEXT:    v_mov_b32_e32 v7, 1
877; VI-NEXT:    v_mov_b32_e32 v8, -1
878; VI-NEXT:    v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
879; VI-NEXT:    v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
880; VI-NEXT:    v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
881; VI-NEXT:    v_lshlrev_b16_e32 v7, v8, v7
882; VI-NEXT:    v_or_b32_sdwa v6, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
883; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
884; VI-NEXT:    v_xor_b32_e32 v7, -1, v5
885; VI-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
886; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
887; VI-NEXT:    v_or_b32_e32 v1, v1, v3
888; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
889; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
890; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
891; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
892; VI-NEXT:    v_or_b32_e32 v0, v0, v2
893; VI-NEXT:    v_or_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
894; VI-NEXT:    s_setpc_b64 s[30:31]
895;
896; GFX9-LABEL: v_fshr_v3i16:
897; GFX9:       ; %bb.0:
898; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
899; GFX9-NEXT:    v_mov_b32_e32 v7, 1
900; GFX9-NEXT:    v_mov_b32_e32 v8, -1
901; GFX9-NEXT:    v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
902; GFX9-NEXT:    v_xor_b32_sdwa v8, v4, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
903; GFX9-NEXT:    v_lshrrev_b16_sdwa v6, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
904; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v8, v7
905; GFX9-NEXT:    v_or_b32_e32 v6, v7, v6
906; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
907; GFX9-NEXT:    v_xor_b32_e32 v7, -1, v5
908; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v7, v1
909; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
910; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
911; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
912; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
913; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
914; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
915; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
916; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
917; GFX9-NEXT:    v_perm_b32 v0, v6, v0, s4
918; GFX9-NEXT:    s_setpc_b64 s[30:31]
919;
920; R600-LABEL: v_fshr_v3i16:
921; R600:       ; %bb.0:
922; R600-NEXT:    CF_END
923; R600-NEXT:    PAD
924;
925; GFX10-LABEL: v_fshr_v3i16:
926; GFX10:       ; %bb.0:
927; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
928; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
929; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
930; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
931; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
932; GFX10-NEXT:    v_xor_b32_e32 v10, -1, v4
933; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
934; GFX10-NEXT:    v_xor_b32_e32 v9, -1, v7
935; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
936; GFX10-NEXT:    v_lshrrev_b16 v7, v7, v8
937; GFX10-NEXT:    v_lshlrev_b16 v0, v10, v0
938; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
939; GFX10-NEXT:    v_lshlrev_b16 v6, v9, v6
940; GFX10-NEXT:    v_xor_b32_e32 v4, -1, v5
941; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
942; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
943; GFX10-NEXT:    v_or_b32_e32 v5, v6, v7
944; GFX10-NEXT:    v_lshlrev_b16 v1, v4, v1
945; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
946; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
947; GFX10-NEXT:    s_setpc_b64 s[30:31]
948;
949; GFX11-LABEL: v_fshr_v3i16:
950; GFX11:       ; %bb.0:
951; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
952; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
953; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
954; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
955; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
956; GFX11-NEXT:    v_xor_b32_e32 v10, -1, v4
957; GFX11-NEXT:    v_lshlrev_b16 v6, 1, v6
958; GFX11-NEXT:    v_xor_b32_e32 v9, -1, v7
959; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
960; GFX11-NEXT:    v_lshrrev_b16 v7, v7, v8
961; GFX11-NEXT:    v_lshlrev_b16 v0, v10, v0
962; GFX11-NEXT:    v_lshrrev_b16 v2, v4, v2
963; GFX11-NEXT:    v_lshlrev_b16 v6, v9, v6
964; GFX11-NEXT:    v_xor_b32_e32 v4, -1, v5
965; GFX11-NEXT:    v_lshrrev_b16 v3, v5, v3
966; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
967; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
968; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
969; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
970; GFX11-NEXT:    v_lshlrev_b16 v1, v4, v1
971; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
972; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
973; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
974; GFX11-NEXT:    s_setpc_b64 s[30:31]
975  %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2)
976  ret <3 x i16> %ret
977}
978
979define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) {
980; SI-LABEL: v_fshr_v4i16:
981; SI:       ; %bb.0:
982; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
983; SI-NEXT:    v_or_b32_e32 v9, 16, v9
984; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
985; SI-NEXT:    v_alignbit_b32 v1, v1, v5, v9
986; SI-NEXT:    v_or_b32_e32 v5, 16, v8
987; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
988; SI-NEXT:    v_alignbit_b32 v0, v0, v4, v5
989; SI-NEXT:    v_or_b32_e32 v4, 16, v11
990; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
991; SI-NEXT:    v_alignbit_b32 v3, v3, v5, v4
992; SI-NEXT:    v_or_b32_e32 v5, 16, v10
993; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
994; SI-NEXT:    v_alignbit_b32 v2, v2, v6, v5
995; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
996; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
997; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
998; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
999; SI-NEXT:    v_or_b32_e32 v2, v2, v4
1000; SI-NEXT:    v_or_b32_e32 v0, v0, v1
1001; SI-NEXT:    v_alignbit_b32 v1, v2, v1, 16
1002; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1003; SI-NEXT:    s_setpc_b64 s[30:31]
1004;
1005; VI-LABEL: v_fshr_v4i16:
1006; VI:       ; %bb.0:
1007; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1008; VI-NEXT:    v_mov_b32_e32 v7, 1
1009; VI-NEXT:    v_mov_b32_e32 v9, -1
1010; VI-NEXT:    v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1011; VI-NEXT:    v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1012; VI-NEXT:    v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1013; VI-NEXT:    v_lshlrev_b16_e32 v8, v10, v8
1014; VI-NEXT:    v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1015; VI-NEXT:    v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1016; VI-NEXT:    v_or_b32_sdwa v6, v8, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1017; VI-NEXT:    v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1018; VI-NEXT:    v_lshlrev_b16_e32 v7, v9, v7
1019; VI-NEXT:    v_or_b32_sdwa v7, v7, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1020; VI-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
1021; VI-NEXT:    v_xor_b32_e32 v8, -1, v5
1022; VI-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
1023; VI-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
1024; VI-NEXT:    v_or_b32_e32 v1, v1, v3
1025; VI-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
1026; VI-NEXT:    v_xor_b32_e32 v3, -1, v4
1027; VI-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
1028; VI-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
1029; VI-NEXT:    v_or_b32_e32 v0, v0, v2
1030; VI-NEXT:    v_or_b32_sdwa v0, v0, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1031; VI-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
1032; VI-NEXT:    s_setpc_b64 s[30:31]
1033;
1034; GFX9-LABEL: v_fshr_v4i16:
1035; GFX9:       ; %bb.0:
1036; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1037; GFX9-NEXT:    v_mov_b32_e32 v7, 1
1038; GFX9-NEXT:    v_mov_b32_e32 v9, -1
1039; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1040; GFX9-NEXT:    v_xor_b32_sdwa v10, v5, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1041; GFX9-NEXT:    v_lshrrev_b16_sdwa v6, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1042; GFX9-NEXT:    v_lshlrev_b16_e32 v8, v10, v8
1043; GFX9-NEXT:    v_lshlrev_b16_sdwa v7, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
1044; GFX9-NEXT:    v_xor_b32_sdwa v9, v4, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1045; GFX9-NEXT:    v_or_b32_e32 v6, v8, v6
1046; GFX9-NEXT:    v_lshrrev_b16_sdwa v8, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1047; GFX9-NEXT:    v_lshlrev_b16_e32 v7, v9, v7
1048; GFX9-NEXT:    v_or_b32_e32 v7, v7, v8
1049; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 1, v1
1050; GFX9-NEXT:    v_xor_b32_e32 v8, -1, v5
1051; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v8, v1
1052; GFX9-NEXT:    v_lshrrev_b16_e32 v3, v5, v3
1053; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
1054; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 1, v0
1055; GFX9-NEXT:    v_xor_b32_e32 v3, -1, v4
1056; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v3, v0
1057; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v4, v2
1058; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
1059; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
1060; GFX9-NEXT:    v_perm_b32 v0, v7, v0, s4
1061; GFX9-NEXT:    v_perm_b32 v1, v6, v1, s4
1062; GFX9-NEXT:    s_setpc_b64 s[30:31]
1063;
1064; R600-LABEL: v_fshr_v4i16:
1065; R600:       ; %bb.0:
1066; R600-NEXT:    CF_END
1067; R600-NEXT:    PAD
1068;
1069; GFX10-LABEL: v_fshr_v4i16:
1070; GFX10:       ; %bb.0:
1071; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1072; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1073; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
1074; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
1075; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
1076; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
1077; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
1078; GFX10-NEXT:    v_lshrrev_b16 v6, v7, v6
1079; GFX10-NEXT:    v_lshlrev_b16 v8, 1, v8
1080; GFX10-NEXT:    v_xor_b32_e32 v7, -1, v7
1081; GFX10-NEXT:    v_lshlrev_b16 v9, 1, v9
1082; GFX10-NEXT:    v_xor_b32_e32 v12, -1, v10
1083; GFX10-NEXT:    v_lshlrev_b16 v1, 1, v1
1084; GFX10-NEXT:    v_xor_b32_e32 v13, -1, v5
1085; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
1086; GFX10-NEXT:    v_xor_b32_e32 v14, -1, v4
1087; GFX10-NEXT:    v_lshlrev_b16 v7, v7, v8
1088; GFX10-NEXT:    v_lshrrev_b16 v8, v10, v11
1089; GFX10-NEXT:    v_lshlrev_b16 v9, v12, v9
1090; GFX10-NEXT:    v_lshlrev_b16 v1, v13, v1
1091; GFX10-NEXT:    v_lshlrev_b16 v0, v14, v0
1092; GFX10-NEXT:    v_lshrrev_b16 v2, v4, v2
1093; GFX10-NEXT:    v_lshrrev_b16 v3, v5, v3
1094; GFX10-NEXT:    v_or_b32_e32 v4, v7, v6
1095; GFX10-NEXT:    v_or_b32_e32 v5, v9, v8
1096; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
1097; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
1098; GFX10-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
1099; GFX10-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
1100; GFX10-NEXT:    s_setpc_b64 s[30:31]
1101;
1102; GFX11-LABEL: v_fshr_v4i16:
1103; GFX11:       ; %bb.0:
1104; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1105; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
1106; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
1107; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
1108; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v0
1109; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
1110; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
1111; GFX11-NEXT:    v_lshrrev_b16 v6, v7, v6
1112; GFX11-NEXT:    v_lshlrev_b16 v8, 1, v8
1113; GFX11-NEXT:    v_xor_b32_e32 v7, -1, v7
1114; GFX11-NEXT:    v_lshlrev_b16 v9, 1, v9
1115; GFX11-NEXT:    v_xor_b32_e32 v12, -1, v10
1116; GFX11-NEXT:    v_lshlrev_b16 v1, 1, v1
1117; GFX11-NEXT:    v_xor_b32_e32 v13, -1, v5
1118; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
1119; GFX11-NEXT:    v_xor_b32_e32 v14, -1, v4
1120; GFX11-NEXT:    v_lshlrev_b16 v7, v7, v8
1121; GFX11-NEXT:    v_lshrrev_b16 v8, v10, v11
1122; GFX11-NEXT:    v_lshlrev_b16 v9, v12, v9
1123; GFX11-NEXT:    v_lshlrev_b16 v1, v13, v1
1124; GFX11-NEXT:    v_lshlrev_b16 v0, v14, v0
1125; GFX11-NEXT:    v_lshrrev_b16 v2, v4, v2
1126; GFX11-NEXT:    v_lshrrev_b16 v3, v5, v3
1127; GFX11-NEXT:    v_or_b32_e32 v4, v7, v6
1128; GFX11-NEXT:    v_or_b32_e32 v5, v9, v8
1129; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1130; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
1131; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
1132; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1133; GFX11-NEXT:    v_perm_b32 v0, v5, v0, 0x5040100
1134; GFX11-NEXT:    v_perm_b32 v1, v4, v1, 0x5040100
1135; GFX11-NEXT:    s_setpc_b64 s[30:31]
1136  %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2)
1137  ret <4 x i16> %ret
1138}
1139
1140define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) {
1141; SI-LABEL: v_fshr_i64:
1142; SI:       ; %bb.0:
1143; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1144; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
1145; SI-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
1146; SI-NEXT:    v_not_b32_e32 v4, v4
1147; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
1148; SI-NEXT:    v_or_b32_e32 v1, v1, v3
1149; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1150; SI-NEXT:    s_setpc_b64 s[30:31]
1151;
1152; VI-LABEL: v_fshr_i64:
1153; VI:       ; %bb.0:
1154; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1155; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1156; VI-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1157; VI-NEXT:    v_not_b32_e32 v4, v4
1158; VI-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
1159; VI-NEXT:    v_or_b32_e32 v1, v1, v3
1160; VI-NEXT:    v_or_b32_e32 v0, v0, v2
1161; VI-NEXT:    s_setpc_b64 s[30:31]
1162;
1163; GFX9-LABEL: v_fshr_i64:
1164; GFX9:       ; %bb.0:
1165; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1167; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1168; GFX9-NEXT:    v_not_b32_e32 v4, v4
1169; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
1170; GFX9-NEXT:    v_or_b32_e32 v1, v1, v3
1171; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
1172; GFX9-NEXT:    s_setpc_b64 s[30:31]
1173;
1174; R600-LABEL: v_fshr_i64:
1175; R600:       ; %bb.0:
1176; R600-NEXT:    CF_END
1177; R600-NEXT:    PAD
1178;
1179; GFX10-LABEL: v_fshr_i64:
1180; GFX10:       ; %bb.0:
1181; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1182; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1183; GFX10-NEXT:    v_not_b32_e32 v5, v4
1184; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1185; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
1186; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
1187; GFX10-NEXT:    v_or_b32_e32 v1, v1, v3
1188; GFX10-NEXT:    s_setpc_b64 s[30:31]
1189;
1190; GFX11-LABEL: v_fshr_i64:
1191; GFX11:       ; %bb.0:
1192; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1193; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1194; GFX11-NEXT:    v_not_b32_e32 v5, v4
1195; GFX11-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
1196; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1197; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v5, v[0:1]
1198; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
1199; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
1200; GFX11-NEXT:    v_or_b32_e32 v1, v1, v3
1201; GFX11-NEXT:    s_setpc_b64 s[30:31]
1202  %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2)
1203  ret i64 %ret
1204}
1205
1206define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) {
1207; SI-LABEL: v_fshr_v2i64:
1208; SI:       ; %bb.0:
1209; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1210; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
1211; SI-NEXT:    v_lshr_b64 v[4:5], v[4:5], v8
1212; SI-NEXT:    v_not_b32_e32 v8, v8
1213; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
1214; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
1215; SI-NEXT:    v_or_b32_e32 v1, v1, v5
1216; SI-NEXT:    v_lshr_b64 v[5:6], v[6:7], v10
1217; SI-NEXT:    v_not_b32_e32 v7, v10
1218; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v7
1219; SI-NEXT:    v_or_b32_e32 v0, v0, v4
1220; SI-NEXT:    v_or_b32_e32 v3, v3, v6
1221; SI-NEXT:    v_or_b32_e32 v2, v2, v5
1222; SI-NEXT:    s_setpc_b64 s[30:31]
1223;
1224; VI-LABEL: v_fshr_v2i64:
1225; VI:       ; %bb.0:
1226; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1227; VI-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1228; VI-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1229; VI-NEXT:    v_not_b32_e32 v8, v8
1230; VI-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1231; VI-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1232; VI-NEXT:    v_or_b32_e32 v1, v1, v5
1233; VI-NEXT:    v_lshrrev_b64 v[5:6], v10, v[6:7]
1234; VI-NEXT:    v_not_b32_e32 v7, v10
1235; VI-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1236; VI-NEXT:    v_or_b32_e32 v0, v0, v4
1237; VI-NEXT:    v_or_b32_e32 v3, v3, v6
1238; VI-NEXT:    v_or_b32_e32 v2, v2, v5
1239; VI-NEXT:    s_setpc_b64 s[30:31]
1240;
1241; GFX9-LABEL: v_fshr_v2i64:
1242; GFX9:       ; %bb.0:
1243; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1244; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1245; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1246; GFX9-NEXT:    v_not_b32_e32 v8, v8
1247; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
1248; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1249; GFX9-NEXT:    v_or_b32_e32 v1, v1, v5
1250; GFX9-NEXT:    v_lshrrev_b64 v[5:6], v10, v[6:7]
1251; GFX9-NEXT:    v_not_b32_e32 v7, v10
1252; GFX9-NEXT:    v_lshlrev_b64 v[2:3], v7, v[2:3]
1253; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
1254; GFX9-NEXT:    v_or_b32_e32 v3, v3, v6
1255; GFX9-NEXT:    v_or_b32_e32 v2, v2, v5
1256; GFX9-NEXT:    s_setpc_b64 s[30:31]
1257;
1258; R600-LABEL: v_fshr_v2i64:
1259; R600:       ; %bb.0:
1260; R600-NEXT:    CF_END
1261; R600-NEXT:    PAD
1262;
1263; GFX10-LABEL: v_fshr_v2i64:
1264; GFX10:       ; %bb.0:
1265; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1266; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1267; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1268; GFX10-NEXT:    v_not_b32_e32 v9, v8
1269; GFX10-NEXT:    v_not_b32_e32 v11, v10
1270; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1271; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v10, v[6:7]
1272; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1273; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
1274; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
1275; GFX10-NEXT:    v_or_b32_e32 v1, v1, v5
1276; GFX10-NEXT:    v_or_b32_e32 v2, v2, v6
1277; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
1278; GFX10-NEXT:    s_setpc_b64 s[30:31]
1279;
1280; GFX11-LABEL: v_fshr_v2i64:
1281; GFX11:       ; %bb.0:
1282; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1283; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
1284; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
1285; GFX11-NEXT:    v_not_b32_e32 v9, v8
1286; GFX11-NEXT:    v_not_b32_e32 v11, v10
1287; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v8, v[4:5]
1288; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v10, v[6:7]
1289; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1290; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
1291; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
1292; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1293; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
1294; GFX11-NEXT:    v_or_b32_e32 v1, v1, v5
1295; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
1296; GFX11-NEXT:    v_or_b32_e32 v2, v2, v6
1297; GFX11-NEXT:    v_or_b32_e32 v3, v3, v7
1298; GFX11-NEXT:    s_setpc_b64 s[30:31]
1299  %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2)
1300  ret <2 x i64> %ret
1301}
1302
1303define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
1304; SI-LABEL: v_fshr_i24:
1305; SI:       ; %bb.0:
1306; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1307; SI-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1308; SI-NEXT:    s_mov_b32 s4, 0xaaaaaab
1309; SI-NEXT:    v_mul_hi_u32 v3, v3, s4
1310; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1311; SI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1312; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
1313; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
1314; SI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1315; SI-NEXT:    s_setpc_b64 s[30:31]
1316;
1317; VI-LABEL: v_fshr_i24:
1318; VI:       ; %bb.0:
1319; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1320; VI-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1321; VI-NEXT:    s_mov_b32 s4, 0xaaaaaab
1322; VI-NEXT:    v_mul_hi_u32 v3, v3, s4
1323; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1324; VI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1325; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
1326; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
1327; VI-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1328; VI-NEXT:    s_setpc_b64 s[30:31]
1329;
1330; GFX9-LABEL: v_fshr_i24:
1331; GFX9:       ; %bb.0:
1332; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1333; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1334; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaab
1335; GFX9-NEXT:    v_mul_hi_u32 v3, v3, s4
1336; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1337; GFX9-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1338; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
1339; GFX9-NEXT:    v_add_u32_e32 v2, 8, v2
1340; GFX9-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1341; GFX9-NEXT:    s_setpc_b64 s[30:31]
1342;
1343; R600-LABEL: v_fshr_i24:
1344; R600:       ; %bb.0:
1345; R600-NEXT:    CF_END
1346; R600-NEXT:    PAD
1347;
1348; GFX10-LABEL: v_fshr_i24:
1349; GFX10:       ; %bb.0:
1350; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1351; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1352; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1353; GFX10-NEXT:    v_mul_hi_u32 v3, 0xaaaaaab, v3
1354; GFX10-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1355; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
1356; GFX10-NEXT:    v_add_nc_u32_e32 v2, 8, v2
1357; GFX10-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1358; GFX10-NEXT:    s_setpc_b64 s[30:31]
1359;
1360; GFX11-LABEL: v_fshr_i24:
1361; GFX11:       ; %bb.0:
1362; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1363; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
1364; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1365; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1366; GFX11-NEXT:    v_mul_hi_u32 v3, 0xaaaaaab, v3
1367; GFX11-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
1368; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1369; GFX11-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
1370; GFX11-NEXT:    v_add_nc_u32_e32 v2, 8, v2
1371; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1372; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, v2
1373; GFX11-NEXT:    s_setpc_b64 s[30:31]
1374  %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
1375  ret i24 %ret
1376}
1377
1378define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) {
1379; SI-LABEL: v_fshr_v2i24:
1380; SI:       ; %bb.0:
1381; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1382; SI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1383; SI-NEXT:    s_mov_b32 s4, 0xaaaaaab
1384; SI-NEXT:    v_mul_hi_u32 v6, v6, s4
1385; SI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1386; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1387; SI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1388; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
1389; SI-NEXT:    v_mul_hi_u32 v6, v7, s4
1390; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v4
1391; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1392; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1393; SI-NEXT:    v_mul_u32_u24_e32 v3, 24, v6
1394; SI-NEXT:    v_sub_i32_e32 v3, vcc, v5, v3
1395; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v3
1396; SI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1397; SI-NEXT:    s_setpc_b64 s[30:31]
1398;
1399; VI-LABEL: v_fshr_v2i24:
1400; VI:       ; %bb.0:
1401; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1402; VI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1403; VI-NEXT:    s_mov_b32 s4, 0xaaaaaab
1404; VI-NEXT:    v_mul_hi_u32 v6, v6, s4
1405; VI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1406; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1407; VI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1408; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
1409; VI-NEXT:    v_mul_hi_u32 v6, v7, s4
1410; VI-NEXT:    v_add_u32_e32 v4, vcc, 8, v4
1411; VI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1412; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1413; VI-NEXT:    v_mul_u32_u24_e32 v3, 24, v6
1414; VI-NEXT:    v_sub_u32_e32 v3, vcc, v5, v3
1415; VI-NEXT:    v_add_u32_e32 v3, vcc, 8, v3
1416; VI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1417; VI-NEXT:    s_setpc_b64 s[30:31]
1418;
1419; GFX9-LABEL: v_fshr_v2i24:
1420; GFX9:       ; %bb.0:
1421; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1422; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1423; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaab
1424; GFX9-NEXT:    v_mul_hi_u32 v6, v6, s4
1425; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1426; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1427; GFX9-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1428; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
1429; GFX9-NEXT:    v_mul_hi_u32 v6, v7, s4
1430; GFX9-NEXT:    v_add_u32_e32 v4, 8, v4
1431; GFX9-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1432; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
1433; GFX9-NEXT:    v_mul_u32_u24_e32 v3, 24, v6
1434; GFX9-NEXT:    v_sub_u32_e32 v3, v5, v3
1435; GFX9-NEXT:    v_add_u32_e32 v3, 8, v3
1436; GFX9-NEXT:    v_alignbit_b32 v1, v1, v2, v3
1437; GFX9-NEXT:    s_setpc_b64 s[30:31]
1438;
1439; R600-LABEL: v_fshr_v2i24:
1440; R600:       ; %bb.0:
1441; R600-NEXT:    CF_END
1442; R600-NEXT:    PAD
1443;
1444; GFX10-LABEL: v_fshr_v2i24:
1445; GFX10:       ; %bb.0:
1446; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1447; GFX10-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1448; GFX10-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1449; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1450; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1451; GFX10-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v6
1452; GFX10-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v7
1453; GFX10-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1454; GFX10-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
1455; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
1456; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
1457; GFX10-NEXT:    v_add_nc_u32_e32 v4, 8, v4
1458; GFX10-NEXT:    v_add_nc_u32_e32 v5, 8, v5
1459; GFX10-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1460; GFX10-NEXT:    v_alignbit_b32 v1, v1, v3, v5
1461; GFX10-NEXT:    s_setpc_b64 s[30:31]
1462;
1463; GFX11-LABEL: v_fshr_v2i24:
1464; GFX11:       ; %bb.0:
1465; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1466; GFX11-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
1467; GFX11-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
1468; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
1469; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
1470; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
1471; GFX11-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v6
1472; GFX11-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v7
1473; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1474; GFX11-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
1475; GFX11-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
1476; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1477; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
1478; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
1479; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1480; GFX11-NEXT:    v_add_nc_u32_e32 v4, 8, v4
1481; GFX11-NEXT:    v_add_nc_u32_e32 v5, 8, v5
1482; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
1483; GFX11-NEXT:    v_alignbit_b32 v0, v0, v2, v4
1484; GFX11-NEXT:    v_alignbit_b32 v1, v1, v3, v5
1485; GFX11-NEXT:    s_setpc_b64 s[30:31]
1486  %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2)
1487  ret <2 x i24> %ret
1488}
1489