xref: /llvm-project/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll (revision 5d9c717597aef72e4ba27a2b143e9753c513e5c9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s
3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s
6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s
7
8define i8 @v_lshr_i8(i8 %value, i8 %amount) {
9; GFX6-LABEL: v_lshr_i8:
10; GFX6:       ; %bb.0:
11; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
12; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
13; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
14; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
15; GFX6-NEXT:    s_setpc_b64 s[30:31]
16;
17; GFX8-LABEL: v_lshr_i8:
18; GFX8:       ; %bb.0:
19; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
20; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
21; GFX8-NEXT:    s_setpc_b64 s[30:31]
22;
23; GFX9-LABEL: v_lshr_i8:
24; GFX9:       ; %bb.0:
25; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
26; GFX9-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
27; GFX9-NEXT:    s_setpc_b64 s[30:31]
28;
29; GFX10PLUS-LABEL: v_lshr_i8:
30; GFX10PLUS:       ; %bb.0:
31; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
32; GFX10PLUS-NEXT:    v_and_b32_e32 v1, 0xff, v1
33; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xff, v0
34; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, v1, v0
35; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
36  %result = lshr i8 %value, %amount
37  ret i8 %result
38}
39
40define i8 @v_lshr_i8_7(i8 %value) {
41; GFX6-LABEL: v_lshr_i8_7:
42; GFX6:       ; %bb.0:
43; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
44; GFX6-NEXT:    v_bfe_u32 v0, v0, 7, 1
45; GFX6-NEXT:    s_setpc_b64 s[30:31]
46;
47; GFX8-LABEL: v_lshr_i8_7:
48; GFX8:       ; %bb.0:
49; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50; GFX8-NEXT:    v_mov_b32_e32 v1, 7
51; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
52; GFX8-NEXT:    s_setpc_b64 s[30:31]
53;
54; GFX9-LABEL: v_lshr_i8_7:
55; GFX9:       ; %bb.0:
56; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57; GFX9-NEXT:    v_mov_b32_e32 v1, 7
58; GFX9-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
59; GFX9-NEXT:    s_setpc_b64 s[30:31]
60;
61; GFX10PLUS-LABEL: v_lshr_i8_7:
62; GFX10PLUS:       ; %bb.0:
63; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xff, v0
65; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 7, v0
66; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
67  %result = lshr i8 %value, 7
68  ret i8 %result
69}
70
71define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) {
72; GCN-LABEL: s_lshr_i8:
73; GCN:       ; %bb.0:
74; GCN-NEXT:    s_and_b32 s0, s0, 0xff
75; GCN-NEXT:    s_lshr_b32 s0, s0, s1
76; GCN-NEXT:    ; return to shader part epilog
77;
78; GFX10PLUS-LABEL: s_lshr_i8:
79; GFX10PLUS:       ; %bb.0:
80; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xff
81; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s1
82; GFX10PLUS-NEXT:    ; return to shader part epilog
83  %result = lshr i8 %value, %amount
84  ret i8 %result
85}
86
87define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) {
88; GCN-LABEL: s_lshr_i8_7:
89; GCN:       ; %bb.0:
90; GCN-NEXT:    s_bfe_u32 s0, s0, 0x10007
91; GCN-NEXT:    ; return to shader part epilog
92;
93; GFX10PLUS-LABEL: s_lshr_i8_7:
94; GFX10PLUS:       ; %bb.0:
95; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x10007
96; GFX10PLUS-NEXT:    ; return to shader part epilog
97  %result = lshr i8 %value, 7
98  ret i8 %result
99}
100
101
102define i24 @v_lshr_i24(i24 %value, i24 %amount) {
103; GCN-LABEL: v_lshr_i24:
104; GCN:       ; %bb.0:
105; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
106; GCN-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
107; GCN-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
108; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
109; GCN-NEXT:    s_setpc_b64 s[30:31]
110;
111; GFX10PLUS-LABEL: v_lshr_i24:
112; GFX10PLUS:       ; %bb.0:
113; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114; GFX10PLUS-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
115; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
116; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
117; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
118  %result = lshr i24 %value, %amount
119  ret i24 %result
120}
121
122define i24 @v_lshr_i24_7(i24 %value) {
123; GCN-LABEL: v_lshr_i24_7:
124; GCN:       ; %bb.0:
125; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
126; GCN-NEXT:    v_bfe_u32 v0, v0, 7, 17
127; GCN-NEXT:    s_setpc_b64 s[30:31]
128;
129; GFX10PLUS-LABEL: v_lshr_i24_7:
130; GFX10PLUS:       ; %bb.0:
131; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
132; GFX10PLUS-NEXT:    v_bfe_u32 v0, v0, 7, 17
133; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
134  %result = lshr i24 %value, 7
135  ret i24 %result
136}
137
138define amdgpu_ps i24 @s_lshr_i24(i24 inreg %value, i24 inreg %amount) {
139; GCN-LABEL: s_lshr_i24:
140; GCN:       ; %bb.0:
141; GCN-NEXT:    s_and_b32 s0, s0, 0xffffff
142; GCN-NEXT:    s_lshr_b32 s0, s0, s1
143; GCN-NEXT:    ; return to shader part epilog
144;
145; GFX10PLUS-LABEL: s_lshr_i24:
146; GFX10PLUS:       ; %bb.0:
147; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffffff
148; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s1
149; GFX10PLUS-NEXT:    ; return to shader part epilog
150  %result = lshr i24 %value, %amount
151  ret i24 %result
152}
153
154define amdgpu_ps i24 @s_lshr_i24_7(i24 inreg %value) {
155; GCN-LABEL: s_lshr_i24_7:
156; GCN:       ; %bb.0:
157; GCN-NEXT:    s_bfe_u32 s0, s0, 0x110007
158; GCN-NEXT:    ; return to shader part epilog
159;
160; GFX10PLUS-LABEL: s_lshr_i24_7:
161; GFX10PLUS:       ; %bb.0:
162; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x110007
163; GFX10PLUS-NEXT:    ; return to shader part epilog
164  %result = lshr i24 %value, 7
165  ret i24 %result
166}
167
168define i32 @v_lshr_i32(i32 %value, i32 %amount) {
169; GCN-LABEL: v_lshr_i32:
170; GCN:       ; %bb.0:
171; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
172; GCN-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
173; GCN-NEXT:    s_setpc_b64 s[30:31]
174;
175; GFX10PLUS-LABEL: v_lshr_i32:
176; GFX10PLUS:       ; %bb.0:
177; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
178; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
179; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
180  %result = lshr i32 %value, %amount
181  ret i32 %result
182}
183
184define i32 @v_lshr_i32_31(i32 %value) {
185; GCN-LABEL: v_lshr_i32_31:
186; GCN:       ; %bb.0:
187; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
188; GCN-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
189; GCN-NEXT:    s_setpc_b64 s[30:31]
190;
191; GFX10PLUS-LABEL: v_lshr_i32_31:
192; GFX10PLUS:       ; %bb.0:
193; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
194; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
195; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
196  %result = lshr i32 %value, 31
197  ret i32 %result
198}
199
200define amdgpu_ps i32 @s_lshr_i32(i32 inreg %value, i32 inreg %amount) {
201; GCN-LABEL: s_lshr_i32:
202; GCN:       ; %bb.0:
203; GCN-NEXT:    s_lshr_b32 s0, s0, s1
204; GCN-NEXT:    ; return to shader part epilog
205;
206; GFX10PLUS-LABEL: s_lshr_i32:
207; GFX10PLUS:       ; %bb.0:
208; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s1
209; GFX10PLUS-NEXT:    ; return to shader part epilog
210  %result = lshr i32 %value, %amount
211  ret i32 %result
212}
213
214define amdgpu_ps i32 @s_lshr_i32_31(i32 inreg %value) {
215; GCN-LABEL: s_lshr_i32_31:
216; GCN:       ; %bb.0:
217; GCN-NEXT:    s_lshr_b32 s0, s0, 31
218; GCN-NEXT:    ; return to shader part epilog
219;
220; GFX10PLUS-LABEL: s_lshr_i32_31:
221; GFX10PLUS:       ; %bb.0:
222; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, 31
223; GFX10PLUS-NEXT:    ; return to shader part epilog
224  %result = lshr i32 %value, 31
225  ret i32 %result
226}
227
228define amdgpu_ps float @lshr_i32_sv(i32 inreg %value, i32 %amount) {
229; GFX6-LABEL: lshr_i32_sv:
230; GFX6:       ; %bb.0:
231; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
232; GFX6-NEXT:    ; return to shader part epilog
233;
234; GFX8-LABEL: lshr_i32_sv:
235; GFX8:       ; %bb.0:
236; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
237; GFX8-NEXT:    ; return to shader part epilog
238;
239; GFX9-LABEL: lshr_i32_sv:
240; GFX9:       ; %bb.0:
241; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
242; GFX9-NEXT:    ; return to shader part epilog
243;
244; GFX10PLUS-LABEL: lshr_i32_sv:
245; GFX10PLUS:       ; %bb.0:
246; GFX10PLUS-NEXT:    v_lshrrev_b32_e64 v0, v0, s0
247; GFX10PLUS-NEXT:    ; return to shader part epilog
248  %result = lshr i32 %value, %amount
249  %cast = bitcast i32 %result to float
250  ret float %cast
251}
252
253define amdgpu_ps float @lshr_i32_vs(i32 %value, i32 inreg %amount) {
254; GCN-LABEL: lshr_i32_vs:
255; GCN:       ; %bb.0:
256; GCN-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
257; GCN-NEXT:    ; return to shader part epilog
258;
259; GFX10PLUS-LABEL: lshr_i32_vs:
260; GFX10PLUS:       ; %bb.0:
261; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
262; GFX10PLUS-NEXT:    ; return to shader part epilog
263  %result = lshr i32 %value, %amount
264  %cast = bitcast i32 %result to float
265  ret float %cast
266}
267
268define <2 x i32> @v_lshr_v2i32(<2 x i32> %value, <2 x i32> %amount) {
269; GCN-LABEL: v_lshr_v2i32:
270; GCN:       ; %bb.0:
271; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
272; GCN-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
273; GCN-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
274; GCN-NEXT:    s_setpc_b64 s[30:31]
275;
276; GFX10PLUS-LABEL: v_lshr_v2i32:
277; GFX10PLUS:       ; %bb.0:
278; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
279; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
280; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v1, v3, v1
281; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
282  %result = lshr <2 x i32> %value, %amount
283  ret <2 x i32> %result
284}
285
286define <2 x i32> @v_lshr_v2i32_31(<2 x i32> %value) {
287; GCN-LABEL: v_lshr_v2i32_31:
288; GCN:       ; %bb.0:
289; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
290; GCN-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
291; GCN-NEXT:    v_lshrrev_b32_e32 v1, 31, v1
292; GCN-NEXT:    s_setpc_b64 s[30:31]
293;
294; GFX10PLUS-LABEL: v_lshr_v2i32_31:
295; GFX10PLUS:       ; %bb.0:
296; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
297; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
298; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v1, 31, v1
299; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
300  %result = lshr <2 x i32> %value, <i32 31, i32 31>
301  ret <2 x i32> %result
302}
303
304define amdgpu_ps <2 x i32> @s_lshr_v2i32(<2 x i32> inreg %value, <2 x i32> inreg %amount) {
305; GCN-LABEL: s_lshr_v2i32:
306; GCN:       ; %bb.0:
307; GCN-NEXT:    s_lshr_b32 s0, s0, s2
308; GCN-NEXT:    s_lshr_b32 s1, s1, s3
309; GCN-NEXT:    ; return to shader part epilog
310;
311; GFX10PLUS-LABEL: s_lshr_v2i32:
312; GFX10PLUS:       ; %bb.0:
313; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s2
314; GFX10PLUS-NEXT:    s_lshr_b32 s1, s1, s3
315; GFX10PLUS-NEXT:    ; return to shader part epilog
316  %result = lshr <2 x i32> %value, %amount
317  ret <2 x i32> %result
318}
319
320define <3 x i32> @v_lshr_v3i32(<3 x i32> %value, <3 x i32> %amount) {
321; GCN-LABEL: v_lshr_v3i32:
322; GCN:       ; %bb.0:
323; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
324; GCN-NEXT:    v_lshrrev_b32_e32 v0, v3, v0
325; GCN-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
326; GCN-NEXT:    v_lshrrev_b32_e32 v2, v5, v2
327; GCN-NEXT:    s_setpc_b64 s[30:31]
328;
329; GFX10PLUS-LABEL: v_lshr_v3i32:
330; GFX10PLUS:       ; %bb.0:
331; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
332; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v3, v0
333; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
334; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v2, v5, v2
335; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
336  %result = lshr <3 x i32> %value, %amount
337  ret <3 x i32> %result
338}
339
340define amdgpu_ps <3 x i32> @s_lshr_v3i32(<3 x i32> inreg %value, <3 x i32> inreg %amount) {
341; GCN-LABEL: s_lshr_v3i32:
342; GCN:       ; %bb.0:
343; GCN-NEXT:    s_lshr_b32 s0, s0, s3
344; GCN-NEXT:    s_lshr_b32 s1, s1, s4
345; GCN-NEXT:    s_lshr_b32 s2, s2, s5
346; GCN-NEXT:    ; return to shader part epilog
347;
348; GFX10PLUS-LABEL: s_lshr_v3i32:
349; GFX10PLUS:       ; %bb.0:
350; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s3
351; GFX10PLUS-NEXT:    s_lshr_b32 s1, s1, s4
352; GFX10PLUS-NEXT:    s_lshr_b32 s2, s2, s5
353; GFX10PLUS-NEXT:    ; return to shader part epilog
354  %result = lshr <3 x i32> %value, %amount
355  ret <3 x i32> %result
356}
357
358define <4 x i32> @v_lshr_v4i32(<4 x i32> %value, <4 x i32> %amount) {
359; GCN-LABEL: v_lshr_v4i32:
360; GCN:       ; %bb.0:
361; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
362; GCN-NEXT:    v_lshrrev_b32_e32 v0, v4, v0
363; GCN-NEXT:    v_lshrrev_b32_e32 v1, v5, v1
364; GCN-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
365; GCN-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
366; GCN-NEXT:    s_setpc_b64 s[30:31]
367;
368; GFX10PLUS-LABEL: v_lshr_v4i32:
369; GFX10PLUS:       ; %bb.0:
370; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
371; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v4, v0
372; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v1, v5, v1
373; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
374; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
375; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
376  %result = lshr <4 x i32> %value, %amount
377  ret <4 x i32> %result
378}
379
380define amdgpu_ps <4 x i32> @s_lshr_v4i32(<4 x i32> inreg %value, <4 x i32> inreg %amount) {
381; GCN-LABEL: s_lshr_v4i32:
382; GCN:       ; %bb.0:
383; GCN-NEXT:    s_lshr_b32 s0, s0, s4
384; GCN-NEXT:    s_lshr_b32 s1, s1, s5
385; GCN-NEXT:    s_lshr_b32 s2, s2, s6
386; GCN-NEXT:    s_lshr_b32 s3, s3, s7
387; GCN-NEXT:    ; return to shader part epilog
388;
389; GFX10PLUS-LABEL: s_lshr_v4i32:
390; GFX10PLUS:       ; %bb.0:
391; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s4
392; GFX10PLUS-NEXT:    s_lshr_b32 s1, s1, s5
393; GFX10PLUS-NEXT:    s_lshr_b32 s2, s2, s6
394; GFX10PLUS-NEXT:    s_lshr_b32 s3, s3, s7
395; GFX10PLUS-NEXT:    ; return to shader part epilog
396  %result = lshr <4 x i32> %value, %amount
397  ret <4 x i32> %result
398}
399
400define <5 x i32> @v_lshr_v5i32(<5 x i32> %value, <5 x i32> %amount) {
401; GCN-LABEL: v_lshr_v5i32:
402; GCN:       ; %bb.0:
403; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
404; GCN-NEXT:    v_lshrrev_b32_e32 v0, v5, v0
405; GCN-NEXT:    v_lshrrev_b32_e32 v1, v6, v1
406; GCN-NEXT:    v_lshrrev_b32_e32 v2, v7, v2
407; GCN-NEXT:    v_lshrrev_b32_e32 v3, v8, v3
408; GCN-NEXT:    v_lshrrev_b32_e32 v4, v9, v4
409; GCN-NEXT:    s_setpc_b64 s[30:31]
410;
411; GFX10PLUS-LABEL: v_lshr_v5i32:
412; GFX10PLUS:       ; %bb.0:
413; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
414; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, v5, v0
415; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v1, v6, v1
416; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v2, v7, v2
417; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v3, v8, v3
418; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v4, v9, v4
419; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
420  %result = lshr <5 x i32> %value, %amount
421  ret <5 x i32> %result
422}
423
424define amdgpu_ps <5 x i32> @s_lshr_v5i32(<5 x i32> inreg %value, <5 x i32> inreg %amount) {
425; GCN-LABEL: s_lshr_v5i32:
426; GCN:       ; %bb.0:
427; GCN-NEXT:    s_lshr_b32 s0, s0, s5
428; GCN-NEXT:    s_lshr_b32 s1, s1, s6
429; GCN-NEXT:    s_lshr_b32 s2, s2, s7
430; GCN-NEXT:    s_lshr_b32 s3, s3, s8
431; GCN-NEXT:    s_lshr_b32 s4, s4, s9
432; GCN-NEXT:    ; return to shader part epilog
433;
434; GFX10PLUS-LABEL: s_lshr_v5i32:
435; GFX10PLUS:       ; %bb.0:
436; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s5
437; GFX10PLUS-NEXT:    s_lshr_b32 s1, s1, s6
438; GFX10PLUS-NEXT:    s_lshr_b32 s2, s2, s7
439; GFX10PLUS-NEXT:    s_lshr_b32 s3, s3, s8
440; GFX10PLUS-NEXT:    s_lshr_b32 s4, s4, s9
441; GFX10PLUS-NEXT:    ; return to shader part epilog
442  %result = lshr <5 x i32> %value, %amount
443  ret <5 x i32> %result
444}
445
446define <16 x i32> @v_lshr_v16i32(<16 x i32> %value, <16 x i32> %amount) {
447; GCN-LABEL: v_lshr_v16i32:
448; GCN:       ; %bb.0:
449; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450; GCN-NEXT:    v_lshrrev_b32_e32 v0, v16, v0
451; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s32
452; GCN-NEXT:    v_lshrrev_b32_e32 v1, v17, v1
453; GCN-NEXT:    v_lshrrev_b32_e32 v2, v18, v2
454; GCN-NEXT:    v_lshrrev_b32_e32 v3, v19, v3
455; GCN-NEXT:    v_lshrrev_b32_e32 v4, v20, v4
456; GCN-NEXT:    v_lshrrev_b32_e32 v5, v21, v5
457; GCN-NEXT:    v_lshrrev_b32_e32 v6, v22, v6
458; GCN-NEXT:    v_lshrrev_b32_e32 v7, v23, v7
459; GCN-NEXT:    v_lshrrev_b32_e32 v8, v24, v8
460; GCN-NEXT:    v_lshrrev_b32_e32 v9, v25, v9
461; GCN-NEXT:    v_lshrrev_b32_e32 v10, v26, v10
462; GCN-NEXT:    v_lshrrev_b32_e32 v11, v27, v11
463; GCN-NEXT:    v_lshrrev_b32_e32 v12, v28, v12
464; GCN-NEXT:    v_lshrrev_b32_e32 v13, v29, v13
465; GCN-NEXT:    v_lshrrev_b32_e32 v14, v30, v14
466; GCN-NEXT:    s_waitcnt vmcnt(0)
467; GCN-NEXT:    v_lshrrev_b32_e32 v15, v16, v15
468; GCN-NEXT:    s_setpc_b64 s[30:31]
469;
470; GFX10-LABEL: v_lshr_v16i32:
471; GFX10:       ; %bb.0:
472; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
473; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
474; GFX10-NEXT:    v_lshrrev_b32_e32 v0, v16, v0
475; GFX10-NEXT:    v_lshrrev_b32_e32 v1, v17, v1
476; GFX10-NEXT:    v_lshrrev_b32_e32 v2, v18, v2
477; GFX10-NEXT:    v_lshrrev_b32_e32 v3, v19, v3
478; GFX10-NEXT:    v_lshrrev_b32_e32 v4, v20, v4
479; GFX10-NEXT:    v_lshrrev_b32_e32 v5, v21, v5
480; GFX10-NEXT:    v_lshrrev_b32_e32 v6, v22, v6
481; GFX10-NEXT:    v_lshrrev_b32_e32 v7, v23, v7
482; GFX10-NEXT:    v_lshrrev_b32_e32 v8, v24, v8
483; GFX10-NEXT:    v_lshrrev_b32_e32 v9, v25, v9
484; GFX10-NEXT:    v_lshrrev_b32_e32 v10, v26, v10
485; GFX10-NEXT:    v_lshrrev_b32_e32 v11, v27, v11
486; GFX10-NEXT:    v_lshrrev_b32_e32 v12, v28, v12
487; GFX10-NEXT:    v_lshrrev_b32_e32 v13, v29, v13
488; GFX10-NEXT:    v_lshrrev_b32_e32 v14, v30, v14
489; GFX10-NEXT:    s_waitcnt vmcnt(0)
490; GFX10-NEXT:    v_lshrrev_b32_e32 v15, v31, v15
491; GFX10-NEXT:    s_setpc_b64 s[30:31]
492;
493; GFX11-LABEL: v_lshr_v16i32:
494; GFX11:       ; %bb.0:
495; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
496; GFX11-NEXT:    scratch_load_b32 v31, off, s32
497; GFX11-NEXT:    v_lshrrev_b32_e32 v0, v16, v0
498; GFX11-NEXT:    v_lshrrev_b32_e32 v1, v17, v1
499; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v18, v2
500; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v19, v3
501; GFX11-NEXT:    v_lshrrev_b32_e32 v4, v20, v4
502; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v21, v5
503; GFX11-NEXT:    v_lshrrev_b32_e32 v6, v22, v6
504; GFX11-NEXT:    v_lshrrev_b32_e32 v7, v23, v7
505; GFX11-NEXT:    v_lshrrev_b32_e32 v8, v24, v8
506; GFX11-NEXT:    v_lshrrev_b32_e32 v9, v25, v9
507; GFX11-NEXT:    v_lshrrev_b32_e32 v10, v26, v10
508; GFX11-NEXT:    v_lshrrev_b32_e32 v11, v27, v11
509; GFX11-NEXT:    v_lshrrev_b32_e32 v12, v28, v12
510; GFX11-NEXT:    v_lshrrev_b32_e32 v13, v29, v13
511; GFX11-NEXT:    v_lshrrev_b32_e32 v14, v30, v14
512; GFX11-NEXT:    s_waitcnt vmcnt(0)
513; GFX11-NEXT:    v_lshrrev_b32_e32 v15, v31, v15
514; GFX11-NEXT:    s_setpc_b64 s[30:31]
515  %result = lshr <16 x i32> %value, %amount
516  ret <16 x i32> %result
517}
518
519define amdgpu_ps <16 x i32> @s_lshr_v16i32(<16 x i32> inreg %value, <16 x i32> inreg %amount) {
520; GCN-LABEL: s_lshr_v16i32:
521; GCN:       ; %bb.0:
522; GCN-NEXT:    s_lshr_b32 s0, s0, s16
523; GCN-NEXT:    s_lshr_b32 s1, s1, s17
524; GCN-NEXT:    s_lshr_b32 s2, s2, s18
525; GCN-NEXT:    s_lshr_b32 s3, s3, s19
526; GCN-NEXT:    s_lshr_b32 s4, s4, s20
527; GCN-NEXT:    s_lshr_b32 s5, s5, s21
528; GCN-NEXT:    s_lshr_b32 s6, s6, s22
529; GCN-NEXT:    s_lshr_b32 s7, s7, s23
530; GCN-NEXT:    s_lshr_b32 s8, s8, s24
531; GCN-NEXT:    s_lshr_b32 s9, s9, s25
532; GCN-NEXT:    s_lshr_b32 s10, s10, s26
533; GCN-NEXT:    s_lshr_b32 s11, s11, s27
534; GCN-NEXT:    s_lshr_b32 s12, s12, s28
535; GCN-NEXT:    s_lshr_b32 s13, s13, s29
536; GCN-NEXT:    s_lshr_b32 s14, s14, s30
537; GCN-NEXT:    s_lshr_b32 s15, s15, s31
538; GCN-NEXT:    ; return to shader part epilog
539;
540; GFX10PLUS-LABEL: s_lshr_v16i32:
541; GFX10PLUS:       ; %bb.0:
542; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s16
543; GFX10PLUS-NEXT:    s_lshr_b32 s1, s1, s17
544; GFX10PLUS-NEXT:    s_lshr_b32 s2, s2, s18
545; GFX10PLUS-NEXT:    s_lshr_b32 s3, s3, s19
546; GFX10PLUS-NEXT:    s_lshr_b32 s4, s4, s20
547; GFX10PLUS-NEXT:    s_lshr_b32 s5, s5, s21
548; GFX10PLUS-NEXT:    s_lshr_b32 s6, s6, s22
549; GFX10PLUS-NEXT:    s_lshr_b32 s7, s7, s23
550; GFX10PLUS-NEXT:    s_lshr_b32 s8, s8, s24
551; GFX10PLUS-NEXT:    s_lshr_b32 s9, s9, s25
552; GFX10PLUS-NEXT:    s_lshr_b32 s10, s10, s26
553; GFX10PLUS-NEXT:    s_lshr_b32 s11, s11, s27
554; GFX10PLUS-NEXT:    s_lshr_b32 s12, s12, s28
555; GFX10PLUS-NEXT:    s_lshr_b32 s13, s13, s29
556; GFX10PLUS-NEXT:    s_lshr_b32 s14, s14, s30
557; GFX10PLUS-NEXT:    s_lshr_b32 s15, s15, s31
558; GFX10PLUS-NEXT:    ; return to shader part epilog
559  %result = lshr <16 x i32> %value, %amount
560  ret <16 x i32> %result
561}
562
563define i16 @v_lshr_i16(i16 %value, i16 %amount) {
564; GFX6-LABEL: v_lshr_i16:
565; GFX6:       ; %bb.0:
566; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
567; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
568; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
569; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v1, v0
570; GFX6-NEXT:    s_setpc_b64 s[30:31]
571;
572; GFX8-LABEL: v_lshr_i16:
573; GFX8:       ; %bb.0:
574; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
575; GFX8-NEXT:    v_lshrrev_b16_e32 v0, v1, v0
576; GFX8-NEXT:    s_setpc_b64 s[30:31]
577;
578; GFX9-LABEL: v_lshr_i16:
579; GFX9:       ; %bb.0:
580; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
581; GFX9-NEXT:    v_lshrrev_b16_e32 v0, v1, v0
582; GFX9-NEXT:    s_setpc_b64 s[30:31]
583;
584; GFX10PLUS-LABEL: v_lshr_i16:
585; GFX10PLUS:       ; %bb.0:
586; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
587; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, v1, v0
588; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
589  %result = lshr i16 %value, %amount
590  ret i16 %result
591}
592
593define i16 @v_lshr_i16_15(i16 %value) {
594; GFX6-LABEL: v_lshr_i16_15:
595; GFX6:       ; %bb.0:
596; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
597; GFX6-NEXT:    v_bfe_u32 v0, v0, 15, 1
598; GFX6-NEXT:    s_setpc_b64 s[30:31]
599;
600; GFX8-LABEL: v_lshr_i16_15:
601; GFX8:       ; %bb.0:
602; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
603; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 15, v0
604; GFX8-NEXT:    s_setpc_b64 s[30:31]
605;
606; GFX9-LABEL: v_lshr_i16_15:
607; GFX9:       ; %bb.0:
608; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
609; GFX9-NEXT:    v_lshrrev_b16_e32 v0, 15, v0
610; GFX9-NEXT:    s_setpc_b64 s[30:31]
611;
612; GFX10PLUS-LABEL: v_lshr_i16_15:
613; GFX10PLUS:       ; %bb.0:
614; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
615; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, 15, v0
616; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
617  %result = lshr i16 %value, 15
618  ret i16 %result
619}
620
621define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) {
622; GCN-LABEL: s_lshr_i16:
623; GCN:       ; %bb.0:
624; GCN-NEXT:    s_and_b32 s0, s0, 0xffff
625; GCN-NEXT:    s_lshr_b32 s0, s0, s1
626; GCN-NEXT:    ; return to shader part epilog
627;
628; GFX10PLUS-LABEL: s_lshr_i16:
629; GFX10PLUS:       ; %bb.0:
630; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
631; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s1
632; GFX10PLUS-NEXT:    ; return to shader part epilog
633  %result = lshr i16 %value, %amount
634  ret i16 %result
635}
636
637define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) {
638; GCN-LABEL: s_lshr_i16_15:
639; GCN:       ; %bb.0:
640; GCN-NEXT:    s_bfe_u32 s0, s0, 0x1000f
641; GCN-NEXT:    ; return to shader part epilog
642;
643; GFX10PLUS-LABEL: s_lshr_i16_15:
644; GFX10PLUS:       ; %bb.0:
645; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x1000f
646; GFX10PLUS-NEXT:    ; return to shader part epilog
647  %result = lshr i16 %value, 15
648  ret i16 %result
649}
650
651define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) {
652; GFX6-LABEL: lshr_i16_sv:
653; GFX6:       ; %bb.0:
654; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
655; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
656; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
657; GFX6-NEXT:    ; return to shader part epilog
658;
659; GFX8-LABEL: lshr_i16_sv:
660; GFX8:       ; %bb.0:
661; GFX8-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
662; GFX8-NEXT:    ; return to shader part epilog
663;
664; GFX9-LABEL: lshr_i16_sv:
665; GFX9:       ; %bb.0:
666; GFX9-NEXT:    v_lshrrev_b16_e64 v0, v0, s0
667; GFX9-NEXT:    ; return to shader part epilog
668;
669; GFX10PLUS-LABEL: lshr_i16_sv:
670; GFX10PLUS:       ; %bb.0:
671; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, v0, s0
672; GFX10PLUS-NEXT:    ; return to shader part epilog
673  %result = lshr i16 %value, %amount
674  %cast = bitcast i16 %result to half
675  ret half %cast
676}
677
678define amdgpu_ps half @lshr_i16_vs(i16 %value, i16 inreg %amount) {
679; GFX6-LABEL: lshr_i16_vs:
680; GFX6:       ; %bb.0:
681; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
682; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
683; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
684; GFX6-NEXT:    ; return to shader part epilog
685;
686; GFX8-LABEL: lshr_i16_vs:
687; GFX8:       ; %bb.0:
688; GFX8-NEXT:    v_lshrrev_b16_e32 v0, s0, v0
689; GFX8-NEXT:    ; return to shader part epilog
690;
691; GFX9-LABEL: lshr_i16_vs:
692; GFX9:       ; %bb.0:
693; GFX9-NEXT:    v_lshrrev_b16_e32 v0, s0, v0
694; GFX9-NEXT:    ; return to shader part epilog
695;
696; GFX10PLUS-LABEL: lshr_i16_vs:
697; GFX10PLUS:       ; %bb.0:
698; GFX10PLUS-NEXT:    v_lshrrev_b16 v0, s0, v0
699; GFX10PLUS-NEXT:    ; return to shader part epilog
700  %result = lshr i16 %value, %amount
701  %cast = bitcast i16 %result to half
702  ret half %cast
703}
704
705define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) {
706; GFX6-LABEL: v_lshr_v2i16:
707; GFX6:       ; %bb.0:
708; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
709; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
710; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
711; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v2, v0
712; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
713; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
714; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v2, v1
715; GFX6-NEXT:    s_setpc_b64 s[30:31]
716;
717; GFX8-LABEL: v_lshr_v2i16:
718; GFX8:       ; %bb.0:
719; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
720; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v1, v0
721; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
722; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
723; GFX8-NEXT:    s_setpc_b64 s[30:31]
724;
725; GFX9-LABEL: v_lshr_v2i16:
726; GFX9:       ; %bb.0:
727; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
728; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
729; GFX9-NEXT:    s_setpc_b64 s[30:31]
730;
731; GFX10PLUS-LABEL: v_lshr_v2i16:
732; GFX10PLUS:       ; %bb.0:
733; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
734; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v0, v1, v0
735; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
736  %result = lshr <2 x i16> %value, %amount
737  ret <2 x i16> %result
738}
739
740define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) {
741; GFX6-LABEL: v_lshr_v2i16_15:
742; GFX6:       ; %bb.0:
743; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
744; GFX6-NEXT:    v_bfe_u32 v0, v0, 15, 1
745; GFX6-NEXT:    v_bfe_u32 v1, v1, 15, 1
746; GFX6-NEXT:    s_setpc_b64 s[30:31]
747;
748; GFX8-LABEL: v_lshr_v2i16_15:
749; GFX8:       ; %bb.0:
750; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
751; GFX8-NEXT:    v_mov_b32_e32 v2, 15
752; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 15, v0
753; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
754; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
755; GFX8-NEXT:    s_setpc_b64 s[30:31]
756;
757; GFX9-LABEL: v_lshr_v2i16_15:
758; GFX9:       ; %bb.0:
759; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
760; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1]
761; GFX9-NEXT:    s_setpc_b64 s[30:31]
762;
763; GFX10PLUS-LABEL: v_lshr_v2i16_15:
764; GFX10PLUS:       ; %bb.0:
765; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
766; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1]
767; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
768  %result = lshr <2 x i16> %value, <i16 15, i16 15>
769  ret <2 x i16> %result
770}
771
772define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) {
773; GFX6-LABEL: s_lshr_v2i16:
774; GFX6:       ; %bb.0:
775; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
776; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
777; GFX6-NEXT:    s_lshr_b32 s1, s1, s3
778; GFX6-NEXT:    s_lshr_b32 s0, s0, s2
779; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
780; GFX6-NEXT:    s_or_b32 s0, s0, s1
781; GFX6-NEXT:    ; return to shader part epilog
782;
783; GFX8-LABEL: s_lshr_v2i16:
784; GFX8:       ; %bb.0:
785; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
786; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
787; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
788; GFX8-NEXT:    s_lshr_b32 s0, s0, s1
789; GFX8-NEXT:    s_lshr_b32 s1, s2, s3
790; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
791; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
792; GFX8-NEXT:    s_or_b32 s0, s1, s0
793; GFX8-NEXT:    ; return to shader part epilog
794;
795; GFX9-LABEL: s_lshr_v2i16:
796; GFX9:       ; %bb.0:
797; GFX9-NEXT:    s_lshr_b32 s2, s0, 16
798; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
799; GFX9-NEXT:    s_lshr_b32 s3, s1, 16
800; GFX9-NEXT:    s_lshr_b32 s0, s0, s1
801; GFX9-NEXT:    s_lshr_b32 s1, s2, s3
802; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
803; GFX9-NEXT:    ; return to shader part epilog
804;
805; GFX10PLUS-LABEL: s_lshr_v2i16:
806; GFX10PLUS:       ; %bb.0:
807; GFX10PLUS-NEXT:    s_lshr_b32 s2, s0, 16
808; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
809; GFX10PLUS-NEXT:    s_lshr_b32 s3, s1, 16
810; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s1
811; GFX10PLUS-NEXT:    s_lshr_b32 s1, s2, s3
812; GFX10PLUS-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
813; GFX10PLUS-NEXT:    ; return to shader part epilog
814  %result = lshr <2 x i16> %value, %amount
815  %cast = bitcast <2 x i16> %result to i32
816  ret i32 %cast
817}
818
819define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) {
820; GFX6-LABEL: lshr_v2i16_sv:
821; GFX6:       ; %bb.0:
822; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
823; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
824; GFX6-NEXT:    v_lshr_b32_e32 v0, s0, v0
825; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
826; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
827; GFX6-NEXT:    v_lshr_b32_e32 v1, s0, v1
828; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
829; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
830; GFX6-NEXT:    ; return to shader part epilog
831;
832; GFX8-LABEL: lshr_v2i16_sv:
833; GFX8:       ; %bb.0:
834; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
835; GFX8-NEXT:    v_mov_b32_e32 v2, s1
836; GFX8-NEXT:    v_lshrrev_b16_e64 v1, v0, s0
837; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
838; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
839; GFX8-NEXT:    ; return to shader part epilog
840;
841; GFX9-LABEL: lshr_v2i16_sv:
842; GFX9:       ; %bb.0:
843; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v0, s0
844; GFX9-NEXT:    ; return to shader part epilog
845;
846; GFX10PLUS-LABEL: lshr_v2i16_sv:
847; GFX10PLUS:       ; %bb.0:
848; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v0, v0, s0
849; GFX10PLUS-NEXT:    ; return to shader part epilog
850  %result = lshr <2 x i16> %value, %amount
851  %cast = bitcast <2 x i16> %result to float
852  ret float %cast
853}
854
855define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) {
856; GFX6-LABEL: lshr_v2i16_vs:
857; GFX6:       ; %bb.0:
858; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
859; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
860; GFX6-NEXT:    v_lshrrev_b32_e32 v0, s0, v0
861; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
862; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
863; GFX6-NEXT:    v_lshrrev_b32_e32 v1, s0, v1
864; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
865; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
866; GFX6-NEXT:    ; return to shader part epilog
867;
868; GFX8-LABEL: lshr_v2i16_vs:
869; GFX8:       ; %bb.0:
870; GFX8-NEXT:    s_lshr_b32 s1, s0, 16
871; GFX8-NEXT:    v_mov_b32_e32 v2, s1
872; GFX8-NEXT:    v_lshrrev_b16_e32 v1, s0, v0
873; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
874; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
875; GFX8-NEXT:    ; return to shader part epilog
876;
877; GFX9-LABEL: lshr_v2i16_vs:
878; GFX9:       ; %bb.0:
879; GFX9-NEXT:    v_pk_lshrrev_b16 v0, s0, v0
880; GFX9-NEXT:    ; return to shader part epilog
881;
882; GFX10PLUS-LABEL: lshr_v2i16_vs:
883; GFX10PLUS:       ; %bb.0:
884; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v0, s0, v0
885; GFX10PLUS-NEXT:    ; return to shader part epilog
886  %result = lshr <2 x i16> %value, %amount
887  %cast = bitcast <2 x i16> %result to float
888  ret float %cast
889}
890
891; FIXME
892; define <3 x i16> @v_lshr_v3i16(<3 x i16> %value, <3 x i16> %amount) {
893;   %result = lshr <3 x i16> %value, %amount
894;   ret <3 x i16> %result
895; }
896
897; define amdgpu_ps <3 x i16> @s_lshr_v3i16(<3 x i16> inreg %value, <3 x i16> inreg %amount) {
898;   %result = lshr <3 x i16> %value, %amount
899;   ret <3 x i16> %result
900; }
901
902define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
903; GFX6-LABEL: v_lshr_v4i16:
904; GFX6:       ; %bb.0:
905; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
906; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
907; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
908; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v4, v0
909; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v5
910; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
911; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v4, v1
912; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v6
913; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
914; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
915; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
916; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
917; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v4, v3
918; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
919; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
920; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
921; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
922; GFX6-NEXT:    s_setpc_b64 s[30:31]
923;
924; GFX8-LABEL: v_lshr_v4i16:
925; GFX8:       ; %bb.0:
926; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
927; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v2, v0
928; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
929; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v3, v1
930; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
931; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
932; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
933; GFX8-NEXT:    s_setpc_b64 s[30:31]
934;
935; GFX9-LABEL: v_lshr_v4i16:
936; GFX9:       ; %bb.0:
937; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
938; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
939; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
940; GFX9-NEXT:    s_setpc_b64 s[30:31]
941;
942; GFX10PLUS-LABEL: v_lshr_v4i16:
943; GFX10PLUS:       ; %bb.0:
944; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
945; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v0, v2, v0
946; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v1, v3, v1
947; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
948  %result = lshr <4 x i16> %value, %amount
949  %cast = bitcast <4 x i16> %result to <2 x float>
950  ret <2 x float> %cast
951}
952
953define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) {
954; GFX6-LABEL: s_lshr_v4i16:
955; GFX6:       ; %bb.0:
956; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
957; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
958; GFX6-NEXT:    s_lshr_b32 s1, s1, s5
959; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
960; GFX6-NEXT:    s_lshr_b32 s0, s0, s4
961; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
962; GFX6-NEXT:    s_lshr_b32 s3, s3, s7
963; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
964; GFX6-NEXT:    s_lshr_b32 s2, s2, s6
965; GFX6-NEXT:    s_or_b32 s0, s0, s1
966; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
967; GFX6-NEXT:    s_or_b32 s1, s2, s1
968; GFX6-NEXT:    ; return to shader part epilog
969;
970; GFX8-LABEL: s_lshr_v4i16:
971; GFX8:       ; %bb.0:
972; GFX8-NEXT:    s_lshr_b32 s4, s0, 16
973; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
974; GFX8-NEXT:    s_lshr_b32 s6, s2, 16
975; GFX8-NEXT:    s_lshr_b32 s5, s1, 16
976; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
977; GFX8-NEXT:    s_lshr_b32 s7, s3, 16
978; GFX8-NEXT:    s_lshr_b32 s0, s0, s2
979; GFX8-NEXT:    s_lshr_b32 s2, s4, s6
980; GFX8-NEXT:    s_lshr_b32 s1, s1, s3
981; GFX8-NEXT:    s_lshr_b32 s3, s5, s7
982; GFX8-NEXT:    s_lshl_b32 s2, s2, 16
983; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
984; GFX8-NEXT:    s_or_b32 s0, s2, s0
985; GFX8-NEXT:    s_lshl_b32 s2, s3, 16
986; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
987; GFX8-NEXT:    s_or_b32 s1, s2, s1
988; GFX8-NEXT:    ; return to shader part epilog
989;
990; GFX9-LABEL: s_lshr_v4i16:
991; GFX9:       ; %bb.0:
992; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
993; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
994; GFX9-NEXT:    s_lshr_b32 s5, s2, 16
995; GFX9-NEXT:    s_lshr_b32 s0, s0, s2
996; GFX9-NEXT:    s_lshr_b32 s2, s4, s5
997; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
998; GFX9-NEXT:    s_lshr_b32 s2, s1, 16
999; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
1000; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
1001; GFX9-NEXT:    s_lshr_b32 s1, s1, s3
1002; GFX9-NEXT:    s_lshr_b32 s2, s2, s4
1003; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s2
1004; GFX9-NEXT:    ; return to shader part epilog
1005;
1006; GFX10PLUS-LABEL: s_lshr_v4i16:
1007; GFX10PLUS:       ; %bb.0:
1008; GFX10PLUS-NEXT:    s_lshr_b32 s4, s0, 16
1009; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
1010; GFX10PLUS-NEXT:    s_lshr_b32 s5, s2, 16
1011; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s2
1012; GFX10PLUS-NEXT:    s_lshr_b32 s2, s4, s5
1013; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 16
1014; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
1015; GFX10PLUS-NEXT:    s_lshr_b32 s5, s3, 16
1016; GFX10PLUS-NEXT:    s_lshr_b32 s1, s1, s3
1017; GFX10PLUS-NEXT:    s_lshr_b32 s3, s4, s5
1018; GFX10PLUS-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
1019; GFX10PLUS-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
1020; GFX10PLUS-NEXT:    ; return to shader part epilog
1021  %result = lshr <4 x i16> %value, %amount
1022  %cast = bitcast <4 x i16> %result to <2 x i32>
1023  ret <2 x i32> %cast
1024}
1025
1026; FIXME
1027; define <5 x i16> @v_lshr_v5i16(<5 x i16> %value, <5 x i16> %amount) {
1028;   %result = lshr <5 x i16> %value, %amount
1029;   ret <5 x i16> %result
1030; }
1031
1032; define amdgpu_ps <5 x i16> @s_lshr_v5i16(<5 x i16> inreg %value, <5 x i16> inreg %amount) {
1033;   %result = lshr <5 x i16> %value, %amount
1034;   ret <5 x i16> %result
1035; }
1036
1037; define <3 x float> @v_lshr_v6i16(<6 x i16> %value, <6 x i16> %amount) {
1038;   %result = lshr <6 x i16> %value, %amount
1039;   %cast = bitcast <6 x i16> %result to <3 x float>
1040;   ret <3 x float> %cast
1041; }
1042
1043; define amdgpu_ps <3 x i32> @s_lshr_v6i16(<6 x i16> inreg %value, <6 x i16> inreg %amount) {
1044;   %result = lshr <6 x i16> %value, %amount
1045;   %cast = bitcast <6 x i16> %result to <3 x i32>
1046;   ret <3 x i32> %cast
1047; }
1048
1049define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
1050; GFX6-LABEL: v_lshr_v8i16:
1051; GFX6:       ; %bb.0:
1052; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1053; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v8
1054; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1055; GFX6-NEXT:    v_lshrrev_b32_e32 v0, v8, v0
1056; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v9
1057; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
1058; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v8, v1
1059; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v10
1060; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1061; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v8, v2
1062; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v11
1063; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
1064; GFX6-NEXT:    v_lshrrev_b32_e32 v3, v8, v3
1065; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v12
1066; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
1067; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v8, v4
1068; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v13
1069; GFX6-NEXT:    v_and_b32_e32 v5, 0xffff, v5
1070; GFX6-NEXT:    v_lshrrev_b32_e32 v5, v8, v5
1071; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v14
1072; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v6
1073; GFX6-NEXT:    v_lshrrev_b32_e32 v6, v8, v6
1074; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v15
1075; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff, v7
1076; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1077; GFX6-NEXT:    v_lshrrev_b32_e32 v7, v8, v7
1078; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
1079; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
1080; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
1081; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
1082; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
1083; GFX6-NEXT:    v_or_b32_e32 v2, v4, v2
1084; GFX6-NEXT:    v_or_b32_e32 v3, v6, v3
1085; GFX6-NEXT:    s_setpc_b64 s[30:31]
1086;
1087; GFX8-LABEL: v_lshr_v8i16:
1088; GFX8:       ; %bb.0:
1089; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1090; GFX8-NEXT:    v_lshrrev_b16_e32 v8, v4, v0
1091; GFX8-NEXT:    v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1092; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v5, v1
1093; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1094; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
1095; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v6, v2
1096; GFX8-NEXT:    v_lshrrev_b16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1097; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
1098; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v7, v3
1099; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
1100; GFX8-NEXT:    v_or_b32_e32 v0, v8, v0
1101; GFX8-NEXT:    v_or_b32_e32 v3, v4, v3
1102; GFX8-NEXT:    s_setpc_b64 s[30:31]
1103;
1104; GFX9-LABEL: v_lshr_v8i16:
1105; GFX9:       ; %bb.0:
1106; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1107; GFX9-NEXT:    v_pk_lshrrev_b16 v0, v4, v0
1108; GFX9-NEXT:    v_pk_lshrrev_b16 v1, v5, v1
1109; GFX9-NEXT:    v_pk_lshrrev_b16 v2, v6, v2
1110; GFX9-NEXT:    v_pk_lshrrev_b16 v3, v7, v3
1111; GFX9-NEXT:    s_setpc_b64 s[30:31]
1112;
1113; GFX10PLUS-LABEL: v_lshr_v8i16:
1114; GFX10PLUS:       ; %bb.0:
1115; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1116; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v0, v4, v0
1117; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v1, v5, v1
1118; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v2, v6, v2
1119; GFX10PLUS-NEXT:    v_pk_lshrrev_b16 v3, v7, v3
1120; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1121  %result = lshr <8 x i16> %value, %amount
1122  %cast = bitcast <8 x i16> %result to <4 x float>
1123  ret <4 x float> %cast
1124}
1125
1126define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) {
1127; GFX6-LABEL: s_lshr_v8i16:
1128; GFX6:       ; %bb.0:
1129; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
1130; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
1131; GFX6-NEXT:    s_lshr_b32 s1, s1, s9
1132; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
1133; GFX6-NEXT:    s_lshr_b32 s0, s0, s8
1134; GFX6-NEXT:    s_and_b32 s2, s2, 0xffff
1135; GFX6-NEXT:    s_lshr_b32 s3, s3, s11
1136; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
1137; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
1138; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
1139; GFX6-NEXT:    s_lshr_b32 s2, s2, s10
1140; GFX6-NEXT:    s_and_b32 s4, s4, 0xffff
1141; GFX6-NEXT:    s_lshr_b32 s5, s5, s13
1142; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
1143; GFX6-NEXT:    s_lshr_b32 s7, s7, s15
1144; GFX6-NEXT:    s_or_b32 s0, s0, s1
1145; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
1146; GFX6-NEXT:    s_lshr_b32 s4, s4, s12
1147; GFX6-NEXT:    s_lshr_b32 s6, s6, s14
1148; GFX6-NEXT:    s_or_b32 s1, s2, s1
1149; GFX6-NEXT:    s_lshl_b32 s2, s5, 16
1150; GFX6-NEXT:    s_lshl_b32 s3, s7, 16
1151; GFX6-NEXT:    s_or_b32 s2, s4, s2
1152; GFX6-NEXT:    s_or_b32 s3, s6, s3
1153; GFX6-NEXT:    ; return to shader part epilog
1154;
1155; GFX8-LABEL: s_lshr_v8i16:
1156; GFX8:       ; %bb.0:
1157; GFX8-NEXT:    s_lshr_b32 s8, s0, 16
1158; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
1159; GFX8-NEXT:    s_lshr_b32 s12, s4, 16
1160; GFX8-NEXT:    s_lshr_b32 s9, s1, 16
1161; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
1162; GFX8-NEXT:    s_lshr_b32 s13, s5, 16
1163; GFX8-NEXT:    s_lshr_b32 s0, s0, s4
1164; GFX8-NEXT:    s_lshr_b32 s4, s8, s12
1165; GFX8-NEXT:    s_lshr_b32 s10, s2, 16
1166; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
1167; GFX8-NEXT:    s_lshr_b32 s14, s6, 16
1168; GFX8-NEXT:    s_lshr_b32 s1, s1, s5
1169; GFX8-NEXT:    s_lshr_b32 s5, s9, s13
1170; GFX8-NEXT:    s_lshl_b32 s4, s4, 16
1171; GFX8-NEXT:    s_and_b32 s0, s0, 0xffff
1172; GFX8-NEXT:    s_lshr_b32 s11, s3, 16
1173; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
1174; GFX8-NEXT:    s_lshr_b32 s15, s7, 16
1175; GFX8-NEXT:    s_lshr_b32 s2, s2, s6
1176; GFX8-NEXT:    s_lshr_b32 s6, s10, s14
1177; GFX8-NEXT:    s_or_b32 s0, s4, s0
1178; GFX8-NEXT:    s_lshl_b32 s4, s5, 16
1179; GFX8-NEXT:    s_and_b32 s1, s1, 0xffff
1180; GFX8-NEXT:    s_lshr_b32 s3, s3, s7
1181; GFX8-NEXT:    s_lshr_b32 s7, s11, s15
1182; GFX8-NEXT:    s_or_b32 s1, s4, s1
1183; GFX8-NEXT:    s_lshl_b32 s4, s6, 16
1184; GFX8-NEXT:    s_and_b32 s2, s2, 0xffff
1185; GFX8-NEXT:    s_or_b32 s2, s4, s2
1186; GFX8-NEXT:    s_lshl_b32 s4, s7, 16
1187; GFX8-NEXT:    s_and_b32 s3, s3, 0xffff
1188; GFX8-NEXT:    s_or_b32 s3, s4, s3
1189; GFX8-NEXT:    ; return to shader part epilog
1190;
1191; GFX9-LABEL: s_lshr_v8i16:
1192; GFX9:       ; %bb.0:
1193; GFX9-NEXT:    s_lshr_b32 s8, s0, 16
1194; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
1195; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
1196; GFX9-NEXT:    s_lshr_b32 s0, s0, s4
1197; GFX9-NEXT:    s_lshr_b32 s4, s8, s9
1198; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
1199; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
1200; GFX9-NEXT:    s_and_b32 s1, s1, 0xffff
1201; GFX9-NEXT:    s_lshr_b32 s8, s5, 16
1202; GFX9-NEXT:    s_lshr_b32 s1, s1, s5
1203; GFX9-NEXT:    s_lshr_b32 s4, s4, s8
1204; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
1205; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
1206; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
1207; GFX9-NEXT:    s_lshr_b32 s5, s6, 16
1208; GFX9-NEXT:    s_lshr_b32 s2, s2, s6
1209; GFX9-NEXT:    s_lshr_b32 s4, s4, s5
1210; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
1211; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
1212; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
1213; GFX9-NEXT:    s_lshr_b32 s5, s7, 16
1214; GFX9-NEXT:    s_lshr_b32 s3, s3, s7
1215; GFX9-NEXT:    s_lshr_b32 s4, s4, s5
1216; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
1217; GFX9-NEXT:    ; return to shader part epilog
1218;
1219; GFX10PLUS-LABEL: s_lshr_v8i16:
1220; GFX10PLUS:       ; %bb.0:
1221; GFX10PLUS-NEXT:    s_lshr_b32 s8, s0, 16
1222; GFX10PLUS-NEXT:    s_and_b32 s0, s0, 0xffff
1223; GFX10PLUS-NEXT:    s_lshr_b32 s9, s4, 16
1224; GFX10PLUS-NEXT:    s_lshr_b32 s0, s0, s4
1225; GFX10PLUS-NEXT:    s_lshr_b32 s4, s8, s9
1226; GFX10PLUS-NEXT:    s_lshr_b32 s8, s1, 16
1227; GFX10PLUS-NEXT:    s_and_b32 s1, s1, 0xffff
1228; GFX10PLUS-NEXT:    s_lshr_b32 s9, s5, 16
1229; GFX10PLUS-NEXT:    s_lshr_b32 s1, s1, s5
1230; GFX10PLUS-NEXT:    s_lshr_b32 s5, s8, s9
1231; GFX10PLUS-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
1232; GFX10PLUS-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
1233; GFX10PLUS-NEXT:    s_lshr_b32 s4, s2, 16
1234; GFX10PLUS-NEXT:    s_and_b32 s2, s2, 0xffff
1235; GFX10PLUS-NEXT:    s_lshr_b32 s5, s6, 16
1236; GFX10PLUS-NEXT:    s_lshr_b32 s2, s2, s6
1237; GFX10PLUS-NEXT:    s_lshr_b32 s4, s4, s5
1238; GFX10PLUS-NEXT:    s_lshr_b32 s5, s3, 16
1239; GFX10PLUS-NEXT:    s_and_b32 s3, s3, 0xffff
1240; GFX10PLUS-NEXT:    s_lshr_b32 s6, s7, 16
1241; GFX10PLUS-NEXT:    s_lshr_b32 s3, s3, s7
1242; GFX10PLUS-NEXT:    s_lshr_b32 s5, s5, s6
1243; GFX10PLUS-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
1244; GFX10PLUS-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
1245; GFX10PLUS-NEXT:    ; return to shader part epilog
1246  %result = lshr <8 x i16> %value, %amount
1247  %cast = bitcast <8 x i16> %result to <4 x i32>
1248  ret <4 x i32> %cast
1249}
1250
1251define i64 @v_lshr_i64(i64 %value, i64 %amount) {
1252; GFX6-LABEL: v_lshr_i64:
1253; GFX6:       ; %bb.0:
1254; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1255; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], v2
1256; GFX6-NEXT:    s_setpc_b64 s[30:31]
1257;
1258; GFX8-LABEL: v_lshr_i64:
1259; GFX8:       ; %bb.0:
1260; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1261; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
1262; GFX8-NEXT:    s_setpc_b64 s[30:31]
1263;
1264; GFX9-LABEL: v_lshr_i64:
1265; GFX9:       ; %bb.0:
1266; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1267; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
1268; GFX9-NEXT:    s_setpc_b64 s[30:31]
1269;
1270; GFX10PLUS-LABEL: v_lshr_i64:
1271; GFX10PLUS:       ; %bb.0:
1272; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1273; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], v2, v[0:1]
1274; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1275  %result = lshr i64 %value, %amount
1276  ret i64 %result
1277}
1278
1279define i64 @v_lshr_i64_63(i64 %value) {
1280; GCN-LABEL: v_lshr_i64_63:
1281; GCN:       ; %bb.0:
1282; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1283; GCN-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
1284; GCN-NEXT:    v_mov_b32_e32 v1, 0
1285; GCN-NEXT:    s_setpc_b64 s[30:31]
1286;
1287; GFX10PLUS-LABEL: v_lshr_i64_63:
1288; GFX10PLUS:       ; %bb.0:
1289; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1290; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
1291; GFX10PLUS-NEXT:    v_mov_b32_e32 v1, 0
1292; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1293  %result = lshr i64 %value, 63
1294  ret i64 %result
1295}
1296
1297define i64 @v_lshr_i64_33(i64 %value) {
1298; GCN-LABEL: v_lshr_i64_33:
1299; GCN:       ; %bb.0:
1300; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1301; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v1
1302; GCN-NEXT:    v_mov_b32_e32 v1, 0
1303; GCN-NEXT:    s_setpc_b64 s[30:31]
1304;
1305; GFX10PLUS-LABEL: v_lshr_i64_33:
1306; GFX10PLUS:       ; %bb.0:
1307; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1308; GFX10PLUS-NEXT:    v_lshrrev_b32_e32 v0, 1, v1
1309; GFX10PLUS-NEXT:    v_mov_b32_e32 v1, 0
1310; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1311  %result = lshr i64 %value, 33
1312  ret i64 %result
1313}
1314
1315define i64 @v_lshr_i64_32(i64 %value) {
1316; GCN-LABEL: v_lshr_i64_32:
1317; GCN:       ; %bb.0:
1318; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1319; GCN-NEXT:    v_mov_b32_e32 v0, v1
1320; GCN-NEXT:    v_mov_b32_e32 v1, 0
1321; GCN-NEXT:    s_setpc_b64 s[30:31]
1322;
1323; GFX10-LABEL: v_lshr_i64_32:
1324; GFX10:       ; %bb.0:
1325; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1326; GFX10-NEXT:    v_mov_b32_e32 v0, v1
1327; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1328; GFX10-NEXT:    s_setpc_b64 s[30:31]
1329;
1330; GFX11-LABEL: v_lshr_i64_32:
1331; GFX11:       ; %bb.0:
1332; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1333; GFX11-NEXT:    v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, 0
1334; GFX11-NEXT:    s_setpc_b64 s[30:31]
1335  %result = lshr i64 %value, 32
1336  ret i64 %result
1337}
1338
1339define i64 @v_lshr_i64_31(i64 %value) {
1340; GFX6-LABEL: v_lshr_i64_31:
1341; GFX6:       ; %bb.0:
1342; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1343; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 31
1344; GFX6-NEXT:    s_setpc_b64 s[30:31]
1345;
1346; GFX8-LABEL: v_lshr_i64_31:
1347; GFX8:       ; %bb.0:
1348; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1349; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
1350; GFX8-NEXT:    s_setpc_b64 s[30:31]
1351;
1352; GFX9-LABEL: v_lshr_i64_31:
1353; GFX9:       ; %bb.0:
1354; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1355; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
1356; GFX9-NEXT:    s_setpc_b64 s[30:31]
1357;
1358; GFX10PLUS-LABEL: v_lshr_i64_31:
1359; GFX10PLUS:       ; %bb.0:
1360; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1361; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
1362; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1363  %result = lshr i64 %value, 31
1364  ret i64 %result
1365}
1366
1367define amdgpu_ps i64 @s_lshr_i64(i64 inreg %value, i64 inreg %amount) {
1368; GCN-LABEL: s_lshr_i64:
1369; GCN:       ; %bb.0:
1370; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
1371; GCN-NEXT:    ; return to shader part epilog
1372;
1373; GFX10PLUS-LABEL: s_lshr_i64:
1374; GFX10PLUS:       ; %bb.0:
1375; GFX10PLUS-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
1376; GFX10PLUS-NEXT:    ; return to shader part epilog
1377  %result = lshr i64 %value, %amount
1378  ret i64 %result
1379}
1380
1381define amdgpu_ps i64 @s_lshr_i64_63(i64 inreg %value) {
1382; GCN-LABEL: s_lshr_i64_63:
1383; GCN:       ; %bb.0:
1384; GCN-NEXT:    s_lshr_b32 s0, s1, 31
1385; GCN-NEXT:    s_mov_b32 s1, 0
1386; GCN-NEXT:    ; return to shader part epilog
1387;
1388; GFX10PLUS-LABEL: s_lshr_i64_63:
1389; GFX10PLUS:       ; %bb.0:
1390; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 31
1391; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
1392; GFX10PLUS-NEXT:    ; return to shader part epilog
1393  %result = lshr i64 %value, 63
1394  ret i64 %result
1395}
1396
1397define amdgpu_ps i64 @s_lshr_i64_33(i64 inreg %value) {
1398; GCN-LABEL: s_lshr_i64_33:
1399; GCN:       ; %bb.0:
1400; GCN-NEXT:    s_lshr_b32 s0, s1, 1
1401; GCN-NEXT:    s_mov_b32 s1, 0
1402; GCN-NEXT:    ; return to shader part epilog
1403;
1404; GFX10PLUS-LABEL: s_lshr_i64_33:
1405; GFX10PLUS:       ; %bb.0:
1406; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 1
1407; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
1408; GFX10PLUS-NEXT:    ; return to shader part epilog
1409  %result = lshr i64 %value, 33
1410  ret i64 %result
1411}
1412
1413define amdgpu_ps i64 @s_lshr_i64_32(i64 inreg %value) {
1414; GCN-LABEL: s_lshr_i64_32:
1415; GCN:       ; %bb.0:
1416; GCN-NEXT:    s_mov_b32 s0, s1
1417; GCN-NEXT:    s_mov_b32 s1, 0
1418; GCN-NEXT:    ; return to shader part epilog
1419;
1420; GFX10PLUS-LABEL: s_lshr_i64_32:
1421; GFX10PLUS:       ; %bb.0:
1422; GFX10PLUS-NEXT:    s_mov_b32 s0, s1
1423; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
1424; GFX10PLUS-NEXT:    ; return to shader part epilog
1425  %result = lshr i64 %value, 32
1426  ret i64 %result
1427}
1428
1429define amdgpu_ps i64 @s_lshr_i64_31(i64 inreg %value) {
1430; GCN-LABEL: s_lshr_i64_31:
1431; GCN:       ; %bb.0:
1432; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], 31
1433; GCN-NEXT:    ; return to shader part epilog
1434;
1435; GFX10PLUS-LABEL: s_lshr_i64_31:
1436; GFX10PLUS:       ; %bb.0:
1437; GFX10PLUS-NEXT:    s_lshr_b64 s[0:1], s[0:1], 31
1438; GFX10PLUS-NEXT:    ; return to shader part epilog
1439  %result = lshr i64 %value, 31
1440  ret i64 %result
1441}
1442
1443define amdgpu_ps <2 x float> @lshr_i64_sv(i64 inreg %value, i64 %amount) {
1444; GFX6-LABEL: lshr_i64_sv:
1445; GFX6:       ; %bb.0:
1446; GFX6-NEXT:    v_lshr_b64 v[0:1], s[0:1], v0
1447; GFX6-NEXT:    ; return to shader part epilog
1448;
1449; GFX8-LABEL: lshr_i64_sv:
1450; GFX8:       ; %bb.0:
1451; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
1452; GFX8-NEXT:    ; return to shader part epilog
1453;
1454; GFX9-LABEL: lshr_i64_sv:
1455; GFX9:       ; %bb.0:
1456; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
1457; GFX9-NEXT:    ; return to shader part epilog
1458;
1459; GFX10PLUS-LABEL: lshr_i64_sv:
1460; GFX10PLUS:       ; %bb.0:
1461; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], v0, s[0:1]
1462; GFX10PLUS-NEXT:    ; return to shader part epilog
1463  %result = lshr i64 %value, %amount
1464  %cast = bitcast i64 %result to <2 x float>
1465  ret <2 x float> %cast
1466}
1467
1468define amdgpu_ps <2 x float> @lshr_i64_vs(i64 %value, i64 inreg %amount) {
1469; GFX6-LABEL: lshr_i64_vs:
1470; GFX6:       ; %bb.0:
1471; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], s0
1472; GFX6-NEXT:    ; return to shader part epilog
1473;
1474; GFX8-LABEL: lshr_i64_vs:
1475; GFX8:       ; %bb.0:
1476; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
1477; GFX8-NEXT:    ; return to shader part epilog
1478;
1479; GFX9-LABEL: lshr_i64_vs:
1480; GFX9:       ; %bb.0:
1481; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
1482; GFX9-NEXT:    ; return to shader part epilog
1483;
1484; GFX10PLUS-LABEL: lshr_i64_vs:
1485; GFX10PLUS:       ; %bb.0:
1486; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
1487; GFX10PLUS-NEXT:    ; return to shader part epilog
1488  %result = lshr i64 %value, %amount
1489  %cast = bitcast i64 %result to <2 x float>
1490  ret <2 x float> %cast
1491}
1492
1493define <2 x i64> @v_lshr_v2i64(<2 x i64> %value, <2 x i64> %amount) {
1494; GFX6-LABEL: v_lshr_v2i64:
1495; GFX6:       ; %bb.0:
1496; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1497; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], v4
1498; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], v6
1499; GFX6-NEXT:    s_setpc_b64 s[30:31]
1500;
1501; GFX8-LABEL: v_lshr_v2i64:
1502; GFX8:       ; %bb.0:
1503; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1504; GFX8-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
1505; GFX8-NEXT:    v_lshrrev_b64 v[2:3], v6, v[2:3]
1506; GFX8-NEXT:    s_setpc_b64 s[30:31]
1507;
1508; GFX9-LABEL: v_lshr_v2i64:
1509; GFX9:       ; %bb.0:
1510; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1511; GFX9-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
1512; GFX9-NEXT:    v_lshrrev_b64 v[2:3], v6, v[2:3]
1513; GFX9-NEXT:    s_setpc_b64 s[30:31]
1514;
1515; GFX10PLUS-LABEL: v_lshr_v2i64:
1516; GFX10PLUS:       ; %bb.0:
1517; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1518; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], v4, v[0:1]
1519; GFX10PLUS-NEXT:    v_lshrrev_b64 v[2:3], v6, v[2:3]
1520; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1521  %result = lshr <2 x i64> %value, %amount
1522  ret <2 x i64> %result
1523}
1524
1525define <2 x i64> @v_lshr_v2i64_31(<2 x i64> %value) {
1526; GFX6-LABEL: v_lshr_v2i64_31:
1527; GFX6:       ; %bb.0:
1528; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1529; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 31
1530; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], 31
1531; GFX6-NEXT:    s_setpc_b64 s[30:31]
1532;
1533; GFX8-LABEL: v_lshr_v2i64_31:
1534; GFX8:       ; %bb.0:
1535; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1536; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
1537; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 31, v[2:3]
1538; GFX8-NEXT:    s_setpc_b64 s[30:31]
1539;
1540; GFX9-LABEL: v_lshr_v2i64_31:
1541; GFX9:       ; %bb.0:
1542; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1543; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
1544; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 31, v[2:3]
1545; GFX9-NEXT:    s_setpc_b64 s[30:31]
1546;
1547; GFX10PLUS-LABEL: v_lshr_v2i64_31:
1548; GFX10PLUS:       ; %bb.0:
1549; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1550; GFX10PLUS-NEXT:    v_lshrrev_b64 v[0:1], 31, v[0:1]
1551; GFX10PLUS-NEXT:    v_lshrrev_b64 v[2:3], 31, v[2:3]
1552; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
1553  %result = lshr <2 x i64> %value, <i64 31, i64 31>
1554  ret <2 x i64> %result
1555}
1556
1557define amdgpu_ps <2 x i64> @s_lshr_v2i64(<2 x i64> inreg %value, <2 x i64> inreg %amount) {
1558; GCN-LABEL: s_lshr_v2i64:
1559; GCN:       ; %bb.0:
1560; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
1561; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
1562; GCN-NEXT:    ; return to shader part epilog
1563;
1564; GFX10PLUS-LABEL: s_lshr_v2i64:
1565; GFX10PLUS:       ; %bb.0:
1566; GFX10PLUS-NEXT:    s_lshr_b64 s[0:1], s[0:1], s4
1567; GFX10PLUS-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
1568; GFX10PLUS-NEXT:    ; return to shader part epilog
1569  %result = lshr <2 x i64> %value, %amount
1570  ret <2 x i64> %result
1571}
1572
1573define i65 @v_lshr_i65(i65 %value, i65 %amount) {
1574; GFX6-LABEL: v_lshr_i65:
1575; GFX6:       ; %bb.0:
1576; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1577; GFX6-NEXT:    v_and_b32_e32 v4, 1, v2
1578; GFX6-NEXT:    v_mov_b32_e32 v5, 0
1579; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 64, v3
1580; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffc0, v3
1581; GFX6-NEXT:    v_lshr_b64 v[6:7], v[0:1], v3
1582; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], v8
1583; GFX6-NEXT:    v_lshr_b64 v[10:11], v[4:5], v3
1584; GFX6-NEXT:    v_lshr_b64 v[4:5], v[4:5], v2
1585; GFX6-NEXT:    v_or_b32_e32 v6, v6, v8
1586; GFX6-NEXT:    v_or_b32_e32 v7, v7, v9
1587; GFX6-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
1588; GFX6-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
1589; GFX6-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
1590; GFX6-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
1591; GFX6-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
1592; GFX6-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1593; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
1594; GFX6-NEXT:    s_setpc_b64 s[30:31]
1595;
1596; GFX8-LABEL: v_lshr_i65:
1597; GFX8:       ; %bb.0:
1598; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1599; GFX8-NEXT:    v_and_b32_e32 v4, 1, v2
1600; GFX8-NEXT:    v_mov_b32_e32 v5, 0
1601; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 64, v3
1602; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xffffffc0, v3
1603; GFX8-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
1604; GFX8-NEXT:    v_lshlrev_b64 v[8:9], v8, v[4:5]
1605; GFX8-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
1606; GFX8-NEXT:    v_lshrrev_b64 v[4:5], v2, v[4:5]
1607; GFX8-NEXT:    v_or_b32_e32 v6, v6, v8
1608; GFX8-NEXT:    v_or_b32_e32 v7, v7, v9
1609; GFX8-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
1610; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
1611; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
1612; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
1613; GFX8-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
1614; GFX8-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1615; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
1616; GFX8-NEXT:    s_setpc_b64 s[30:31]
1617;
1618; GFX9-LABEL: v_lshr_i65:
1619; GFX9:       ; %bb.0:
1620; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1621; GFX9-NEXT:    v_and_b32_e32 v4, 1, v2
1622; GFX9-NEXT:    v_mov_b32_e32 v5, 0
1623; GFX9-NEXT:    v_sub_u32_e32 v8, 64, v3
1624; GFX9-NEXT:    v_add_u32_e32 v2, 0xffffffc0, v3
1625; GFX9-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
1626; GFX9-NEXT:    v_lshlrev_b64 v[8:9], v8, v[4:5]
1627; GFX9-NEXT:    v_lshrrev_b64 v[10:11], v3, v[4:5]
1628; GFX9-NEXT:    v_lshrrev_b64 v[4:5], v2, v[4:5]
1629; GFX9-NEXT:    v_or_b32_e32 v6, v6, v8
1630; GFX9-NEXT:    v_or_b32_e32 v7, v7, v9
1631; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v3
1632; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc
1633; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
1634; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
1635; GFX9-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
1636; GFX9-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
1637; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v10, vcc
1638; GFX9-NEXT:    s_setpc_b64 s[30:31]
1639;
1640; GFX10-LABEL: v_lshr_i65:
1641; GFX10:       ; %bb.0:
1642; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1643; GFX10-NEXT:    v_mov_b32_e32 v5, 0
1644; GFX10-NEXT:    v_and_b32_e32 v4, 1, v2
1645; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 64, v3
1646; GFX10-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v3
1647; GFX10-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
1648; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
1649; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 0, v3
1650; GFX10-NEXT:    v_lshlrev_b64 v[8:9], v2, v[4:5]
1651; GFX10-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
1652; GFX10-NEXT:    v_lshrrev_b64 v[4:5], v3, v[4:5]
1653; GFX10-NEXT:    v_or_b32_e32 v2, v6, v8
1654; GFX10-NEXT:    v_or_b32_e32 v6, v7, v9
1655; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
1656; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v6, vcc_lo
1657; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s4
1658; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s4
1659; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc_lo
1660; GFX10-NEXT:    s_setpc_b64 s[30:31]
1661;
1662; GFX11-LABEL: v_lshr_i65:
1663; GFX11:       ; %bb.0:
1664; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1665; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v4, 1, v2
1666; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 64, v3
1667; GFX11-NEXT:    v_lshrrev_b64 v[6:7], v3, v[0:1]
1668; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v3
1669; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 0, v3
1670; GFX11-NEXT:    v_lshlrev_b64 v[8:9], v2, v[4:5]
1671; GFX11-NEXT:    v_or_b32_e32 v2, v6, v8
1672; GFX11-NEXT:    v_or_b32_e32 v6, v7, v9
1673; GFX11-NEXT:    v_add_nc_u32_e32 v10, 0xffffffc0, v3
1674; GFX11-NEXT:    v_lshrrev_b64 v[10:11], v10, v[4:5]
1675; GFX11-NEXT:    v_lshrrev_b64 v[4:5], v3, v[4:5]
1676; GFX11-NEXT:    v_cndmask_b32_e32 v5, v11, v6, vcc_lo
1677; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc_lo
1678; GFX11-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
1679; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s0
1680; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc_lo
1681; GFX11-NEXT:    s_setpc_b64 s[30:31]
1682  %result = lshr i65 %value, %amount
1683  ret i65 %result
1684}
1685
1686define i65 @v_lshr_i65_33(i65 %value) {
1687; GFX6-LABEL: v_lshr_i65_33:
1688; GFX6:       ; %bb.0:
1689; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1690; GFX6-NEXT:    v_mov_b32_e32 v3, v1
1691; GFX6-NEXT:    v_and_b32_e32 v0, 1, v2
1692; GFX6-NEXT:    v_mov_b32_e32 v1, 0
1693; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
1694; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
1695; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
1696; GFX6-NEXT:    v_mov_b32_e32 v2, 0
1697; GFX6-NEXT:    s_setpc_b64 s[30:31]
1698;
1699; GFX8-LABEL: v_lshr_i65_33:
1700; GFX8:       ; %bb.0:
1701; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1702; GFX8-NEXT:    v_mov_b32_e32 v3, v1
1703; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
1704; GFX8-NEXT:    v_mov_b32_e32 v1, 0
1705; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
1706; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
1707; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
1708; GFX8-NEXT:    v_mov_b32_e32 v2, 0
1709; GFX8-NEXT:    s_setpc_b64 s[30:31]
1710;
1711; GFX9-LABEL: v_lshr_i65_33:
1712; GFX9:       ; %bb.0:
1713; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1714; GFX9-NEXT:    v_mov_b32_e32 v3, v1
1715; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
1716; GFX9-NEXT:    v_mov_b32_e32 v1, 0
1717; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
1718; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
1719; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
1720; GFX9-NEXT:    v_mov_b32_e32 v2, 0
1721; GFX9-NEXT:    s_setpc_b64 s[30:31]
1722;
1723; GFX10-LABEL: v_lshr_i65_33:
1724; GFX10:       ; %bb.0:
1725; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1726; GFX10-NEXT:    v_mov_b32_e32 v3, v1
1727; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1728; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
1729; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
1730; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
1731; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
1732; GFX10-NEXT:    v_mov_b32_e32 v2, 0
1733; GFX10-NEXT:    s_setpc_b64 s[30:31]
1734;
1735; GFX11-LABEL: v_lshr_i65_33:
1736; GFX11:       ; %bb.0:
1737; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1738; GFX11-NEXT:    v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v2
1739; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1740; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
1741; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
1742; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
1743; GFX11-NEXT:    v_mov_b32_e32 v2, 0
1744; GFX11-NEXT:    s_setpc_b64 s[30:31]
1745  %result = lshr i65 %value, 33
1746  ret i65 %result
1747}
1748
1749define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) {
1750; GCN-LABEL: s_lshr_i65:
1751; GCN:       ; %bb.0:
1752; GCN-NEXT:    s_and_b64 s[4:5], s[2:3], 1
1753; GCN-NEXT:    s_sub_i32 s10, s3, 64
1754; GCN-NEXT:    s_sub_i32 s8, 64, s3
1755; GCN-NEXT:    s_cmp_lt_u32 s3, 64
1756; GCN-NEXT:    s_cselect_b32 s11, 1, 0
1757; GCN-NEXT:    s_cmp_eq_u32 s3, 0
1758; GCN-NEXT:    s_cselect_b32 s12, 1, 0
1759; GCN-NEXT:    s_lshr_b64 s[6:7], s[4:5], s3
1760; GCN-NEXT:    s_lshr_b64 s[2:3], s[0:1], s3
1761; GCN-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
1762; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
1763; GCN-NEXT:    s_lshr_b64 s[4:5], s[4:5], s10
1764; GCN-NEXT:    s_cmp_lg_u32 s11, 0
1765; GCN-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
1766; GCN-NEXT:    s_cmp_lg_u32 s12, 0
1767; GCN-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
1768; GCN-NEXT:    s_cmp_lg_u32 s11, 0
1769; GCN-NEXT:    s_cselect_b32 s2, s6, 0
1770; GCN-NEXT:    ; return to shader part epilog
1771;
1772; GFX10PLUS-LABEL: s_lshr_i65:
1773; GFX10PLUS:       ; %bb.0:
1774; GFX10PLUS-NEXT:    s_and_b64 s[4:5], s[2:3], 1
1775; GFX10PLUS-NEXT:    s_sub_i32 s10, s3, 64
1776; GFX10PLUS-NEXT:    s_sub_i32 s2, 64, s3
1777; GFX10PLUS-NEXT:    s_cmp_lt_u32 s3, 64
1778; GFX10PLUS-NEXT:    s_cselect_b32 s11, 1, 0
1779; GFX10PLUS-NEXT:    s_cmp_eq_u32 s3, 0
1780; GFX10PLUS-NEXT:    s_cselect_b32 s12, 1, 0
1781; GFX10PLUS-NEXT:    s_lshr_b64 s[6:7], s[0:1], s3
1782; GFX10PLUS-NEXT:    s_lshl_b64 s[8:9], s[4:5], s2
1783; GFX10PLUS-NEXT:    s_lshr_b64 s[2:3], s[4:5], s3
1784; GFX10PLUS-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
1785; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[4:5], s10
1786; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
1787; GFX10PLUS-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
1788; GFX10PLUS-NEXT:    s_cmp_lg_u32 s12, 0
1789; GFX10PLUS-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
1790; GFX10PLUS-NEXT:    s_cmp_lg_u32 s11, 0
1791; GFX10PLUS-NEXT:    s_cselect_b32 s2, s2, 0
1792; GFX10PLUS-NEXT:    ; return to shader part epilog
1793  %result = lshr i65 %value, %amount
1794  ret i65 %result
1795}
1796
1797define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
1798; GCN-LABEL: s_lshr_i65_33:
1799; GCN:       ; %bb.0:
1800; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], 1
1801; GCN-NEXT:    s_lshr_b32 s0, s1, 1
1802; GCN-NEXT:    s_mov_b32 s1, 0
1803; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
1804; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1805; GCN-NEXT:    s_mov_b32 s2, 0
1806; GCN-NEXT:    ; return to shader part epilog
1807;
1808; GFX10PLUS-LABEL: s_lshr_i65_33:
1809; GFX10PLUS:       ; %bb.0:
1810; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[2:3], 1
1811; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 1
1812; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
1813; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
1814; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
1815; GFX10PLUS-NEXT:    s_mov_b32 s2, 0
1816; GFX10PLUS-NEXT:    ; return to shader part epilog
1817  %result = lshr i65 %value, 33
1818  ret i65 %result
1819}
1820
1821; FIXME: Argument lowering asserts
1822; define <2 x i65> @v_lshr_v2i65(<2 x i65> %value, <2 x i65> %amount) {
1823;   %result = lshr <2 x i65> %value, %amount
1824;   ret <2 x i65> %result
1825; }
1826
1827; define amdgpu_ps <2 x i65> @s_lshr_v2i65(<2 x i65> inreg %value, <2 x i65> inreg %amount) {
1828;   %result = lshr <2 x i65> %value, %amount
1829;   ret <2 x i65> %result
1830; }
1831