xref: /llvm-project/llvm/test/CodeGen/AMDGPU/shift-i128.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
3
4define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
5; GCN-LABEL: v_shl_i128_vv:
6; GCN:       ; %bb.0:
7; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 64, v4
9; GCN-NEXT:    v_lshl_b64 v[5:6], v[2:3], v4
10; GCN-NEXT:    v_lshr_b64 v[7:8], v[0:1], v7
11; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
12; GCN-NEXT:    v_or_b32_e32 v7, v5, v7
13; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, 64, v4
14; GCN-NEXT:    v_or_b32_e32 v8, v6, v8
15; GCN-NEXT:    v_lshl_b64 v[5:6], v[0:1], v5
16; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
17; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
18; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
19; GCN-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
20; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
21; GCN-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
22; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
23; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
24; GCN-NEXT:    s_setpc_b64 s[30:31]
25  %shl = shl i128 %lhs, %rhs
26  ret i128 %shl
27}
28
29define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) {
30; GCN-LABEL: v_lshr_i128_vv:
31; GCN:       ; %bb.0:
32; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
33; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 64, v4
34; GCN-NEXT:    v_lshr_b64 v[5:6], v[0:1], v4
35; GCN-NEXT:    v_lshl_b64 v[7:8], v[2:3], v7
36; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
37; GCN-NEXT:    v_or_b32_e32 v7, v5, v7
38; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, 64, v4
39; GCN-NEXT:    v_or_b32_e32 v8, v6, v8
40; GCN-NEXT:    v_lshr_b64 v[5:6], v[2:3], v5
41; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
42; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
43; GCN-NEXT:    v_lshr_b64 v[2:3], v[2:3], v4
44; GCN-NEXT:    v_cndmask_b32_e64 v0, v5, v0, s[4:5]
45; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
46; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
47; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
48; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
49; GCN-NEXT:    s_setpc_b64 s[30:31]
50
51  %shl = lshr i128 %lhs, %rhs
52  ret i128 %shl
53}
54
55define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) {
56; GCN-LABEL: v_ashr_i128_vv:
57; GCN:       ; %bb.0:
58; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59; GCN-NEXT:    v_sub_i32_e32 v7, vcc, 64, v4
60; GCN-NEXT:    v_lshr_b64 v[5:6], v[0:1], v4
61; GCN-NEXT:    v_lshl_b64 v[7:8], v[2:3], v7
62; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
63; GCN-NEXT:    v_or_b32_e32 v7, v5, v7
64; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, 64, v4
65; GCN-NEXT:    v_or_b32_e32 v8, v6, v8
66; GCN-NEXT:    v_ashr_i64 v[5:6], v[2:3], v5
67; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
68; GCN-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
69; GCN-NEXT:    v_cndmask_b32_e64 v0, v5, v0, s[4:5]
70; GCN-NEXT:    v_cndmask_b32_e32 v5, v6, v8, vcc
71; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[4:5]
72; GCN-NEXT:    v_ashr_i64 v[4:5], v[2:3], v4
73; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
74; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
75; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
76; GCN-NEXT:    s_setpc_b64 s[30:31]
77  %shl = ashr i128 %lhs, %rhs
78  ret i128 %shl
79}
80
81
82define i128 @v_shl_i128_vk(i128 %lhs) {
83; GCN-LABEL: v_shl_i128_vk:
84; GCN:       ; %bb.0:
85; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
86; GCN-NEXT:    v_lshl_b64 v[2:3], v[2:3], 17
87; GCN-NEXT:    v_lshrrev_b32_e32 v4, 15, v1
88; GCN-NEXT:    v_or_b32_e32 v2, v2, v4
89; GCN-NEXT:    v_alignbit_b32 v1, v1, v0, 15
90; GCN-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
91; GCN-NEXT:    s_setpc_b64 s[30:31]
92  %shl = shl i128 %lhs, 17
93  ret i128 %shl
94}
95
96define i128 @v_lshr_i128_vk(i128 %lhs) {
97; GCN-LABEL: v_lshr_i128_vk:
98; GCN:       ; %bb.0:
99; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
100; GCN-NEXT:    v_alignbit_b32 v0, v3, v2, 1
101; GCN-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
102; GCN-NEXT:    v_mov_b32_e32 v2, 0
103; GCN-NEXT:    v_mov_b32_e32 v3, 0
104; GCN-NEXT:    s_setpc_b64 s[30:31]
105  %shl = lshr i128 %lhs, 65
106  ret i128 %shl
107}
108
109define i128 @v_ashr_i128_vk(i128 %lhs) {
110; GCN-LABEL: v_ashr_i128_vk:
111; GCN:       ; %bb.0:
112; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
113; GCN-NEXT:    v_mov_b32_e32 v4, v1
114; GCN-NEXT:    v_lshl_b64 v[0:1], v[2:3], 31
115; GCN-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
116; GCN-NEXT:    v_ashr_i64 v[2:3], v[2:3], 33
117; GCN-NEXT:    v_or_b32_e32 v0, v4, v0
118; GCN-NEXT:    s_setpc_b64 s[30:31]
119  %shl = ashr i128 %lhs, 33
120  ret i128 %shl
121}
122
123define i128 @v_shl_i128_kv(i128 %rhs) {
124; GCN-LABEL: v_shl_i128_kv:
125; GCN:       ; %bb.0:
126; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
127; GCN-NEXT:    v_sub_i32_e32 v1, vcc, 64, v0
128; GCN-NEXT:    v_lshr_b64 v[2:3], 17, v1
129; GCN-NEXT:    v_subrev_i32_e32 v1, vcc, 64, v0
130; GCN-NEXT:    v_lshl_b64 v[4:5], 17, v1
131; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
132; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
133; GCN-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v0
134; GCN-NEXT:    v_cndmask_b32_e64 v2, 0, v1, s[4:5]
135; GCN-NEXT:    v_lshl_b64 v[0:1], 17, v0
136; GCN-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
137; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[4:5]
138; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
139; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
140; GCN-NEXT:    s_setpc_b64 s[30:31]
141  %shl = shl i128 17, %rhs
142  ret i128 %shl
143}
144
145define i128 @v_lshr_i128_kv(i128 %rhs) {
146; GCN-LABEL: v_lshr_i128_kv:
147; GCN:       ; %bb.0:
148; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
149; GCN-NEXT:    s_mov_b64 s[4:5], 0x41
150; GCN-NEXT:    v_lshr_b64 v[1:2], s[4:5], v0
151; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
152; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
153; GCN-NEXT:    v_mov_b32_e32 v2, 0x41
154; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
155; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
156; GCN-NEXT:    v_mov_b32_e32 v1, 0
157; GCN-NEXT:    v_mov_b32_e32 v2, 0
158; GCN-NEXT:    v_mov_b32_e32 v3, 0
159; GCN-NEXT:    s_setpc_b64 s[30:31]
160  %shl = lshr i128 65, %rhs
161  ret i128 %shl
162}
163
164define i128 @v_ashr_i128_kv(i128 %rhs) {
165; GCN-LABEL: v_ashr_i128_kv:
166; GCN:       ; %bb.0:
167; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168; GCN-NEXT:    v_lshr_b64 v[1:2], 33, v0
169; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
170; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
171; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
172; GCN-NEXT:    v_cndmask_b32_e32 v0, 33, v1, vcc
173; GCN-NEXT:    v_mov_b32_e32 v1, 0
174; GCN-NEXT:    v_mov_b32_e32 v2, 0
175; GCN-NEXT:    v_mov_b32_e32 v3, 0
176; GCN-NEXT:    s_setpc_b64 s[30:31]
177  %shl = ashr i128 33, %rhs
178  ret i128 %shl
179}
180
181define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
182; GCN-LABEL: s_shl_i128_ss:
183; GCN:       ; %bb.0:
184; GCN-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
185; GCN-NEXT:    v_mov_b32_e32 v4, 0
186; GCN-NEXT:    v_mov_b32_e32 v5, 0
187; GCN-NEXT:    s_waitcnt lgkmcnt(0)
188; GCN-NEXT:    s_sub_i32 s5, s4, 64
189; GCN-NEXT:    s_sub_i32 s12, 64, s4
190; GCN-NEXT:    s_lshl_b64 s[6:7], s[2:3], s4
191; GCN-NEXT:    s_lshl_b64 s[8:9], s[0:1], s4
192; GCN-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
193; GCN-NEXT:    s_lshr_b64 s[0:1], s[0:1], s12
194; GCN-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
195; GCN-NEXT:    s_cmp_lt_u32 s4, 64
196; GCN-NEXT:    s_cselect_b32 s0, s0, s10
197; GCN-NEXT:    s_cselect_b32 s1, s1, s11
198; GCN-NEXT:    s_cselect_b32 s5, s9, 0
199; GCN-NEXT:    s_cselect_b32 s6, s8, 0
200; GCN-NEXT:    s_cmp_eq_u32 s4, 0
201; GCN-NEXT:    s_cselect_b32 s1, s3, s1
202; GCN-NEXT:    s_cselect_b32 s0, s2, s0
203; GCN-NEXT:    v_mov_b32_e32 v0, s6
204; GCN-NEXT:    v_mov_b32_e32 v1, s5
205; GCN-NEXT:    v_mov_b32_e32 v2, s0
206; GCN-NEXT:    v_mov_b32_e32 v3, s1
207; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
208; GCN-NEXT:    s_endpgm
209  %shift = shl i128 %lhs, %rhs
210  store i128 %shift, ptr addrspace(1) null
211  ret void
212}
213
214define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
215; GCN-LABEL: s_lshr_i128_ss:
216; GCN:       ; %bb.0:
217; GCN-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
218; GCN-NEXT:    v_mov_b32_e32 v4, 0
219; GCN-NEXT:    v_mov_b32_e32 v5, 0
220; GCN-NEXT:    s_waitcnt lgkmcnt(0)
221; GCN-NEXT:    s_sub_i32 s5, s4, 64
222; GCN-NEXT:    s_sub_i32 s12, 64, s4
223; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
224; GCN-NEXT:    s_lshr_b64 s[8:9], s[2:3], s4
225; GCN-NEXT:    s_lshr_b64 s[10:11], s[2:3], s5
226; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], s12
227; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
228; GCN-NEXT:    s_cmp_lt_u32 s4, 64
229; GCN-NEXT:    s_cselect_b32 s2, s2, s10
230; GCN-NEXT:    s_cselect_b32 s3, s3, s11
231; GCN-NEXT:    s_cselect_b32 s5, s9, 0
232; GCN-NEXT:    s_cselect_b32 s6, s8, 0
233; GCN-NEXT:    s_cmp_eq_u32 s4, 0
234; GCN-NEXT:    s_cselect_b32 s1, s1, s3
235; GCN-NEXT:    s_cselect_b32 s0, s0, s2
236; GCN-NEXT:    v_mov_b32_e32 v0, s0
237; GCN-NEXT:    v_mov_b32_e32 v1, s1
238; GCN-NEXT:    v_mov_b32_e32 v2, s6
239; GCN-NEXT:    v_mov_b32_e32 v3, s5
240; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
241; GCN-NEXT:    s_endpgm
242  %shift = lshr i128 %lhs, %rhs
243  store i128 %shift, ptr addrspace(1) null
244  ret void
245}
246
247define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
248; GCN-LABEL: s_ashr_i128_ss:
249; GCN:       ; %bb.0:
250; GCN-NEXT:    s_load_dwordx8 s[0:7], s[8:9], 0x0
251; GCN-NEXT:    v_mov_b32_e32 v4, 0
252; GCN-NEXT:    v_mov_b32_e32 v5, 0
253; GCN-NEXT:    s_waitcnt lgkmcnt(0)
254; GCN-NEXT:    s_sub_i32 s5, 64, s4
255; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
256; GCN-NEXT:    s_sub_i32 s10, s4, 64
257; GCN-NEXT:    s_lshl_b64 s[8:9], s[2:3], s5
258; GCN-NEXT:    s_ashr_i32 s12, s3, 31
259; GCN-NEXT:    s_ashr_i64 s[10:11], s[2:3], s10
260; GCN-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
261; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], s4
262; GCN-NEXT:    s_cmp_lt_u32 s4, 64
263; GCN-NEXT:    s_cselect_b32 s3, s3, s12
264; GCN-NEXT:    s_cselect_b32 s2, s2, s12
265; GCN-NEXT:    s_cselect_b32 s5, s6, s10
266; GCN-NEXT:    s_cselect_b32 s6, s7, s11
267; GCN-NEXT:    s_cmp_eq_u32 s4, 0
268; GCN-NEXT:    s_cselect_b32 s1, s1, s6
269; GCN-NEXT:    s_cselect_b32 s0, s0, s5
270; GCN-NEXT:    v_mov_b32_e32 v0, s0
271; GCN-NEXT:    v_mov_b32_e32 v1, s1
272; GCN-NEXT:    v_mov_b32_e32 v2, s2
273; GCN-NEXT:    v_mov_b32_e32 v3, s3
274; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
275; GCN-NEXT:    s_endpgm
276  %shift = ashr i128 %lhs, %rhs
277  store i128 %shift, ptr addrspace(1) null
278  ret void
279}
280
281define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
282; GCN-LABEL: v_shl_v2i128_vv:
283; GCN:       ; %bb.0:
284; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
285; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v8
286; GCN-NEXT:    v_lshr_b64 v[16:17], v[0:1], v16
287; GCN-NEXT:    v_lshl_b64 v[18:19], v[2:3], v8
288; GCN-NEXT:    v_cmp_gt_u64_e32 vcc, 64, v[8:9]
289; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
290; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
291; GCN-NEXT:    v_subrev_i32_e64 v9, s[6:7], 64, v8
292; GCN-NEXT:    v_or_b32_e32 v19, v19, v17
293; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
294; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
295; GCN-NEXT:    v_lshl_b64 v[16:17], v[0:1], v9
296; GCN-NEXT:    s_and_b64 vcc, s[4:5], vcc
297; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
298; GCN-NEXT:    v_cndmask_b32_e32 v9, v16, v18, vcc
299; GCN-NEXT:    v_cndmask_b32_e64 v2, v9, v2, s[4:5]
300; GCN-NEXT:    v_sub_i32_e64 v9, s[6:7], 64, v12
301; GCN-NEXT:    v_cndmask_b32_e32 v11, v17, v19, vcc
302; GCN-NEXT:    v_lshr_b64 v[9:10], v[4:5], v9
303; GCN-NEXT:    v_lshl_b64 v[16:17], v[6:7], v12
304; GCN-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[4:5]
305; GCN-NEXT:    v_or_b32_e32 v16, v16, v9
306; GCN-NEXT:    v_cmp_gt_u64_e64 s[4:5], 64, v[12:13]
307; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
308; GCN-NEXT:    v_subrev_i32_e64 v9, s[8:9], 64, v12
309; GCN-NEXT:    v_or_b32_e32 v11, v17, v10
310; GCN-NEXT:    v_lshl_b64 v[9:10], v[4:5], v9
311; GCN-NEXT:    v_or_b32_e32 v15, v13, v15
312; GCN-NEXT:    v_or_b32_e32 v14, v12, v14
313; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
314; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
315; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v16, s[4:5]
316; GCN-NEXT:    v_lshl_b64 v[0:1], v[0:1], v8
317; GCN-NEXT:    v_lshl_b64 v[4:5], v[4:5], v12
318; GCN-NEXT:    v_cndmask_b32_e64 v6, v9, v6, s[6:7]
319; GCN-NEXT:    v_cndmask_b32_e64 v9, v10, v11, s[4:5]
320; GCN-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[6:7]
321; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
322; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
323; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
324; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
325; GCN-NEXT:    s_setpc_b64 s[30:31]
326  %shl = shl <2 x i128> %lhs, %rhs
327  ret <2 x i128> %shl
328}
329
330define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
331; GCN-LABEL: v_lshr_v2i128_vv:
332; GCN:       ; %bb.0:
333; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
334; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v8
335; GCN-NEXT:    v_lshl_b64 v[16:17], v[2:3], v16
336; GCN-NEXT:    v_lshr_b64 v[18:19], v[0:1], v8
337; GCN-NEXT:    v_cmp_gt_u64_e32 vcc, 64, v[8:9]
338; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
339; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
340; GCN-NEXT:    v_subrev_i32_e64 v9, s[6:7], 64, v8
341; GCN-NEXT:    v_or_b32_e32 v19, v19, v17
342; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
343; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
344; GCN-NEXT:    v_lshr_b64 v[16:17], v[2:3], v9
345; GCN-NEXT:    s_and_b64 vcc, s[4:5], vcc
346; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
347; GCN-NEXT:    v_cndmask_b32_e32 v9, v16, v18, vcc
348; GCN-NEXT:    v_cndmask_b32_e64 v0, v9, v0, s[4:5]
349; GCN-NEXT:    v_sub_i32_e64 v9, s[6:7], 64, v12
350; GCN-NEXT:    v_cndmask_b32_e32 v11, v17, v19, vcc
351; GCN-NEXT:    v_lshl_b64 v[9:10], v[6:7], v9
352; GCN-NEXT:    v_lshr_b64 v[16:17], v[4:5], v12
353; GCN-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s[4:5]
354; GCN-NEXT:    v_or_b32_e32 v16, v16, v9
355; GCN-NEXT:    v_cmp_gt_u64_e64 s[4:5], 64, v[12:13]
356; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
357; GCN-NEXT:    v_subrev_i32_e64 v9, s[8:9], 64, v12
358; GCN-NEXT:    v_or_b32_e32 v11, v17, v10
359; GCN-NEXT:    v_lshr_b64 v[9:10], v[6:7], v9
360; GCN-NEXT:    v_or_b32_e32 v15, v13, v15
361; GCN-NEXT:    v_or_b32_e32 v14, v12, v14
362; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
363; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
364; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v16, s[4:5]
365; GCN-NEXT:    v_lshr_b64 v[2:3], v[2:3], v8
366; GCN-NEXT:    v_lshr_b64 v[6:7], v[6:7], v12
367; GCN-NEXT:    v_cndmask_b32_e64 v4, v9, v4, s[6:7]
368; GCN-NEXT:    v_cndmask_b32_e64 v9, v10, v11, s[4:5]
369; GCN-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
370; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
371; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
372; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[4:5]
373; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, v7, s[4:5]
374; GCN-NEXT:    s_setpc_b64 s[30:31]
375  %shl = lshr <2 x i128> %lhs, %rhs
376  ret <2 x i128> %shl
377}
378
379define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
380; GCN-LABEL: v_ashr_v2i128_vv:
381; GCN:       ; %bb.0:
382; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
383; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 64, v8
384; GCN-NEXT:    v_lshl_b64 v[16:17], v[2:3], v16
385; GCN-NEXT:    v_lshr_b64 v[18:19], v[0:1], v8
386; GCN-NEXT:    v_cmp_gt_u64_e32 vcc, 64, v[8:9]
387; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
388; GCN-NEXT:    v_or_b32_e32 v11, v9, v11
389; GCN-NEXT:    v_subrev_i32_e64 v9, s[6:7], 64, v8
390; GCN-NEXT:    v_or_b32_e32 v19, v19, v17
391; GCN-NEXT:    v_or_b32_e32 v18, v18, v16
392; GCN-NEXT:    v_or_b32_e32 v10, v8, v10
393; GCN-NEXT:    v_ashr_i64 v[16:17], v[2:3], v9
394; GCN-NEXT:    s_and_b64 vcc, s[4:5], vcc
395; GCN-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
396; GCN-NEXT:    v_cndmask_b32_e32 v9, v16, v18, vcc
397; GCN-NEXT:    v_cndmask_b32_e64 v0, v9, v0, s[4:5]
398; GCN-NEXT:    v_sub_i32_e64 v9, s[6:7], 64, v12
399; GCN-NEXT:    v_cndmask_b32_e32 v11, v17, v19, vcc
400; GCN-NEXT:    v_lshl_b64 v[9:10], v[6:7], v9
401; GCN-NEXT:    v_lshr_b64 v[16:17], v[4:5], v12
402; GCN-NEXT:    v_cndmask_b32_e64 v1, v11, v1, s[4:5]
403; GCN-NEXT:    v_or_b32_e32 v16, v16, v9
404; GCN-NEXT:    v_cmp_gt_u64_e64 s[4:5], 64, v[12:13]
405; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
406; GCN-NEXT:    v_subrev_i32_e64 v9, s[8:9], 64, v12
407; GCN-NEXT:    v_or_b32_e32 v11, v17, v10
408; GCN-NEXT:    v_ashr_i64 v[9:10], v[6:7], v9
409; GCN-NEXT:    v_or_b32_e32 v15, v13, v15
410; GCN-NEXT:    v_or_b32_e32 v14, v12, v14
411; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
412; GCN-NEXT:    v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
413; GCN-NEXT:    v_cndmask_b32_e64 v9, v9, v16, s[4:5]
414; GCN-NEXT:    v_cndmask_b32_e64 v4, v9, v4, s[6:7]
415; GCN-NEXT:    v_cndmask_b32_e64 v9, v10, v11, s[4:5]
416; GCN-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[6:7]
417; GCN-NEXT:    v_ashr_i64 v[8:9], v[2:3], v8
418; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
419; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v8, vcc
420; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v9, vcc
421; GCN-NEXT:    v_ashr_i64 v[8:9], v[6:7], v12
422; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
423; GCN-NEXT:    v_cndmask_b32_e64 v6, v7, v8, s[4:5]
424; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
425; GCN-NEXT:    s_setpc_b64 s[30:31]
426  %shl = ashr <2 x i128> %lhs, %rhs
427  ret <2 x i128> %shl
428}
429
430define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
431; GCN-LABEL: s_shl_v2i128ss:
432; GCN:       ; %bb.0:
433; GCN-NEXT:    s_load_dwordx16 s[0:15], s[8:9], 0x0
434; GCN-NEXT:    v_mov_b32_e32 v6, 16
435; GCN-NEXT:    v_mov_b32_e32 v4, 0
436; GCN-NEXT:    v_mov_b32_e32 v7, 0
437; GCN-NEXT:    v_mov_b32_e32 v5, 0
438; GCN-NEXT:    s_waitcnt lgkmcnt(0)
439; GCN-NEXT:    v_cmp_lt_u64_e64 s[16:17], s[8:9], 64
440; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
441; GCN-NEXT:    s_sub_i32 s22, 64, s8
442; GCN-NEXT:    s_sub_i32 s20, s8, 64
443; GCN-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
444; GCN-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
445; GCN-NEXT:    s_lshl_b64 s[18:19], s[2:3], s8
446; GCN-NEXT:    s_lshl_b64 s[20:21], s[0:1], s20
447; GCN-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
448; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
449; GCN-NEXT:    s_cselect_b32 s19, s19, s21
450; GCN-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
451; GCN-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], 0
452; GCN-NEXT:    s_and_b64 s[22:23], s[10:11], exec
453; GCN-NEXT:    s_cselect_b32 s9, s3, s19
454; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
455; GCN-NEXT:    s_cselect_b32 s3, s18, s20
456; GCN-NEXT:    s_and_b64 s[10:11], s[10:11], exec
457; GCN-NEXT:    v_cmp_lt_u64_e64 s[10:11], s[12:13], 64
458; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 0
459; GCN-NEXT:    s_cselect_b32 s22, s2, s3
460; GCN-NEXT:    s_and_b64 s[2:3], s[18:19], s[10:11]
461; GCN-NEXT:    s_sub_i32 s18, 64, s12
462; GCN-NEXT:    s_sub_i32 s10, s12, 64
463; GCN-NEXT:    s_lshr_b64 s[18:19], s[4:5], s18
464; GCN-NEXT:    s_lshl_b64 s[20:21], s[6:7], s12
465; GCN-NEXT:    s_lshl_b64 s[10:11], s[4:5], s10
466; GCN-NEXT:    s_or_b64 s[18:19], s[20:21], s[18:19]
467; GCN-NEXT:    s_and_b64 s[20:21], s[2:3], exec
468; GCN-NEXT:    s_cselect_b32 s11, s19, s11
469; GCN-NEXT:    s_or_b64 s[14:15], s[12:13], s[14:15]
470; GCN-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[14:15], 0
471; GCN-NEXT:    s_and_b64 s[20:21], s[14:15], exec
472; GCN-NEXT:    s_cselect_b32 s13, s7, s11
473; GCN-NEXT:    s_and_b64 s[20:21], s[2:3], exec
474; GCN-NEXT:    s_cselect_b32 s7, s18, s10
475; GCN-NEXT:    s_and_b64 s[10:11], s[14:15], exec
476; GCN-NEXT:    s_cselect_b32 s10, s6, s7
477; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
478; GCN-NEXT:    s_and_b64 s[6:7], s[16:17], exec
479; GCN-NEXT:    s_cselect_b32 s6, s1, 0
480; GCN-NEXT:    s_cselect_b32 s7, s0, 0
481; GCN-NEXT:    s_lshl_b64 s[0:1], s[4:5], s12
482; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], exec
483; GCN-NEXT:    s_cselect_b32 s1, s1, 0
484; GCN-NEXT:    s_cselect_b32 s0, s0, 0
485; GCN-NEXT:    v_mov_b32_e32 v0, s0
486; GCN-NEXT:    v_mov_b32_e32 v1, s1
487; GCN-NEXT:    v_mov_b32_e32 v2, s10
488; GCN-NEXT:    v_mov_b32_e32 v3, s13
489; GCN-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
490; GCN-NEXT:    s_nop 0
491; GCN-NEXT:    v_mov_b32_e32 v0, s7
492; GCN-NEXT:    v_mov_b32_e32 v1, s6
493; GCN-NEXT:    v_mov_b32_e32 v2, s22
494; GCN-NEXT:    v_mov_b32_e32 v3, s9
495; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
496; GCN-NEXT:    s_endpgm
497  %shift = shl <2 x i128> %lhs, %rhs
498  store <2 x i128> %shift, ptr addrspace(1) null
499  ret void
500}
501
502define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
503; GCN-LABEL: s_lshr_v2i128_ss:
504; GCN:       ; %bb.0:
505; GCN-NEXT:    s_load_dwordx16 s[0:15], s[8:9], 0x0
506; GCN-NEXT:    v_mov_b32_e32 v6, 16
507; GCN-NEXT:    v_mov_b32_e32 v4, 0
508; GCN-NEXT:    v_mov_b32_e32 v7, 0
509; GCN-NEXT:    v_mov_b32_e32 v5, 0
510; GCN-NEXT:    s_waitcnt lgkmcnt(0)
511; GCN-NEXT:    v_cmp_lt_u64_e64 s[16:17], s[8:9], 64
512; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
513; GCN-NEXT:    s_sub_i32 s22, 64, s8
514; GCN-NEXT:    s_sub_i32 s20, s8, 64
515; GCN-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
516; GCN-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
517; GCN-NEXT:    s_lshr_b64 s[18:19], s[0:1], s8
518; GCN-NEXT:    s_lshr_b64 s[20:21], s[2:3], s20
519; GCN-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
520; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
521; GCN-NEXT:    s_cselect_b32 s19, s19, s21
522; GCN-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
523; GCN-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], 0
524; GCN-NEXT:    s_and_b64 s[22:23], s[10:11], exec
525; GCN-NEXT:    s_cselect_b32 s9, s1, s19
526; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
527; GCN-NEXT:    s_cselect_b32 s1, s18, s20
528; GCN-NEXT:    s_and_b64 s[10:11], s[10:11], exec
529; GCN-NEXT:    v_cmp_lt_u64_e64 s[10:11], s[12:13], 64
530; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 0
531; GCN-NEXT:    s_cselect_b32 s22, s0, s1
532; GCN-NEXT:    s_and_b64 s[0:1], s[18:19], s[10:11]
533; GCN-NEXT:    s_sub_i32 s18, 64, s12
534; GCN-NEXT:    s_sub_i32 s10, s12, 64
535; GCN-NEXT:    s_lshl_b64 s[18:19], s[6:7], s18
536; GCN-NEXT:    s_lshr_b64 s[20:21], s[4:5], s12
537; GCN-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
538; GCN-NEXT:    s_or_b64 s[18:19], s[20:21], s[18:19]
539; GCN-NEXT:    s_and_b64 s[20:21], s[0:1], exec
540; GCN-NEXT:    s_cselect_b32 s11, s19, s11
541; GCN-NEXT:    s_or_b64 s[14:15], s[12:13], s[14:15]
542; GCN-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[14:15], 0
543; GCN-NEXT:    s_and_b64 s[20:21], s[14:15], exec
544; GCN-NEXT:    s_cselect_b32 s13, s5, s11
545; GCN-NEXT:    s_and_b64 s[20:21], s[0:1], exec
546; GCN-NEXT:    s_cselect_b32 s5, s18, s10
547; GCN-NEXT:    s_and_b64 s[10:11], s[14:15], exec
548; GCN-NEXT:    s_cselect_b32 s10, s4, s5
549; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], s8
550; GCN-NEXT:    s_and_b64 s[4:5], s[16:17], exec
551; GCN-NEXT:    s_cselect_b32 s4, s3, 0
552; GCN-NEXT:    s_cselect_b32 s5, s2, 0
553; GCN-NEXT:    s_lshr_b64 s[2:3], s[6:7], s12
554; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
555; GCN-NEXT:    s_cselect_b32 s0, s3, 0
556; GCN-NEXT:    s_cselect_b32 s1, s2, 0
557; GCN-NEXT:    v_mov_b32_e32 v0, s10
558; GCN-NEXT:    v_mov_b32_e32 v1, s13
559; GCN-NEXT:    v_mov_b32_e32 v2, s1
560; GCN-NEXT:    v_mov_b32_e32 v3, s0
561; GCN-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
562; GCN-NEXT:    s_nop 0
563; GCN-NEXT:    v_mov_b32_e32 v0, s22
564; GCN-NEXT:    v_mov_b32_e32 v1, s9
565; GCN-NEXT:    v_mov_b32_e32 v2, s5
566; GCN-NEXT:    v_mov_b32_e32 v3, s4
567; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
568; GCN-NEXT:    s_endpgm
569  %shift = lshr <2 x i128> %lhs, %rhs
570  store <2 x i128> %shift, ptr addrspace(1) null
571  ret void
572}
573
574define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
575; GCN-LABEL: s_ashr_v2i128_ss:
576; GCN:       ; %bb.0:
577; GCN-NEXT:    s_load_dwordx16 s[0:15], s[8:9], 0x0
578; GCN-NEXT:    v_mov_b32_e32 v6, 16
579; GCN-NEXT:    v_mov_b32_e32 v4, 0
580; GCN-NEXT:    v_mov_b32_e32 v7, 0
581; GCN-NEXT:    v_mov_b32_e32 v5, 0
582; GCN-NEXT:    s_waitcnt lgkmcnt(0)
583; GCN-NEXT:    v_cmp_lt_u64_e64 s[16:17], s[8:9], 64
584; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[10:11], 0
585; GCN-NEXT:    s_sub_i32 s22, 64, s8
586; GCN-NEXT:    s_sub_i32 s20, s8, 64
587; GCN-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
588; GCN-NEXT:    s_and_b64 s[16:17], s[18:19], s[16:17]
589; GCN-NEXT:    s_lshr_b64 s[18:19], s[0:1], s8
590; GCN-NEXT:    s_ashr_i64 s[20:21], s[2:3], s20
591; GCN-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
592; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
593; GCN-NEXT:    s_cselect_b32 s19, s19, s21
594; GCN-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
595; GCN-NEXT:    v_cmp_eq_u64_e64 s[10:11], s[10:11], 0
596; GCN-NEXT:    s_and_b64 s[22:23], s[10:11], exec
597; GCN-NEXT:    s_cselect_b32 s9, s1, s19
598; GCN-NEXT:    s_and_b64 s[22:23], s[16:17], exec
599; GCN-NEXT:    s_cselect_b32 s1, s18, s20
600; GCN-NEXT:    s_and_b64 s[10:11], s[10:11], exec
601; GCN-NEXT:    v_cmp_lt_u64_e64 s[10:11], s[12:13], 64
602; GCN-NEXT:    v_cmp_eq_u64_e64 s[18:19], s[14:15], 0
603; GCN-NEXT:    s_cselect_b32 s22, s0, s1
604; GCN-NEXT:    s_and_b64 s[0:1], s[18:19], s[10:11]
605; GCN-NEXT:    s_sub_i32 s18, 64, s12
606; GCN-NEXT:    s_sub_i32 s10, s12, 64
607; GCN-NEXT:    s_lshl_b64 s[18:19], s[6:7], s18
608; GCN-NEXT:    s_lshr_b64 s[20:21], s[4:5], s12
609; GCN-NEXT:    s_ashr_i64 s[10:11], s[6:7], s10
610; GCN-NEXT:    s_or_b64 s[18:19], s[20:21], s[18:19]
611; GCN-NEXT:    s_and_b64 s[20:21], s[0:1], exec
612; GCN-NEXT:    s_cselect_b32 s11, s19, s11
613; GCN-NEXT:    s_or_b64 s[14:15], s[12:13], s[14:15]
614; GCN-NEXT:    v_cmp_eq_u64_e64 s[14:15], s[14:15], 0
615; GCN-NEXT:    s_and_b64 s[20:21], s[14:15], exec
616; GCN-NEXT:    s_cselect_b32 s13, s5, s11
617; GCN-NEXT:    s_and_b64 s[20:21], s[0:1], exec
618; GCN-NEXT:    s_cselect_b32 s5, s18, s10
619; GCN-NEXT:    s_and_b64 s[10:11], s[14:15], exec
620; GCN-NEXT:    s_cselect_b32 s10, s4, s5
621; GCN-NEXT:    s_ashr_i32 s11, s3, 31
622; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], s8
623; GCN-NEXT:    s_and_b64 s[4:5], s[16:17], exec
624; GCN-NEXT:    s_cselect_b32 s4, s3, s11
625; GCN-NEXT:    s_cselect_b32 s5, s2, s11
626; GCN-NEXT:    s_ashr_i32 s8, s7, 31
627; GCN-NEXT:    s_ashr_i64 s[2:3], s[6:7], s12
628; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], exec
629; GCN-NEXT:    s_cselect_b32 s0, s3, s8
630; GCN-NEXT:    s_cselect_b32 s1, s2, s8
631; GCN-NEXT:    v_mov_b32_e32 v0, s10
632; GCN-NEXT:    v_mov_b32_e32 v1, s13
633; GCN-NEXT:    v_mov_b32_e32 v2, s1
634; GCN-NEXT:    v_mov_b32_e32 v3, s0
635; GCN-NEXT:    flat_store_dwordx4 v[6:7], v[0:3]
636; GCN-NEXT:    s_nop 0
637; GCN-NEXT:    v_mov_b32_e32 v0, s22
638; GCN-NEXT:    v_mov_b32_e32 v1, s9
639; GCN-NEXT:    v_mov_b32_e32 v2, s5
640; GCN-NEXT:    v_mov_b32_e32 v3, s4
641; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
642; GCN-NEXT:    s_endpgm
643  %shift = ashr <2 x i128> %lhs, %rhs
644  store <2 x i128> %shift, ptr addrspace(1) null
645  ret void
646}
647
648