xref: /llvm-project/llvm/test/CodeGen/AMDGPU/sra.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs < %s | FileCheck %s -check-prefixes=SI
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=VI
4; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s -check-prefixes=EG
5
6declare i32 @llvm.amdgcn.workitem.id.x() #0
7
8define amdgpu_kernel void @ashr_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
9; SI-LABEL: ashr_v2i32:
10; SI:       ; %bb.0:
11; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
12; SI-NEXT:    s_mov_b32 s7, 0xf000
13; SI-NEXT:    s_mov_b32 s6, -1
14; SI-NEXT:    s_mov_b32 s10, s6
15; SI-NEXT:    s_mov_b32 s11, s7
16; SI-NEXT:    s_waitcnt lgkmcnt(0)
17; SI-NEXT:    s_mov_b32 s8, s2
18; SI-NEXT:    s_mov_b32 s9, s3
19; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
20; SI-NEXT:    s_mov_b32 s4, s0
21; SI-NEXT:    s_mov_b32 s5, s1
22; SI-NEXT:    s_waitcnt vmcnt(0)
23; SI-NEXT:    v_ashr_i32_e32 v1, v1, v3
24; SI-NEXT:    v_ashr_i32_e32 v0, v0, v2
25; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: ashr_v2i32:
29; VI:       ; %bb.0:
30; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
31; VI-NEXT:    s_mov_b32 s7, 0xf000
32; VI-NEXT:    s_mov_b32 s6, -1
33; VI-NEXT:    s_mov_b32 s10, s6
34; VI-NEXT:    s_mov_b32 s11, s7
35; VI-NEXT:    s_waitcnt lgkmcnt(0)
36; VI-NEXT:    s_mov_b32 s8, s2
37; VI-NEXT:    s_mov_b32 s9, s3
38; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
39; VI-NEXT:    s_mov_b32 s4, s0
40; VI-NEXT:    s_mov_b32 s5, s1
41; VI-NEXT:    s_waitcnt vmcnt(0)
42; VI-NEXT:    v_ashrrev_i32_e32 v1, v3, v1
43; VI-NEXT:    v_ashrrev_i32_e32 v0, v2, v0
44; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
45; VI-NEXT:    s_endpgm
46;
47; EG-LABEL: ashr_v2i32:
48; EG:       ; %bb.0:
49; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
50; EG-NEXT:    TEX 0 @6
51; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
52; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
53; EG-NEXT:    CF_END
54; EG-NEXT:    PAD
55; EG-NEXT:    Fetch clause starting at 6:
56; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
57; EG-NEXT:    ALU clause starting at 8:
58; EG-NEXT:     MOV * T0.X, KC0[2].Z,
59; EG-NEXT:    ALU clause starting at 9:
60; EG-NEXT:     ASHR * T0.Y, T0.Y, T0.W,
61; EG-NEXT:     ASHR T0.X, T0.X, T0.Z,
62; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
63; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
64  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
65  %a = load <2 x i32>, ptr addrspace(1) %in
66  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
67  %result = ashr <2 x i32> %a, %b
68  store <2 x i32> %result, ptr addrspace(1) %out
69  ret void
70}
71
72define amdgpu_kernel void @ashr_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
73; SI-LABEL: ashr_v4i32:
74; SI:       ; %bb.0:
75; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
76; SI-NEXT:    s_mov_b32 s7, 0xf000
77; SI-NEXT:    s_mov_b32 s6, -1
78; SI-NEXT:    s_mov_b32 s10, s6
79; SI-NEXT:    s_mov_b32 s11, s7
80; SI-NEXT:    s_waitcnt lgkmcnt(0)
81; SI-NEXT:    s_mov_b32 s8, s2
82; SI-NEXT:    s_mov_b32 s9, s3
83; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
84; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
85; SI-NEXT:    s_mov_b32 s4, s0
86; SI-NEXT:    s_mov_b32 s5, s1
87; SI-NEXT:    s_waitcnt vmcnt(0)
88; SI-NEXT:    v_ashr_i32_e32 v3, v3, v7
89; SI-NEXT:    v_ashr_i32_e32 v2, v2, v6
90; SI-NEXT:    v_ashr_i32_e32 v1, v1, v5
91; SI-NEXT:    v_ashr_i32_e32 v0, v0, v4
92; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
93; SI-NEXT:    s_endpgm
94;
95; VI-LABEL: ashr_v4i32:
96; VI:       ; %bb.0:
97; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
98; VI-NEXT:    s_mov_b32 s7, 0xf000
99; VI-NEXT:    s_mov_b32 s6, -1
100; VI-NEXT:    s_mov_b32 s10, s6
101; VI-NEXT:    s_mov_b32 s11, s7
102; VI-NEXT:    s_waitcnt lgkmcnt(0)
103; VI-NEXT:    s_mov_b32 s8, s2
104; VI-NEXT:    s_mov_b32 s9, s3
105; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
106; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
107; VI-NEXT:    s_mov_b32 s4, s0
108; VI-NEXT:    s_mov_b32 s5, s1
109; VI-NEXT:    s_waitcnt vmcnt(0)
110; VI-NEXT:    v_ashrrev_i32_e32 v3, v7, v3
111; VI-NEXT:    v_ashrrev_i32_e32 v2, v6, v2
112; VI-NEXT:    v_ashrrev_i32_e32 v1, v5, v1
113; VI-NEXT:    v_ashrrev_i32_e32 v0, v4, v0
114; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
115; VI-NEXT:    s_endpgm
116;
117; EG-LABEL: ashr_v4i32:
118; EG:       ; %bb.0:
119; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
120; EG-NEXT:    TEX 1 @6
121; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
122; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
123; EG-NEXT:    CF_END
124; EG-NEXT:    PAD
125; EG-NEXT:    Fetch clause starting at 6:
126; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
127; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
128; EG-NEXT:    ALU clause starting at 10:
129; EG-NEXT:     MOV * T0.X, KC0[2].Z,
130; EG-NEXT:    ALU clause starting at 11:
131; EG-NEXT:     ASHR * T0.W, T0.W, T1.W,
132; EG-NEXT:     ASHR * T0.Z, T0.Z, T1.Z,
133; EG-NEXT:     ASHR * T0.Y, T0.Y, T1.Y,
134; EG-NEXT:     ASHR T0.X, T0.X, T1.X,
135; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
136; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
137  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
138  %a = load <4 x i32>, ptr addrspace(1) %in
139  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
140  %result = ashr <4 x i32> %a, %b
141  store <4 x i32> %result, ptr addrspace(1) %out
142  ret void
143}
144
145; FIXME: The ashr operation is uniform, but because its operands come from a
146; global load we end up with the vector instructions rather than scalar.
147define amdgpu_kernel void @ashr_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
148; SI-LABEL: ashr_v2i16:
149; SI:       ; %bb.0:
150; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
151; SI-NEXT:    s_mov_b32 s7, 0xf000
152; SI-NEXT:    s_mov_b32 s6, -1
153; SI-NEXT:    s_mov_b32 s10, s6
154; SI-NEXT:    s_mov_b32 s11, s7
155; SI-NEXT:    s_waitcnt lgkmcnt(0)
156; SI-NEXT:    s_mov_b32 s8, s2
157; SI-NEXT:    s_mov_b32 s9, s3
158; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
159; SI-NEXT:    s_mov_b32 s4, s0
160; SI-NEXT:    s_mov_b32 s5, s1
161; SI-NEXT:    s_waitcnt vmcnt(0)
162; SI-NEXT:    v_readfirstlane_b32 s0, v0
163; SI-NEXT:    v_readfirstlane_b32 s1, v1
164; SI-NEXT:    s_sext_i32_i16 s2, s0
165; SI-NEXT:    s_ashr_i32 s0, s0, 16
166; SI-NEXT:    s_lshr_b32 s3, s1, 16
167; SI-NEXT:    s_ashr_i32 s0, s0, s3
168; SI-NEXT:    s_ashr_i32 s1, s2, s1
169; SI-NEXT:    s_lshl_b32 s0, s0, 16
170; SI-NEXT:    s_and_b32 s1, s1, 0xffff
171; SI-NEXT:    s_or_b32 s0, s1, s0
172; SI-NEXT:    v_mov_b32_e32 v0, s0
173; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
174; SI-NEXT:    s_endpgm
175;
176; VI-LABEL: ashr_v2i16:
177; VI:       ; %bb.0:
178; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
179; VI-NEXT:    s_mov_b32 s7, 0xf000
180; VI-NEXT:    s_mov_b32 s6, -1
181; VI-NEXT:    s_mov_b32 s10, s6
182; VI-NEXT:    s_mov_b32 s11, s7
183; VI-NEXT:    s_waitcnt lgkmcnt(0)
184; VI-NEXT:    s_mov_b32 s8, s2
185; VI-NEXT:    s_mov_b32 s9, s3
186; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[8:11], 0
187; VI-NEXT:    s_mov_b32 s4, s0
188; VI-NEXT:    s_mov_b32 s5, s1
189; VI-NEXT:    s_waitcnt vmcnt(0)
190; VI-NEXT:    v_readfirstlane_b32 s0, v1
191; VI-NEXT:    v_readfirstlane_b32 s1, v0
192; VI-NEXT:    s_ashr_i32 s2, s1, 16
193; VI-NEXT:    s_sext_i32_i16 s1, s1
194; VI-NEXT:    s_ashr_i32 s3, s0, 16
195; VI-NEXT:    s_sext_i32_i16 s0, s0
196; VI-NEXT:    s_ashr_i32 s0, s1, s0
197; VI-NEXT:    s_ashr_i32 s1, s2, s3
198; VI-NEXT:    s_lshl_b32 s1, s1, 16
199; VI-NEXT:    s_and_b32 s0, s0, 0xffff
200; VI-NEXT:    s_or_b32 s0, s0, s1
201; VI-NEXT:    v_mov_b32_e32 v0, s0
202; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
203; VI-NEXT:    s_endpgm
204;
205; EG-LABEL: ashr_v2i16:
206; EG:       ; %bb.0:
207; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
208; EG-NEXT:    TEX 0 @6
209; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
210; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T6.X, T7.X, 1
211; EG-NEXT:    CF_END
212; EG-NEXT:    PAD
213; EG-NEXT:    Fetch clause starting at 6:
214; EG-NEXT:     VTX_READ_64 T6.XY, T6.X, 0, #1
215; EG-NEXT:    ALU clause starting at 8:
216; EG-NEXT:     MOV * T6.X, KC0[2].Z,
217; EG-NEXT:    ALU clause starting at 9:
218; EG-NEXT:     LSHR * T0.W, T6.X, literal.x,
219; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
220; EG-NEXT:     BFE_INT T0.Y, PV.W, 0.0, literal.x,
221; EG-NEXT:     LSHR T0.Z, T6.Y, literal.x,
222; EG-NEXT:     BFE_INT T0.W, T6.X, 0.0, literal.x,
223; EG-NEXT:     AND_INT * T1.W, T6.Y, literal.y,
224; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
225; EG-NEXT:     ASHR T0.W, PV.W, PS,
226; EG-NEXT:     ASHR * T1.W, PV.Y, PV.Z,
227; EG-NEXT:     LSHL T1.W, PS, literal.x,
228; EG-NEXT:     AND_INT * T0.W, PV.W, literal.y,
229; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
230; EG-NEXT:     OR_INT T6.X, PS, PV.W,
231; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
232; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
233  %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %in, i16 1
234  %a = load <2 x i16>, ptr addrspace(1) %in
235  %b = load <2 x i16>, ptr addrspace(1) %b_ptr
236  %result = ashr <2 x i16> %a, %b
237  store <2 x i16> %result, ptr addrspace(1) %out
238  ret void
239}
240
241; FIXME: The ashr operation is uniform, but because its operands come from a
242; global load we end up with the vector instructions rather than scalar.
243define amdgpu_kernel void @ashr_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
244; SI-LABEL: ashr_v4i16:
245; SI:       ; %bb.0:
246; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
247; SI-NEXT:    s_mov_b32 s7, 0xf000
248; SI-NEXT:    s_mov_b32 s6, -1
249; SI-NEXT:    s_mov_b32 s10, s6
250; SI-NEXT:    s_mov_b32 s11, s7
251; SI-NEXT:    s_waitcnt lgkmcnt(0)
252; SI-NEXT:    s_mov_b32 s8, s2
253; SI-NEXT:    s_mov_b32 s9, s3
254; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
255; SI-NEXT:    s_mov_b32 s4, s0
256; SI-NEXT:    s_mov_b32 s5, s1
257; SI-NEXT:    s_waitcnt vmcnt(0)
258; SI-NEXT:    v_readfirstlane_b32 s0, v3
259; SI-NEXT:    v_readfirstlane_b32 s1, v2
260; SI-NEXT:    v_readfirstlane_b32 s2, v1
261; SI-NEXT:    v_readfirstlane_b32 s3, v0
262; SI-NEXT:    s_sext_i32_i16 s8, s3
263; SI-NEXT:    s_ashr_i32 s3, s3, 16
264; SI-NEXT:    s_sext_i32_i16 s9, s2
265; SI-NEXT:    s_ashr_i32 s2, s2, 16
266; SI-NEXT:    s_lshr_b32 s10, s1, 16
267; SI-NEXT:    s_lshr_b32 s11, s0, 16
268; SI-NEXT:    s_ashr_i32 s2, s2, s11
269; SI-NEXT:    s_ashr_i32 s0, s9, s0
270; SI-NEXT:    s_ashr_i32 s3, s3, s10
271; SI-NEXT:    s_ashr_i32 s1, s8, s1
272; SI-NEXT:    s_lshl_b32 s2, s2, 16
273; SI-NEXT:    s_and_b32 s0, s0, 0xffff
274; SI-NEXT:    s_lshl_b32 s3, s3, 16
275; SI-NEXT:    s_and_b32 s1, s1, 0xffff
276; SI-NEXT:    s_or_b32 s0, s0, s2
277; SI-NEXT:    s_or_b32 s1, s1, s3
278; SI-NEXT:    v_mov_b32_e32 v0, s1
279; SI-NEXT:    v_mov_b32_e32 v1, s0
280; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
281; SI-NEXT:    s_endpgm
282;
283; VI-LABEL: ashr_v4i16:
284; VI:       ; %bb.0:
285; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
286; VI-NEXT:    s_mov_b32 s7, 0xf000
287; VI-NEXT:    s_mov_b32 s6, -1
288; VI-NEXT:    s_mov_b32 s10, s6
289; VI-NEXT:    s_mov_b32 s11, s7
290; VI-NEXT:    s_waitcnt lgkmcnt(0)
291; VI-NEXT:    s_mov_b32 s8, s2
292; VI-NEXT:    s_mov_b32 s9, s3
293; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
294; VI-NEXT:    s_mov_b32 s4, s0
295; VI-NEXT:    s_mov_b32 s5, s1
296; VI-NEXT:    s_waitcnt vmcnt(0)
297; VI-NEXT:    v_readfirstlane_b32 s0, v2
298; VI-NEXT:    v_readfirstlane_b32 s1, v3
299; VI-NEXT:    v_readfirstlane_b32 s2, v0
300; VI-NEXT:    v_readfirstlane_b32 s3, v1
301; VI-NEXT:    s_ashr_i32 s8, s3, 16
302; VI-NEXT:    s_sext_i32_i16 s3, s3
303; VI-NEXT:    s_ashr_i32 s9, s2, 16
304; VI-NEXT:    s_sext_i32_i16 s2, s2
305; VI-NEXT:    s_ashr_i32 s10, s1, 16
306; VI-NEXT:    s_sext_i32_i16 s1, s1
307; VI-NEXT:    s_ashr_i32 s11, s0, 16
308; VI-NEXT:    s_sext_i32_i16 s0, s0
309; VI-NEXT:    s_ashr_i32 s0, s2, s0
310; VI-NEXT:    s_ashr_i32 s2, s9, s11
311; VI-NEXT:    s_ashr_i32 s1, s3, s1
312; VI-NEXT:    s_ashr_i32 s3, s8, s10
313; VI-NEXT:    s_lshl_b32 s3, s3, 16
314; VI-NEXT:    s_and_b32 s1, s1, 0xffff
315; VI-NEXT:    s_lshl_b32 s2, s2, 16
316; VI-NEXT:    s_and_b32 s0, s0, 0xffff
317; VI-NEXT:    s_or_b32 s1, s1, s3
318; VI-NEXT:    s_or_b32 s0, s0, s2
319; VI-NEXT:    v_mov_b32_e32 v0, s0
320; VI-NEXT:    v_mov_b32_e32 v1, s1
321; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
322; VI-NEXT:    s_endpgm
323;
324; EG-LABEL: ashr_v4i16:
325; EG:       ; %bb.0:
326; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
327; EG-NEXT:    TEX 0 @6
328; EG-NEXT:    ALU 48, @10, KC0[CB0:0-32], KC1[]
329; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T9.X, 1
330; EG-NEXT:    CF_END
331; EG-NEXT:    PAD
332; EG-NEXT:    Fetch clause starting at 6:
333; EG-NEXT:     VTX_READ_128 T9.XYZW, T9.X, 0, #1
334; EG-NEXT:    ALU clause starting at 8:
335; EG-NEXT:     MOV * T0.Y, T6.X,
336; EG-NEXT:     MOV * T9.X, KC0[2].Z,
337; EG-NEXT:    ALU clause starting at 10:
338; EG-NEXT:     BFE_INT T0.W, T9.X, 0.0, literal.x,
339; EG-NEXT:     AND_INT * T1.W, T9.Z, literal.y,
340; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
341; EG-NEXT:     ASHR * T0.W, PV.W, PS,
342; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
343; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
344; EG-NEXT:    65535(9.183409e-41), -65536(nan)
345; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
346; EG-NEXT:     MOV * T6.X, PV.W,
347; EG-NEXT:     MOV T0.Y, PV.X,
348; EG-NEXT:     LSHR * T0.W, T9.X, literal.x,
349; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
350; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
351; EG-NEXT:     LSHR * T1.W, T9.Z, literal.x,
352; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
353; EG-NEXT:     ASHR T0.W, PV.W, PS,
354; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
355; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
356; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
357; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
358; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
359; EG-NEXT:     MOV T6.X, PV.W,
360; EG-NEXT:     MOV T0.Y, T7.X,
361; EG-NEXT:     BFE_INT T0.W, T9.Y, 0.0, literal.x,
362; EG-NEXT:     AND_INT * T1.W, T9.W, literal.y,
363; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
364; EG-NEXT:     ASHR T0.W, PV.W, PS,
365; EG-NEXT:     AND_INT * T1.W, PV.Y, literal.x,
366; EG-NEXT:    -65536(nan), 0(0.000000e+00)
367; EG-NEXT:     AND_INT * T0.W, PV.W, literal.x,
368; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
369; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
370; EG-NEXT:     MOV * T7.X, PV.W,
371; EG-NEXT:     MOV T0.Y, PV.X,
372; EG-NEXT:     LSHR * T0.W, T9.Y, literal.x,
373; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
374; EG-NEXT:     BFE_INT T0.W, PV.W, 0.0, literal.x,
375; EG-NEXT:     LSHR * T1.W, T9.W, literal.x,
376; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
377; EG-NEXT:     ASHR T0.W, PV.W, PS,
378; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
379; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
380; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
381; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
382; EG-NEXT:     LSHR T9.X, KC0[2].Y, literal.x,
383; EG-NEXT:     OR_INT * T10.Y, T1.W, PV.W,
384; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
385; EG-NEXT:     MOV T7.X, PV.Y,
386; EG-NEXT:     MOV * T10.X, T6.X,
387  %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %in, i16 1
388  %a = load <4 x i16>, ptr addrspace(1) %in
389  %b = load <4 x i16>, ptr addrspace(1) %b_ptr
390  %result = ashr <4 x i16> %a, %b
391  store <4 x i16> %result, ptr addrspace(1) %out
392  ret void
393}
394
395define amdgpu_kernel void @s_ashr_i64(ptr addrspace(1) %out, i32 %in) {
396; SI-LABEL: s_ashr_i64:
397; SI:       ; %bb.0: ; %entry
398; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
399; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
400; SI-NEXT:    s_mov_b32 s3, 0xf000
401; SI-NEXT:    s_mov_b32 s2, -1
402; SI-NEXT:    s_waitcnt lgkmcnt(0)
403; SI-NEXT:    s_ashr_i32 s7, s6, 31
404; SI-NEXT:    s_ashr_i64 s[4:5], s[6:7], 8
405; SI-NEXT:    v_mov_b32_e32 v0, s4
406; SI-NEXT:    v_mov_b32_e32 v1, s5
407; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
408; SI-NEXT:    s_endpgm
409;
410; VI-LABEL: s_ashr_i64:
411; VI:       ; %bb.0: ; %entry
412; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
413; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
414; VI-NEXT:    s_mov_b32 s3, 0xf000
415; VI-NEXT:    s_mov_b32 s2, -1
416; VI-NEXT:    s_waitcnt lgkmcnt(0)
417; VI-NEXT:    s_ashr_i32 s7, s6, 31
418; VI-NEXT:    s_ashr_i64 s[4:5], s[6:7], 8
419; VI-NEXT:    v_mov_b32_e32 v0, s4
420; VI-NEXT:    v_mov_b32_e32 v1, s5
421; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
422; VI-NEXT:    s_endpgm
423;
424; EG-LABEL: s_ashr_i64:
425; EG:       ; %bb.0: ; %entry
426; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
427; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
428; EG-NEXT:    CF_END
429; EG-NEXT:    PAD
430; EG-NEXT:    ALU clause starting at 4:
431; EG-NEXT:     ASHR * T0.Y, KC0[2].Z, literal.x,
432; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
433; EG-NEXT:     BIT_ALIGN_INT T0.X, PV.Y, KC0[2].Z, literal.x,
434; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
435; EG-NEXT:    8(1.121039e-44), 2(2.802597e-45)
436entry:
437  %in.ext = sext i32 %in to i64
438  %ashr = ashr i64 %in.ext, 8
439  store i64 %ashr, ptr addrspace(1) %out
440  ret void
441}
442
443define amdgpu_kernel void @ashr_i64_2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
444; SI-LABEL: ashr_i64_2:
445; SI:       ; %bb.0: ; %entry
446; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
447; SI-NEXT:    s_mov_b32 s7, 0xf000
448; SI-NEXT:    s_mov_b32 s6, -1
449; SI-NEXT:    s_mov_b32 s10, s6
450; SI-NEXT:    s_mov_b32 s11, s7
451; SI-NEXT:    s_waitcnt lgkmcnt(0)
452; SI-NEXT:    s_mov_b32 s8, s2
453; SI-NEXT:    s_mov_b32 s9, s3
454; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
455; SI-NEXT:    s_mov_b32 s4, s0
456; SI-NEXT:    s_mov_b32 s5, s1
457; SI-NEXT:    s_waitcnt vmcnt(0)
458; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v2
459; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
460; SI-NEXT:    s_endpgm
461;
462; VI-LABEL: ashr_i64_2:
463; VI:       ; %bb.0: ; %entry
464; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
465; VI-NEXT:    s_mov_b32 s7, 0xf000
466; VI-NEXT:    s_mov_b32 s6, -1
467; VI-NEXT:    s_mov_b32 s10, s6
468; VI-NEXT:    s_mov_b32 s11, s7
469; VI-NEXT:    s_waitcnt lgkmcnt(0)
470; VI-NEXT:    s_mov_b32 s8, s2
471; VI-NEXT:    s_mov_b32 s9, s3
472; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
473; VI-NEXT:    s_mov_b32 s4, s0
474; VI-NEXT:    s_mov_b32 s5, s1
475; VI-NEXT:    s_waitcnt vmcnt(0)
476; VI-NEXT:    v_ashrrev_i64 v[0:1], v2, v[0:1]
477; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
478; VI-NEXT:    s_endpgm
479;
480; EG-LABEL: ashr_i64_2:
481; EG:       ; %bb.0: ; %entry
482; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
483; EG-NEXT:    TEX 0 @6
484; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
485; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
486; EG-NEXT:    CF_END
487; EG-NEXT:    PAD
488; EG-NEXT:    Fetch clause starting at 6:
489; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
490; EG-NEXT:    ALU clause starting at 8:
491; EG-NEXT:     MOV * T0.X, KC0[2].Z,
492; EG-NEXT:    ALU clause starting at 9:
493; EG-NEXT:     AND_INT * T0.W, T0.Z, literal.x,
494; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
495; EG-NEXT:     ASHR T1.Z, T0.Y, PV.W,
496; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, T0.Z,
497; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
498; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
499; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Z,
500; EG-NEXT:     ASHR T0.W, T0.Y, literal.x,
501; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
502; EG-NEXT:    31(4.344025e-44), 2(2.802597e-45)
503; EG-NEXT:     CNDE_INT * T0.Y, T1.W, T1.Z, PV.W,
504entry:
505  %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1
506  %a = load i64, ptr addrspace(1) %in
507  %b = load i64, ptr addrspace(1) %b_ptr
508  %result = ashr i64 %a, %b
509  store i64 %result, ptr addrspace(1) %out
510  ret void
511}
512
513define amdgpu_kernel void @ashr_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
514; SI-LABEL: ashr_v2i64:
515; SI:       ; %bb.0:
516; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
517; SI-NEXT:    s_mov_b32 s7, 0xf000
518; SI-NEXT:    s_mov_b32 s6, -1
519; SI-NEXT:    s_mov_b32 s10, s6
520; SI-NEXT:    s_mov_b32 s11, s7
521; SI-NEXT:    s_waitcnt lgkmcnt(0)
522; SI-NEXT:    s_mov_b32 s8, s2
523; SI-NEXT:    s_mov_b32 s9, s3
524; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
525; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
526; SI-NEXT:    s_mov_b32 s4, s0
527; SI-NEXT:    s_mov_b32 s5, s1
528; SI-NEXT:    s_waitcnt vmcnt(0)
529; SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], v6
530; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v4
531; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
532; SI-NEXT:    s_endpgm
533;
534; VI-LABEL: ashr_v2i64:
535; VI:       ; %bb.0:
536; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
537; VI-NEXT:    s_mov_b32 s7, 0xf000
538; VI-NEXT:    s_mov_b32 s6, -1
539; VI-NEXT:    s_mov_b32 s10, s6
540; VI-NEXT:    s_mov_b32 s11, s7
541; VI-NEXT:    s_waitcnt lgkmcnt(0)
542; VI-NEXT:    s_mov_b32 s8, s2
543; VI-NEXT:    s_mov_b32 s9, s3
544; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
545; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
546; VI-NEXT:    s_mov_b32 s4, s0
547; VI-NEXT:    s_mov_b32 s5, s1
548; VI-NEXT:    s_waitcnt vmcnt(0)
549; VI-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
550; VI-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
551; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
552; VI-NEXT:    s_endpgm
553;
554; EG-LABEL: ashr_v2i64:
555; EG:       ; %bb.0:
556; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
557; EG-NEXT:    TEX 1 @6
558; EG-NEXT:    ALU 19, @11, KC0[CB0:0-32], KC1[]
559; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
560; EG-NEXT:    CF_END
561; EG-NEXT:    PAD
562; EG-NEXT:    Fetch clause starting at 6:
563; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
564; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
565; EG-NEXT:    ALU clause starting at 10:
566; EG-NEXT:     MOV * T0.X, KC0[2].Z,
567; EG-NEXT:    ALU clause starting at 11:
568; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
569; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
570; EG-NEXT:     ASHR T1.Y, T0.W, PV.W,
571; EG-NEXT:     AND_INT T2.Z, T1.Z, literal.x,
572; EG-NEXT:     BIT_ALIGN_INT T1.W, T0.W, T0.Z, T1.Z,
573; EG-NEXT:     AND_INT * T2.W, T1.X, literal.y,
574; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
575; EG-NEXT:     ASHR T2.Y, T0.Y, PS,
576; EG-NEXT:     CNDE_INT T0.Z, PV.Z, PV.W, PV.Y,
577; EG-NEXT:     BIT_ALIGN_INT T1.W, T0.Y, T0.X, T1.X,
578; EG-NEXT:     AND_INT * T2.W, T1.X, literal.x,
579; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
580; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Y,
581; EG-NEXT:     ASHR T0.W, T0.W, literal.x,
582; EG-NEXT:     ASHR * T1.W, T0.Y, literal.x,
583; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
584; EG-NEXT:     CNDE_INT * T0.W, T2.Z, T1.Y, PV.W,
585; EG-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
586; EG-NEXT:     CNDE_INT * T0.Y, T2.W, T2.Y, T1.W,
587; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
588  %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1
589  %a = load <2 x i64>, ptr addrspace(1) %in
590  %b = load <2 x i64>, ptr addrspace(1) %b_ptr
591  %result = ashr <2 x i64> %a, %b
592  store <2 x i64> %result, ptr addrspace(1) %out
593  ret void
594}
595
596; FIXME: Broken on r600
597define amdgpu_kernel void @ashr_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
598; SI-LABEL: ashr_v4i64:
599; SI:       ; %bb.0:
600; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
601; SI-NEXT:    s_mov_b32 s3, 0xf000
602; SI-NEXT:    s_mov_b32 s2, -1
603; SI-NEXT:    s_mov_b32 s10, s2
604; SI-NEXT:    s_mov_b32 s11, s3
605; SI-NEXT:    s_waitcnt lgkmcnt(0)
606; SI-NEXT:    s_mov_b32 s8, s6
607; SI-NEXT:    s_mov_b32 s9, s7
608; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
609; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
610; SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
611; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
612; SI-NEXT:    s_mov_b32 s0, s4
613; SI-NEXT:    s_mov_b32 s1, s5
614; SI-NEXT:    s_waitcnt vmcnt(2)
615; SI-NEXT:    v_ashr_i64 v[2:3], v[2:3], v6
616; SI-NEXT:    v_ashr_i64 v[0:1], v[0:1], v4
617; SI-NEXT:    s_waitcnt vmcnt(0)
618; SI-NEXT:    v_ashr_i64 v[9:10], v[9:10], v13
619; SI-NEXT:    v_ashr_i64 v[7:8], v[7:8], v11
620; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
621; SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
622; SI-NEXT:    s_endpgm
623;
624; VI-LABEL: ashr_v4i64:
625; VI:       ; %bb.0:
626; VI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
627; VI-NEXT:    s_mov_b32 s3, 0xf000
628; VI-NEXT:    s_mov_b32 s2, -1
629; VI-NEXT:    s_mov_b32 s10, s2
630; VI-NEXT:    s_mov_b32 s11, s3
631; VI-NEXT:    s_waitcnt lgkmcnt(0)
632; VI-NEXT:    s_mov_b32 s8, s6
633; VI-NEXT:    s_mov_b32 s9, s7
634; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
635; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
636; VI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
637; VI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
638; VI-NEXT:    s_mov_b32 s0, s4
639; VI-NEXT:    s_mov_b32 s1, s5
640; VI-NEXT:    s_waitcnt vmcnt(2)
641; VI-NEXT:    v_ashrrev_i64 v[2:3], v6, v[2:3]
642; VI-NEXT:    v_ashrrev_i64 v[0:1], v4, v[0:1]
643; VI-NEXT:    s_waitcnt vmcnt(0)
644; VI-NEXT:    v_ashrrev_i64 v[9:10], v13, v[9:10]
645; VI-NEXT:    v_ashrrev_i64 v[7:8], v11, v[7:8]
646; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
647; VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
648; VI-NEXT:    s_endpgm
649;
650; EG-LABEL: ashr_v4i64:
651; EG:       ; %bb.0:
652; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
653; EG-NEXT:    TEX 3 @6
654; EG-NEXT:    ALU 39, @15, KC0[CB0:0-32], KC1[]
655; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0
656; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
657; EG-NEXT:    CF_END
658; EG-NEXT:    Fetch clause starting at 6:
659; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
660; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 48, #1
661; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 0, #1
662; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 16, #1
663; EG-NEXT:    ALU clause starting at 14:
664; EG-NEXT:     MOV * T0.X, KC0[2].Z,
665; EG-NEXT:    ALU clause starting at 15:
666; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
667; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
668; EG-NEXT:     ASHR T1.Y, T0.W, literal.x,
669; EG-NEXT:     ASHR T4.Z, T3.W, PV.W, BS:VEC_120/SCL_212
670; EG-NEXT:     AND_INT T1.W, T1.Z, literal.y,
671; EG-NEXT:     AND_INT * T2.W, T2.Z, literal.x,
672; EG-NEXT:    31(4.344025e-44), 32(4.484155e-44)
673; EG-NEXT:     BIT_ALIGN_INT T4.X, T3.W, T3.Z, T1.Z,
674; EG-NEXT:     ASHR T2.Y, T0.W, PS, BS:VEC_120/SCL_212
675; EG-NEXT:     AND_INT * T1.Z, T2.Z, literal.x,
676; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
677; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, T2.Z,
678; EG-NEXT:     AND_INT * T2.W, T2.X, literal.x,
679; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
680; EG-NEXT:     AND_INT T5.X, T1.X, literal.x,
681; EG-NEXT:     ASHR T4.Y, T0.Y, PS,
682; EG-NEXT:     CNDE_INT T0.Z, T1.Z, PV.W, T2.Y,
683; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, T2.X,
684; EG-NEXT:     AND_INT * T2.W, T2.X, literal.y,
685; EG-NEXT:    31(4.344025e-44), 32(4.484155e-44)
686; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, PV.Y,
687; EG-NEXT:     ASHR T5.Y, T3.Y, PV.X,
688; EG-NEXT:     CNDE_INT T2.Z, T1.W, T4.X, T4.Z,
689; EG-NEXT:     BIT_ALIGN_INT T0.W, T3.Y, T3.X, T1.X, BS:VEC_102/SCL_221
690; EG-NEXT:     AND_INT * T4.W, T1.X, literal.x,
691; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
692; EG-NEXT:     CNDE_INT T2.X, PS, PV.W, PV.Y,
693; EG-NEXT:     ASHR T6.Y, T3.W, literal.x,
694; EG-NEXT:     ASHR T3.Z, T0.Y, literal.x, BS:VEC_201
695; EG-NEXT:     ADD_INT T3.W, KC0[2].Y, literal.y,
696; EG-NEXT:     CNDE_INT * T0.W, T1.Z, T2.Y, T1.Y,
697; EG-NEXT:    31(4.344025e-44), 16(2.242078e-44)
698; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
699; EG-NEXT:     CNDE_INT T0.Y, T2.W, T4.Y, PV.Z,
700; EG-NEXT:     ASHR T3.W, T3.Y, literal.y,
701; EG-NEXT:     CNDE_INT * T2.W, T1.W, T4.Z, PV.Y,
702; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
703; EG-NEXT:     LSHR T3.X, KC0[2].Y, literal.x,
704; EG-NEXT:     CNDE_INT * T2.Y, T4.W, T5.Y, PV.W,
705; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
706  %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1
707  %a = load <4 x i64>, ptr addrspace(1) %in
708  %b = load <4 x i64>, ptr addrspace(1) %b_ptr
709  %result = ashr <4 x i64> %a, %b
710  store <4 x i64> %result, ptr addrspace(1) %out
711  ret void
712}
713
714define amdgpu_kernel void @s_ashr_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
715; SI-LABEL: s_ashr_32_i64:
716; SI:       ; %bb.0:
717; SI-NEXT:    s_load_dword s8, s[4:5], 0x14
718; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x1d
719; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
720; SI-NEXT:    s_mov_b32 s3, 0xf000
721; SI-NEXT:    s_mov_b32 s2, -1
722; SI-NEXT:    s_waitcnt lgkmcnt(0)
723; SI-NEXT:    s_ashr_i32 s5, s8, 31
724; SI-NEXT:    s_add_u32 s4, s8, s6
725; SI-NEXT:    s_addc_u32 s5, s5, s7
726; SI-NEXT:    v_mov_b32_e32 v0, s4
727; SI-NEXT:    v_mov_b32_e32 v1, s5
728; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
729; SI-NEXT:    s_endpgm
730;
731; VI-LABEL: s_ashr_32_i64:
732; VI:       ; %bb.0:
733; VI-NEXT:    s_load_dword s8, s[4:5], 0x50
734; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x74
735; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
736; VI-NEXT:    s_mov_b32 s3, 0xf000
737; VI-NEXT:    s_mov_b32 s2, -1
738; VI-NEXT:    s_waitcnt lgkmcnt(0)
739; VI-NEXT:    s_ashr_i32 s5, s8, 31
740; VI-NEXT:    s_add_u32 s4, s8, s6
741; VI-NEXT:    s_addc_u32 s5, s5, s7
742; VI-NEXT:    v_mov_b32_e32 v0, s4
743; VI-NEXT:    v_mov_b32_e32 v1, s5
744; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
745; VI-NEXT:    s_endpgm
746;
747; EG-LABEL: s_ashr_32_i64:
748; EG:       ; %bb.0:
749; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
750; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
751; EG-NEXT:    CF_END
752; EG-NEXT:    PAD
753; EG-NEXT:    ALU clause starting at 4:
754; EG-NEXT:     ASHR * T0.W, KC0[5].X, literal.x,
755; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
756; EG-NEXT:     ADD_INT * T0.W, PV.W, KC0[7].Z,
757; EG-NEXT:     ADDC_UINT * T1.W, KC0[5].X, KC0[7].Y,
758; EG-NEXT:     ADD_INT * T0.Y, T0.W, PV.W,
759; EG-NEXT:     ADD_INT * T0.X, KC0[5].X, KC0[7].Y,
760; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
761; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
762  %result = ashr i64 %a, 32
763  %add = add i64 %result, %b
764  store i64 %add, ptr addrspace(1) %out
765  ret void
766}
767
768define amdgpu_kernel void @v_ashr_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
769; SI-LABEL: v_ashr_32_i64:
770; SI:       ; %bb.0:
771; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
772; SI-NEXT:    s_mov_b32 s7, 0xf000
773; SI-NEXT:    s_mov_b32 s6, 0
774; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
775; SI-NEXT:    v_mov_b32_e32 v1, 0
776; SI-NEXT:    s_waitcnt lgkmcnt(0)
777; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
778; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
779; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
780; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
781; SI-NEXT:    s_waitcnt vmcnt(0)
782; SI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
783; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
784; SI-NEXT:    s_endpgm
785;
786; VI-LABEL: v_ashr_32_i64:
787; VI:       ; %bb.0:
788; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
789; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
790; VI-NEXT:    s_waitcnt lgkmcnt(0)
791; VI-NEXT:    v_mov_b32_e32 v0, s3
792; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v2
793; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
794; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v1
795; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
796; VI-NEXT:    flat_load_dword v0, v[0:1]
797; VI-NEXT:    v_mov_b32_e32 v1, s1
798; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
799; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
800; VI-NEXT:    s_waitcnt vmcnt(0)
801; VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
802; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
803; VI-NEXT:    s_endpgm
804;
805; EG-LABEL: v_ashr_32_i64:
806; EG:       ; %bb.0:
807; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
808; EG-NEXT:    TEX 0 @6
809; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
810; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
811; EG-NEXT:    CF_END
812; EG-NEXT:    PAD
813; EG-NEXT:    Fetch clause starting at 6:
814; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
815; EG-NEXT:    ALU clause starting at 8:
816; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
817; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
818; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
819; EG-NEXT:    ALU clause starting at 11:
820; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
821; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
822; EG-NEXT:     ASHR * T0.Y, T0.X, literal.y,
823; EG-NEXT:    2(2.802597e-45), 31(4.344025e-44)
824  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
825  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
826  %gep.out = getelementptr i64, ptr addrspace(1) %out, i32 %tid
827  %a = load i64, ptr addrspace(1) %gep.in
828  %result = ashr i64 %a, 32
829  store i64 %result, ptr addrspace(1) %gep.out
830  ret void
831}
832
833define amdgpu_kernel void @s_ashr_63_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
834; SI-LABEL: s_ashr_63_i64:
835; SI:       ; %bb.0:
836; SI-NEXT:    s_load_dword s8, s[4:5], 0x14
837; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x1d
838; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
839; SI-NEXT:    s_mov_b32 s3, 0xf000
840; SI-NEXT:    s_mov_b32 s2, -1
841; SI-NEXT:    s_waitcnt lgkmcnt(0)
842; SI-NEXT:    s_ashr_i32 s5, s8, 31
843; SI-NEXT:    s_add_u32 s4, s5, s6
844; SI-NEXT:    s_addc_u32 s5, s5, s7
845; SI-NEXT:    v_mov_b32_e32 v0, s4
846; SI-NEXT:    v_mov_b32_e32 v1, s5
847; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
848; SI-NEXT:    s_endpgm
849;
850; VI-LABEL: s_ashr_63_i64:
851; VI:       ; %bb.0:
852; VI-NEXT:    s_load_dword s8, s[4:5], 0x50
853; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x74
854; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
855; VI-NEXT:    s_mov_b32 s3, 0xf000
856; VI-NEXT:    s_mov_b32 s2, -1
857; VI-NEXT:    s_waitcnt lgkmcnt(0)
858; VI-NEXT:    s_ashr_i32 s5, s8, 31
859; VI-NEXT:    s_add_u32 s4, s5, s6
860; VI-NEXT:    s_addc_u32 s5, s5, s7
861; VI-NEXT:    v_mov_b32_e32 v0, s4
862; VI-NEXT:    v_mov_b32_e32 v1, s5
863; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
864; VI-NEXT:    s_endpgm
865;
866; EG-LABEL: s_ashr_63_i64:
867; EG:       ; %bb.0:
868; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
869; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
870; EG-NEXT:    CF_END
871; EG-NEXT:    PAD
872; EG-NEXT:    ALU clause starting at 4:
873; EG-NEXT:     ASHR * T0.W, KC0[5].X, literal.x,
874; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
875; EG-NEXT:     ADD_INT T1.W, PV.W, KC0[7].Z,
876; EG-NEXT:     ADDC_UINT * T2.W, PV.W, KC0[7].Y,
877; EG-NEXT:     ADD_INT * T0.Y, PV.W, PS,
878; EG-NEXT:     ADD_INT T0.X, T0.W, KC0[7].Y,
879; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
880; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
881  %result = ashr i64 %a, 63
882  %add = add i64 %result, %b
883  store i64 %add, ptr addrspace(1) %out
884  ret void
885}
886
887define amdgpu_kernel void @v_ashr_63_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
888; SI-LABEL: v_ashr_63_i64:
889; SI:       ; %bb.0:
890; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
891; SI-NEXT:    s_mov_b32 s7, 0xf000
892; SI-NEXT:    s_mov_b32 s6, 0
893; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
894; SI-NEXT:    v_mov_b32_e32 v1, 0
895; SI-NEXT:    s_waitcnt lgkmcnt(0)
896; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
897; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
898; SI-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 offset:4
899; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
900; SI-NEXT:    s_waitcnt vmcnt(0)
901; SI-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
902; SI-NEXT:    v_mov_b32_e32 v3, v2
903; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
904; SI-NEXT:    s_endpgm
905;
906; VI-LABEL: v_ashr_63_i64:
907; VI:       ; %bb.0:
908; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
909; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
910; VI-NEXT:    s_waitcnt lgkmcnt(0)
911; VI-NEXT:    v_mov_b32_e32 v0, s3
912; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v2
913; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
914; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v1
915; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
916; VI-NEXT:    flat_load_dword v3, v[0:1]
917; VI-NEXT:    v_mov_b32_e32 v1, s1
918; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
919; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
920; VI-NEXT:    s_waitcnt vmcnt(0)
921; VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v3
922; VI-NEXT:    v_mov_b32_e32 v3, v2
923; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
924; VI-NEXT:    s_endpgm
925;
926; EG-LABEL: v_ashr_63_i64:
927; EG:       ; %bb.0:
928; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
929; EG-NEXT:    TEX 0 @6
930; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
931; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
932; EG-NEXT:    CF_END
933; EG-NEXT:    PAD
934; EG-NEXT:    Fetch clause starting at 6:
935; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
936; EG-NEXT:    ALU clause starting at 8:
937; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
938; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
939; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
940; EG-NEXT:    ALU clause starting at 11:
941; EG-NEXT:     ASHR T0.X, T0.X, literal.x,
942; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
943; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
944; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
945; EG-NEXT:     MOV * T0.Y, PV.X,
946; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
947  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
948  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
949  %gep.out = getelementptr i64, ptr addrspace(1) %out, i32 %tid
950  %a = load i64, ptr addrspace(1) %gep.in
951  %result = ashr i64 %a, 63
952  store i64 %result, ptr addrspace(1) %gep.out
953  ret void
954}
955
956attributes #0 = { nounwind readnone }
957