xref: /llvm-project/llvm/test/CodeGen/AMDGPU/shl.ll (revision 6206f5444fc0732e6495703c75a67f1f90f5b418)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-- -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefixes=SI
3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=VI
4; RUN: llc < %s -amdgpu-scalarize-global-loads=false  -mtriple=r600-- -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefixes=EG
5
6declare i32 @llvm.amdgcn.workitem.id.x() #0
7
8declare i32 @llvm.amdgcn.workgroup.id.x() #0
9
10define amdgpu_kernel void @shl_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
11; SI-LABEL: shl_v2i32:
12; SI:       ; %bb.0:
13; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
14; SI-NEXT:    s_mov_b32 s7, 0xf000
15; SI-NEXT:    s_mov_b32 s6, -1
16; SI-NEXT:    s_mov_b32 s10, s6
17; SI-NEXT:    s_mov_b32 s11, s7
18; SI-NEXT:    s_waitcnt lgkmcnt(0)
19; SI-NEXT:    s_mov_b32 s8, s2
20; SI-NEXT:    s_mov_b32 s9, s3
21; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
22; SI-NEXT:    s_mov_b32 s4, s0
23; SI-NEXT:    s_mov_b32 s5, s1
24; SI-NEXT:    s_waitcnt vmcnt(0)
25; SI-NEXT:    v_lshl_b32_e32 v1, v1, v3
26; SI-NEXT:    v_lshl_b32_e32 v0, v0, v2
27; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
28; SI-NEXT:    s_endpgm
29;
30; VI-LABEL: shl_v2i32:
31; VI:       ; %bb.0:
32; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
33; VI-NEXT:    s_waitcnt lgkmcnt(0)
34; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
35; VI-NEXT:    s_mov_b32 s3, 0xf000
36; VI-NEXT:    s_mov_b32 s2, -1
37; VI-NEXT:    s_waitcnt lgkmcnt(0)
38; VI-NEXT:    s_lshl_b32 s5, s5, s7
39; VI-NEXT:    s_lshl_b32 s4, s4, s6
40; VI-NEXT:    v_mov_b32_e32 v0, s4
41; VI-NEXT:    v_mov_b32_e32 v1, s5
42; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
43; VI-NEXT:    s_endpgm
44;
45; EG-LABEL: shl_v2i32:
46; EG:       ; %bb.0:
47; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
48; EG-NEXT:    TEX 0 @6
49; EG-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
50; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
51; EG-NEXT:    CF_END
52; EG-NEXT:    PAD
53; EG-NEXT:    Fetch clause starting at 6:
54; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
55; EG-NEXT:    ALU clause starting at 8:
56; EG-NEXT:     MOV * T0.X, KC0[2].Z,
57; EG-NEXT:    ALU clause starting at 9:
58; EG-NEXT:     LSHL * T0.Y, T0.Y, T0.W,
59; EG-NEXT:     LSHL T0.X, T0.X, T0.Z,
60; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
61; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
62  %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1
63  %a = load <2 x i32>, ptr addrspace(1) %in
64  %b = load <2 x i32>, ptr addrspace(1) %b_ptr
65  %result = shl <2 x i32> %a, %b
66  store <2 x i32> %result, ptr addrspace(1) %out
67  ret void
68}
69
70define amdgpu_kernel void @shl_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
71; SI-LABEL: shl_v4i32:
72; SI:       ; %bb.0:
73; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
74; SI-NEXT:    s_mov_b32 s7, 0xf000
75; SI-NEXT:    s_mov_b32 s6, -1
76; SI-NEXT:    s_mov_b32 s10, s6
77; SI-NEXT:    s_mov_b32 s11, s7
78; SI-NEXT:    s_waitcnt lgkmcnt(0)
79; SI-NEXT:    s_mov_b32 s8, s2
80; SI-NEXT:    s_mov_b32 s9, s3
81; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
82; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
83; SI-NEXT:    s_mov_b32 s4, s0
84; SI-NEXT:    s_mov_b32 s5, s1
85; SI-NEXT:    s_waitcnt vmcnt(0)
86; SI-NEXT:    v_lshl_b32_e32 v3, v3, v7
87; SI-NEXT:    v_lshl_b32_e32 v2, v2, v6
88; SI-NEXT:    v_lshl_b32_e32 v1, v1, v5
89; SI-NEXT:    v_lshl_b32_e32 v0, v0, v4
90; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
91; SI-NEXT:    s_endpgm
92;
93; VI-LABEL: shl_v4i32:
94; VI:       ; %bb.0:
95; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
96; VI-NEXT:    s_waitcnt lgkmcnt(0)
97; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
98; VI-NEXT:    s_mov_b32 s11, 0xf000
99; VI-NEXT:    s_mov_b32 s10, -1
100; VI-NEXT:    s_waitcnt lgkmcnt(0)
101; VI-NEXT:    s_lshl_b32 s3, s3, s7
102; VI-NEXT:    s_lshl_b32 s2, s2, s6
103; VI-NEXT:    s_lshl_b32 s1, s1, s5
104; VI-NEXT:    s_lshl_b32 s0, s0, s4
105; VI-NEXT:    v_mov_b32_e32 v0, s0
106; VI-NEXT:    v_mov_b32_e32 v1, s1
107; VI-NEXT:    v_mov_b32_e32 v2, s2
108; VI-NEXT:    v_mov_b32_e32 v3, s3
109; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
110; VI-NEXT:    s_endpgm
111;
112; EG-LABEL: shl_v4i32:
113; EG:       ; %bb.0:
114; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
115; EG-NEXT:    TEX 1 @6
116; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
117; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
118; EG-NEXT:    CF_END
119; EG-NEXT:    PAD
120; EG-NEXT:    Fetch clause starting at 6:
121; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
122; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
123; EG-NEXT:    ALU clause starting at 10:
124; EG-NEXT:     MOV * T0.X, KC0[2].Z,
125; EG-NEXT:    ALU clause starting at 11:
126; EG-NEXT:     LSHL * T0.W, T0.W, T1.W,
127; EG-NEXT:     LSHL * T0.Z, T0.Z, T1.Z,
128; EG-NEXT:     LSHL * T0.Y, T0.Y, T1.Y,
129; EG-NEXT:     LSHL T0.X, T0.X, T1.X,
130; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
131; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
132  %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1
133  %a = load <4 x i32>, ptr addrspace(1) %in
134  %b = load <4 x i32>, ptr addrspace(1) %b_ptr
135  %result = shl <4 x i32> %a, %b
136  store <4 x i32> %result, ptr addrspace(1) %out
137  ret void
138}
139
140define amdgpu_kernel void @shl_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
141; SI-LABEL: shl_i16:
142; SI:       ; %bb.0:
143; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
144; SI-NEXT:    s_mov_b32 s7, 0xf000
145; SI-NEXT:    s_mov_b32 s6, -1
146; SI-NEXT:    s_mov_b32 s10, s6
147; SI-NEXT:    s_mov_b32 s11, s7
148; SI-NEXT:    s_waitcnt lgkmcnt(0)
149; SI-NEXT:    s_mov_b32 s8, s2
150; SI-NEXT:    s_mov_b32 s9, s3
151; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
152; SI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
153; SI-NEXT:    s_mov_b32 s4, s0
154; SI-NEXT:    s_mov_b32 s5, s1
155; SI-NEXT:    s_waitcnt vmcnt(0)
156; SI-NEXT:    v_lshl_b32_e32 v0, v0, v1
157; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
158; SI-NEXT:    s_endpgm
159;
160; VI-LABEL: shl_i16:
161; VI:       ; %bb.0:
162; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
163; VI-NEXT:    s_mov_b32 s7, 0xf000
164; VI-NEXT:    s_mov_b32 s6, -1
165; VI-NEXT:    s_mov_b32 s10, s6
166; VI-NEXT:    s_mov_b32 s11, s7
167; VI-NEXT:    s_waitcnt lgkmcnt(0)
168; VI-NEXT:    s_mov_b32 s8, s2
169; VI-NEXT:    s_mov_b32 s9, s3
170; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
171; VI-NEXT:    buffer_load_ushort v1, off, s[8:11], 0 offset:2
172; VI-NEXT:    s_mov_b32 s4, s0
173; VI-NEXT:    s_mov_b32 s5, s1
174; VI-NEXT:    s_waitcnt vmcnt(0)
175; VI-NEXT:    v_lshlrev_b32_e32 v0, v1, v0
176; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
177; VI-NEXT:    s_endpgm
178;
179; EG-LABEL: shl_i16:
180; EG:       ; %bb.0:
181; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
182; EG-NEXT:    TEX 1 @6
183; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
184; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
185; EG-NEXT:    CF_END
186; EG-NEXT:    PAD
187; EG-NEXT:    Fetch clause starting at 6:
188; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
189; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
190; EG-NEXT:    ALU clause starting at 10:
191; EG-NEXT:     MOV * T0.X, KC0[2].Z,
192; EG-NEXT:    ALU clause starting at 11:
193; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
194; EG-NEXT:     LSHL * T1.W, T0.X, T1.X,
195; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
196; EG-NEXT:     AND_INT T1.W, PS, literal.x,
197; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
198; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
199; EG-NEXT:     LSHL T0.X, PV.W, PS,
200; EG-NEXT:     LSHL * T0.W, literal.x, PS,
201; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
202; EG-NEXT:     MOV T0.Y, 0.0,
203; EG-NEXT:     MOV * T0.Z, 0.0,
204; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
205; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
206  %b_ptr = getelementptr i16, ptr addrspace(1) %in, i16 1
207  %a = load i16, ptr addrspace(1) %in
208  %b = load i16, ptr addrspace(1) %b_ptr
209  %result = shl i16 %a, %b
210  store i16 %result, ptr addrspace(1) %out
211  ret void
212}
213
214define amdgpu_kernel void @shl_i16_v_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) {
215; SI-LABEL: shl_i16_v_s:
216; SI:       ; %bb.0:
217; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
218; SI-NEXT:    s_load_dword s12, s[4:5], 0xd
219; SI-NEXT:    s_mov_b32 s7, 0xf000
220; SI-NEXT:    s_mov_b32 s6, -1
221; SI-NEXT:    s_mov_b32 s10, s6
222; SI-NEXT:    s_waitcnt lgkmcnt(0)
223; SI-NEXT:    s_mov_b32 s8, s2
224; SI-NEXT:    s_mov_b32 s9, s3
225; SI-NEXT:    s_mov_b32 s11, s7
226; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
227; SI-NEXT:    s_mov_b32 s4, s0
228; SI-NEXT:    s_mov_b32 s5, s1
229; SI-NEXT:    s_waitcnt vmcnt(0)
230; SI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
231; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
232; SI-NEXT:    s_endpgm
233;
234; VI-LABEL: shl_i16_v_s:
235; VI:       ; %bb.0:
236; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
237; VI-NEXT:    s_load_dword s12, s[4:5], 0x34
238; VI-NEXT:    s_mov_b32 s7, 0xf000
239; VI-NEXT:    s_mov_b32 s6, -1
240; VI-NEXT:    s_mov_b32 s10, s6
241; VI-NEXT:    s_waitcnt lgkmcnt(0)
242; VI-NEXT:    s_mov_b32 s8, s2
243; VI-NEXT:    s_mov_b32 s9, s3
244; VI-NEXT:    s_mov_b32 s11, s7
245; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
246; VI-NEXT:    s_mov_b32 s4, s0
247; VI-NEXT:    s_mov_b32 s5, s1
248; VI-NEXT:    s_waitcnt vmcnt(0)
249; VI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
250; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
251; VI-NEXT:    s_endpgm
252;
253; EG-LABEL: shl_i16_v_s:
254; EG:       ; %bb.0:
255; EG-NEXT:    ALU 1, @10, KC0[CB0:0-32], KC1[]
256; EG-NEXT:    TEX 1 @6
257; EG-NEXT:    ALU 12, @12, KC0[CB0:0-32], KC1[]
258; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
259; EG-NEXT:    CF_END
260; EG-NEXT:    PAD
261; EG-NEXT:    Fetch clause starting at 6:
262; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
263; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
264; EG-NEXT:    ALU clause starting at 10:
265; EG-NEXT:     MOV T0.X, 0.0,
266; EG-NEXT:     MOV * T1.X, KC0[2].Z,
267; EG-NEXT:    ALU clause starting at 12:
268; EG-NEXT:     AND_INT T0.W, KC0[2].Y, literal.x,
269; EG-NEXT:     LSHL * T1.W, T1.X, T0.X,
270; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
271; EG-NEXT:     AND_INT T1.W, PS, literal.x,
272; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
273; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
274; EG-NEXT:     LSHL T0.X, PV.W, PS,
275; EG-NEXT:     LSHL * T0.W, literal.x, PS,
276; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
277; EG-NEXT:     MOV T0.Y, 0.0,
278; EG-NEXT:     MOV * T0.Z, 0.0,
279; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
280; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
281  %a = load i16, ptr addrspace(1) %in
282  %result = shl i16 %a, %b
283  store i16 %result, ptr addrspace(1) %out
284  ret void
285}
286
287define amdgpu_kernel void @shl_i16_v_compute_s(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %b) {
288; SI-LABEL: shl_i16_v_compute_s:
289; SI:       ; %bb.0:
290; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
291; SI-NEXT:    s_load_dword s12, s[4:5], 0xd
292; SI-NEXT:    s_mov_b32 s7, 0xf000
293; SI-NEXT:    s_mov_b32 s6, -1
294; SI-NEXT:    s_mov_b32 s10, s6
295; SI-NEXT:    s_waitcnt lgkmcnt(0)
296; SI-NEXT:    s_mov_b32 s8, s2
297; SI-NEXT:    s_mov_b32 s9, s3
298; SI-NEXT:    s_mov_b32 s11, s7
299; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
300; SI-NEXT:    s_add_i32 s12, s12, 3
301; SI-NEXT:    s_mov_b32 s4, s0
302; SI-NEXT:    s_mov_b32 s5, s1
303; SI-NEXT:    s_waitcnt vmcnt(0)
304; SI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
305; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
306; SI-NEXT:    s_endpgm
307;
308; VI-LABEL: shl_i16_v_compute_s:
309; VI:       ; %bb.0:
310; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
311; VI-NEXT:    s_load_dword s12, s[4:5], 0x34
312; VI-NEXT:    s_mov_b32 s7, 0xf000
313; VI-NEXT:    s_mov_b32 s6, -1
314; VI-NEXT:    s_mov_b32 s10, s6
315; VI-NEXT:    s_waitcnt lgkmcnt(0)
316; VI-NEXT:    s_mov_b32 s8, s2
317; VI-NEXT:    s_mov_b32 s9, s3
318; VI-NEXT:    s_mov_b32 s11, s7
319; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
320; VI-NEXT:    s_add_i32 s12, s12, 3
321; VI-NEXT:    s_mov_b32 s4, s0
322; VI-NEXT:    s_mov_b32 s5, s1
323; VI-NEXT:    s_waitcnt vmcnt(0)
324; VI-NEXT:    v_lshlrev_b32_e32 v0, s12, v0
325; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
326; VI-NEXT:    s_endpgm
327;
328; EG-LABEL: shl_i16_v_compute_s:
329; EG:       ; %bb.0:
330; EG-NEXT:    ALU 0, @12, KC0[], KC1[]
331; EG-NEXT:    TEX 0 @8
332; EG-NEXT:    ALU 0, @13, KC0[CB0:0-32], KC1[]
333; EG-NEXT:    TEX 0 @10
334; EG-NEXT:    ALU 15, @14, KC0[CB0:0-32], KC1[]
335; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
336; EG-NEXT:    CF_END
337; EG-NEXT:    PAD
338; EG-NEXT:    Fetch clause starting at 8:
339; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 44, #3
340; EG-NEXT:    Fetch clause starting at 10:
341; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
342; EG-NEXT:    ALU clause starting at 12:
343; EG-NEXT:     MOV * T0.X, 0.0,
344; EG-NEXT:    ALU clause starting at 13:
345; EG-NEXT:     MOV * T1.X, KC0[2].Z,
346; EG-NEXT:    ALU clause starting at 14:
347; EG-NEXT:     ADD_INT * T0.W, T0.X, literal.x,
348; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
349; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
350; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
351; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
352; EG-NEXT:     LSHL * T0.W, T1.X, PV.W,
353; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
354; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
355; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
356; EG-NEXT:     LSHL T0.X, PV.W, PS,
357; EG-NEXT:     LSHL * T0.W, literal.x, PS,
358; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
359; EG-NEXT:     MOV T0.Y, 0.0,
360; EG-NEXT:     MOV * T0.Z, 0.0,
361; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
362; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
363  %a = load i16, ptr addrspace(1) %in
364  %b.add = add i16 %b, 3
365  %result = shl i16 %a, %b.add
366  store i16 %result, ptr addrspace(1) %out
367  ret void
368}
369
370define amdgpu_kernel void @shl_i16_computed_amount(ptr addrspace(1) %out, ptr addrspace(1) %in) {
371; SI-LABEL: shl_i16_computed_amount:
372; SI:       ; %bb.0:
373; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
374; SI-NEXT:    s_mov_b32 s7, 0xf000
375; SI-NEXT:    s_mov_b32 s6, -1
376; SI-NEXT:    s_mov_b32 s10, s6
377; SI-NEXT:    s_mov_b32 s11, s7
378; SI-NEXT:    s_waitcnt lgkmcnt(0)
379; SI-NEXT:    s_mov_b32 s8, s2
380; SI-NEXT:    s_mov_b32 s9, s3
381; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
382; SI-NEXT:    v_mov_b32_e32 v1, 0
383; SI-NEXT:    s_mov_b32 s14, 0
384; SI-NEXT:    s_mov_b32 s15, s7
385; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
386; SI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
387; SI-NEXT:    s_waitcnt vmcnt(0)
388; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[12:15], 0 addr64 offset:2 glc
389; SI-NEXT:    s_waitcnt vmcnt(0)
390; SI-NEXT:    s_mov_b32 s4, s0
391; SI-NEXT:    s_mov_b32 s5, s1
392; SI-NEXT:    v_add_i32_e32 v0, vcc, 3, v0
393; SI-NEXT:    v_lshlrev_b32_e32 v0, v0, v2
394; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
395; SI-NEXT:    s_endpgm
396;
397; VI-LABEL: shl_i16_computed_amount:
398; VI:       ; %bb.0:
399; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
400; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
401; VI-NEXT:    s_mov_b32 s7, 0xf000
402; VI-NEXT:    s_mov_b32 s6, -1
403; VI-NEXT:    s_mov_b32 s10, s6
404; VI-NEXT:    s_waitcnt lgkmcnt(0)
405; VI-NEXT:    v_mov_b32_e32 v1, s3
406; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
407; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
408; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
409; VI-NEXT:    s_mov_b32 s8, s2
410; VI-NEXT:    s_mov_b32 s9, s3
411; VI-NEXT:    s_mov_b32 s11, s7
412; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
413; VI-NEXT:    buffer_load_ushort v2, off, s[8:11], 0 glc
414; VI-NEXT:    s_waitcnt vmcnt(0)
415; VI-NEXT:    flat_load_ushort v0, v[0:1] glc
416; VI-NEXT:    s_waitcnt vmcnt(0)
417; VI-NEXT:    s_mov_b32 s4, s0
418; VI-NEXT:    s_mov_b32 s5, s1
419; VI-NEXT:    v_add_u16_e32 v0, 3, v0
420; VI-NEXT:    v_lshlrev_b16_e32 v0, v0, v2
421; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
422; VI-NEXT:    s_endpgm
423;
424; EG-LABEL: shl_i16_computed_amount:
425; EG:       ; %bb.0:
426; EG-NEXT:    ALU 0, @12, KC0[CB0:0-32], KC1[]
427; EG-NEXT:    TEX 0 @8
428; EG-NEXT:    ALU 1, @13, KC0[CB0:0-32], KC1[]
429; EG-NEXT:    TEX 0 @10
430; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
431; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
432; EG-NEXT:    CF_END
433; EG-NEXT:    PAD
434; EG-NEXT:    Fetch clause starting at 8:
435; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
436; EG-NEXT:    Fetch clause starting at 10:
437; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 2, #1
438; EG-NEXT:    ALU clause starting at 12:
439; EG-NEXT:     MOV * T1.X, KC0[2].Z,
440; EG-NEXT:    ALU clause starting at 13:
441; EG-NEXT:     LSHL * T0.W, T0.X, 1,
442; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
443; EG-NEXT:    ALU clause starting at 15:
444; EG-NEXT:     ADD_INT * T0.W, T0.X, literal.x,
445; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
446; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
447; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
448; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
449; EG-NEXT:     LSHL * T0.W, T1.X, PV.W,
450; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
451; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
452; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
453; EG-NEXT:     LSHL T0.X, PV.W, PS,
454; EG-NEXT:     LSHL * T0.W, literal.x, PS,
455; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
456; EG-NEXT:     MOV T0.Y, 0.0,
457; EG-NEXT:     MOV * T0.Z, 0.0,
458; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
459; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
460  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
461  %gep = getelementptr inbounds i16, ptr addrspace(1) %in, i32 %tid
462  %gep.out = getelementptr inbounds i16, ptr addrspace(1) %out, i32 %tid
463  %b_ptr = getelementptr i16, ptr addrspace(1) %gep, i16 1
464  %a = load volatile i16, ptr addrspace(1) %in
465  %b = load volatile i16, ptr addrspace(1) %b_ptr
466  %b.add = add i16 %b, 3
467  %result = shl i16 %a, %b.add
468  store i16 %result, ptr addrspace(1) %out
469  ret void
470}
471
472define amdgpu_kernel void @shl_i16_i_s(ptr addrspace(1) %out, i16 zeroext %a) {
473; SI-LABEL: shl_i16_i_s:
474; SI:       ; %bb.0:
475; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
476; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
477; SI-NEXT:    s_mov_b32 s3, 0xf000
478; SI-NEXT:    s_mov_b32 s2, -1
479; SI-NEXT:    s_waitcnt lgkmcnt(0)
480; SI-NEXT:    s_lshl_b32 s4, s6, 12
481; SI-NEXT:    v_mov_b32_e32 v0, s4
482; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
483; SI-NEXT:    s_endpgm
484;
485; VI-LABEL: shl_i16_i_s:
486; VI:       ; %bb.0:
487; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
488; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
489; VI-NEXT:    s_mov_b32 s3, 0xf000
490; VI-NEXT:    s_mov_b32 s2, -1
491; VI-NEXT:    s_waitcnt lgkmcnt(0)
492; VI-NEXT:    s_lshl_b32 s4, s6, 12
493; VI-NEXT:    v_mov_b32_e32 v0, s4
494; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
495; VI-NEXT:    s_endpgm
496;
497; EG-LABEL: shl_i16_i_s:
498; EG:       ; %bb.0:
499; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
500; EG-NEXT:    TEX 0 @6
501; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
502; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
503; EG-NEXT:    CF_END
504; EG-NEXT:    PAD
505; EG-NEXT:    Fetch clause starting at 6:
506; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
507; EG-NEXT:    ALU clause starting at 8:
508; EG-NEXT:     MOV * T0.X, 0.0,
509; EG-NEXT:    ALU clause starting at 9:
510; EG-NEXT:     BFE_INT T0.W, T0.X, 0.0, literal.x,
511; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
512; EG-NEXT:    16(2.242078e-44), 3(4.203895e-45)
513; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
514; EG-NEXT:    12(1.681558e-44), 0(0.000000e+00)
515; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
516; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
517; EG-NEXT:    61440(8.609578e-41), 3(4.203895e-45)
518; EG-NEXT:     LSHL T0.X, PV.W, PS,
519; EG-NEXT:     LSHL * T0.W, literal.x, PS,
520; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
521; EG-NEXT:     MOV T0.Y, 0.0,
522; EG-NEXT:     MOV * T0.Z, 0.0,
523; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
524; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
525  %result = shl i16 %a, 12
526  store i16 %result, ptr addrspace(1) %out
527  ret void
528}
529
530define amdgpu_kernel void @shl_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
531; SI-LABEL: shl_v2i16:
532; SI:       ; %bb.0:
533; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
534; SI-NEXT:    s_mov_b32 s7, 0xf000
535; SI-NEXT:    s_mov_b32 s6, -1
536; SI-NEXT:    s_mov_b32 s10, s6
537; SI-NEXT:    s_mov_b32 s11, s7
538; SI-NEXT:    s_waitcnt lgkmcnt(0)
539; SI-NEXT:    s_mov_b32 s8, s2
540; SI-NEXT:    s_mov_b32 s9, s3
541; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
542; SI-NEXT:    v_mov_b32_e32 v1, 0
543; SI-NEXT:    s_mov_b32 s14, 0
544; SI-NEXT:    s_mov_b32 s15, s7
545; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
546; SI-NEXT:    buffer_load_dword v2, off, s[8:11], 0
547; SI-NEXT:    buffer_load_dword v0, v[0:1], s[12:15], 0 addr64 offset:4
548; SI-NEXT:    s_mov_b32 s4, s0
549; SI-NEXT:    s_mov_b32 s5, s1
550; SI-NEXT:    s_waitcnt vmcnt(1)
551; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
552; SI-NEXT:    s_waitcnt vmcnt(0)
553; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
554; SI-NEXT:    v_lshlrev_b32_e32 v0, v0, v2
555; SI-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
556; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
557; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
558; SI-NEXT:    v_or_b32_e32 v0, v0, v1
559; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
560; SI-NEXT:    s_endpgm
561;
562; VI-LABEL: shl_v2i16:
563; VI:       ; %bb.0:
564; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
565; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
566; VI-NEXT:    s_waitcnt lgkmcnt(0)
567; VI-NEXT:    v_mov_b32_e32 v1, s3
568; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
569; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
570; VI-NEXT:    v_add_u32_e32 v0, vcc, 4, v0
571; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
572; VI-NEXT:    flat_load_dword v0, v[0:1]
573; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
574; VI-NEXT:    s_mov_b32 s3, 0xf000
575; VI-NEXT:    s_mov_b32 s2, -1
576; VI-NEXT:    s_waitcnt lgkmcnt(0)
577; VI-NEXT:    s_lshr_b32 s5, s4, 16
578; VI-NEXT:    v_mov_b32_e32 v1, s5
579; VI-NEXT:    s_waitcnt vmcnt(0)
580; VI-NEXT:    v_lshlrev_b16_e64 v2, v0, s4
581; VI-NEXT:    v_lshlrev_b16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
582; VI-NEXT:    v_or_b32_e32 v0, v2, v0
583; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
584; VI-NEXT:    s_endpgm
585;
586; EG-LABEL: shl_v2i16:
587; EG:       ; %bb.0:
588; EG-NEXT:    ALU 2, @12, KC0[CB0:0-32], KC1[]
589; EG-NEXT:    TEX 0 @8
590; EG-NEXT:    ALU 0, @15, KC0[CB0:0-32], KC1[]
591; EG-NEXT:    TEX 0 @10
592; EG-NEXT:    ALU 11, @16, KC0[CB0:0-32], KC1[]
593; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T7.X, 1
594; EG-NEXT:    CF_END
595; EG-NEXT:    PAD
596; EG-NEXT:    Fetch clause starting at 8:
597; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 4, #1
598; EG-NEXT:    Fetch clause starting at 10:
599; EG-NEXT:     VTX_READ_32 T7.X, T7.X, 0, #1
600; EG-NEXT:    ALU clause starting at 12:
601; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
602; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
603; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
604; EG-NEXT:    ALU clause starting at 15:
605; EG-NEXT:     MOV * T7.X, KC0[2].Z,
606; EG-NEXT:    ALU clause starting at 16:
607; EG-NEXT:     AND_INT T0.Z, T0.X, literal.x,
608; EG-NEXT:     LSHR T0.W, T0.X, literal.y,
609; EG-NEXT:     LSHR * T1.W, T7.X, literal.y,
610; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
611; EG-NEXT:     LSHL T0.W, PS, PV.W,
612; EG-NEXT:     LSHL * T1.W, T7.X, PV.Z,
613; EG-NEXT:     AND_INT T1.W, PS, literal.x,
614; EG-NEXT:     LSHL * T0.W, PV.W, literal.y,
615; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
616; EG-NEXT:     OR_INT T0.X, PV.W, PS,
617; EG-NEXT:     LSHR * T7.X, KC0[2].Y, literal.x,
618; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
619  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
620  %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i32 %tid
621  %gep.out = getelementptr inbounds <2 x i16>, ptr addrspace(1) %out, i32 %tid
622  %b_ptr = getelementptr <2 x i16>, ptr addrspace(1) %gep, i16 1
623  %a = load <2 x i16>, ptr addrspace(1) %in
624  %b = load <2 x i16>, ptr addrspace(1) %b_ptr
625  %result = shl <2 x i16> %a, %b
626  store <2 x i16> %result, ptr addrspace(1) %out
627  ret void
628}
629
630define amdgpu_kernel void @shl_v4i16(ptr addrspace(1) %out, ptr addrspace(1) %in) {
631; SI-LABEL: shl_v4i16:
632; SI:       ; %bb.0:
633; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
634; SI-NEXT:    s_mov_b32 s7, 0xf000
635; SI-NEXT:    s_mov_b32 s6, 0
636; SI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
637; SI-NEXT:    v_mov_b32_e32 v5, 0
638; SI-NEXT:    s_waitcnt lgkmcnt(0)
639; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
640; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[4:7], 0 addr64
641; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
642; SI-NEXT:    s_waitcnt vmcnt(0)
643; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
644; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
645; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
646; SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
647; SI-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
648; SI-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
649; SI-NEXT:    v_lshlrev_b32_e32 v2, v9, v7
650; SI-NEXT:    v_lshlrev_b32_e32 v3, v8, v6
651; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
652; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
653; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
654; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
655; SI-NEXT:    v_or_b32_e32 v1, v1, v2
656; SI-NEXT:    v_or_b32_e32 v0, v0, v3
657; SI-NEXT:    buffer_store_dwordx2 v[0:1], v[4:5], s[0:3], 0 addr64
658; SI-NEXT:    s_endpgm
659;
660; VI-LABEL: shl_v4i16:
661; VI:       ; %bb.0:
662; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
663; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
664; VI-NEXT:    s_waitcnt lgkmcnt(0)
665; VI-NEXT:    v_mov_b32_e32 v1, s3
666; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
667; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
668; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
669; VI-NEXT:    v_mov_b32_e32 v5, s1
670; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
671; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
672; VI-NEXT:    s_waitcnt vmcnt(0)
673; VI-NEXT:    v_lshlrev_b16_e32 v6, v3, v1
674; VI-NEXT:    v_lshlrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
675; VI-NEXT:    v_lshlrev_b16_e32 v3, v2, v0
676; VI-NEXT:    v_lshlrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
677; VI-NEXT:    v_or_b32_e32 v1, v6, v1
678; VI-NEXT:    v_or_b32_e32 v0, v3, v0
679; VI-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
680; VI-NEXT:    s_endpgm
681;
682; EG-LABEL: shl_v4i16:
683; EG:       ; %bb.0:
684; EG-NEXT:    ALU 3, @8, KC0[CB0:0-32], KC1[]
685; EG-NEXT:    TEX 0 @6
686; EG-NEXT:    ALU 42, @12, KC0[CB0:0-32], KC1[]
687; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T10.XY, T0.X, 1
688; EG-NEXT:    CF_END
689; EG-NEXT:    PAD
690; EG-NEXT:    Fetch clause starting at 6:
691; EG-NEXT:     VTX_READ_128 T10.XYZW, T0.X, 0, #1
692; EG-NEXT:    ALU clause starting at 8:
693; EG-NEXT:     MOV T0.Y, T6.X,
694; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
695; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
696; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
697; EG-NEXT:    ALU clause starting at 12:
698; EG-NEXT:     AND_INT * T1.W, T10.Z, literal.x,
699; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
700; EG-NEXT:     LSHL * T1.W, T10.X, PV.W,
701; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
702; EG-NEXT:     AND_INT * T2.W, T0.Y, literal.y,
703; EG-NEXT:    65535(9.183409e-41), -65536(nan)
704; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
705; EG-NEXT:     MOV * T6.X, PV.W,
706; EG-NEXT:     MOV T0.X, PV.X,
707; EG-NEXT:     LSHR T1.W, T10.Z, literal.x,
708; EG-NEXT:     LSHR * T2.W, T10.X, literal.x,
709; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
710; EG-NEXT:     LSHL T1.W, PS, PV.W,
711; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
712; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
713; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
714; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
715; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
716; EG-NEXT:     MOV T6.X, PV.W,
717; EG-NEXT:     MOV * T0.X, T7.X,
718; EG-NEXT:     AND_INT * T1.W, T10.W, literal.x,
719; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
720; EG-NEXT:     LSHL T1.W, T10.Y, PV.W,
721; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
722; EG-NEXT:    -65536(nan), 0(0.000000e+00)
723; EG-NEXT:     AND_INT * T1.W, PV.W, literal.x,
724; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
725; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
726; EG-NEXT:     MOV * T7.X, PV.W,
727; EG-NEXT:     MOV T0.X, PV.X,
728; EG-NEXT:     LSHR T1.W, T10.W, literal.x,
729; EG-NEXT:     LSHR * T2.W, T10.Y, literal.x,
730; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
731; EG-NEXT:     LSHL * T1.W, PS, PV.W,
732; EG-NEXT:     AND_INT T0.Z, T0.X, literal.x,
733; EG-NEXT:     LSHL T1.W, PV.W, literal.y,
734; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
735; EG-NEXT:    65535(9.183409e-41), 16(2.242078e-44)
736; EG-NEXT:     LSHR T0.X, PS, literal.x,
737; EG-NEXT:     OR_INT * T10.Y, PV.Z, PV.W,
738; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
739; EG-NEXT:     MOV T7.X, PV.Y,
740; EG-NEXT:     MOV * T10.X, T6.X,
741  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
742  %gep = getelementptr inbounds <4 x i16>, ptr addrspace(1) %in, i32 %tid
743  %gep.out = getelementptr inbounds <4 x i16>, ptr addrspace(1) %out, i32 %tid
744  %b_ptr = getelementptr <4 x i16>, ptr addrspace(1) %gep, i16 1
745  %a = load <4 x i16>, ptr addrspace(1) %gep
746  %b = load <4 x i16>, ptr addrspace(1) %b_ptr
747  %result = shl <4 x i16> %a, %b
748  store <4 x i16> %result, ptr addrspace(1) %gep.out
749  ret void
750}
751
752define amdgpu_kernel void @shl_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
753; SI-LABEL: shl_i64:
754; SI:       ; %bb.0:
755; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
756; SI-NEXT:    s_mov_b32 s7, 0xf000
757; SI-NEXT:    s_mov_b32 s6, -1
758; SI-NEXT:    s_mov_b32 s10, s6
759; SI-NEXT:    s_mov_b32 s11, s7
760; SI-NEXT:    s_waitcnt lgkmcnt(0)
761; SI-NEXT:    s_mov_b32 s8, s2
762; SI-NEXT:    s_mov_b32 s9, s3
763; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
764; SI-NEXT:    s_mov_b32 s4, s0
765; SI-NEXT:    s_mov_b32 s5, s1
766; SI-NEXT:    s_waitcnt vmcnt(0)
767; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v2
768; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
769; SI-NEXT:    s_endpgm
770;
771; VI-LABEL: shl_i64:
772; VI:       ; %bb.0:
773; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
774; VI-NEXT:    s_waitcnt lgkmcnt(0)
775; VI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
776; VI-NEXT:    s_mov_b32 s3, 0xf000
777; VI-NEXT:    s_mov_b32 s2, -1
778; VI-NEXT:    s_waitcnt lgkmcnt(0)
779; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
780; VI-NEXT:    v_mov_b32_e32 v0, s4
781; VI-NEXT:    v_mov_b32_e32 v1, s5
782; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
783; VI-NEXT:    s_endpgm
784;
785; EG-LABEL: shl_i64:
786; EG:       ; %bb.0:
787; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
788; EG-NEXT:    TEX 0 @6
789; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
790; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
791; EG-NEXT:    CF_END
792; EG-NEXT:    PAD
793; EG-NEXT:    Fetch clause starting at 6:
794; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
795; EG-NEXT:    ALU clause starting at 8:
796; EG-NEXT:     MOV * T0.X, KC0[2].Z,
797; EG-NEXT:    ALU clause starting at 9:
798; EG-NEXT:     LSHR T1.Y, T0.Y, 1,
799; EG-NEXT:     NOT_INT T1.Z, T0.Z,
800; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
801; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
802; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
803; EG-NEXT:     LSHL T2.Z, T0.X, PS,
804; EG-NEXT:     BIT_ALIGN_INT T0.W, PV.Y, PV.W, PV.Z,
805; EG-NEXT:     AND_INT * T1.W, T0.Z, literal.x,
806; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
807; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
808; EG-NEXT:     CNDE_INT T0.X, T1.W, T2.Z, 0.0,
809; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
810; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
811  %b_ptr = getelementptr i64, ptr addrspace(1) %in, i64 1
812  %a = load i64, ptr addrspace(1) %in
813  %b = load i64, ptr addrspace(1) %b_ptr
814  %result = shl i64 %a, %b
815  store i64 %result, ptr addrspace(1) %out
816  ret void
817}
818
819define amdgpu_kernel void @shl_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
820; SI-LABEL: shl_v2i64:
821; SI:       ; %bb.0:
822; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
823; SI-NEXT:    s_mov_b32 s7, 0xf000
824; SI-NEXT:    s_mov_b32 s6, -1
825; SI-NEXT:    s_mov_b32 s10, s6
826; SI-NEXT:    s_mov_b32 s11, s7
827; SI-NEXT:    s_waitcnt lgkmcnt(0)
828; SI-NEXT:    s_mov_b32 s8, s2
829; SI-NEXT:    s_mov_b32 s9, s3
830; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
831; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
832; SI-NEXT:    s_mov_b32 s4, s0
833; SI-NEXT:    s_mov_b32 s5, s1
834; SI-NEXT:    s_waitcnt vmcnt(0)
835; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
836; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
837; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
838; SI-NEXT:    s_endpgm
839;
840; VI-LABEL: shl_v2i64:
841; VI:       ; %bb.0:
842; VI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
843; VI-NEXT:    s_waitcnt lgkmcnt(0)
844; VI-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
845; VI-NEXT:    s_mov_b32 s11, 0xf000
846; VI-NEXT:    s_mov_b32 s10, -1
847; VI-NEXT:    s_waitcnt lgkmcnt(0)
848; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s6
849; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s4
850; VI-NEXT:    v_mov_b32_e32 v0, s0
851; VI-NEXT:    v_mov_b32_e32 v1, s1
852; VI-NEXT:    v_mov_b32_e32 v2, s2
853; VI-NEXT:    v_mov_b32_e32 v3, s3
854; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
855; VI-NEXT:    s_endpgm
856;
857; EG-LABEL: shl_v2i64:
858; EG:       ; %bb.0:
859; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
860; EG-NEXT:    TEX 1 @6
861; EG-NEXT:    ALU 23, @11, KC0[CB0:0-32], KC1[]
862; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 1
863; EG-NEXT:    CF_END
864; EG-NEXT:    PAD
865; EG-NEXT:    Fetch clause starting at 6:
866; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 16, #1
867; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
868; EG-NEXT:    ALU clause starting at 10:
869; EG-NEXT:     MOV * T0.X, KC0[2].Z,
870; EG-NEXT:    ALU clause starting at 11:
871; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
872; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
873; EG-NEXT:     LSHL T2.X, T0.Z, PV.W,
874; EG-NEXT:     AND_INT T1.Y, T1.Z, literal.x, BS:VEC_120/SCL_212
875; EG-NEXT:     LSHR T2.Z, T0.W, 1,
876; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1, BS:VEC_102/SCL_221
877; EG-NEXT:     NOT_INT * T1.W, T1.Z,
878; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
879; EG-NEXT:     BIT_ALIGN_INT T3.X, PV.Z, PV.W, PS,
880; EG-NEXT:     LSHR T2.Y, T0.Y, 1,
881; EG-NEXT:     NOT_INT T0.Z, T1.X,
882; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
883; EG-NEXT:     AND_INT * T1.W, T1.X, literal.x,
884; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
885; EG-NEXT:     LSHL T0.Y, T0.X, PS, BS:VEC_120/SCL_212
886; EG-NEXT:     AND_INT T1.Z, T1.X, literal.x, BS:VEC_201
887; EG-NEXT:     BIT_ALIGN_INT T0.W, PV.Y, PV.W, PV.Z,
888; EG-NEXT:     CNDE_INT * T2.W, T1.Y, PV.X, T2.X,
889; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
890; EG-NEXT:     CNDE_INT T2.Y, PV.Z, PV.W, PV.Y,
891; EG-NEXT:     CNDE_INT * T2.Z, T1.Y, T2.X, 0.0,
892; EG-NEXT:     CNDE_INT T2.X, T1.Z, T0.Y, 0.0,
893; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
894; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
895  %b_ptr = getelementptr <2 x i64>, ptr addrspace(1) %in, i64 1
896  %a = load <2 x i64>, ptr addrspace(1) %in
897  %b = load <2 x i64>, ptr addrspace(1) %b_ptr
898  %result = shl <2 x i64> %a, %b
899  store <2 x i64> %result, ptr addrspace(1) %out
900  ret void
901}
902
903define amdgpu_kernel void @shl_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
904; SI-LABEL: shl_v4i64:
905; SI:       ; %bb.0:
906; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
907; SI-NEXT:    s_mov_b32 s3, 0xf000
908; SI-NEXT:    s_mov_b32 s2, -1
909; SI-NEXT:    s_mov_b32 s10, s2
910; SI-NEXT:    s_mov_b32 s11, s3
911; SI-NEXT:    s_waitcnt lgkmcnt(0)
912; SI-NEXT:    s_mov_b32 s8, s6
913; SI-NEXT:    s_mov_b32 s9, s7
914; SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:16
915; SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:48
916; SI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
917; SI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
918; SI-NEXT:    s_mov_b32 s0, s4
919; SI-NEXT:    s_mov_b32 s1, s5
920; SI-NEXT:    s_waitcnt vmcnt(2)
921; SI-NEXT:    v_lshl_b64 v[2:3], v[2:3], v6
922; SI-NEXT:    v_lshl_b64 v[0:1], v[0:1], v4
923; SI-NEXT:    s_waitcnt vmcnt(0)
924; SI-NEXT:    v_lshl_b64 v[9:10], v[9:10], v13
925; SI-NEXT:    v_lshl_b64 v[7:8], v[7:8], v11
926; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
927; SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0
928; SI-NEXT:    s_endpgm
929;
930; VI-LABEL: shl_v4i64:
931; VI:       ; %bb.0:
932; VI-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x24
933; VI-NEXT:    s_waitcnt lgkmcnt(0)
934; VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
935; VI-NEXT:    s_mov_b32 s19, 0xf000
936; VI-NEXT:    s_mov_b32 s18, -1
937; VI-NEXT:    s_waitcnt lgkmcnt(0)
938; VI-NEXT:    s_lshl_b64 s[6:7], s[6:7], s14
939; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s12
940; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s10
941; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
942; VI-NEXT:    v_mov_b32_e32 v0, s4
943; VI-NEXT:    v_mov_b32_e32 v1, s5
944; VI-NEXT:    v_mov_b32_e32 v2, s6
945; VI-NEXT:    v_mov_b32_e32 v3, s7
946; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0 offset:16
947; VI-NEXT:    s_nop 0
948; VI-NEXT:    v_mov_b32_e32 v0, s0
949; VI-NEXT:    v_mov_b32_e32 v1, s1
950; VI-NEXT:    v_mov_b32_e32 v2, s2
951; VI-NEXT:    v_mov_b32_e32 v3, s3
952; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
953; VI-NEXT:    s_endpgm
954;
955; EG-LABEL: shl_v4i64:
956; EG:       ; %bb.0:
957; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
958; EG-NEXT:    TEX 3 @6
959; EG-NEXT:    ALU 48, @15, KC0[CB0:0-32], KC1[]
960; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T0.X, 0
961; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T3.XYZW, T1.X, 1
962; EG-NEXT:    CF_END
963; EG-NEXT:    Fetch clause starting at 6:
964; EG-NEXT:     VTX_READ_128 T1.XYZW, T0.X, 32, #1
965; EG-NEXT:     VTX_READ_128 T2.XYZW, T0.X, 48, #1
966; EG-NEXT:     VTX_READ_128 T3.XYZW, T0.X, 16, #1
967; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
968; EG-NEXT:    ALU clause starting at 14:
969; EG-NEXT:     MOV * T0.X, KC0[2].Z,
970; EG-NEXT:    ALU clause starting at 15:
971; EG-NEXT:     AND_INT * T1.W, T1.Z, literal.x,
972; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
973; EG-NEXT:     LSHL * T1.W, T0.Z, PV.W,
974; EG-NEXT:     AND_INT T4.X, T1.Z, literal.x,
975; EG-NEXT:     LSHR T1.Y, T3.W, 1,
976; EG-NEXT:     NOT_INT T4.Z, T2.Z, BS:VEC_201
977; EG-NEXT:     BIT_ALIGN_INT T2.W, T3.W, T3.Z, 1,
978; EG-NEXT:     AND_INT * T3.W, T2.Z, literal.y,
979; EG-NEXT:    32(4.484155e-44), 31(4.344025e-44)
980; EG-NEXT:     LSHL T5.X, T3.Z, PS,
981; EG-NEXT:     AND_INT T2.Y, T2.Z, literal.x, BS:VEC_120/SCL_212
982; EG-NEXT:     BIT_ALIGN_INT T2.Z, PV.Y, PV.W, PV.Z,
983; EG-NEXT:     LSHR T2.W, T3.Y, 1,
984; EG-NEXT:     NOT_INT * T3.W, T2.X,
985; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
986; EG-NEXT:     BIT_ALIGN_INT T6.X, T3.Y, T3.X, 1,
987; EG-NEXT:     AND_INT T1.Y, T2.X, literal.x,
988; EG-NEXT:     LSHR T3.Z, T0.W, 1,
989; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.W, T0.Z, 1,
990; EG-NEXT:     NOT_INT * T4.W, T1.Z,
991; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
992; EG-NEXT:     BIT_ALIGN_INT T7.X, PV.Z, PV.W, PS,
993; EG-NEXT:     LSHL T1.Y, T3.X, PV.Y, BS:VEC_120/SCL_212
994; EG-NEXT:     AND_INT T0.Z, T2.X, literal.x, BS:VEC_201
995; EG-NEXT:     BIT_ALIGN_INT T0.W, T2.W, PV.X, T3.W,
996; EG-NEXT:     CNDE_INT * T3.W, T2.Y, T2.Z, T5.X,
997; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
998; EG-NEXT:     LSHR T2.X, T0.Y, 1,
999; EG-NEXT:     CNDE_INT T3.Y, PV.Z, PV.W, PV.Y,
1000; EG-NEXT:     NOT_INT T1.Z, T1.X,
1001; EG-NEXT:     BIT_ALIGN_INT T0.W, T0.Y, T0.X, 1,
1002; EG-NEXT:     AND_INT * T2.W, T1.X, literal.x,
1003; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1004; EG-NEXT:     LSHL T0.X, T0.X, PS,
1005; EG-NEXT:     AND_INT T0.Y, T1.X, literal.x, BS:VEC_120/SCL_212
1006; EG-NEXT:     CNDE_INT T3.Z, T2.Y, T5.X, 0.0, BS:VEC_021/SCL_122
1007; EG-NEXT:     BIT_ALIGN_INT * T0.W, PV.X, PV.W, PV.Z,
1008; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1009; EG-NEXT:     CNDE_INT * T2.W, T4.X, T7.X, T1.W,
1010; EG-NEXT:     CNDE_INT T3.X, T0.Z, T1.Y, 0.0,
1011; EG-NEXT:     CNDE_INT T2.Y, T0.Y, T0.W, T0.X,
1012; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, literal.x,
1013; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1014; EG-NEXT:     LSHR T1.X, PV.W, literal.x,
1015; EG-NEXT:     CNDE_INT T2.Z, T4.X, T1.W, 0.0,
1016; EG-NEXT:     CNDE_INT * T2.X, T0.Y, T0.X, 0.0,
1017; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1018; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
1019; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1020  %b_ptr = getelementptr <4 x i64>, ptr addrspace(1) %in, i64 1
1021  %a = load <4 x i64>, ptr addrspace(1) %in
1022  %b = load <4 x i64>, ptr addrspace(1) %b_ptr
1023  %result = shl <4 x i64> %a, %b
1024  store <4 x i64> %result, ptr addrspace(1) %out
1025  ret void
1026}
1027
1028; Make sure load width gets reduced to i32 load.
1029define amdgpu_kernel void @s_shl_32_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
1030; SI-LABEL: s_shl_32_i64:
1031; SI:       ; %bb.0:
1032; SI-NEXT:    s_load_dword s6, s[4:5], 0x13
1033; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1034; SI-NEXT:    s_mov_b32 s3, 0xf000
1035; SI-NEXT:    s_mov_b32 s2, -1
1036; SI-NEXT:    v_mov_b32_e32 v0, 0
1037; SI-NEXT:    s_waitcnt lgkmcnt(0)
1038; SI-NEXT:    v_mov_b32_e32 v1, s6
1039; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1040; SI-NEXT:    s_endpgm
1041;
1042; VI-LABEL: s_shl_32_i64:
1043; VI:       ; %bb.0:
1044; VI-NEXT:    s_load_dword s6, s[4:5], 0x4c
1045; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1046; VI-NEXT:    s_mov_b32 s3, 0xf000
1047; VI-NEXT:    s_mov_b32 s2, -1
1048; VI-NEXT:    v_mov_b32_e32 v0, 0
1049; VI-NEXT:    s_waitcnt lgkmcnt(0)
1050; VI-NEXT:    v_mov_b32_e32 v1, s6
1051; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1052; VI-NEXT:    s_endpgm
1053;
1054; EG-LABEL: s_shl_32_i64:
1055; EG:       ; %bb.0:
1056; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
1057; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1058; EG-NEXT:    CF_END
1059; EG-NEXT:    PAD
1060; EG-NEXT:    ALU clause starting at 4:
1061; EG-NEXT:     MOV * T0.Y, KC0[4].W,
1062; EG-NEXT:     MOV T0.X, 0.0,
1063; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1064; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1065  %result = shl i64 %a, 32
1066  store i64 %result, ptr addrspace(1) %out
1067  ret void
1068}
1069
1070define amdgpu_kernel void @v_shl_32_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
1071; SI-LABEL: v_shl_32_i64:
1072; SI:       ; %bb.0:
1073; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1074; SI-NEXT:    s_ashr_i32 s9, s8, 31
1075; SI-NEXT:    s_mov_b32 s7, 0xf000
1076; SI-NEXT:    s_mov_b32 s6, 0
1077; SI-NEXT:    v_mov_b32_e32 v2, 0
1078; SI-NEXT:    s_waitcnt lgkmcnt(0)
1079; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1080; SI-NEXT:    s_lshl_b64 s[2:3], s[8:9], 3
1081; SI-NEXT:    v_mov_b32_e32 v0, s2
1082; SI-NEXT:    v_mov_b32_e32 v1, s3
1083; SI-NEXT:    buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
1084; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1085; SI-NEXT:    s_waitcnt vmcnt(0)
1086; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
1087; SI-NEXT:    s_endpgm
1088;
1089; VI-LABEL: v_shl_32_i64:
1090; VI:       ; %bb.0:
1091; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1092; VI-NEXT:    s_ashr_i32 s9, s8, 31
1093; VI-NEXT:    s_lshl_b64 s[4:5], s[8:9], 3
1094; VI-NEXT:    v_mov_b32_e32 v0, 0
1095; VI-NEXT:    s_waitcnt lgkmcnt(0)
1096; VI-NEXT:    s_add_u32 s2, s2, s4
1097; VI-NEXT:    s_addc_u32 s3, s3, s5
1098; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1099; VI-NEXT:    s_add_u32 s0, s0, s4
1100; VI-NEXT:    s_addc_u32 s1, s1, s5
1101; VI-NEXT:    v_mov_b32_e32 v3, s1
1102; VI-NEXT:    v_mov_b32_e32 v2, s0
1103; VI-NEXT:    s_waitcnt lgkmcnt(0)
1104; VI-NEXT:    v_mov_b32_e32 v1, s2
1105; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1106; VI-NEXT:    s_endpgm
1107;
1108; EG-LABEL: v_shl_32_i64:
1109; EG:       ; %bb.0:
1110; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1111; EG-NEXT:    TEX 0 @6
1112; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
1113; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.XY, T2.X, 1
1114; EG-NEXT:    CF_END
1115; EG-NEXT:    PAD
1116; EG-NEXT:    Fetch clause starting at 6:
1117; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1118; EG-NEXT:    ALU clause starting at 8:
1119; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1120; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1121; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1122; EG-NEXT:    ALU clause starting at 11:
1123; EG-NEXT:     MOV T1.X, 0.0,
1124; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
1125; EG-NEXT:     LSHR T2.X, PV.W, literal.x,
1126; EG-NEXT:     MOV * T1.Y, T0.X,
1127; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1128  %tid = call i32 @llvm.amdgcn.workgroup.id.x() #0
1129  %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
1130  %gep.out = getelementptr i64, ptr addrspace(1) %out, i32 %tid
1131  %a = load i64, ptr addrspace(1) %gep.in
1132  %result = shl i64 %a, 32
1133  store i64 %result, ptr addrspace(1) %gep.out
1134  ret void
1135}
1136
1137define amdgpu_kernel void @s_shl_constant_i64(ptr addrspace(1) %out, i64 %a) {
1138; SI-LABEL: s_shl_constant_i64:
1139; SI:       ; %bb.0:
1140; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1141; SI-NEXT:    s_mov_b32 s6, -1
1142; SI-NEXT:    s_mov_b32 s9, 0xffff
1143; SI-NEXT:    s_mov_b32 s8, s6
1144; SI-NEXT:    s_mov_b32 s7, 0xf000
1145; SI-NEXT:    s_waitcnt lgkmcnt(0)
1146; SI-NEXT:    s_mov_b32 s4, s0
1147; SI-NEXT:    s_mov_b32 s5, s1
1148; SI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
1149; SI-NEXT:    v_mov_b32_e32 v0, s0
1150; SI-NEXT:    v_mov_b32_e32 v1, s1
1151; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1152; SI-NEXT:    s_endpgm
1153;
1154; VI-LABEL: s_shl_constant_i64:
1155; VI:       ; %bb.0:
1156; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1157; VI-NEXT:    s_mov_b32 s6, -1
1158; VI-NEXT:    s_mov_b32 s9, 0xffff
1159; VI-NEXT:    s_mov_b32 s8, s6
1160; VI-NEXT:    s_mov_b32 s7, 0xf000
1161; VI-NEXT:    s_waitcnt lgkmcnt(0)
1162; VI-NEXT:    s_mov_b32 s4, s0
1163; VI-NEXT:    s_mov_b32 s5, s1
1164; VI-NEXT:    s_lshl_b64 s[0:1], s[8:9], s2
1165; VI-NEXT:    v_mov_b32_e32 v0, s0
1166; VI-NEXT:    v_mov_b32_e32 v1, s1
1167; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1168; VI-NEXT:    s_endpgm
1169;
1170; EG-LABEL: s_shl_constant_i64:
1171; EG:       ; %bb.0:
1172; EG-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
1173; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1174; EG-NEXT:    CF_END
1175; EG-NEXT:    PAD
1176; EG-NEXT:    ALU clause starting at 4:
1177; EG-NEXT:     MOV T0.Z, literal.x,
1178; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
1179; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1180; EG-NEXT:    -1(nan), 31(4.344025e-44)
1181; EG-NEXT:     LSHL T1.Z, literal.x, PS,
1182; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.y, PV.Z, PV.W,
1183; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
1184; EG-NEXT:    -1(nan), 32767(4.591635e-41)
1185; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1186; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1187; EG-NEXT:     CNDE_INT T0.X, T1.W, T1.Z, 0.0,
1188; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1189; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1190  %shl = shl i64 281474976710655, %a
1191  store i64 %shl, ptr addrspace(1) %out, align 8
1192  ret void
1193}
1194
1195define amdgpu_kernel void @v_shl_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
1196; SI-LABEL: v_shl_constant_i64:
1197; SI:       ; %bb.0:
1198; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1199; SI-NEXT:    s_mov_b32 s7, 0xf000
1200; SI-NEXT:    s_mov_b32 s6, -1
1201; SI-NEXT:    s_mov_b32 s10, s6
1202; SI-NEXT:    s_mov_b32 s11, s7
1203; SI-NEXT:    s_waitcnt lgkmcnt(0)
1204; SI-NEXT:    s_mov_b32 s8, s2
1205; SI-NEXT:    s_mov_b32 s9, s3
1206; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1207; SI-NEXT:    s_mov_b32 s2, 0xab19b207
1208; SI-NEXT:    s_movk_i32 s3, 0x11e
1209; SI-NEXT:    s_mov_b32 s4, s0
1210; SI-NEXT:    s_mov_b32 s5, s1
1211; SI-NEXT:    s_waitcnt vmcnt(0)
1212; SI-NEXT:    v_lshl_b64 v[0:1], s[2:3], v0
1213; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1214; SI-NEXT:    s_endpgm
1215;
1216; VI-LABEL: v_shl_constant_i64:
1217; VI:       ; %bb.0:
1218; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1219; VI-NEXT:    s_mov_b32 s7, 0xf000
1220; VI-NEXT:    s_mov_b32 s6, -1
1221; VI-NEXT:    s_waitcnt lgkmcnt(0)
1222; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
1223; VI-NEXT:    s_mov_b32 s4, s0
1224; VI-NEXT:    s_mov_b32 s5, s1
1225; VI-NEXT:    s_mov_b32 s0, 0xab19b207
1226; VI-NEXT:    s_movk_i32 s1, 0x11e
1227; VI-NEXT:    s_waitcnt lgkmcnt(0)
1228; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
1229; VI-NEXT:    v_mov_b32_e32 v0, s0
1230; VI-NEXT:    v_mov_b32_e32 v1, s1
1231; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1232; VI-NEXT:    s_endpgm
1233;
1234; EG-LABEL: v_shl_constant_i64:
1235; EG:       ; %bb.0:
1236; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1237; EG-NEXT:    TEX 0 @6
1238; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1239; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1240; EG-NEXT:    CF_END
1241; EG-NEXT:    PAD
1242; EG-NEXT:    Fetch clause starting at 6:
1243; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1244; EG-NEXT:    ALU clause starting at 8:
1245; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1246; EG-NEXT:    ALU clause starting at 9:
1247; EG-NEXT:     NOT_INT T0.Z, T0.X,
1248; EG-NEXT:     MOV T0.W, literal.x,
1249; EG-NEXT:     AND_INT * T1.W, T0.X, literal.y,
1250; EG-NEXT:    1435293955(1.935796e+13), 31(4.344025e-44)
1251; EG-NEXT:     LSHL T1.Z, literal.x, PS,
1252; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.y, PV.W, PV.Z,
1253; EG-NEXT:     AND_INT * T1.W, T0.X, literal.z,
1254; EG-NEXT:    -1424379385(-5.460358e-13), 143(2.003857e-43)
1255; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1256; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1257; EG-NEXT:     CNDE_INT T0.X, T1.W, T1.Z, 0.0,
1258; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1259; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1260  %a = load i64, ptr addrspace(1) %aptr, align 8
1261  %shl = shl i64 1231231234567, %a
1262  store i64 %shl, ptr addrspace(1) %out, align 8
1263  ret void
1264}
1265
1266define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
1267; SI-LABEL: v_shl_i64_32_bit_constant:
1268; SI:       ; %bb.0:
1269; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1270; SI-NEXT:    s_mov_b32 s7, 0xf000
1271; SI-NEXT:    s_mov_b32 s6, -1
1272; SI-NEXT:    s_mov_b32 s10, s6
1273; SI-NEXT:    s_mov_b32 s11, s7
1274; SI-NEXT:    s_waitcnt lgkmcnt(0)
1275; SI-NEXT:    s_mov_b32 s8, s2
1276; SI-NEXT:    s_mov_b32 s9, s3
1277; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1278; SI-NEXT:    s_mov_b64 s[2:3], 0x12d687
1279; SI-NEXT:    s_mov_b32 s4, s0
1280; SI-NEXT:    s_mov_b32 s5, s1
1281; SI-NEXT:    s_waitcnt vmcnt(0)
1282; SI-NEXT:    v_lshl_b64 v[0:1], s[2:3], v0
1283; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1284; SI-NEXT:    s_endpgm
1285;
1286; VI-LABEL: v_shl_i64_32_bit_constant:
1287; VI:       ; %bb.0:
1288; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1289; VI-NEXT:    s_waitcnt lgkmcnt(0)
1290; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
1291; VI-NEXT:    s_mov_b32 s3, 0xf000
1292; VI-NEXT:    s_mov_b32 s2, -1
1293; VI-NEXT:    s_waitcnt lgkmcnt(0)
1294; VI-NEXT:    s_lshl_b64 s[4:5], 0x12d687, s4
1295; VI-NEXT:    v_mov_b32_e32 v0, s4
1296; VI-NEXT:    v_mov_b32_e32 v1, s5
1297; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1298; VI-NEXT:    s_endpgm
1299;
1300; EG-LABEL: v_shl_i64_32_bit_constant:
1301; EG:       ; %bb.0:
1302; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1303; EG-NEXT:    TEX 0 @6
1304; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
1305; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1306; EG-NEXT:    CF_END
1307; EG-NEXT:    PAD
1308; EG-NEXT:    Fetch clause starting at 6:
1309; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1310; EG-NEXT:    ALU clause starting at 8:
1311; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1312; EG-NEXT:    ALU clause starting at 9:
1313; EG-NEXT:     AND_INT T0.W, T0.X, literal.x,
1314; EG-NEXT:     NOT_INT * T1.W, T0.X,
1315; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1316; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1317; EG-NEXT:     LSHL T0.W, literal.y, PV.W,
1318; EG-NEXT:     AND_INT * T1.W, T0.X, literal.z,
1319; EG-NEXT:    617283(8.649977e-40), 1234567(1.729997e-39)
1320; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1321; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1322; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1323; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1324; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1325  %a = load i64, ptr addrspace(1) %aptr, align 8
1326  %shl = shl i64 1234567, %a
1327  store i64 %shl, ptr addrspace(1) %out, align 8
1328  ret void
1329}
1330
1331define amdgpu_kernel void @v_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) {
1332; SI-LABEL: v_shl_inline_imm_64_i64:
1333; SI:       ; %bb.0:
1334; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1335; SI-NEXT:    s_mov_b32 s7, 0xf000
1336; SI-NEXT:    s_mov_b32 s6, -1
1337; SI-NEXT:    s_mov_b32 s10, s6
1338; SI-NEXT:    s_mov_b32 s11, s7
1339; SI-NEXT:    s_waitcnt lgkmcnt(0)
1340; SI-NEXT:    s_mov_b32 s8, s2
1341; SI-NEXT:    s_mov_b32 s9, s3
1342; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1343; SI-NEXT:    s_mov_b32 s4, s0
1344; SI-NEXT:    s_mov_b32 s5, s1
1345; SI-NEXT:    s_waitcnt vmcnt(0)
1346; SI-NEXT:    v_lshl_b64 v[0:1], 64, v0
1347; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1348; SI-NEXT:    s_endpgm
1349;
1350; VI-LABEL: v_shl_inline_imm_64_i64:
1351; VI:       ; %bb.0:
1352; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1353; VI-NEXT:    s_waitcnt lgkmcnt(0)
1354; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
1355; VI-NEXT:    s_mov_b32 s3, 0xf000
1356; VI-NEXT:    s_mov_b32 s2, -1
1357; VI-NEXT:    s_waitcnt lgkmcnt(0)
1358; VI-NEXT:    s_lshl_b64 s[4:5], 64, s4
1359; VI-NEXT:    v_mov_b32_e32 v0, s4
1360; VI-NEXT:    v_mov_b32_e32 v1, s5
1361; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1362; VI-NEXT:    s_endpgm
1363;
1364; EG-LABEL: v_shl_inline_imm_64_i64:
1365; EG:       ; %bb.0:
1366; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1367; EG-NEXT:    TEX 0 @6
1368; EG-NEXT:    ALU 10, @9, KC0[CB0:0-32], KC1[]
1369; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1370; EG-NEXT:    CF_END
1371; EG-NEXT:    PAD
1372; EG-NEXT:    Fetch clause starting at 6:
1373; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1374; EG-NEXT:    ALU clause starting at 8:
1375; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1376; EG-NEXT:    ALU clause starting at 9:
1377; EG-NEXT:     AND_INT T0.W, T0.X, literal.x,
1378; EG-NEXT:     NOT_INT * T1.W, T0.X,
1379; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1380; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1381; EG-NEXT:     LSHL T0.W, literal.y, PV.W,
1382; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
1383; EG-NEXT:    32(4.484155e-44), 64(8.968310e-44)
1384; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.Z, PV.W,
1385; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1386; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1387; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1388  %a = load i64, ptr addrspace(1) %aptr, align 8
1389  %shl = shl i64 64, %a
1390  store i64 %shl, ptr addrspace(1) %out, align 8
1391  ret void
1392}
1393
1394define amdgpu_kernel void @s_shl_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1395; SI-LABEL: s_shl_inline_imm_64_i64:
1396; SI:       ; %bb.0:
1397; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1398; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1399; SI-NEXT:    s_mov_b32 s3, 0xf000
1400; SI-NEXT:    s_mov_b32 s2, -1
1401; SI-NEXT:    s_waitcnt lgkmcnt(0)
1402; SI-NEXT:    s_lshl_b64 s[4:5], 64, s6
1403; SI-NEXT:    v_mov_b32_e32 v0, s4
1404; SI-NEXT:    v_mov_b32_e32 v1, s5
1405; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1406; SI-NEXT:    s_endpgm
1407;
1408; VI-LABEL: s_shl_inline_imm_64_i64:
1409; VI:       ; %bb.0:
1410; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1411; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1412; VI-NEXT:    s_mov_b32 s3, 0xf000
1413; VI-NEXT:    s_mov_b32 s2, -1
1414; VI-NEXT:    s_waitcnt lgkmcnt(0)
1415; VI-NEXT:    s_lshl_b64 s[4:5], 64, s6
1416; VI-NEXT:    v_mov_b32_e32 v0, s4
1417; VI-NEXT:    v_mov_b32_e32 v1, s5
1418; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1419; VI-NEXT:    s_endpgm
1420;
1421; EG-LABEL: s_shl_inline_imm_64_i64:
1422; EG:       ; %bb.0:
1423; EG-NEXT:    ALU 10, @4, KC0[CB0:0-32], KC1[]
1424; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1425; EG-NEXT:    CF_END
1426; EG-NEXT:    PAD
1427; EG-NEXT:    ALU clause starting at 4:
1428; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
1429; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
1430; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1431; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1432; EG-NEXT:     AND_INT T1.W, KC0[2].W, literal.x,
1433; EG-NEXT:     LSHL * T0.W, literal.y, PV.W,
1434; EG-NEXT:    32(4.484155e-44), 64(8.968310e-44)
1435; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PV.Z, PS,
1436; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1437; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1438; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1439  %shl = shl i64 64, %a
1440  store i64 %shl, ptr addrspace(1) %out, align 8
1441  ret void
1442}
1443
1444define amdgpu_kernel void @s_shl_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1445; SI-LABEL: s_shl_inline_imm_1_i64:
1446; SI:       ; %bb.0:
1447; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1448; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1449; SI-NEXT:    s_mov_b32 s3, 0xf000
1450; SI-NEXT:    s_mov_b32 s2, -1
1451; SI-NEXT:    s_waitcnt lgkmcnt(0)
1452; SI-NEXT:    s_lshl_b64 s[4:5], 1, s6
1453; SI-NEXT:    v_mov_b32_e32 v0, s4
1454; SI-NEXT:    v_mov_b32_e32 v1, s5
1455; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1456; SI-NEXT:    s_endpgm
1457;
1458; VI-LABEL: s_shl_inline_imm_1_i64:
1459; VI:       ; %bb.0:
1460; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1461; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1462; VI-NEXT:    s_mov_b32 s3, 0xf000
1463; VI-NEXT:    s_mov_b32 s2, -1
1464; VI-NEXT:    s_waitcnt lgkmcnt(0)
1465; VI-NEXT:    s_lshl_b64 s[4:5], 1, s6
1466; VI-NEXT:    v_mov_b32_e32 v0, s4
1467; VI-NEXT:    v_mov_b32_e32 v1, s5
1468; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1469; VI-NEXT:    s_endpgm
1470;
1471; EG-LABEL: s_shl_inline_imm_1_i64:
1472; EG:       ; %bb.0:
1473; EG-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
1474; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1475; EG-NEXT:    CF_END
1476; EG-NEXT:    PAD
1477; EG-NEXT:    ALU clause starting at 4:
1478; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
1479; EG-NEXT:     LSHL * T1.W, KC0[2].W, literal.y,
1480; EG-NEXT:    31(4.344025e-44), 26(3.643376e-44)
1481; EG-NEXT:     ASHR T1.W, PS, literal.x,
1482; EG-NEXT:     LSHL * T0.W, 1, PV.W,
1483; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1484; EG-NEXT:     AND_INT T0.Y, PV.W, PS,
1485; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.x,
1486; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1487; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, 0.0,
1488; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1489; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1490  %shl = shl i64 1, %a
1491  store i64 %shl, ptr addrspace(1) %out, align 8
1492  ret void
1493}
1494
1495define amdgpu_kernel void @s_shl_inline_imm_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1496; SI-LABEL: s_shl_inline_imm_1_0_i64:
1497; SI:       ; %bb.0:
1498; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1499; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1500; SI-NEXT:    s_mov_b32 s3, 0xf000
1501; SI-NEXT:    s_mov_b32 s2, -1
1502; SI-NEXT:    s_waitcnt lgkmcnt(0)
1503; SI-NEXT:    s_lshl_b64 s[4:5], 1.0, s6
1504; SI-NEXT:    v_mov_b32_e32 v0, s4
1505; SI-NEXT:    v_mov_b32_e32 v1, s5
1506; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1507; SI-NEXT:    s_endpgm
1508;
1509; VI-LABEL: s_shl_inline_imm_1_0_i64:
1510; VI:       ; %bb.0:
1511; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1512; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1513; VI-NEXT:    s_mov_b32 s3, 0xf000
1514; VI-NEXT:    s_mov_b32 s2, -1
1515; VI-NEXT:    s_waitcnt lgkmcnt(0)
1516; VI-NEXT:    s_lshl_b64 s[4:5], 1.0, s6
1517; VI-NEXT:    v_mov_b32_e32 v0, s4
1518; VI-NEXT:    v_mov_b32_e32 v1, s5
1519; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1520; VI-NEXT:    s_endpgm
1521;
1522; EG-LABEL: s_shl_inline_imm_1_0_i64:
1523; EG:       ; %bb.0:
1524; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1525; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1526; EG-NEXT:    CF_END
1527; EG-NEXT:    PAD
1528; EG-NEXT:    ALU clause starting at 4:
1529; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1530; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1531; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1532; EG-NEXT:    536346624(1.050321e-19), 32(4.484155e-44)
1533; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1534; EG-NEXT:     MOV T0.X, 0.0,
1535; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1536; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1537  %shl = shl i64 4607182418800017408, %a
1538  store i64 %shl, ptr addrspace(1) %out, align 8
1539  ret void
1540}
1541
1542define amdgpu_kernel void @s_shl_inline_imm_neg_1_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1543; SI-LABEL: s_shl_inline_imm_neg_1_0_i64:
1544; SI:       ; %bb.0:
1545; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1546; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1547; SI-NEXT:    s_mov_b32 s3, 0xf000
1548; SI-NEXT:    s_mov_b32 s2, -1
1549; SI-NEXT:    s_waitcnt lgkmcnt(0)
1550; SI-NEXT:    s_lshl_b64 s[4:5], -1.0, s6
1551; SI-NEXT:    v_mov_b32_e32 v0, s4
1552; SI-NEXT:    v_mov_b32_e32 v1, s5
1553; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1554; SI-NEXT:    s_endpgm
1555;
1556; VI-LABEL: s_shl_inline_imm_neg_1_0_i64:
1557; VI:       ; %bb.0:
1558; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1559; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1560; VI-NEXT:    s_mov_b32 s3, 0xf000
1561; VI-NEXT:    s_mov_b32 s2, -1
1562; VI-NEXT:    s_waitcnt lgkmcnt(0)
1563; VI-NEXT:    s_lshl_b64 s[4:5], -1.0, s6
1564; VI-NEXT:    v_mov_b32_e32 v0, s4
1565; VI-NEXT:    v_mov_b32_e32 v1, s5
1566; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1567; VI-NEXT:    s_endpgm
1568;
1569; EG-LABEL: s_shl_inline_imm_neg_1_0_i64:
1570; EG:       ; %bb.0:
1571; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1572; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1573; EG-NEXT:    CF_END
1574; EG-NEXT:    PAD
1575; EG-NEXT:    ALU clause starting at 4:
1576; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1577; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1578; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1579; EG-NEXT:    1610088448(3.574057e+19), 32(4.484155e-44)
1580; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1581; EG-NEXT:     MOV T0.X, 0.0,
1582; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1583; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1584  %shl = shl i64 13830554455654793216, %a
1585  store i64 %shl, ptr addrspace(1) %out, align 8
1586  ret void
1587}
1588
1589define amdgpu_kernel void @s_shl_inline_imm_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1590; SI-LABEL: s_shl_inline_imm_0_5_i64:
1591; SI:       ; %bb.0:
1592; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1593; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1594; SI-NEXT:    s_mov_b32 s3, 0xf000
1595; SI-NEXT:    s_mov_b32 s2, -1
1596; SI-NEXT:    s_waitcnt lgkmcnt(0)
1597; SI-NEXT:    s_lshl_b64 s[4:5], 0.5, s6
1598; SI-NEXT:    v_mov_b32_e32 v0, s4
1599; SI-NEXT:    v_mov_b32_e32 v1, s5
1600; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1601; SI-NEXT:    s_endpgm
1602;
1603; VI-LABEL: s_shl_inline_imm_0_5_i64:
1604; VI:       ; %bb.0:
1605; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1606; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1607; VI-NEXT:    s_mov_b32 s3, 0xf000
1608; VI-NEXT:    s_mov_b32 s2, -1
1609; VI-NEXT:    s_waitcnt lgkmcnt(0)
1610; VI-NEXT:    s_lshl_b64 s[4:5], 0.5, s6
1611; VI-NEXT:    v_mov_b32_e32 v0, s4
1612; VI-NEXT:    v_mov_b32_e32 v1, s5
1613; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1614; VI-NEXT:    s_endpgm
1615;
1616; EG-LABEL: s_shl_inline_imm_0_5_i64:
1617; EG:       ; %bb.0:
1618; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1619; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1620; EG-NEXT:    CF_END
1621; EG-NEXT:    PAD
1622; EG-NEXT:    ALU clause starting at 4:
1623; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1624; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1625; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1626; EG-NEXT:    535822336(1.016440e-19), 32(4.484155e-44)
1627; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1628; EG-NEXT:     MOV T0.X, 0.0,
1629; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1630; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1631  %shl = shl i64 4602678819172646912, %a
1632  store i64 %shl, ptr addrspace(1) %out, align 8
1633  ret void
1634}
1635
1636define amdgpu_kernel void @s_shl_inline_imm_neg_0_5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1637; SI-LABEL: s_shl_inline_imm_neg_0_5_i64:
1638; SI:       ; %bb.0:
1639; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1640; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1641; SI-NEXT:    s_mov_b32 s3, 0xf000
1642; SI-NEXT:    s_mov_b32 s2, -1
1643; SI-NEXT:    s_waitcnt lgkmcnt(0)
1644; SI-NEXT:    s_lshl_b64 s[4:5], -0.5, s6
1645; SI-NEXT:    v_mov_b32_e32 v0, s4
1646; SI-NEXT:    v_mov_b32_e32 v1, s5
1647; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1648; SI-NEXT:    s_endpgm
1649;
1650; VI-LABEL: s_shl_inline_imm_neg_0_5_i64:
1651; VI:       ; %bb.0:
1652; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1653; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1654; VI-NEXT:    s_mov_b32 s3, 0xf000
1655; VI-NEXT:    s_mov_b32 s2, -1
1656; VI-NEXT:    s_waitcnt lgkmcnt(0)
1657; VI-NEXT:    s_lshl_b64 s[4:5], -0.5, s6
1658; VI-NEXT:    v_mov_b32_e32 v0, s4
1659; VI-NEXT:    v_mov_b32_e32 v1, s5
1660; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1661; VI-NEXT:    s_endpgm
1662;
1663; EG-LABEL: s_shl_inline_imm_neg_0_5_i64:
1664; EG:       ; %bb.0:
1665; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1666; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1667; EG-NEXT:    CF_END
1668; EG-NEXT:    PAD
1669; EG-NEXT:    ALU clause starting at 4:
1670; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1671; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1672; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1673; EG-NEXT:    1609564160(3.458765e+19), 32(4.484155e-44)
1674; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1675; EG-NEXT:     MOV T0.X, 0.0,
1676; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1677; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1678  %shl = shl i64 13826050856027422720, %a
1679  store i64 %shl, ptr addrspace(1) %out, align 8
1680  ret void
1681}
1682
1683define amdgpu_kernel void @s_shl_inline_imm_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1684; SI-LABEL: s_shl_inline_imm_2_0_i64:
1685; SI:       ; %bb.0:
1686; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1687; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1688; SI-NEXT:    s_mov_b32 s3, 0xf000
1689; SI-NEXT:    s_mov_b32 s2, -1
1690; SI-NEXT:    s_waitcnt lgkmcnt(0)
1691; SI-NEXT:    s_lshl_b64 s[4:5], 2.0, s6
1692; SI-NEXT:    v_mov_b32_e32 v0, s4
1693; SI-NEXT:    v_mov_b32_e32 v1, s5
1694; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1695; SI-NEXT:    s_endpgm
1696;
1697; VI-LABEL: s_shl_inline_imm_2_0_i64:
1698; VI:       ; %bb.0:
1699; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1700; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1701; VI-NEXT:    s_mov_b32 s3, 0xf000
1702; VI-NEXT:    s_mov_b32 s2, -1
1703; VI-NEXT:    s_waitcnt lgkmcnt(0)
1704; VI-NEXT:    s_lshl_b64 s[4:5], 2.0, s6
1705; VI-NEXT:    v_mov_b32_e32 v0, s4
1706; VI-NEXT:    v_mov_b32_e32 v1, s5
1707; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1708; VI-NEXT:    s_endpgm
1709;
1710; EG-LABEL: s_shl_inline_imm_2_0_i64:
1711; EG:       ; %bb.0:
1712; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1713; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1714; EG-NEXT:    CF_END
1715; EG-NEXT:    PAD
1716; EG-NEXT:    ALU clause starting at 4:
1717; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1718; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1719; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1720; EG-NEXT:    536870912(1.084202e-19), 32(4.484155e-44)
1721; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1722; EG-NEXT:     MOV T0.X, 0.0,
1723; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1724; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1725  %shl = shl i64 4611686018427387904, %a
1726  store i64 %shl, ptr addrspace(1) %out, align 8
1727  ret void
1728}
1729
1730define amdgpu_kernel void @s_shl_inline_imm_neg_2_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1731; SI-LABEL: s_shl_inline_imm_neg_2_0_i64:
1732; SI:       ; %bb.0:
1733; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1734; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1735; SI-NEXT:    s_mov_b32 s3, 0xf000
1736; SI-NEXT:    s_mov_b32 s2, -1
1737; SI-NEXT:    s_waitcnt lgkmcnt(0)
1738; SI-NEXT:    s_lshl_b64 s[4:5], -2.0, s6
1739; SI-NEXT:    v_mov_b32_e32 v0, s4
1740; SI-NEXT:    v_mov_b32_e32 v1, s5
1741; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1742; SI-NEXT:    s_endpgm
1743;
1744; VI-LABEL: s_shl_inline_imm_neg_2_0_i64:
1745; VI:       ; %bb.0:
1746; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1747; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1748; VI-NEXT:    s_mov_b32 s3, 0xf000
1749; VI-NEXT:    s_mov_b32 s2, -1
1750; VI-NEXT:    s_waitcnt lgkmcnt(0)
1751; VI-NEXT:    s_lshl_b64 s[4:5], -2.0, s6
1752; VI-NEXT:    v_mov_b32_e32 v0, s4
1753; VI-NEXT:    v_mov_b32_e32 v1, s5
1754; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1755; VI-NEXT:    s_endpgm
1756;
1757; EG-LABEL: s_shl_inline_imm_neg_2_0_i64:
1758; EG:       ; %bb.0:
1759; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1760; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1761; EG-NEXT:    CF_END
1762; EG-NEXT:    PAD
1763; EG-NEXT:    ALU clause starting at 4:
1764; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1765; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1766; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1767; EG-NEXT:    1610612736(3.689349e+19), 32(4.484155e-44)
1768; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1769; EG-NEXT:     MOV T0.X, 0.0,
1770; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1771; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1772  %shl = shl i64 13835058055282163712, %a
1773  store i64 %shl, ptr addrspace(1) %out, align 8
1774  ret void
1775}
1776
1777define amdgpu_kernel void @s_shl_inline_imm_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1778; SI-LABEL: s_shl_inline_imm_4_0_i64:
1779; SI:       ; %bb.0:
1780; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1781; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1782; SI-NEXT:    s_mov_b32 s3, 0xf000
1783; SI-NEXT:    s_mov_b32 s2, -1
1784; SI-NEXT:    s_waitcnt lgkmcnt(0)
1785; SI-NEXT:    s_lshl_b64 s[4:5], 4.0, s6
1786; SI-NEXT:    v_mov_b32_e32 v0, s4
1787; SI-NEXT:    v_mov_b32_e32 v1, s5
1788; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1789; SI-NEXT:    s_endpgm
1790;
1791; VI-LABEL: s_shl_inline_imm_4_0_i64:
1792; VI:       ; %bb.0:
1793; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1794; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1795; VI-NEXT:    s_mov_b32 s3, 0xf000
1796; VI-NEXT:    s_mov_b32 s2, -1
1797; VI-NEXT:    s_waitcnt lgkmcnt(0)
1798; VI-NEXT:    s_lshl_b64 s[4:5], 4.0, s6
1799; VI-NEXT:    v_mov_b32_e32 v0, s4
1800; VI-NEXT:    v_mov_b32_e32 v1, s5
1801; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1802; VI-NEXT:    s_endpgm
1803;
1804; EG-LABEL: s_shl_inline_imm_4_0_i64:
1805; EG:       ; %bb.0:
1806; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1807; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1808; EG-NEXT:    CF_END
1809; EG-NEXT:    PAD
1810; EG-NEXT:    ALU clause starting at 4:
1811; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1812; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1813; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1814; EG-NEXT:    537395200(1.151965e-19), 32(4.484155e-44)
1815; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1816; EG-NEXT:     MOV T0.X, 0.0,
1817; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1818; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1819  %shl = shl i64 4616189618054758400, %a
1820  store i64 %shl, ptr addrspace(1) %out, align 8
1821  ret void
1822}
1823
1824define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1825; SI-LABEL: s_shl_inline_imm_neg_4_0_i64:
1826; SI:       ; %bb.0:
1827; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1828; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1829; SI-NEXT:    s_mov_b32 s3, 0xf000
1830; SI-NEXT:    s_mov_b32 s2, -1
1831; SI-NEXT:    s_waitcnt lgkmcnt(0)
1832; SI-NEXT:    s_lshl_b64 s[4:5], -4.0, s6
1833; SI-NEXT:    v_mov_b32_e32 v0, s4
1834; SI-NEXT:    v_mov_b32_e32 v1, s5
1835; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1836; SI-NEXT:    s_endpgm
1837;
1838; VI-LABEL: s_shl_inline_imm_neg_4_0_i64:
1839; VI:       ; %bb.0:
1840; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1841; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1842; VI-NEXT:    s_mov_b32 s3, 0xf000
1843; VI-NEXT:    s_mov_b32 s2, -1
1844; VI-NEXT:    s_waitcnt lgkmcnt(0)
1845; VI-NEXT:    s_lshl_b64 s[4:5], -4.0, s6
1846; VI-NEXT:    v_mov_b32_e32 v0, s4
1847; VI-NEXT:    v_mov_b32_e32 v1, s5
1848; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1849; VI-NEXT:    s_endpgm
1850;
1851; EG-LABEL: s_shl_inline_imm_neg_4_0_i64:
1852; EG:       ; %bb.0:
1853; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1854; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1855; EG-NEXT:    CF_END
1856; EG-NEXT:    PAD
1857; EG-NEXT:    ALU clause starting at 4:
1858; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
1859; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
1860; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1861; EG-NEXT:    1611137024(3.919933e+19), 32(4.484155e-44)
1862; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
1863; EG-NEXT:     MOV T0.X, 0.0,
1864; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1865; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1866  %shl = shl i64 13839561654909534208, %a
1867  store i64 %shl, ptr addrspace(1) %out, align 8
1868  ret void
1869}
1870
1871
1872; Test with the 64-bit integer bitpattern for a 32-bit float in the
1873; low 32-bits, which is not a valid 64-bit inline immmediate.
1874define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1875; SI-LABEL: s_shl_inline_imm_f32_4_0_i64:
1876; SI:       ; %bb.0:
1877; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1878; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1879; SI-NEXT:    s_mov_b32 s3, 0xf000
1880; SI-NEXT:    s_mov_b32 s2, -1
1881; SI-NEXT:    s_waitcnt lgkmcnt(0)
1882; SI-NEXT:    s_lshl_b64 s[4:5], 0x40800000, s6
1883; SI-NEXT:    v_mov_b32_e32 v0, s4
1884; SI-NEXT:    v_mov_b32_e32 v1, s5
1885; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1886; SI-NEXT:    s_endpgm
1887;
1888; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
1889; VI:       ; %bb.0:
1890; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1891; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1892; VI-NEXT:    s_mov_b32 s3, 0xf000
1893; VI-NEXT:    s_mov_b32 s2, -1
1894; VI-NEXT:    s_waitcnt lgkmcnt(0)
1895; VI-NEXT:    s_lshl_b64 s[4:5], 0x40800000, s6
1896; VI-NEXT:    v_mov_b32_e32 v0, s4
1897; VI-NEXT:    v_mov_b32_e32 v1, s5
1898; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1899; VI-NEXT:    s_endpgm
1900;
1901; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
1902; EG:       ; %bb.0:
1903; EG-NEXT:    ALU 11, @4, KC0[CB0:0-32], KC1[]
1904; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1905; EG-NEXT:    CF_END
1906; EG-NEXT:    PAD
1907; EG-NEXT:    ALU clause starting at 4:
1908; EG-NEXT:     AND_INT T0.W, KC0[2].W, literal.x,
1909; EG-NEXT:     NOT_INT * T1.W, KC0[2].W,
1910; EG-NEXT:    31(4.344025e-44), 0(0.000000e+00)
1911; EG-NEXT:     BIT_ALIGN_INT T0.Z, 0.0, literal.x, PS,
1912; EG-NEXT:     AND_INT T1.W, KC0[2].W, literal.y,
1913; EG-NEXT:     LSHL * T0.W, literal.z, PV.W,
1914; EG-NEXT:    541065216(1.626303e-19), 32(4.484155e-44)
1915; EG-NEXT:    1082130432(4.000000e+00), 0(0.000000e+00)
1916; EG-NEXT:     CNDE_INT * T0.Y, PV.W, PV.Z, PS,
1917; EG-NEXT:     CNDE_INT T0.X, T1.W, T0.W, 0.0,
1918; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1919; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1920  %shl = shl i64 1082130432, %a
1921  store i64 %shl, ptr addrspace(1) %out, align 8
1922  ret void
1923}
1924
1925; FIXME: Copy of -1 register
1926define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1927; SI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1928; SI:       ; %bb.0:
1929; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1930; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1931; SI-NEXT:    s_mov_b32 s4, -4.0
1932; SI-NEXT:    s_mov_b32 s5, -1
1933; SI-NEXT:    s_mov_b32 s3, 0xf000
1934; SI-NEXT:    s_mov_b32 s2, -1
1935; SI-NEXT:    s_waitcnt lgkmcnt(0)
1936; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
1937; SI-NEXT:    v_mov_b32_e32 v0, s4
1938; SI-NEXT:    v_mov_b32_e32 v1, s5
1939; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1940; SI-NEXT:    s_endpgm
1941;
1942; VI-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1943; VI:       ; %bb.0:
1944; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1945; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1946; VI-NEXT:    s_mov_b32 s4, -4.0
1947; VI-NEXT:    s_mov_b32 s5, -1
1948; VI-NEXT:    s_mov_b32 s3, 0xf000
1949; VI-NEXT:    s_mov_b32 s2, -1
1950; VI-NEXT:    s_waitcnt lgkmcnt(0)
1951; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
1952; VI-NEXT:    v_mov_b32_e32 v0, s4
1953; VI-NEXT:    v_mov_b32_e32 v1, s5
1954; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1955; VI-NEXT:    s_endpgm
1956;
1957; EG-LABEL: s_shl_inline_imm_f32_neg_4_0_i64:
1958; EG:       ; %bb.0:
1959; EG-NEXT:    ALU 12, @4, KC0[CB0:0-32], KC1[]
1960; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1961; EG-NEXT:    CF_END
1962; EG-NEXT:    PAD
1963; EG-NEXT:    ALU clause starting at 4:
1964; EG-NEXT:     MOV T0.Z, literal.x,
1965; EG-NEXT:     NOT_INT T0.W, KC0[2].W,
1966; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
1967; EG-NEXT:    -532676608(-5.534023e+19), 31(4.344025e-44)
1968; EG-NEXT:     LSHL T1.Z, literal.x, PS,
1969; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.y, PV.Z, PV.W,
1970; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.z,
1971; EG-NEXT:    -1065353216(-4.000000e+00), 2147483647(nan)
1972; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1973; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, PV.Z,
1974; EG-NEXT:     CNDE_INT T0.X, T1.W, T1.Z, 0.0,
1975; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1976; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1977  %shl = shl i64 -1065353216, %a
1978  store i64 %shl, ptr addrspace(1) %out, align 8
1979  ret void
1980}
1981
1982define amdgpu_kernel void @s_shl_inline_high_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
1983; SI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
1984; SI:       ; %bb.0:
1985; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1986; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1987; SI-NEXT:    s_mov_b32 s4, 0
1988; SI-NEXT:    s_mov_b32 s5, 4.0
1989; SI-NEXT:    s_mov_b32 s3, 0xf000
1990; SI-NEXT:    s_mov_b32 s2, -1
1991; SI-NEXT:    s_waitcnt lgkmcnt(0)
1992; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
1993; SI-NEXT:    v_mov_b32_e32 v0, s4
1994; SI-NEXT:    v_mov_b32_e32 v1, s5
1995; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1996; SI-NEXT:    s_endpgm
1997;
1998; VI-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
1999; VI:       ; %bb.0:
2000; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2001; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
2002; VI-NEXT:    s_mov_b32 s4, 0
2003; VI-NEXT:    s_mov_b32 s5, 4.0
2004; VI-NEXT:    s_mov_b32 s3, 0xf000
2005; VI-NEXT:    s_mov_b32 s2, -1
2006; VI-NEXT:    s_waitcnt lgkmcnt(0)
2007; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
2008; VI-NEXT:    v_mov_b32_e32 v0, s4
2009; VI-NEXT:    v_mov_b32_e32 v1, s5
2010; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2011; VI-NEXT:    s_endpgm
2012;
2013; EG-LABEL: s_shl_inline_high_imm_f32_4_0_i64:
2014; EG:       ; %bb.0:
2015; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
2016; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2017; EG-NEXT:    CF_END
2018; EG-NEXT:    PAD
2019; EG-NEXT:    ALU clause starting at 4:
2020; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
2021; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
2022; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
2023; EG-NEXT:    541065216(1.626303e-19), 32(4.484155e-44)
2024; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
2025; EG-NEXT:     MOV T0.X, 0.0,
2026; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2027; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2028  %shl = shl i64 4647714815446351872, %a
2029  store i64 %shl, ptr addrspace(1) %out, align 8
2030  ret void
2031}
2032
2033define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
2034; SI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2035; SI:       ; %bb.0:
2036; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2037; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
2038; SI-NEXT:    s_mov_b32 s4, 0
2039; SI-NEXT:    s_mov_b32 s5, -4.0
2040; SI-NEXT:    s_mov_b32 s3, 0xf000
2041; SI-NEXT:    s_mov_b32 s2, -1
2042; SI-NEXT:    s_waitcnt lgkmcnt(0)
2043; SI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
2044; SI-NEXT:    v_mov_b32_e32 v0, s4
2045; SI-NEXT:    v_mov_b32_e32 v1, s5
2046; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2047; SI-NEXT:    s_endpgm
2048;
2049; VI-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2050; VI:       ; %bb.0:
2051; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2052; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
2053; VI-NEXT:    s_mov_b32 s4, 0
2054; VI-NEXT:    s_mov_b32 s5, -4.0
2055; VI-NEXT:    s_mov_b32 s3, 0xf000
2056; VI-NEXT:    s_mov_b32 s2, -1
2057; VI-NEXT:    s_waitcnt lgkmcnt(0)
2058; VI-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
2059; VI-NEXT:    v_mov_b32_e32 v0, s4
2060; VI-NEXT:    v_mov_b32_e32 v1, s5
2061; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2062; VI-NEXT:    s_endpgm
2063;
2064; EG-LABEL: s_shl_inline_high_imm_f32_neg_4_0_i64:
2065; EG:       ; %bb.0:
2066; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
2067; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
2068; EG-NEXT:    CF_END
2069; EG-NEXT:    PAD
2070; EG-NEXT:    ALU clause starting at 4:
2071; EG-NEXT:     NOT_INT * T0.W, KC0[2].W,
2072; EG-NEXT:     BIT_ALIGN_INT T0.W, literal.x, 0.0, PV.W,
2073; EG-NEXT:     AND_INT * T1.W, KC0[2].W, literal.y,
2074; EG-NEXT:    1614807040(5.534023e+19), 32(4.484155e-44)
2075; EG-NEXT:     CNDE_INT * T0.Y, PS, PV.W, 0.0,
2076; EG-NEXT:     MOV T0.X, 0.0,
2077; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2078; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2079  %shl = shl i64 13871086852301127680, %a
2080  store i64 %shl, ptr addrspace(1) %out, align 8
2081  ret void
2082}
2083
2084define amdgpu_kernel void @test_mul2(i32 %p) {
2085; SI-LABEL: test_mul2:
2086; SI:       ; %bb.0:
2087; SI-NEXT:    s_load_dword s0, s[4:5], 0x9
2088; SI-NEXT:    s_mov_b32 s3, 0xf000
2089; SI-NEXT:    s_mov_b32 s2, -1
2090; SI-NEXT:    s_waitcnt lgkmcnt(0)
2091; SI-NEXT:    s_lshl_b32 s0, s0, 1
2092; SI-NEXT:    v_mov_b32_e32 v0, s0
2093; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2094; SI-NEXT:    s_waitcnt vmcnt(0)
2095; SI-NEXT:    s_endpgm
2096;
2097; VI-LABEL: test_mul2:
2098; VI:       ; %bb.0:
2099; VI-NEXT:    s_load_dword s0, s[4:5], 0x24
2100; VI-NEXT:    s_mov_b32 s3, 0xf000
2101; VI-NEXT:    s_mov_b32 s2, -1
2102; VI-NEXT:    s_waitcnt lgkmcnt(0)
2103; VI-NEXT:    s_lshl_b32 s0, s0, 1
2104; VI-NEXT:    v_mov_b32_e32 v0, s0
2105; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
2106; VI-NEXT:    s_waitcnt vmcnt(0)
2107; VI-NEXT:    s_endpgm
2108;
2109; EG-LABEL: test_mul2:
2110; EG:       ; %bb.0:
2111; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
2112; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2113; EG-NEXT:    CF_END
2114; EG-NEXT:    PAD
2115; EG-NEXT:    ALU clause starting at 4:
2116; EG-NEXT:     MOV T0.X, literal.x,
2117; EG-NEXT:     LSHL * T1.X, KC0[2].Y, 1,
2118; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
2119   %i = mul i32 %p, 2
2120   store volatile i32 %i, ptr addrspace(1) undef
2121   ret void
2122}
2123
2124define void @shl_or_k(ptr addrspace(1) %out, i32 %in) {
2125; SI-LABEL: shl_or_k:
2126; SI:       ; %bb.0:
2127; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2128; SI-NEXT:    s_mov_b32 s6, 0
2129; SI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
2130; SI-NEXT:    s_mov_b32 s7, 0xf000
2131; SI-NEXT:    s_mov_b32 s4, s6
2132; SI-NEXT:    s_mov_b32 s5, s6
2133; SI-NEXT:    v_or_b32_e32 v2, 4, v2
2134; SI-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
2135; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2136; SI-NEXT:    s_setpc_b64 s[30:31]
2137;
2138; VI-LABEL: shl_or_k:
2139; VI:       ; %bb.0:
2140; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2141; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
2142; VI-NEXT:    v_or_b32_e32 v2, 4, v2
2143; VI-NEXT:    flat_store_dword v[0:1], v2
2144; VI-NEXT:    s_waitcnt vmcnt(0)
2145; VI-NEXT:    s_setpc_b64 s[30:31]
2146;
2147; EG-LABEL: shl_or_k:
2148; EG:       ; %bb.0:
2149; EG-NEXT:    ALU 4, @4, KC0[CB0:0-32], KC1[]
2150; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2151; EG-NEXT:    CF_END
2152; EG-NEXT:    PAD
2153; EG-NEXT:    ALU clause starting at 4:
2154; EG-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
2155; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2156; EG-NEXT:     OR_INT T0.X, PV.W, literal.x,
2157; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
2158; EG-NEXT:    4(5.605194e-45), 2(2.802597e-45)
2159  %tmp0 = or i32 %in, 1
2160  %tmp2 = shl i32 %tmp0, 2
2161  store i32 %tmp2, ptr addrspace(1) %out
2162  ret void
2163}
2164
2165define void @shl_or_k_two_uses(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i32 %in) {
2166; SI-LABEL: shl_or_k_two_uses:
2167; SI:       ; %bb.0:
2168; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2169; SI-NEXT:    s_mov_b32 s6, 0
2170; SI-NEXT:    v_or_b32_e32 v4, 1, v4
2171; SI-NEXT:    s_mov_b32 s7, 0xf000
2172; SI-NEXT:    s_mov_b32 s4, s6
2173; SI-NEXT:    s_mov_b32 s5, s6
2174; SI-NEXT:    v_lshlrev_b32_e32 v5, 2, v4
2175; SI-NEXT:    buffer_store_dword v5, v[0:1], s[4:7], 0 addr64
2176; SI-NEXT:    buffer_store_dword v4, v[2:3], s[4:7], 0 addr64
2177; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
2178; SI-NEXT:    s_setpc_b64 s[30:31]
2179;
2180; VI-LABEL: shl_or_k_two_uses:
2181; VI:       ; %bb.0:
2182; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2183; VI-NEXT:    v_or_b32_e32 v4, 1, v4
2184; VI-NEXT:    v_lshlrev_b32_e32 v5, 2, v4
2185; VI-NEXT:    flat_store_dword v[0:1], v5
2186; VI-NEXT:    flat_store_dword v[2:3], v4
2187; VI-NEXT:    s_waitcnt vmcnt(0)
2188; VI-NEXT:    s_setpc_b64 s[30:31]
2189;
2190; EG-LABEL: shl_or_k_two_uses:
2191; EG:       ; %bb.0:
2192; EG-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
2193; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0
2194; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
2195; EG-NEXT:    CF_END
2196; EG-NEXT:    ALU clause starting at 4:
2197; EG-NEXT:     LSHR T0.X, KC0[2].Z, literal.x,
2198; EG-NEXT:     OR_INT * T1.X, KC0[2].W, 1,
2199; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2200; EG-NEXT:     LSHL T2.X, PS, literal.x,
2201; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2202; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2203  %tmp0 = or i32 %in, 1
2204  %tmp2 = shl i32 %tmp0, 2
2205  store i32 %tmp2, ptr addrspace(1) %out0
2206  store i32 %tmp0, ptr addrspace(1) %out1
2207  ret void
2208}
2209
2210attributes #0 = { nounwind readnone }
2211