xref: /llvm-project/llvm/test/CodeGen/AMDGPU/fshl.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga  -verify-machineinstrs | FileCheck %s --check-prefix=VI
4; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s --check-prefix=GFX9
5; RUN: llc < %s -mtriple=r600 -mcpu=redwood  -verify-machineinstrs | FileCheck %s --check-prefix=R600
6; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s --check-prefix=GFX10
7; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefix=GFX11
8
9declare i32 @llvm.fshl.i32(i32, i32, i32) nounwind readnone
10declare <2 x i32> @llvm.fshl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
11declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
12
13define amdgpu_kernel void @fshl_i32(ptr addrspace(1) %in, i32 %x, i32 %y, i32 %z) {
14; SI-LABEL: fshl_i32:
15; SI:       ; %bb.0: ; %entry
16; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
17; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
18; SI-NEXT:    s_mov_b32 s7, 0xf000
19; SI-NEXT:    s_mov_b32 s6, -1
20; SI-NEXT:    s_waitcnt lgkmcnt(0)
21; SI-NEXT:    v_mov_b32_e32 v0, s1
22; SI-NEXT:    s_lshr_b32 s1, s0, 1
23; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
24; SI-NEXT:    s_not_b32 s0, s2
25; SI-NEXT:    v_mov_b32_e32 v1, s0
26; SI-NEXT:    v_alignbit_b32 v0, s1, v0, v1
27; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
28; SI-NEXT:    s_endpgm
29;
30; VI-LABEL: fshl_i32:
31; VI:       ; %bb.0: ; %entry
32; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
33; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
34; VI-NEXT:    s_waitcnt lgkmcnt(0)
35; VI-NEXT:    v_mov_b32_e32 v0, s1
36; VI-NEXT:    s_not_b32 s2, s2
37; VI-NEXT:    s_lshr_b32 s1, s0, 1
38; VI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
39; VI-NEXT:    v_mov_b32_e32 v1, s2
40; VI-NEXT:    v_alignbit_b32 v2, s1, v0, v1
41; VI-NEXT:    v_mov_b32_e32 v0, s4
42; VI-NEXT:    v_mov_b32_e32 v1, s5
43; VI-NEXT:    flat_store_dword v[0:1], v2
44; VI-NEXT:    s_endpgm
45;
46; GFX9-LABEL: fshl_i32:
47; GFX9:       ; %bb.0: ; %entry
48; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
49; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
50; GFX9-NEXT:    v_mov_b32_e32 v0, 0
51; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
52; GFX9-NEXT:    v_mov_b32_e32 v1, s1
53; GFX9-NEXT:    s_not_b32 s2, s2
54; GFX9-NEXT:    s_lshr_b32 s1, s0, 1
55; GFX9-NEXT:    v_alignbit_b32 v1, s0, v1, 1
56; GFX9-NEXT:    v_mov_b32_e32 v2, s2
57; GFX9-NEXT:    v_alignbit_b32 v1, s1, v1, v2
58; GFX9-NEXT:    global_store_dword v0, v1, s[6:7]
59; GFX9-NEXT:    s_endpgm
60;
61; R600-LABEL: fshl_i32:
62; R600:       ; %bb.0: ; %entry
63; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
64; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
65; R600-NEXT:    CF_END
66; R600-NEXT:    PAD
67; R600-NEXT:    ALU clause starting at 4:
68; R600-NEXT:     LSHR T0.Z, KC0[2].Z, 1,
69; R600-NEXT:     BIT_ALIGN_INT T0.W, KC0[2].Z, KC0[2].W, 1,
70; R600-NEXT:     NOT_INT * T1.W, KC0[3].X,
71; R600-NEXT:     BIT_ALIGN_INT T0.X, PV.Z, PV.W, PS,
72; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
73; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
74;
75; GFX10-LABEL: fshl_i32:
76; GFX10:       ; %bb.0: ; %entry
77; GFX10-NEXT:    s_clause 0x1
78; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
79; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
80; GFX10-NEXT:    v_mov_b32_e32 v1, 0
81; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX10-NEXT:    v_alignbit_b32 v0, s0, s1, 1
83; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
84; GFX10-NEXT:    s_not_b32 s1, s2
85; GFX10-NEXT:    v_alignbit_b32 v0, s0, v0, s1
86; GFX10-NEXT:    global_store_dword v1, v0, s[6:7]
87; GFX10-NEXT:    s_endpgm
88;
89; GFX11-LABEL: fshl_i32:
90; GFX11:       ; %bb.0: ; %entry
91; GFX11-NEXT:    s_clause 0x1
92; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
93; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
94; GFX11-NEXT:    v_mov_b32_e32 v1, 0
95; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
96; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, 1
97; GFX11-NEXT:    s_lshr_b32 s0, s0, 1
98; GFX11-NEXT:    s_not_b32 s1, s2
99; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
100; GFX11-NEXT:    v_alignbit_b32 v0, s0, v0, s1
101; GFX11-NEXT:    global_store_b32 v1, v0, s[4:5]
102; GFX11-NEXT:    s_endpgm
103entry:
104  %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
105  store i32 %0, ptr addrspace(1) %in
106  ret void
107}
108
109define amdgpu_kernel void @fshl_i32_imm(ptr addrspace(1) %in, i32 %x, i32 %y) {
110; SI-LABEL: fshl_i32_imm:
111; SI:       ; %bb.0: ; %entry
112; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
113; SI-NEXT:    s_mov_b32 s7, 0xf000
114; SI-NEXT:    s_mov_b32 s6, -1
115; SI-NEXT:    s_waitcnt lgkmcnt(0)
116; SI-NEXT:    v_mov_b32_e32 v0, s3
117; SI-NEXT:    s_mov_b32 s4, s0
118; SI-NEXT:    s_mov_b32 s5, s1
119; SI-NEXT:    v_alignbit_b32 v0, s2, v0, 25
120; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
121; SI-NEXT:    s_endpgm
122;
123; VI-LABEL: fshl_i32_imm:
124; VI:       ; %bb.0: ; %entry
125; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
126; VI-NEXT:    s_waitcnt lgkmcnt(0)
127; VI-NEXT:    v_mov_b32_e32 v0, s3
128; VI-NEXT:    v_alignbit_b32 v2, s2, v0, 25
129; VI-NEXT:    v_mov_b32_e32 v0, s0
130; VI-NEXT:    v_mov_b32_e32 v1, s1
131; VI-NEXT:    flat_store_dword v[0:1], v2
132; VI-NEXT:    s_endpgm
133;
134; GFX9-LABEL: fshl_i32_imm:
135; GFX9:       ; %bb.0: ; %entry
136; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
137; GFX9-NEXT:    v_mov_b32_e32 v0, 0
138; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
139; GFX9-NEXT:    v_mov_b32_e32 v1, s3
140; GFX9-NEXT:    v_alignbit_b32 v1, s2, v1, 25
141; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
142; GFX9-NEXT:    s_endpgm
143;
144; R600-LABEL: fshl_i32_imm:
145; R600:       ; %bb.0: ; %entry
146; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
147; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
148; R600-NEXT:    CF_END
149; R600-NEXT:    PAD
150; R600-NEXT:    ALU clause starting at 4:
151; R600-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
152; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
153; R600-NEXT:     BIT_ALIGN_INT * T1.X, KC0[2].Z, KC0[2].W, literal.x,
154; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
155;
156; GFX10-LABEL: fshl_i32_imm:
157; GFX10:       ; %bb.0: ; %entry
158; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
159; GFX10-NEXT:    v_mov_b32_e32 v0, 0
160; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
161; GFX10-NEXT:    v_alignbit_b32 v1, s2, s3, 25
162; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
163; GFX10-NEXT:    s_endpgm
164;
165; GFX11-LABEL: fshl_i32_imm:
166; GFX11:       ; %bb.0: ; %entry
167; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
168; GFX11-NEXT:    v_mov_b32_e32 v0, 0
169; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
170; GFX11-NEXT:    v_alignbit_b32 v1, s2, s3, 25
171; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
172; GFX11-NEXT:    s_endpgm
173entry:
174  %0 = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 7)
175  store i32 %0, ptr addrspace(1) %in
176  ret void
177}
178
179define amdgpu_kernel void @fshl_v2i32(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y, <2 x i32> %z) {
180; SI-LABEL: fshl_v2i32:
181; SI:       ; %bb.0: ; %entry
182; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
183; SI-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x9
184; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xf
185; SI-NEXT:    s_mov_b32 s11, 0xf000
186; SI-NEXT:    s_mov_b32 s10, -1
187; SI-NEXT:    s_waitcnt lgkmcnt(0)
188; SI-NEXT:    v_mov_b32_e32 v0, s3
189; SI-NEXT:    v_alignbit_b32 v0, s1, v0, 1
190; SI-NEXT:    s_not_b32 s3, s5
191; SI-NEXT:    s_lshr_b32 s1, s1, 1
192; SI-NEXT:    v_mov_b32_e32 v1, s3
193; SI-NEXT:    v_alignbit_b32 v1, s1, v0, v1
194; SI-NEXT:    v_mov_b32_e32 v0, s2
195; SI-NEXT:    s_not_b32 s1, s4
196; SI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
197; SI-NEXT:    s_lshr_b32 s0, s0, 1
198; SI-NEXT:    v_mov_b32_e32 v2, s1
199; SI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
200; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
201; SI-NEXT:    s_endpgm
202;
203; VI-LABEL: fshl_v2i32:
204; VI:       ; %bb.0: ; %entry
205; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
206; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
207; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
208; VI-NEXT:    s_waitcnt lgkmcnt(0)
209; VI-NEXT:    v_mov_b32_e32 v0, s3
210; VI-NEXT:    s_not_b32 s7, s7
211; VI-NEXT:    s_lshr_b32 s3, s1, 1
212; VI-NEXT:    v_alignbit_b32 v0, s1, v0, 1
213; VI-NEXT:    v_mov_b32_e32 v1, s7
214; VI-NEXT:    v_alignbit_b32 v1, s3, v0, v1
215; VI-NEXT:    v_mov_b32_e32 v0, s2
216; VI-NEXT:    s_not_b32 s1, s6
217; VI-NEXT:    v_alignbit_b32 v0, s0, v0, 1
218; VI-NEXT:    s_lshr_b32 s0, s0, 1
219; VI-NEXT:    v_mov_b32_e32 v2, s1
220; VI-NEXT:    v_alignbit_b32 v0, s0, v0, v2
221; VI-NEXT:    v_mov_b32_e32 v2, s4
222; VI-NEXT:    v_mov_b32_e32 v3, s5
223; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
224; VI-NEXT:    s_endpgm
225;
226; GFX9-LABEL: fshl_v2i32:
227; GFX9:       ; %bb.0: ; %entry
228; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
229; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
230; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x3c
231; GFX9-NEXT:    v_mov_b32_e32 v2, 0
232; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
233; GFX9-NEXT:    v_mov_b32_e32 v0, s3
234; GFX9-NEXT:    s_lshr_b32 s3, s1, 1
235; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, 1
236; GFX9-NEXT:    s_not_b32 s1, s9
237; GFX9-NEXT:    v_mov_b32_e32 v1, s1
238; GFX9-NEXT:    v_alignbit_b32 v1, s3, v0, v1
239; GFX9-NEXT:    v_mov_b32_e32 v0, s2
240; GFX9-NEXT:    s_not_b32 s1, s8
241; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, 1
242; GFX9-NEXT:    s_lshr_b32 s0, s0, 1
243; GFX9-NEXT:    v_mov_b32_e32 v3, s1
244; GFX9-NEXT:    v_alignbit_b32 v0, s0, v0, v3
245; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
246; GFX9-NEXT:    s_endpgm
247;
248; R600-LABEL: fshl_v2i32:
249; R600:       ; %bb.0: ; %entry
250; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
251; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
252; R600-NEXT:    CF_END
253; R600-NEXT:    PAD
254; R600-NEXT:    ALU clause starting at 4:
255; R600-NEXT:     LSHR T0.Z, KC0[3].X, 1,
256; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[3].X, KC0[3].Z, 1,
257; R600-NEXT:     NOT_INT * T1.W, KC0[4].X,
258; R600-NEXT:     BIT_ALIGN_INT T0.Y, T0.Z, T0.W, PV.W,
259; R600-NEXT:     LSHR T0.Z, KC0[2].W, 1,
260; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[2].W, KC0[3].Y, 1,
261; R600-NEXT:     NOT_INT * T1.W, KC0[3].W,
262; R600-NEXT:     BIT_ALIGN_INT T0.X, T0.Z, T0.W, PV.W,
263; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
264; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
265;
266; GFX10-LABEL: fshl_v2i32:
267; GFX10:       ; %bb.0: ; %entry
268; GFX10-NEXT:    s_clause 0x2
269; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
270; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x3c
271; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x24
272; GFX10-NEXT:    v_mov_b32_e32 v2, 0
273; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
274; GFX10-NEXT:    v_alignbit_b32 v0, s1, s3, 1
275; GFX10-NEXT:    v_alignbit_b32 v3, s0, s2, 1
276; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
277; GFX10-NEXT:    s_not_b32 s2, s7
278; GFX10-NEXT:    s_lshr_b32 s0, s0, 1
279; GFX10-NEXT:    s_not_b32 s3, s6
280; GFX10-NEXT:    v_alignbit_b32 v1, s1, v0, s2
281; GFX10-NEXT:    v_alignbit_b32 v0, s0, v3, s3
282; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
283; GFX10-NEXT:    s_endpgm
284;
285; GFX11-LABEL: fshl_v2i32:
286; GFX11:       ; %bb.0: ; %entry
287; GFX11-NEXT:    s_clause 0x2
288; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
289; GFX11-NEXT:    s_load_b64 s[6:7], s[4:5], 0x3c
290; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
291; GFX11-NEXT:    v_mov_b32_e32 v2, 0
292; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
293; GFX11-NEXT:    v_alignbit_b32 v0, s1, s3, 1
294; GFX11-NEXT:    v_alignbit_b32 v3, s0, s2, 1
295; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
296; GFX11-NEXT:    s_not_b32 s2, s7
297; GFX11-NEXT:    s_lshr_b32 s0, s0, 1
298; GFX11-NEXT:    s_not_b32 s3, s6
299; GFX11-NEXT:    v_alignbit_b32 v1, s1, v0, s2
300; GFX11-NEXT:    v_alignbit_b32 v0, s0, v3, s3
301; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
302; GFX11-NEXT:    s_endpgm
303entry:
304  %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %z)
305  store <2 x i32> %0, ptr addrspace(1) %in
306  ret void
307}
308
309define amdgpu_kernel void @fshl_v2i32_imm(ptr addrspace(1) %in, <2 x i32> %x, <2 x i32> %y) {
310; SI-LABEL: fshl_v2i32_imm:
311; SI:       ; %bb.0: ; %entry
312; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
313; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
314; SI-NEXT:    s_mov_b32 s7, 0xf000
315; SI-NEXT:    s_mov_b32 s6, -1
316; SI-NEXT:    s_waitcnt lgkmcnt(0)
317; SI-NEXT:    v_mov_b32_e32 v0, s3
318; SI-NEXT:    v_mov_b32_e32 v2, s2
319; SI-NEXT:    v_alignbit_b32 v1, s1, v0, 23
320; SI-NEXT:    v_alignbit_b32 v0, s0, v2, 25
321; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
322; SI-NEXT:    s_endpgm
323;
324; VI-LABEL: fshl_v2i32_imm:
325; VI:       ; %bb.0: ; %entry
326; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
327; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
328; VI-NEXT:    s_waitcnt lgkmcnt(0)
329; VI-NEXT:    v_mov_b32_e32 v0, s3
330; VI-NEXT:    v_mov_b32_e32 v2, s2
331; VI-NEXT:    v_alignbit_b32 v1, s1, v0, 23
332; VI-NEXT:    v_alignbit_b32 v0, s0, v2, 25
333; VI-NEXT:    v_mov_b32_e32 v2, s4
334; VI-NEXT:    v_mov_b32_e32 v3, s5
335; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
336; VI-NEXT:    s_endpgm
337;
338; GFX9-LABEL: fshl_v2i32_imm:
339; GFX9:       ; %bb.0: ; %entry
340; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
341; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
342; GFX9-NEXT:    v_mov_b32_e32 v2, 0
343; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
344; GFX9-NEXT:    v_mov_b32_e32 v0, s3
345; GFX9-NEXT:    v_mov_b32_e32 v3, s2
346; GFX9-NEXT:    v_alignbit_b32 v1, s1, v0, 23
347; GFX9-NEXT:    v_alignbit_b32 v0, s0, v3, 25
348; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
349; GFX9-NEXT:    s_endpgm
350;
351; R600-LABEL: fshl_v2i32_imm:
352; R600:       ; %bb.0: ; %entry
353; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
354; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
355; R600-NEXT:    CF_END
356; R600-NEXT:    PAD
357; R600-NEXT:    ALU clause starting at 4:
358; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].X, KC0[3].Z, literal.x,
359; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
360; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[2].W, KC0[3].Y, literal.x,
361; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
362; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
363; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
364;
365; GFX10-LABEL: fshl_v2i32_imm:
366; GFX10:       ; %bb.0: ; %entry
367; GFX10-NEXT:    s_clause 0x1
368; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
369; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
370; GFX10-NEXT:    v_mov_b32_e32 v2, 0
371; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
372; GFX10-NEXT:    v_alignbit_b32 v1, s1, s3, 23
373; GFX10-NEXT:    v_alignbit_b32 v0, s0, s2, 25
374; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
375; GFX10-NEXT:    s_endpgm
376;
377; GFX11-LABEL: fshl_v2i32_imm:
378; GFX11:       ; %bb.0: ; %entry
379; GFX11-NEXT:    s_clause 0x1
380; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
381; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
382; GFX11-NEXT:    v_mov_b32_e32 v2, 0
383; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
384; GFX11-NEXT:    v_alignbit_b32 v1, s1, s3, 23
385; GFX11-NEXT:    v_alignbit_b32 v0, s0, s2, 25
386; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
387; GFX11-NEXT:    s_endpgm
388entry:
389  %0 = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 7, i32 9>)
390  store <2 x i32> %0, ptr addrspace(1) %in
391  ret void
392}
393
394define amdgpu_kernel void @fshl_v4i32(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
395; SI-LABEL: fshl_v4i32:
396; SI:       ; %bb.0: ; %entry
397; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
398; SI-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x15
399; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
400; SI-NEXT:    s_mov_b32 s3, 0xf000
401; SI-NEXT:    s_mov_b32 s2, -1
402; SI-NEXT:    s_waitcnt lgkmcnt(0)
403; SI-NEXT:    s_not_b32 s5, s19
404; SI-NEXT:    v_mov_b32_e32 v0, s15
405; SI-NEXT:    v_alignbit_b32 v0, s11, v0, 1
406; SI-NEXT:    s_lshr_b32 s4, s11, 1
407; SI-NEXT:    v_mov_b32_e32 v1, s5
408; SI-NEXT:    v_alignbit_b32 v3, s4, v0, v1
409; SI-NEXT:    v_mov_b32_e32 v0, s14
410; SI-NEXT:    s_not_b32 s5, s18
411; SI-NEXT:    v_alignbit_b32 v0, s10, v0, 1
412; SI-NEXT:    s_lshr_b32 s4, s10, 1
413; SI-NEXT:    v_mov_b32_e32 v1, s5
414; SI-NEXT:    v_alignbit_b32 v2, s4, v0, v1
415; SI-NEXT:    v_mov_b32_e32 v0, s13
416; SI-NEXT:    s_not_b32 s5, s17
417; SI-NEXT:    v_alignbit_b32 v0, s9, v0, 1
418; SI-NEXT:    s_lshr_b32 s4, s9, 1
419; SI-NEXT:    v_mov_b32_e32 v1, s5
420; SI-NEXT:    v_alignbit_b32 v1, s4, v0, v1
421; SI-NEXT:    v_mov_b32_e32 v0, s12
422; SI-NEXT:    s_not_b32 s5, s16
423; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
424; SI-NEXT:    s_lshr_b32 s4, s8, 1
425; SI-NEXT:    v_mov_b32_e32 v4, s5
426; SI-NEXT:    v_alignbit_b32 v0, s4, v0, v4
427; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
428; SI-NEXT:    s_endpgm
429;
430; VI-LABEL: fshl_v4i32:
431; VI:       ; %bb.0: ; %entry
432; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
433; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
434; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
435; VI-NEXT:    s_waitcnt lgkmcnt(0)
436; VI-NEXT:    v_mov_b32_e32 v0, s15
437; VI-NEXT:    s_not_b32 s3, s3
438; VI-NEXT:    s_lshr_b32 s6, s11, 1
439; VI-NEXT:    v_alignbit_b32 v0, s11, v0, 1
440; VI-NEXT:    v_mov_b32_e32 v1, s3
441; VI-NEXT:    v_alignbit_b32 v3, s6, v0, v1
442; VI-NEXT:    v_mov_b32_e32 v0, s14
443; VI-NEXT:    s_not_b32 s2, s2
444; VI-NEXT:    v_alignbit_b32 v0, s10, v0, 1
445; VI-NEXT:    s_lshr_b32 s3, s10, 1
446; VI-NEXT:    v_mov_b32_e32 v1, s2
447; VI-NEXT:    v_alignbit_b32 v2, s3, v0, v1
448; VI-NEXT:    v_mov_b32_e32 v0, s13
449; VI-NEXT:    s_not_b32 s1, s1
450; VI-NEXT:    v_alignbit_b32 v0, s9, v0, 1
451; VI-NEXT:    s_lshr_b32 s2, s9, 1
452; VI-NEXT:    v_mov_b32_e32 v1, s1
453; VI-NEXT:    v_alignbit_b32 v1, s2, v0, v1
454; VI-NEXT:    v_mov_b32_e32 v0, s12
455; VI-NEXT:    s_not_b32 s0, s0
456; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 1
457; VI-NEXT:    s_lshr_b32 s1, s8, 1
458; VI-NEXT:    v_mov_b32_e32 v4, s0
459; VI-NEXT:    v_alignbit_b32 v0, s1, v0, v4
460; VI-NEXT:    v_mov_b32_e32 v4, s4
461; VI-NEXT:    v_mov_b32_e32 v5, s5
462; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
463; VI-NEXT:    s_endpgm
464;
465; GFX9-LABEL: fshl_v4i32:
466; GFX9:       ; %bb.0: ; %entry
467; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
468; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
469; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
470; GFX9-NEXT:    v_mov_b32_e32 v4, 0
471; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
472; GFX9-NEXT:    s_not_b32 s3, s3
473; GFX9-NEXT:    v_mov_b32_e32 v0, s15
474; GFX9-NEXT:    s_lshr_b32 s4, s11, 1
475; GFX9-NEXT:    v_alignbit_b32 v0, s11, v0, 1
476; GFX9-NEXT:    v_mov_b32_e32 v1, s3
477; GFX9-NEXT:    v_alignbit_b32 v3, s4, v0, v1
478; GFX9-NEXT:    v_mov_b32_e32 v0, s14
479; GFX9-NEXT:    s_not_b32 s2, s2
480; GFX9-NEXT:    v_alignbit_b32 v0, s10, v0, 1
481; GFX9-NEXT:    s_lshr_b32 s3, s10, 1
482; GFX9-NEXT:    v_mov_b32_e32 v1, s2
483; GFX9-NEXT:    v_alignbit_b32 v2, s3, v0, v1
484; GFX9-NEXT:    v_mov_b32_e32 v0, s13
485; GFX9-NEXT:    s_not_b32 s1, s1
486; GFX9-NEXT:    v_alignbit_b32 v0, s9, v0, 1
487; GFX9-NEXT:    s_lshr_b32 s2, s9, 1
488; GFX9-NEXT:    v_mov_b32_e32 v1, s1
489; GFX9-NEXT:    v_alignbit_b32 v1, s2, v0, v1
490; GFX9-NEXT:    v_mov_b32_e32 v0, s12
491; GFX9-NEXT:    s_not_b32 s0, s0
492; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 1
493; GFX9-NEXT:    s_lshr_b32 s1, s8, 1
494; GFX9-NEXT:    v_mov_b32_e32 v5, s0
495; GFX9-NEXT:    v_alignbit_b32 v0, s1, v0, v5
496; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
497; GFX9-NEXT:    s_endpgm
498;
499; R600-LABEL: fshl_v4i32:
500; R600:       ; %bb.0: ; %entry
501; R600-NEXT:    ALU 17, @4, KC0[CB0:0-32], KC1[]
502; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
503; R600-NEXT:    CF_END
504; R600-NEXT:    PAD
505; R600-NEXT:    ALU clause starting at 4:
506; R600-NEXT:     LSHR T0.Z, KC0[4].X, 1,
507; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, 1,
508; R600-NEXT:     NOT_INT * T1.W, KC0[6].X,
509; R600-NEXT:     LSHR T0.Y, KC0[3].W, 1,
510; R600-NEXT:     BIT_ALIGN_INT T1.Z, KC0[3].W, KC0[4].W, 1,
511; R600-NEXT:     BIT_ALIGN_INT * T0.W, T0.Z, T0.W, PV.W,
512; R600-NEXT:     NOT_INT * T1.W, KC0[5].W,
513; R600-NEXT:     LSHR T1.Y, KC0[3].Z, 1,
514; R600-NEXT:     BIT_ALIGN_INT T0.Z, T0.Y, T1.Z, PV.W,
515; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[3].Z, KC0[4].Z, 1,
516; R600-NEXT:     NOT_INT * T2.W, KC0[5].Z,
517; R600-NEXT:     BIT_ALIGN_INT T0.Y, T1.Y, T1.W, PV.W,
518; R600-NEXT:     LSHR T1.Z, KC0[3].Y, 1,
519; R600-NEXT:     BIT_ALIGN_INT * T1.W, KC0[3].Y, KC0[4].Y, 1,
520; R600-NEXT:     NOT_INT * T2.W, KC0[5].Y,
521; R600-NEXT:     BIT_ALIGN_INT T0.X, T1.Z, T1.W, PV.W,
522; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
523; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
524;
525; GFX10-LABEL: fshl_v4i32:
526; GFX10:       ; %bb.0: ; %entry
527; GFX10-NEXT:    s_clause 0x2
528; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
529; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x54
530; GFX10-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
531; GFX10-NEXT:    v_mov_b32_e32 v4, 0
532; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
533; GFX10-NEXT:    v_alignbit_b32 v0, s11, s15, 1
534; GFX10-NEXT:    v_alignbit_b32 v1, s10, s14, 1
535; GFX10-NEXT:    v_alignbit_b32 v5, s9, s13, 1
536; GFX10-NEXT:    v_alignbit_b32 v6, s8, s12, 1
537; GFX10-NEXT:    s_lshr_b32 s4, s11, 1
538; GFX10-NEXT:    s_not_b32 s3, s3
539; GFX10-NEXT:    s_lshr_b32 s5, s10, 1
540; GFX10-NEXT:    s_not_b32 s2, s2
541; GFX10-NEXT:    s_lshr_b32 s9, s9, 1
542; GFX10-NEXT:    s_not_b32 s1, s1
543; GFX10-NEXT:    s_lshr_b32 s8, s8, 1
544; GFX10-NEXT:    s_not_b32 s0, s0
545; GFX10-NEXT:    v_alignbit_b32 v3, s4, v0, s3
546; GFX10-NEXT:    v_alignbit_b32 v2, s5, v1, s2
547; GFX10-NEXT:    v_alignbit_b32 v1, s9, v5, s1
548; GFX10-NEXT:    v_alignbit_b32 v0, s8, v6, s0
549; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
550; GFX10-NEXT:    s_endpgm
551;
552; GFX11-LABEL: fshl_v4i32:
553; GFX11:       ; %bb.0: ; %entry
554; GFX11-NEXT:    s_clause 0x2
555; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
556; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x54
557; GFX11-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
558; GFX11-NEXT:    v_mov_b32_e32 v4, 0
559; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
560; GFX11-NEXT:    v_alignbit_b32 v0, s11, s15, 1
561; GFX11-NEXT:    v_alignbit_b32 v1, s10, s14, 1
562; GFX11-NEXT:    v_alignbit_b32 v5, s9, s13, 1
563; GFX11-NEXT:    v_alignbit_b32 v6, s8, s12, 1
564; GFX11-NEXT:    s_lshr_b32 s6, s11, 1
565; GFX11-NEXT:    s_not_b32 s3, s3
566; GFX11-NEXT:    s_lshr_b32 s7, s10, 1
567; GFX11-NEXT:    s_not_b32 s2, s2
568; GFX11-NEXT:    s_lshr_b32 s9, s9, 1
569; GFX11-NEXT:    s_not_b32 s1, s1
570; GFX11-NEXT:    s_lshr_b32 s8, s8, 1
571; GFX11-NEXT:    s_not_b32 s0, s0
572; GFX11-NEXT:    v_alignbit_b32 v3, s6, v0, s3
573; GFX11-NEXT:    v_alignbit_b32 v2, s7, v1, s2
574; GFX11-NEXT:    v_alignbit_b32 v1, s9, v5, s1
575; GFX11-NEXT:    v_alignbit_b32 v0, s8, v6, s0
576; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
577; GFX11-NEXT:    s_endpgm
578entry:
579  %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z)
580  store <4 x i32> %0, ptr addrspace(1) %in
581  ret void
582}
583
584define amdgpu_kernel void @fshl_v4i32_imm(ptr addrspace(1) %in, <4 x i32> %x, <4 x i32> %y) {
585; SI-LABEL: fshl_v4i32_imm:
586; SI:       ; %bb.0: ; %entry
587; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0xd
588; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
589; SI-NEXT:    s_mov_b32 s3, 0xf000
590; SI-NEXT:    s_mov_b32 s2, -1
591; SI-NEXT:    s_waitcnt lgkmcnt(0)
592; SI-NEXT:    v_mov_b32_e32 v0, s15
593; SI-NEXT:    v_mov_b32_e32 v1, s14
594; SI-NEXT:    v_alignbit_b32 v3, s11, v0, 31
595; SI-NEXT:    v_mov_b32_e32 v0, s13
596; SI-NEXT:    v_alignbit_b32 v2, s10, v1, 23
597; SI-NEXT:    v_alignbit_b32 v1, s9, v0, 25
598; SI-NEXT:    v_mov_b32_e32 v0, s12
599; SI-NEXT:    v_alignbit_b32 v0, s8, v0, 31
600; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
601; SI-NEXT:    s_endpgm
602;
603; VI-LABEL: fshl_v4i32_imm:
604; VI:       ; %bb.0: ; %entry
605; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
606; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
607; VI-NEXT:    s_waitcnt lgkmcnt(0)
608; VI-NEXT:    v_mov_b32_e32 v0, s15
609; VI-NEXT:    v_mov_b32_e32 v1, s14
610; VI-NEXT:    v_mov_b32_e32 v4, s13
611; VI-NEXT:    v_alignbit_b32 v3, s11, v0, 31
612; VI-NEXT:    v_alignbit_b32 v2, s10, v1, 23
613; VI-NEXT:    v_alignbit_b32 v1, s9, v4, 25
614; VI-NEXT:    v_mov_b32_e32 v0, s12
615; VI-NEXT:    v_mov_b32_e32 v5, s1
616; VI-NEXT:    v_alignbit_b32 v0, s8, v0, 31
617; VI-NEXT:    v_mov_b32_e32 v4, s0
618; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
619; VI-NEXT:    s_endpgm
620;
621; GFX9-LABEL: fshl_v4i32_imm:
622; GFX9:       ; %bb.0: ; %entry
623; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
624; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
625; GFX9-NEXT:    v_mov_b32_e32 v4, 0
626; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
627; GFX9-NEXT:    v_mov_b32_e32 v0, s15
628; GFX9-NEXT:    v_mov_b32_e32 v1, s14
629; GFX9-NEXT:    v_alignbit_b32 v3, s11, v0, 31
630; GFX9-NEXT:    v_mov_b32_e32 v0, s13
631; GFX9-NEXT:    v_alignbit_b32 v2, s10, v1, 23
632; GFX9-NEXT:    v_alignbit_b32 v1, s9, v0, 25
633; GFX9-NEXT:    v_mov_b32_e32 v0, s12
634; GFX9-NEXT:    v_alignbit_b32 v0, s8, v0, 31
635; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
636; GFX9-NEXT:    s_endpgm
637;
638; R600-LABEL: fshl_v4i32_imm:
639; R600:       ; %bb.0: ; %entry
640; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
641; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
642; R600-NEXT:    CF_END
643; R600-NEXT:    PAD
644; R600-NEXT:    ALU clause starting at 4:
645; R600-NEXT:     BIT_ALIGN_INT * T0.W, KC0[4].X, KC0[5].X, literal.x,
646; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
647; R600-NEXT:     BIT_ALIGN_INT * T0.Z, KC0[3].W, KC0[4].W, literal.x,
648; R600-NEXT:    23(3.222986e-44), 0(0.000000e+00)
649; R600-NEXT:     BIT_ALIGN_INT * T0.Y, KC0[3].Z, KC0[4].Z, literal.x,
650; R600-NEXT:    25(3.503246e-44), 0(0.000000e+00)
651; R600-NEXT:     BIT_ALIGN_INT * T0.X, KC0[3].Y, KC0[4].Y, literal.x,
652; R600-NEXT:    31(4.344025e-44), 0(0.000000e+00)
653; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
654; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
655;
656; GFX10-LABEL: fshl_v4i32_imm:
657; GFX10:       ; %bb.0: ; %entry
658; GFX10-NEXT:    s_clause 0x1
659; GFX10-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
660; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
661; GFX10-NEXT:    v_mov_b32_e32 v4, 0
662; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
663; GFX10-NEXT:    v_alignbit_b32 v3, s11, s15, 31
664; GFX10-NEXT:    v_alignbit_b32 v2, s10, s14, 23
665; GFX10-NEXT:    v_alignbit_b32 v1, s9, s13, 25
666; GFX10-NEXT:    v_alignbit_b32 v0, s8, s12, 31
667; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
668; GFX10-NEXT:    s_endpgm
669;
670; GFX11-LABEL: fshl_v4i32_imm:
671; GFX11:       ; %bb.0: ; %entry
672; GFX11-NEXT:    s_clause 0x1
673; GFX11-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
674; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
675; GFX11-NEXT:    v_mov_b32_e32 v4, 0
676; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
677; GFX11-NEXT:    v_alignbit_b32 v3, s11, s15, 31
678; GFX11-NEXT:    v_alignbit_b32 v2, s10, s14, 23
679; GFX11-NEXT:    v_alignbit_b32 v1, s9, s13, 25
680; GFX11-NEXT:    v_alignbit_b32 v0, s8, s12, 31
681; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
682; GFX11-NEXT:    s_endpgm
683entry:
684  %0 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 7, i32 9, i32 33>)
685  store <4 x i32> %0, ptr addrspace(1) %in
686  ret void
687}
688
689; (a ^ b) | a --> a | b
690define amdgpu_kernel void @orxor2or1(ptr addrspace(1) %in, i32 %a, i32 %b) {
691; SI-LABEL: orxor2or1:
692; SI:       ; %bb.0:
693; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
694; SI-NEXT:    s_mov_b32 s7, 0xf000
695; SI-NEXT:    s_mov_b32 s6, -1
696; SI-NEXT:    s_waitcnt lgkmcnt(0)
697; SI-NEXT:    s_mov_b32 s4, s0
698; SI-NEXT:    s_lshl_b32 s0, s2, 7
699; SI-NEXT:    s_or_b32 s0, s3, s0
700; SI-NEXT:    s_cmp_eq_u32 s0, 0
701; SI-NEXT:    s_cselect_b32 s0, s2, s3
702; SI-NEXT:    s_mov_b32 s5, s1
703; SI-NEXT:    v_mov_b32_e32 v0, s0
704; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
705; SI-NEXT:    s_endpgm
706;
707; VI-LABEL: orxor2or1:
708; VI:       ; %bb.0:
709; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
710; VI-NEXT:    s_waitcnt lgkmcnt(0)
711; VI-NEXT:    s_lshl_b32 s4, s2, 7
712; VI-NEXT:    s_or_b32 s4, s3, s4
713; VI-NEXT:    s_cmp_eq_u32 s4, 0
714; VI-NEXT:    s_cselect_b32 s2, s2, s3
715; VI-NEXT:    v_mov_b32_e32 v0, s0
716; VI-NEXT:    v_mov_b32_e32 v1, s1
717; VI-NEXT:    v_mov_b32_e32 v2, s2
718; VI-NEXT:    flat_store_dword v[0:1], v2
719; VI-NEXT:    s_endpgm
720;
721; GFX9-LABEL: orxor2or1:
722; GFX9:       ; %bb.0:
723; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
724; GFX9-NEXT:    v_mov_b32_e32 v0, 0
725; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
726; GFX9-NEXT:    s_lshl_b32 s4, s2, 7
727; GFX9-NEXT:    s_or_b32 s4, s3, s4
728; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
729; GFX9-NEXT:    s_cselect_b32 s2, s2, s3
730; GFX9-NEXT:    v_mov_b32_e32 v1, s2
731; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
732; GFX9-NEXT:    s_endpgm
733;
734; R600-LABEL: orxor2or1:
735; R600:       ; %bb.0:
736; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
737; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
738; R600-NEXT:    CF_END
739; R600-NEXT:    PAD
740; R600-NEXT:    ALU clause starting at 4:
741; R600-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
742; R600-NEXT:    7(9.809089e-45), 0(0.000000e+00)
743; R600-NEXT:     OR_INT * T0.W, KC0[2].W, PV.W,
744; R600-NEXT:     CNDE_INT T0.X, PV.W, KC0[2].Z, KC0[2].W,
745; R600-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
746; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
747;
748; GFX10-LABEL: orxor2or1:
749; GFX10:       ; %bb.0:
750; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
751; GFX10-NEXT:    v_mov_b32_e32 v0, 0
752; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX10-NEXT:    s_lshl_b32 s4, s2, 7
754; GFX10-NEXT:    s_or_b32 s4, s3, s4
755; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
756; GFX10-NEXT:    s_cselect_b32 s2, s2, s3
757; GFX10-NEXT:    v_mov_b32_e32 v1, s2
758; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
759; GFX10-NEXT:    s_endpgm
760;
761; GFX11-LABEL: orxor2or1:
762; GFX11:       ; %bb.0:
763; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
764; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
765; GFX11-NEXT:    s_lshl_b32 s4, s2, 7
766; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
767; GFX11-NEXT:    s_or_b32 s4, s3, s4
768; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
769; GFX11-NEXT:    s_cselect_b32 s2, s2, s3
770; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
771; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
772; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
773; GFX11-NEXT:    s_endpgm
774  %shl = shl i32 %a, 7
775  %xor = xor i32 %shl, %b
776  %or = or i32 %a, %xor
777  %fshl = call i32 @llvm.fshl.i32(i32 %or, i32 %xor, i32 7)
778  %cond = icmp eq i32 %fshl, 0
779  %r = select i1 %cond, i32 %a, i32 %b
780  store i32 %r, ptr addrspace(1) %in
781  ret void
782}
783