xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
3; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
4
5define amdgpu_kernel void @bfe_u32_arg_arg_arg(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) #0 {
6; SI-LABEL: bfe_u32_arg_arg_arg:
7; SI:       ; %bb.0:
8; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
9; SI-NEXT:    s_mov_b32 s7, 0xf000
10; SI-NEXT:    s_mov_b32 s6, -1
11; SI-NEXT:    s_waitcnt lgkmcnt(0)
12; SI-NEXT:    s_mov_b32 s4, s0
13; SI-NEXT:    s_mov_b32 s5, s1
14; SI-NEXT:    v_mov_b32_e32 v0, s2
15; SI-NEXT:    v_bfe_u32 v0, v0, s3, s3
16; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
17; SI-NEXT:    s_endpgm
18;
19; VI-LABEL: bfe_u32_arg_arg_arg:
20; VI:       ; %bb.0:
21; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
22; VI-NEXT:    s_mov_b32 s7, 0xf000
23; VI-NEXT:    s_mov_b32 s6, -1
24; VI-NEXT:    s_waitcnt lgkmcnt(0)
25; VI-NEXT:    v_mov_b32_e32 v0, s2
26; VI-NEXT:    s_mov_b32 s4, s0
27; VI-NEXT:    s_mov_b32 s5, s1
28; VI-NEXT:    v_bfe_u32 v0, v0, s3, s3
29; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
30; VI-NEXT:    s_endpgm
31  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1)
32  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
33  ret void
34}
35
36define amdgpu_kernel void @bfe_u32_arg_arg_imm(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 {
37; SI-LABEL: bfe_u32_arg_arg_imm:
38; SI:       ; %bb.0:
39; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
40; SI-NEXT:    s_mov_b32 s7, 0xf000
41; SI-NEXT:    s_mov_b32 s6, -1
42; SI-NEXT:    v_mov_b32_e32 v0, 0x7b
43; SI-NEXT:    s_waitcnt lgkmcnt(0)
44; SI-NEXT:    s_mov_b32 s4, s0
45; SI-NEXT:    s_mov_b32 s5, s1
46; SI-NEXT:    v_mov_b32_e32 v1, s3
47; SI-NEXT:    v_bfe_u32 v0, s2, v1, v0
48; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
49; SI-NEXT:    s_endpgm
50;
51; VI-LABEL: bfe_u32_arg_arg_imm:
52; VI:       ; %bb.0:
53; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
54; VI-NEXT:    v_mov_b32_e32 v1, 0x7b
55; VI-NEXT:    s_mov_b32 s7, 0xf000
56; VI-NEXT:    s_mov_b32 s6, -1
57; VI-NEXT:    s_waitcnt lgkmcnt(0)
58; VI-NEXT:    v_mov_b32_e32 v0, s3
59; VI-NEXT:    s_mov_b32 s4, s0
60; VI-NEXT:    s_mov_b32 s5, s1
61; VI-NEXT:    v_bfe_u32 v0, s2, v0, v1
62; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
63; VI-NEXT:    s_endpgm
64  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123)
65  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
66  ret void
67}
68
69define amdgpu_kernel void @bfe_u32_arg_imm_arg(ptr addrspace(1) %out, i32 %src0, i32 %src2) #0 {
70; SI-LABEL: bfe_u32_arg_imm_arg:
71; SI:       ; %bb.0:
72; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
73; SI-NEXT:    s_mov_b32 s7, 0xf000
74; SI-NEXT:    s_mov_b32 s6, -1
75; SI-NEXT:    v_mov_b32_e32 v0, 0x7b
76; SI-NEXT:    s_waitcnt lgkmcnt(0)
77; SI-NEXT:    s_mov_b32 s4, s0
78; SI-NEXT:    s_mov_b32 s5, s1
79; SI-NEXT:    v_mov_b32_e32 v1, s3
80; SI-NEXT:    v_bfe_u32 v0, s2, v0, v1
81; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
82; SI-NEXT:    s_endpgm
83;
84; VI-LABEL: bfe_u32_arg_imm_arg:
85; VI:       ; %bb.0:
86; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
87; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
88; VI-NEXT:    s_mov_b32 s7, 0xf000
89; VI-NEXT:    s_mov_b32 s6, -1
90; VI-NEXT:    s_waitcnt lgkmcnt(0)
91; VI-NEXT:    v_mov_b32_e32 v1, s3
92; VI-NEXT:    s_mov_b32 s4, s0
93; VI-NEXT:    s_mov_b32 s5, s1
94; VI-NEXT:    v_bfe_u32 v0, s2, v0, v1
95; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
96; VI-NEXT:    s_endpgm
97  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2)
98  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
99  ret void
100}
101
102define amdgpu_kernel void @bfe_u32_imm_arg_arg(ptr addrspace(1) %out, i32 %src1, i32 %src2) #0 {
103; SI-LABEL: bfe_u32_imm_arg_arg:
104; SI:       ; %bb.0:
105; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
106; SI-NEXT:    s_mov_b32 s7, 0xf000
107; SI-NEXT:    s_mov_b32 s6, -1
108; SI-NEXT:    s_movk_i32 s8, 0x7b
109; SI-NEXT:    s_waitcnt lgkmcnt(0)
110; SI-NEXT:    s_mov_b32 s4, s0
111; SI-NEXT:    s_mov_b32 s5, s1
112; SI-NEXT:    v_mov_b32_e32 v0, s2
113; SI-NEXT:    v_mov_b32_e32 v1, s3
114; SI-NEXT:    v_bfe_u32 v0, s8, v0, v1
115; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
116; SI-NEXT:    s_endpgm
117;
118; VI-LABEL: bfe_u32_imm_arg_arg:
119; VI:       ; %bb.0:
120; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
121; VI-NEXT:    s_movk_i32 s8, 0x7b
122; VI-NEXT:    s_mov_b32 s7, 0xf000
123; VI-NEXT:    s_mov_b32 s6, -1
124; VI-NEXT:    s_waitcnt lgkmcnt(0)
125; VI-NEXT:    v_mov_b32_e32 v0, s2
126; VI-NEXT:    v_mov_b32_e32 v1, s3
127; VI-NEXT:    s_mov_b32 s4, s0
128; VI-NEXT:    s_mov_b32 s5, s1
129; VI-NEXT:    v_bfe_u32 v0, s8, v0, v1
130; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
131; VI-NEXT:    s_endpgm
132  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2)
133  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
134  ret void
135}
136
137define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 {
138; SI-LABEL: bfe_u32_arg_0_width_reg_offset:
139; SI:       ; %bb.0:
140; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
141; SI-NEXT:    s_mov_b32 s3, 0xf000
142; SI-NEXT:    s_mov_b32 s2, -1
143; SI-NEXT:    v_mov_b32_e32 v0, 0
144; SI-NEXT:    s_waitcnt lgkmcnt(0)
145; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
146; SI-NEXT:    s_endpgm
147;
148; VI-LABEL: bfe_u32_arg_0_width_reg_offset:
149; VI:       ; %bb.0:
150; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
151; VI-NEXT:    s_mov_b32 s3, 0xf000
152; VI-NEXT:    s_mov_b32 s2, -1
153; VI-NEXT:    v_mov_b32_e32 v0, 0
154; VI-NEXT:    s_waitcnt lgkmcnt(0)
155; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
156; VI-NEXT:    s_endpgm
157  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0)
158  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
159  ret void
160}
161
162define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(ptr addrspace(1) %out, i32 %src0, i32 %src1) #0 {
163; SI-LABEL: bfe_u32_arg_0_width_imm_offset:
164; SI:       ; %bb.0:
165; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
166; SI-NEXT:    s_mov_b32 s3, 0xf000
167; SI-NEXT:    s_mov_b32 s2, -1
168; SI-NEXT:    v_mov_b32_e32 v0, 0
169; SI-NEXT:    s_waitcnt lgkmcnt(0)
170; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
171; SI-NEXT:    s_endpgm
172;
173; VI-LABEL: bfe_u32_arg_0_width_imm_offset:
174; VI:       ; %bb.0:
175; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
176; VI-NEXT:    s_mov_b32 s3, 0xf000
177; VI-NEXT:    s_mov_b32 s2, -1
178; VI-NEXT:    v_mov_b32_e32 v0, 0
179; VI-NEXT:    s_waitcnt lgkmcnt(0)
180; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
181; VI-NEXT:    s_endpgm
182  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0)
183  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
184  ret void
185}
186
187define amdgpu_kernel void @bfe_u32_zextload_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
188; SI-LABEL: bfe_u32_zextload_i8:
189; SI:       ; %bb.0:
190; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
191; SI-NEXT:    s_mov_b32 s7, 0xf000
192; SI-NEXT:    s_mov_b32 s6, -1
193; SI-NEXT:    s_mov_b32 s10, s6
194; SI-NEXT:    s_mov_b32 s11, s7
195; SI-NEXT:    s_waitcnt lgkmcnt(0)
196; SI-NEXT:    s_mov_b32 s8, s2
197; SI-NEXT:    s_mov_b32 s9, s3
198; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
199; SI-NEXT:    s_mov_b32 s4, s0
200; SI-NEXT:    s_mov_b32 s5, s1
201; SI-NEXT:    s_waitcnt vmcnt(0)
202; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
203; SI-NEXT:    s_endpgm
204;
205; VI-LABEL: bfe_u32_zextload_i8:
206; VI:       ; %bb.0:
207; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
208; VI-NEXT:    s_mov_b32 s7, 0xf000
209; VI-NEXT:    s_mov_b32 s6, -1
210; VI-NEXT:    s_mov_b32 s10, s6
211; VI-NEXT:    s_mov_b32 s11, s7
212; VI-NEXT:    s_waitcnt lgkmcnt(0)
213; VI-NEXT:    s_mov_b32 s8, s2
214; VI-NEXT:    s_mov_b32 s9, s3
215; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
216; VI-NEXT:    s_mov_b32 s4, s0
217; VI-NEXT:    s_mov_b32 s5, s1
218; VI-NEXT:    s_waitcnt vmcnt(0)
219; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
220; VI-NEXT:    s_endpgm
221  %load = load i8, ptr addrspace(1) %in
222  %ext = zext i8 %load to i32
223  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
224  store i32 %bfe, ptr addrspace(1) %out, align 4
225  ret void
226}
227
228; FIXME: Should be using s_add_i32
229define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
230; SI-LABEL: bfe_u32_zext_in_reg_i8:
231; SI:       ; %bb.0:
232; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
233; SI-NEXT:    s_mov_b32 s7, 0xf000
234; SI-NEXT:    s_mov_b32 s6, -1
235; SI-NEXT:    s_mov_b32 s10, s6
236; SI-NEXT:    s_mov_b32 s11, s7
237; SI-NEXT:    s_waitcnt lgkmcnt(0)
238; SI-NEXT:    s_mov_b32 s8, s2
239; SI-NEXT:    s_mov_b32 s9, s3
240; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
241; SI-NEXT:    s_mov_b32 s4, s0
242; SI-NEXT:    s_mov_b32 s5, s1
243; SI-NEXT:    s_waitcnt vmcnt(0)
244; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
245; SI-NEXT:    v_and_b32_e32 v0, 0xff, v0
246; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
247; SI-NEXT:    s_endpgm
248;
249; VI-LABEL: bfe_u32_zext_in_reg_i8:
250; VI:       ; %bb.0:
251; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
252; VI-NEXT:    s_mov_b32 s7, 0xf000
253; VI-NEXT:    s_mov_b32 s6, -1
254; VI-NEXT:    s_mov_b32 s10, s6
255; VI-NEXT:    s_mov_b32 s11, s7
256; VI-NEXT:    s_waitcnt lgkmcnt(0)
257; VI-NEXT:    s_mov_b32 s8, s2
258; VI-NEXT:    s_mov_b32 s9, s3
259; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
260; VI-NEXT:    s_mov_b32 s4, s0
261; VI-NEXT:    s_mov_b32 s5, s1
262; VI-NEXT:    s_waitcnt vmcnt(0)
263; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
264; VI-NEXT:    v_and_b32_e32 v0, 0xff, v0
265; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
266; VI-NEXT:    s_endpgm
267  %load = load i32, ptr addrspace(1) %in, align 4
268  %add = add i32 %load, 1
269  %ext = and i32 %add, 255
270  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
271  store i32 %bfe, ptr addrspace(1) %out, align 4
272  ret void
273}
274
275define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
276; SI-LABEL: bfe_u32_zext_in_reg_i16:
277; SI:       ; %bb.0:
278; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
279; SI-NEXT:    s_mov_b32 s7, 0xf000
280; SI-NEXT:    s_mov_b32 s6, -1
281; SI-NEXT:    s_mov_b32 s10, s6
282; SI-NEXT:    s_mov_b32 s11, s7
283; SI-NEXT:    s_waitcnt lgkmcnt(0)
284; SI-NEXT:    s_mov_b32 s8, s2
285; SI-NEXT:    s_mov_b32 s9, s3
286; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
287; SI-NEXT:    s_mov_b32 s4, s0
288; SI-NEXT:    s_mov_b32 s5, s1
289; SI-NEXT:    s_waitcnt vmcnt(0)
290; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
291; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
292; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
293; SI-NEXT:    s_endpgm
294;
295; VI-LABEL: bfe_u32_zext_in_reg_i16:
296; VI:       ; %bb.0:
297; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
298; VI-NEXT:    s_mov_b32 s7, 0xf000
299; VI-NEXT:    s_mov_b32 s6, -1
300; VI-NEXT:    s_mov_b32 s10, s6
301; VI-NEXT:    s_mov_b32 s11, s7
302; VI-NEXT:    s_waitcnt lgkmcnt(0)
303; VI-NEXT:    s_mov_b32 s8, s2
304; VI-NEXT:    s_mov_b32 s9, s3
305; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
306; VI-NEXT:    s_mov_b32 s4, s0
307; VI-NEXT:    s_mov_b32 s5, s1
308; VI-NEXT:    s_waitcnt vmcnt(0)
309; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
310; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
311; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
312; VI-NEXT:    s_endpgm
313  %load = load i32, ptr addrspace(1) %in, align 4
314  %add = add i32 %load, 1
315  %ext = and i32 %add, 65535
316  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 16)
317  store i32 %bfe, ptr addrspace(1) %out, align 4
318  ret void
319}
320
321define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
322; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_1:
323; SI:       ; %bb.0:
324; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
325; SI-NEXT:    s_mov_b32 s7, 0xf000
326; SI-NEXT:    s_mov_b32 s6, -1
327; SI-NEXT:    s_mov_b32 s10, s6
328; SI-NEXT:    s_mov_b32 s11, s7
329; SI-NEXT:    s_waitcnt lgkmcnt(0)
330; SI-NEXT:    s_mov_b32 s8, s2
331; SI-NEXT:    s_mov_b32 s9, s3
332; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
333; SI-NEXT:    s_mov_b32 s4, s0
334; SI-NEXT:    s_mov_b32 s5, s1
335; SI-NEXT:    s_waitcnt vmcnt(0)
336; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
337; SI-NEXT:    v_and_b32_e32 v0, 0xfe, v0
338; SI-NEXT:    v_bfe_u32 v0, v0, 1, 8
339; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
340; SI-NEXT:    s_endpgm
341;
342; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_1:
343; VI:       ; %bb.0:
344; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
345; VI-NEXT:    s_mov_b32 s7, 0xf000
346; VI-NEXT:    s_mov_b32 s6, -1
347; VI-NEXT:    s_mov_b32 s10, s6
348; VI-NEXT:    s_mov_b32 s11, s7
349; VI-NEXT:    s_waitcnt lgkmcnt(0)
350; VI-NEXT:    s_mov_b32 s8, s2
351; VI-NEXT:    s_mov_b32 s9, s3
352; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
353; VI-NEXT:    s_mov_b32 s4, s0
354; VI-NEXT:    s_mov_b32 s5, s1
355; VI-NEXT:    s_waitcnt vmcnt(0)
356; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
357; VI-NEXT:    v_and_b32_e32 v0, 0xfe, v0
358; VI-NEXT:    v_bfe_u32 v0, v0, 1, 8
359; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
360; VI-NEXT:    s_endpgm
361  %load = load i32, ptr addrspace(1) %in, align 4
362  %add = add i32 %load, 1
363  %ext = and i32 %add, 255
364  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 1, i32 8)
365  store i32 %bfe, ptr addrspace(1) %out, align 4
366  ret void
367}
368
369define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
370; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_3:
371; SI:       ; %bb.0:
372; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
373; SI-NEXT:    s_mov_b32 s7, 0xf000
374; SI-NEXT:    s_mov_b32 s6, -1
375; SI-NEXT:    s_mov_b32 s10, s6
376; SI-NEXT:    s_mov_b32 s11, s7
377; SI-NEXT:    s_waitcnt lgkmcnt(0)
378; SI-NEXT:    s_mov_b32 s8, s2
379; SI-NEXT:    s_mov_b32 s9, s3
380; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
381; SI-NEXT:    s_mov_b32 s4, s0
382; SI-NEXT:    s_mov_b32 s5, s1
383; SI-NEXT:    s_waitcnt vmcnt(0)
384; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
385; SI-NEXT:    v_and_b32_e32 v0, 0xf8, v0
386; SI-NEXT:    v_bfe_u32 v0, v0, 3, 8
387; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
388; SI-NEXT:    s_endpgm
389;
390; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_3:
391; VI:       ; %bb.0:
392; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
393; VI-NEXT:    s_mov_b32 s7, 0xf000
394; VI-NEXT:    s_mov_b32 s6, -1
395; VI-NEXT:    s_mov_b32 s10, s6
396; VI-NEXT:    s_mov_b32 s11, s7
397; VI-NEXT:    s_waitcnt lgkmcnt(0)
398; VI-NEXT:    s_mov_b32 s8, s2
399; VI-NEXT:    s_mov_b32 s9, s3
400; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
401; VI-NEXT:    s_mov_b32 s4, s0
402; VI-NEXT:    s_mov_b32 s5, s1
403; VI-NEXT:    s_waitcnt vmcnt(0)
404; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
405; VI-NEXT:    v_and_b32_e32 v0, 0xf8, v0
406; VI-NEXT:    v_bfe_u32 v0, v0, 3, 8
407; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
408; VI-NEXT:    s_endpgm
409  %load = load i32, ptr addrspace(1) %in, align 4
410  %add = add i32 %load, 1
411  %ext = and i32 %add, 255
412  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 3, i32 8)
413  store i32 %bfe, ptr addrspace(1) %out, align 4
414  ret void
415}
416
417define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
418; SI-LABEL: bfe_u32_zext_in_reg_i8_offset_7:
419; SI:       ; %bb.0:
420; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
421; SI-NEXT:    s_mov_b32 s7, 0xf000
422; SI-NEXT:    s_mov_b32 s6, -1
423; SI-NEXT:    s_mov_b32 s10, s6
424; SI-NEXT:    s_mov_b32 s11, s7
425; SI-NEXT:    s_waitcnt lgkmcnt(0)
426; SI-NEXT:    s_mov_b32 s8, s2
427; SI-NEXT:    s_mov_b32 s9, s3
428; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
429; SI-NEXT:    s_mov_b32 s4, s0
430; SI-NEXT:    s_mov_b32 s5, s1
431; SI-NEXT:    s_waitcnt vmcnt(0)
432; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
433; SI-NEXT:    v_and_b32_e32 v0, 0x80, v0
434; SI-NEXT:    v_bfe_u32 v0, v0, 7, 8
435; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
436; SI-NEXT:    s_endpgm
437;
438; VI-LABEL: bfe_u32_zext_in_reg_i8_offset_7:
439; VI:       ; %bb.0:
440; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
441; VI-NEXT:    s_mov_b32 s7, 0xf000
442; VI-NEXT:    s_mov_b32 s6, -1
443; VI-NEXT:    s_mov_b32 s10, s6
444; VI-NEXT:    s_mov_b32 s11, s7
445; VI-NEXT:    s_waitcnt lgkmcnt(0)
446; VI-NEXT:    s_mov_b32 s8, s2
447; VI-NEXT:    s_mov_b32 s9, s3
448; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
449; VI-NEXT:    s_mov_b32 s4, s0
450; VI-NEXT:    s_mov_b32 s5, s1
451; VI-NEXT:    s_waitcnt vmcnt(0)
452; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
453; VI-NEXT:    v_and_b32_e32 v0, 0x80, v0
454; VI-NEXT:    v_bfe_u32 v0, v0, 7, 8
455; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
456; VI-NEXT:    s_endpgm
457  %load = load i32, ptr addrspace(1) %in, align 4
458  %add = add i32 %load, 1
459  %ext = and i32 %add, 255
460  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 7, i32 8)
461  store i32 %bfe, ptr addrspace(1) %out, align 4
462  ret void
463}
464
465define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
466; SI-LABEL: bfe_u32_zext_in_reg_i16_offset_8:
467; SI:       ; %bb.0:
468; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
469; SI-NEXT:    s_mov_b32 s7, 0xf000
470; SI-NEXT:    s_mov_b32 s6, -1
471; SI-NEXT:    s_mov_b32 s10, s6
472; SI-NEXT:    s_mov_b32 s11, s7
473; SI-NEXT:    s_waitcnt lgkmcnt(0)
474; SI-NEXT:    s_mov_b32 s8, s2
475; SI-NEXT:    s_mov_b32 s9, s3
476; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
477; SI-NEXT:    s_mov_b32 s4, s0
478; SI-NEXT:    s_mov_b32 s5, s1
479; SI-NEXT:    s_waitcnt vmcnt(0)
480; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
481; SI-NEXT:    v_bfe_u32 v0, v0, 8, 8
482; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
483; SI-NEXT:    s_endpgm
484;
485; VI-LABEL: bfe_u32_zext_in_reg_i16_offset_8:
486; VI:       ; %bb.0:
487; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
488; VI-NEXT:    s_mov_b32 s7, 0xf000
489; VI-NEXT:    s_mov_b32 s6, -1
490; VI-NEXT:    s_mov_b32 s10, s6
491; VI-NEXT:    s_mov_b32 s11, s7
492; VI-NEXT:    s_waitcnt lgkmcnt(0)
493; VI-NEXT:    s_mov_b32 s8, s2
494; VI-NEXT:    s_mov_b32 s9, s3
495; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
496; VI-NEXT:    s_mov_b32 s4, s0
497; VI-NEXT:    s_mov_b32 s5, s1
498; VI-NEXT:    s_waitcnt vmcnt(0)
499; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
500; VI-NEXT:    v_bfe_u32 v0, v0, 8, 8
501; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
502; VI-NEXT:    s_endpgm
503  %load = load i32, ptr addrspace(1) %in, align 4
504  %add = add i32 %load, 1
505  %ext = and i32 %add, 65535
506  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 8, i32 8)
507  store i32 %bfe, ptr addrspace(1) %out, align 4
508  ret void
509}
510
511define amdgpu_kernel void @bfe_u32_test_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
512; SI-LABEL: bfe_u32_test_1:
513; SI:       ; %bb.0:
514; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
515; SI-NEXT:    s_mov_b32 s7, 0xf000
516; SI-NEXT:    s_mov_b32 s6, -1
517; SI-NEXT:    s_mov_b32 s10, s6
518; SI-NEXT:    s_mov_b32 s11, s7
519; SI-NEXT:    s_waitcnt lgkmcnt(0)
520; SI-NEXT:    s_mov_b32 s8, s2
521; SI-NEXT:    s_mov_b32 s9, s3
522; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
523; SI-NEXT:    s_mov_b32 s4, s0
524; SI-NEXT:    s_mov_b32 s5, s1
525; SI-NEXT:    s_waitcnt vmcnt(0)
526; SI-NEXT:    v_and_b32_e32 v0, 1, v0
527; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
528; SI-NEXT:    s_endpgm
529;
530; VI-LABEL: bfe_u32_test_1:
531; VI:       ; %bb.0:
532; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
533; VI-NEXT:    s_mov_b32 s7, 0xf000
534; VI-NEXT:    s_mov_b32 s6, -1
535; VI-NEXT:    s_mov_b32 s10, s6
536; VI-NEXT:    s_mov_b32 s11, s7
537; VI-NEXT:    s_waitcnt lgkmcnt(0)
538; VI-NEXT:    s_mov_b32 s8, s2
539; VI-NEXT:    s_mov_b32 s9, s3
540; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
541; VI-NEXT:    s_mov_b32 s4, s0
542; VI-NEXT:    s_mov_b32 s5, s1
543; VI-NEXT:    s_waitcnt vmcnt(0)
544; VI-NEXT:    v_and_b32_e32 v0, 1, v0
545; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
546; VI-NEXT:    s_endpgm
547  %x = load i32, ptr addrspace(1) %in, align 4
548  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1)
549  store i32 %bfe, ptr addrspace(1) %out, align 4
550  ret void
551}
552
553define amdgpu_kernel void @bfe_u32_test_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
554; SI-LABEL: bfe_u32_test_2:
555; SI:       ; %bb.0:
556; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
557; SI-NEXT:    s_waitcnt lgkmcnt(0)
558; SI-NEXT:    s_mov_b32 s3, 0xf000
559; SI-NEXT:    s_mov_b32 s2, -1
560; SI-NEXT:    v_mov_b32_e32 v0, 0
561; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
562; SI-NEXT:    s_endpgm
563;
564; VI-LABEL: bfe_u32_test_2:
565; VI:       ; %bb.0:
566; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
567; VI-NEXT:    s_waitcnt lgkmcnt(0)
568; VI-NEXT:    s_mov_b32 s3, 0xf000
569; VI-NEXT:    s_mov_b32 s2, -1
570; VI-NEXT:    v_mov_b32_e32 v0, 0
571; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
572; VI-NEXT:    s_endpgm
573  %x = load i32, ptr addrspace(1) %in, align 4
574  %shl = shl i32 %x, 31
575  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8)
576  store i32 %bfe, ptr addrspace(1) %out, align 4
577  ret void
578}
579
580define amdgpu_kernel void @bfe_u32_test_3(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
581; SI-LABEL: bfe_u32_test_3:
582; SI:       ; %bb.0:
583; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
584; SI-NEXT:    s_waitcnt lgkmcnt(0)
585; SI-NEXT:    s_mov_b32 s3, 0xf000
586; SI-NEXT:    s_mov_b32 s2, -1
587; SI-NEXT:    v_mov_b32_e32 v0, 0
588; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
589; SI-NEXT:    s_endpgm
590;
591; VI-LABEL: bfe_u32_test_3:
592; VI:       ; %bb.0:
593; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
594; VI-NEXT:    s_waitcnt lgkmcnt(0)
595; VI-NEXT:    s_mov_b32 s3, 0xf000
596; VI-NEXT:    s_mov_b32 s2, -1
597; VI-NEXT:    v_mov_b32_e32 v0, 0
598; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
599; VI-NEXT:    s_endpgm
600  %x = load i32, ptr addrspace(1) %in, align 4
601  %shl = shl i32 %x, 31
602  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1)
603  store i32 %bfe, ptr addrspace(1) %out, align 4
604  ret void
605}
606
607define amdgpu_kernel void @bfe_u32_test_4(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
608; SI-LABEL: bfe_u32_test_4:
609; SI:       ; %bb.0:
610; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
611; SI-NEXT:    s_waitcnt lgkmcnt(0)
612; SI-NEXT:    s_mov_b32 s3, 0xf000
613; SI-NEXT:    s_mov_b32 s2, -1
614; SI-NEXT:    v_mov_b32_e32 v0, 0
615; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
616; SI-NEXT:    s_endpgm
617;
618; VI-LABEL: bfe_u32_test_4:
619; VI:       ; %bb.0:
620; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
621; VI-NEXT:    s_waitcnt lgkmcnt(0)
622; VI-NEXT:    s_mov_b32 s3, 0xf000
623; VI-NEXT:    s_mov_b32 s2, -1
624; VI-NEXT:    v_mov_b32_e32 v0, 0
625; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
626; VI-NEXT:    s_endpgm
627  %x = load i32, ptr addrspace(1) %in, align 4
628  %shl = shl i32 %x, 31
629  %shr = lshr i32 %shl, 31
630  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 31, i32 1)
631  store i32 %bfe, ptr addrspace(1) %out, align 4
632  ret void
633}
634
635define amdgpu_kernel void @bfe_u32_test_5(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
636; SI-LABEL: bfe_u32_test_5:
637; SI:       ; %bb.0:
638; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
639; SI-NEXT:    s_mov_b32 s7, 0xf000
640; SI-NEXT:    s_mov_b32 s6, -1
641; SI-NEXT:    s_mov_b32 s10, s6
642; SI-NEXT:    s_mov_b32 s11, s7
643; SI-NEXT:    s_waitcnt lgkmcnt(0)
644; SI-NEXT:    s_mov_b32 s8, s2
645; SI-NEXT:    s_mov_b32 s9, s3
646; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
647; SI-NEXT:    s_mov_b32 s4, s0
648; SI-NEXT:    s_mov_b32 s5, s1
649; SI-NEXT:    s_waitcnt vmcnt(0)
650; SI-NEXT:    v_bfe_i32 v0, v0, 0, 1
651; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
652; SI-NEXT:    s_endpgm
653;
654; VI-LABEL: bfe_u32_test_5:
655; VI:       ; %bb.0:
656; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
657; VI-NEXT:    s_mov_b32 s7, 0xf000
658; VI-NEXT:    s_mov_b32 s6, -1
659; VI-NEXT:    s_mov_b32 s10, s6
660; VI-NEXT:    s_mov_b32 s11, s7
661; VI-NEXT:    s_waitcnt lgkmcnt(0)
662; VI-NEXT:    s_mov_b32 s8, s2
663; VI-NEXT:    s_mov_b32 s9, s3
664; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
665; VI-NEXT:    s_mov_b32 s4, s0
666; VI-NEXT:    s_mov_b32 s5, s1
667; VI-NEXT:    s_waitcnt vmcnt(0)
668; VI-NEXT:    v_bfe_i32 v0, v0, 0, 1
669; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
670; VI-NEXT:    s_endpgm
671  %x = load i32, ptr addrspace(1) %in, align 4
672  %shl = shl i32 %x, 31
673  %shr = ashr i32 %shl, 31
674  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 0, i32 1)
675  store i32 %bfe, ptr addrspace(1) %out, align 4
676  ret void
677}
678
679define amdgpu_kernel void @bfe_u32_test_6(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
680; SI-LABEL: bfe_u32_test_6:
681; SI:       ; %bb.0:
682; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
683; SI-NEXT:    s_mov_b32 s7, 0xf000
684; SI-NEXT:    s_mov_b32 s6, -1
685; SI-NEXT:    s_mov_b32 s10, s6
686; SI-NEXT:    s_mov_b32 s11, s7
687; SI-NEXT:    s_waitcnt lgkmcnt(0)
688; SI-NEXT:    s_mov_b32 s8, s2
689; SI-NEXT:    s_mov_b32 s9, s3
690; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
691; SI-NEXT:    s_mov_b32 s4, s0
692; SI-NEXT:    s_mov_b32 s5, s1
693; SI-NEXT:    s_waitcnt vmcnt(0)
694; SI-NEXT:    v_lshlrev_b32_e32 v0, 30, v0
695; SI-NEXT:    v_and_b32_e32 v0, 2.0, v0
696; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
697; SI-NEXT:    s_endpgm
698;
699; VI-LABEL: bfe_u32_test_6:
700; VI:       ; %bb.0:
701; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
702; VI-NEXT:    s_mov_b32 s7, 0xf000
703; VI-NEXT:    s_mov_b32 s6, -1
704; VI-NEXT:    s_mov_b32 s10, s6
705; VI-NEXT:    s_mov_b32 s11, s7
706; VI-NEXT:    s_waitcnt lgkmcnt(0)
707; VI-NEXT:    s_mov_b32 s8, s2
708; VI-NEXT:    s_mov_b32 s9, s3
709; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
710; VI-NEXT:    s_mov_b32 s4, s0
711; VI-NEXT:    s_mov_b32 s5, s1
712; VI-NEXT:    s_waitcnt vmcnt(0)
713; VI-NEXT:    v_lshlrev_b32_e32 v0, 30, v0
714; VI-NEXT:    v_and_b32_e32 v0, 2.0, v0
715; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
716; VI-NEXT:    s_endpgm
717  %x = load i32, ptr addrspace(1) %in, align 4
718  %shl = shl i32 %x, 31
719  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31)
720  store i32 %bfe, ptr addrspace(1) %out, align 4
721  ret void
722}
723
724define amdgpu_kernel void @bfe_u32_test_7(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
725; SI-LABEL: bfe_u32_test_7:
726; SI:       ; %bb.0:
727; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
728; SI-NEXT:    s_mov_b32 s7, 0xf000
729; SI-NEXT:    s_mov_b32 s6, -1
730; SI-NEXT:    s_mov_b32 s10, s6
731; SI-NEXT:    s_mov_b32 s11, s7
732; SI-NEXT:    s_waitcnt lgkmcnt(0)
733; SI-NEXT:    s_mov_b32 s8, s2
734; SI-NEXT:    s_mov_b32 s9, s3
735; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
736; SI-NEXT:    s_mov_b32 s4, s0
737; SI-NEXT:    s_mov_b32 s5, s1
738; SI-NEXT:    s_waitcnt vmcnt(0)
739; SI-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
740; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
741; SI-NEXT:    s_endpgm
742;
743; VI-LABEL: bfe_u32_test_7:
744; VI:       ; %bb.0:
745; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
746; VI-NEXT:    s_mov_b32 s7, 0xf000
747; VI-NEXT:    s_mov_b32 s6, -1
748; VI-NEXT:    s_mov_b32 s10, s6
749; VI-NEXT:    s_mov_b32 s11, s7
750; VI-NEXT:    s_waitcnt lgkmcnt(0)
751; VI-NEXT:    s_mov_b32 s8, s2
752; VI-NEXT:    s_mov_b32 s9, s3
753; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
754; VI-NEXT:    s_mov_b32 s4, s0
755; VI-NEXT:    s_mov_b32 s5, s1
756; VI-NEXT:    s_waitcnt vmcnt(0)
757; VI-NEXT:    v_lshlrev_b32_e32 v0, 31, v0
758; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
759; VI-NEXT:    s_endpgm
760  %x = load i32, ptr addrspace(1) %in, align 4
761  %shl = shl i32 %x, 31
762  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31)
763  store i32 %bfe, ptr addrspace(1) %out, align 4
764  ret void
765}
766
767define amdgpu_kernel void @bfe_u32_test_8(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
768; SI-LABEL: bfe_u32_test_8:
769; SI:       ; %bb.0:
770; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
771; SI-NEXT:    s_mov_b32 s7, 0xf000
772; SI-NEXT:    s_mov_b32 s6, -1
773; SI-NEXT:    s_mov_b32 s10, s6
774; SI-NEXT:    s_mov_b32 s11, s7
775; SI-NEXT:    s_waitcnt lgkmcnt(0)
776; SI-NEXT:    s_mov_b32 s8, s2
777; SI-NEXT:    s_mov_b32 s9, s3
778; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
779; SI-NEXT:    s_mov_b32 s4, s0
780; SI-NEXT:    s_mov_b32 s5, s1
781; SI-NEXT:    s_waitcnt vmcnt(0)
782; SI-NEXT:    v_and_b32_e32 v0, 1, v0
783; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
784; SI-NEXT:    s_endpgm
785;
786; VI-LABEL: bfe_u32_test_8:
787; VI:       ; %bb.0:
788; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
789; VI-NEXT:    s_mov_b32 s7, 0xf000
790; VI-NEXT:    s_mov_b32 s6, -1
791; VI-NEXT:    s_mov_b32 s10, s6
792; VI-NEXT:    s_mov_b32 s11, s7
793; VI-NEXT:    s_waitcnt lgkmcnt(0)
794; VI-NEXT:    s_mov_b32 s8, s2
795; VI-NEXT:    s_mov_b32 s9, s3
796; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
797; VI-NEXT:    s_mov_b32 s4, s0
798; VI-NEXT:    s_mov_b32 s5, s1
799; VI-NEXT:    s_waitcnt vmcnt(0)
800; VI-NEXT:    v_and_b32_e32 v0, 1, v0
801; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
802; VI-NEXT:    s_endpgm
803  %x = load i32, ptr addrspace(1) %in, align 4
804  %shl = shl i32 %x, 31
805  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
806  store i32 %bfe, ptr addrspace(1) %out, align 4
807  ret void
808}
809
810define amdgpu_kernel void @bfe_u32_test_9(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
811; SI-LABEL: bfe_u32_test_9:
812; SI:       ; %bb.0:
813; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
814; SI-NEXT:    s_mov_b32 s7, 0xf000
815; SI-NEXT:    s_mov_b32 s6, -1
816; SI-NEXT:    s_mov_b32 s10, s6
817; SI-NEXT:    s_mov_b32 s11, s7
818; SI-NEXT:    s_waitcnt lgkmcnt(0)
819; SI-NEXT:    s_mov_b32 s8, s2
820; SI-NEXT:    s_mov_b32 s9, s3
821; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
822; SI-NEXT:    s_mov_b32 s4, s0
823; SI-NEXT:    s_mov_b32 s5, s1
824; SI-NEXT:    s_waitcnt vmcnt(0)
825; SI-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
826; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
827; SI-NEXT:    s_endpgm
828;
829; VI-LABEL: bfe_u32_test_9:
830; VI:       ; %bb.0:
831; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
832; VI-NEXT:    s_mov_b32 s7, 0xf000
833; VI-NEXT:    s_mov_b32 s6, -1
834; VI-NEXT:    s_mov_b32 s10, s6
835; VI-NEXT:    s_mov_b32 s11, s7
836; VI-NEXT:    s_waitcnt lgkmcnt(0)
837; VI-NEXT:    s_mov_b32 s8, s2
838; VI-NEXT:    s_mov_b32 s9, s3
839; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
840; VI-NEXT:    s_mov_b32 s4, s0
841; VI-NEXT:    s_mov_b32 s5, s1
842; VI-NEXT:    s_waitcnt vmcnt(0)
843; VI-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
844; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
845; VI-NEXT:    s_endpgm
846  %x = load i32, ptr addrspace(1) %in, align 4
847  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1)
848  store i32 %bfe, ptr addrspace(1) %out, align 4
849  ret void
850}
851
852define amdgpu_kernel void @bfe_u32_test_10(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
853; SI-LABEL: bfe_u32_test_10:
854; SI:       ; %bb.0:
855; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
856; SI-NEXT:    s_mov_b32 s7, 0xf000
857; SI-NEXT:    s_mov_b32 s6, -1
858; SI-NEXT:    s_mov_b32 s10, s6
859; SI-NEXT:    s_mov_b32 s11, s7
860; SI-NEXT:    s_waitcnt lgkmcnt(0)
861; SI-NEXT:    s_mov_b32 s8, s2
862; SI-NEXT:    s_mov_b32 s9, s3
863; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
864; SI-NEXT:    s_mov_b32 s4, s0
865; SI-NEXT:    s_mov_b32 s5, s1
866; SI-NEXT:    s_waitcnt vmcnt(0)
867; SI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
868; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
869; SI-NEXT:    s_endpgm
870;
871; VI-LABEL: bfe_u32_test_10:
872; VI:       ; %bb.0:
873; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
874; VI-NEXT:    s_mov_b32 s7, 0xf000
875; VI-NEXT:    s_mov_b32 s6, -1
876; VI-NEXT:    s_mov_b32 s10, s6
877; VI-NEXT:    s_mov_b32 s11, s7
878; VI-NEXT:    s_waitcnt lgkmcnt(0)
879; VI-NEXT:    s_mov_b32 s8, s2
880; VI-NEXT:    s_mov_b32 s9, s3
881; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
882; VI-NEXT:    s_mov_b32 s4, s0
883; VI-NEXT:    s_mov_b32 s5, s1
884; VI-NEXT:    s_waitcnt vmcnt(0)
885; VI-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
886; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
887; VI-NEXT:    s_endpgm
888  %x = load i32, ptr addrspace(1) %in, align 4
889  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31)
890  store i32 %bfe, ptr addrspace(1) %out, align 4
891  ret void
892}
893
894define amdgpu_kernel void @bfe_u32_test_11(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
895; SI-LABEL: bfe_u32_test_11:
896; SI:       ; %bb.0:
897; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
898; SI-NEXT:    s_mov_b32 s7, 0xf000
899; SI-NEXT:    s_mov_b32 s6, -1
900; SI-NEXT:    s_mov_b32 s10, s6
901; SI-NEXT:    s_mov_b32 s11, s7
902; SI-NEXT:    s_waitcnt lgkmcnt(0)
903; SI-NEXT:    s_mov_b32 s8, s2
904; SI-NEXT:    s_mov_b32 s9, s3
905; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
906; SI-NEXT:    s_mov_b32 s4, s0
907; SI-NEXT:    s_mov_b32 s5, s1
908; SI-NEXT:    s_waitcnt vmcnt(0)
909; SI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
910; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
911; SI-NEXT:    s_endpgm
912;
913; VI-LABEL: bfe_u32_test_11:
914; VI:       ; %bb.0:
915; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
916; VI-NEXT:    s_mov_b32 s7, 0xf000
917; VI-NEXT:    s_mov_b32 s6, -1
918; VI-NEXT:    s_mov_b32 s10, s6
919; VI-NEXT:    s_mov_b32 s11, s7
920; VI-NEXT:    s_waitcnt lgkmcnt(0)
921; VI-NEXT:    s_mov_b32 s8, s2
922; VI-NEXT:    s_mov_b32 s9, s3
923; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
924; VI-NEXT:    s_mov_b32 s4, s0
925; VI-NEXT:    s_mov_b32 s5, s1
926; VI-NEXT:    s_waitcnt vmcnt(0)
927; VI-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
928; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
929; VI-NEXT:    s_endpgm
930  %x = load i32, ptr addrspace(1) %in, align 4
931  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24)
932  store i32 %bfe, ptr addrspace(1) %out, align 4
933  ret void
934}
935
936define amdgpu_kernel void @bfe_u32_test_12(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
937; SI-LABEL: bfe_u32_test_12:
938; SI:       ; %bb.0:
939; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
940; SI-NEXT:    s_mov_b32 s7, 0xf000
941; SI-NEXT:    s_mov_b32 s6, -1
942; SI-NEXT:    s_mov_b32 s10, s6
943; SI-NEXT:    s_mov_b32 s11, s7
944; SI-NEXT:    s_waitcnt lgkmcnt(0)
945; SI-NEXT:    s_mov_b32 s8, s2
946; SI-NEXT:    s_mov_b32 s9, s3
947; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
948; SI-NEXT:    s_mov_b32 s4, s0
949; SI-NEXT:    s_mov_b32 s5, s1
950; SI-NEXT:    s_waitcnt vmcnt(0)
951; SI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
952; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
953; SI-NEXT:    s_endpgm
954;
955; VI-LABEL: bfe_u32_test_12:
956; VI:       ; %bb.0:
957; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
958; VI-NEXT:    s_mov_b32 s7, 0xf000
959; VI-NEXT:    s_mov_b32 s6, -1
960; VI-NEXT:    s_mov_b32 s10, s6
961; VI-NEXT:    s_mov_b32 s11, s7
962; VI-NEXT:    s_waitcnt lgkmcnt(0)
963; VI-NEXT:    s_mov_b32 s8, s2
964; VI-NEXT:    s_mov_b32 s9, s3
965; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
966; VI-NEXT:    s_mov_b32 s4, s0
967; VI-NEXT:    s_mov_b32 s5, s1
968; VI-NEXT:    s_waitcnt vmcnt(0)
969; VI-NEXT:    v_lshrrev_b32_e32 v0, 24, v0
970; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
971; VI-NEXT:    s_endpgm
972  %x = load i32, ptr addrspace(1) %in, align 4
973  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8)
974  store i32 %bfe, ptr addrspace(1) %out, align 4
975  ret void
976}
977
978; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
979define amdgpu_kernel void @bfe_u32_test_13(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
980; SI-LABEL: bfe_u32_test_13:
981; SI:       ; %bb.0:
982; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
983; SI-NEXT:    s_mov_b32 s7, 0xf000
984; SI-NEXT:    s_mov_b32 s6, -1
985; SI-NEXT:    s_mov_b32 s10, s6
986; SI-NEXT:    s_mov_b32 s11, s7
987; SI-NEXT:    s_waitcnt lgkmcnt(0)
988; SI-NEXT:    s_mov_b32 s8, s2
989; SI-NEXT:    s_mov_b32 s9, s3
990; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
991; SI-NEXT:    s_mov_b32 s4, s0
992; SI-NEXT:    s_mov_b32 s5, s1
993; SI-NEXT:    s_waitcnt vmcnt(0)
994; SI-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
995; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
996; SI-NEXT:    s_endpgm
997;
998; VI-LABEL: bfe_u32_test_13:
999; VI:       ; %bb.0:
1000; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1001; VI-NEXT:    s_mov_b32 s7, 0xf000
1002; VI-NEXT:    s_mov_b32 s6, -1
1003; VI-NEXT:    s_mov_b32 s10, s6
1004; VI-NEXT:    s_mov_b32 s11, s7
1005; VI-NEXT:    s_waitcnt lgkmcnt(0)
1006; VI-NEXT:    s_mov_b32 s8, s2
1007; VI-NEXT:    s_mov_b32 s9, s3
1008; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
1009; VI-NEXT:    s_mov_b32 s4, s0
1010; VI-NEXT:    s_mov_b32 s5, s1
1011; VI-NEXT:    s_waitcnt vmcnt(0)
1012; VI-NEXT:    v_lshrrev_b32_e32 v0, 31, v0
1013; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1014; VI-NEXT:    s_endpgm
1015  %x = load i32, ptr addrspace(1) %in, align 4
1016  %shl = ashr i32 %x, 31
1017  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
1018  store i32 %bfe, ptr addrspace(1) %out, align 4 ret void
1019}
1020
1021define amdgpu_kernel void @bfe_u32_test_14(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
1022; SI-LABEL: bfe_u32_test_14:
1023; SI:       ; %bb.0:
1024; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1025; SI-NEXT:    s_waitcnt lgkmcnt(0)
1026; SI-NEXT:    s_mov_b32 s3, 0xf000
1027; SI-NEXT:    s_mov_b32 s2, -1
1028; SI-NEXT:    v_mov_b32_e32 v0, 0
1029; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1030; SI-NEXT:    s_endpgm
1031;
1032; VI-LABEL: bfe_u32_test_14:
1033; VI:       ; %bb.0:
1034; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1035; VI-NEXT:    s_waitcnt lgkmcnt(0)
1036; VI-NEXT:    s_mov_b32 s3, 0xf000
1037; VI-NEXT:    s_mov_b32 s2, -1
1038; VI-NEXT:    v_mov_b32_e32 v0, 0
1039; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1040; VI-NEXT:    s_endpgm
1041  %x = load i32, ptr addrspace(1) %in, align 4
1042  %shl = lshr i32 %x, 31
1043  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
1044  store i32 %bfe, ptr addrspace(1) %out, align 4 ret void
1045}
1046
1047define amdgpu_kernel void @bfe_u32_constant_fold_test_0(ptr addrspace(1) %out) #0 {
1048; SI-LABEL: bfe_u32_constant_fold_test_0:
1049; SI:       ; %bb.0:
1050; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1051; SI-NEXT:    s_mov_b32 s3, 0xf000
1052; SI-NEXT:    s_mov_b32 s2, -1
1053; SI-NEXT:    v_mov_b32_e32 v0, 0
1054; SI-NEXT:    s_waitcnt lgkmcnt(0)
1055; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1056; SI-NEXT:    s_endpgm
1057;
1058; VI-LABEL: bfe_u32_constant_fold_test_0:
1059; VI:       ; %bb.0:
1060; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1061; VI-NEXT:    s_mov_b32 s3, 0xf000
1062; VI-NEXT:    s_mov_b32 s2, -1
1063; VI-NEXT:    v_mov_b32_e32 v0, 0
1064; VI-NEXT:    s_waitcnt lgkmcnt(0)
1065; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1066; VI-NEXT:    s_endpgm
1067  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
1068  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1069  ret void
1070}
1071
1072define amdgpu_kernel void @bfe_u32_constant_fold_test_1(ptr addrspace(1) %out) #0 {
1073; SI-LABEL: bfe_u32_constant_fold_test_1:
1074; SI:       ; %bb.0:
1075; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1076; SI-NEXT:    s_mov_b32 s3, 0xf000
1077; SI-NEXT:    s_mov_b32 s2, -1
1078; SI-NEXT:    v_mov_b32_e32 v0, 0
1079; SI-NEXT:    s_waitcnt lgkmcnt(0)
1080; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1081; SI-NEXT:    s_endpgm
1082;
1083; VI-LABEL: bfe_u32_constant_fold_test_1:
1084; VI:       ; %bb.0:
1085; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1086; VI-NEXT:    s_mov_b32 s3, 0xf000
1087; VI-NEXT:    s_mov_b32 s2, -1
1088; VI-NEXT:    v_mov_b32_e32 v0, 0
1089; VI-NEXT:    s_waitcnt lgkmcnt(0)
1090; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1091; VI-NEXT:    s_endpgm
1092  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0)
1093  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1094  ret void
1095}
1096
1097define amdgpu_kernel void @bfe_u32_constant_fold_test_2(ptr addrspace(1) %out) #0 {
1098; SI-LABEL: bfe_u32_constant_fold_test_2:
1099; SI:       ; %bb.0:
1100; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1101; SI-NEXT:    s_mov_b32 s3, 0xf000
1102; SI-NEXT:    s_mov_b32 s2, -1
1103; SI-NEXT:    v_mov_b32_e32 v0, 0
1104; SI-NEXT:    s_waitcnt lgkmcnt(0)
1105; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1106; SI-NEXT:    s_endpgm
1107;
1108; VI-LABEL: bfe_u32_constant_fold_test_2:
1109; VI:       ; %bb.0:
1110; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1111; VI-NEXT:    s_mov_b32 s3, 0xf000
1112; VI-NEXT:    s_mov_b32 s2, -1
1113; VI-NEXT:    v_mov_b32_e32 v0, 0
1114; VI-NEXT:    s_waitcnt lgkmcnt(0)
1115; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1116; VI-NEXT:    s_endpgm
1117  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1)
1118  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1119  ret void
1120}
1121
1122define amdgpu_kernel void @bfe_u32_constant_fold_test_3(ptr addrspace(1) %out) #0 {
1123; SI-LABEL: bfe_u32_constant_fold_test_3:
1124; SI:       ; %bb.0:
1125; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1126; SI-NEXT:    s_mov_b32 s3, 0xf000
1127; SI-NEXT:    s_mov_b32 s2, -1
1128; SI-NEXT:    v_mov_b32_e32 v0, 1
1129; SI-NEXT:    s_waitcnt lgkmcnt(0)
1130; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1131; SI-NEXT:    s_endpgm
1132;
1133; VI-LABEL: bfe_u32_constant_fold_test_3:
1134; VI:       ; %bb.0:
1135; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1136; VI-NEXT:    s_mov_b32 s3, 0xf000
1137; VI-NEXT:    s_mov_b32 s2, -1
1138; VI-NEXT:    v_mov_b32_e32 v0, 1
1139; VI-NEXT:    s_waitcnt lgkmcnt(0)
1140; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1141; VI-NEXT:    s_endpgm
1142  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1)
1143  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1144  ret void
1145}
1146
1147define amdgpu_kernel void @bfe_u32_constant_fold_test_4(ptr addrspace(1) %out) #0 {
1148; SI-LABEL: bfe_u32_constant_fold_test_4:
1149; SI:       ; %bb.0:
1150; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1151; SI-NEXT:    s_mov_b32 s3, 0xf000
1152; SI-NEXT:    s_mov_b32 s2, -1
1153; SI-NEXT:    v_mov_b32_e32 v0, -1
1154; SI-NEXT:    s_waitcnt lgkmcnt(0)
1155; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1156; SI-NEXT:    s_endpgm
1157;
1158; VI-LABEL: bfe_u32_constant_fold_test_4:
1159; VI:       ; %bb.0:
1160; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1161; VI-NEXT:    s_mov_b32 s3, 0xf000
1162; VI-NEXT:    s_mov_b32 s2, -1
1163; VI-NEXT:    v_mov_b32_e32 v0, -1
1164; VI-NEXT:    s_waitcnt lgkmcnt(0)
1165; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1166; VI-NEXT:    s_endpgm
1167  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1)
1168  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1169  ret void
1170}
1171
1172define amdgpu_kernel void @bfe_u32_constant_fold_test_5(ptr addrspace(1) %out) #0 {
1173; SI-LABEL: bfe_u32_constant_fold_test_5:
1174; SI:       ; %bb.0:
1175; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1176; SI-NEXT:    s_mov_b32 s3, 0xf000
1177; SI-NEXT:    s_mov_b32 s2, -1
1178; SI-NEXT:    v_mov_b32_e32 v0, 1
1179; SI-NEXT:    s_waitcnt lgkmcnt(0)
1180; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1181; SI-NEXT:    s_endpgm
1182;
1183; VI-LABEL: bfe_u32_constant_fold_test_5:
1184; VI:       ; %bb.0:
1185; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1186; VI-NEXT:    s_mov_b32 s3, 0xf000
1187; VI-NEXT:    s_mov_b32 s2, -1
1188; VI-NEXT:    v_mov_b32_e32 v0, 1
1189; VI-NEXT:    s_waitcnt lgkmcnt(0)
1190; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1191; VI-NEXT:    s_endpgm
1192  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1)
1193  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1194  ret void
1195}
1196
1197define amdgpu_kernel void @bfe_u32_constant_fold_test_6(ptr addrspace(1) %out) #0 {
1198; SI-LABEL: bfe_u32_constant_fold_test_6:
1199; SI:       ; %bb.0:
1200; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1201; SI-NEXT:    s_mov_b32 s3, 0xf000
1202; SI-NEXT:    s_mov_b32 s2, -1
1203; SI-NEXT:    v_mov_b32_e32 v0, 0x80
1204; SI-NEXT:    s_waitcnt lgkmcnt(0)
1205; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1206; SI-NEXT:    s_endpgm
1207;
1208; VI-LABEL: bfe_u32_constant_fold_test_6:
1209; VI:       ; %bb.0:
1210; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1211; VI-NEXT:    s_mov_b32 s3, 0xf000
1212; VI-NEXT:    s_mov_b32 s2, -1
1213; VI-NEXT:    v_mov_b32_e32 v0, 0x80
1214; VI-NEXT:    s_waitcnt lgkmcnt(0)
1215; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1216; VI-NEXT:    s_endpgm
1217  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8)
1218  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1219  ret void
1220}
1221
1222define amdgpu_kernel void @bfe_u32_constant_fold_test_7(ptr addrspace(1) %out) #0 {
1223; SI-LABEL: bfe_u32_constant_fold_test_7:
1224; SI:       ; %bb.0:
1225; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1226; SI-NEXT:    s_mov_b32 s3, 0xf000
1227; SI-NEXT:    s_mov_b32 s2, -1
1228; SI-NEXT:    v_mov_b32_e32 v0, 0x7f
1229; SI-NEXT:    s_waitcnt lgkmcnt(0)
1230; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1231; SI-NEXT:    s_endpgm
1232;
1233; VI-LABEL: bfe_u32_constant_fold_test_7:
1234; VI:       ; %bb.0:
1235; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1236; VI-NEXT:    s_mov_b32 s3, 0xf000
1237; VI-NEXT:    s_mov_b32 s2, -1
1238; VI-NEXT:    v_mov_b32_e32 v0, 0x7f
1239; VI-NEXT:    s_waitcnt lgkmcnt(0)
1240; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1241; VI-NEXT:    s_endpgm
1242  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8)
1243  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1244  ret void
1245}
1246
1247define amdgpu_kernel void @bfe_u32_constant_fold_test_8(ptr addrspace(1) %out) #0 {
1248; SI-LABEL: bfe_u32_constant_fold_test_8:
1249; SI:       ; %bb.0:
1250; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1251; SI-NEXT:    s_mov_b32 s3, 0xf000
1252; SI-NEXT:    s_mov_b32 s2, -1
1253; SI-NEXT:    v_mov_b32_e32 v0, 1
1254; SI-NEXT:    s_waitcnt lgkmcnt(0)
1255; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1256; SI-NEXT:    s_endpgm
1257;
1258; VI-LABEL: bfe_u32_constant_fold_test_8:
1259; VI:       ; %bb.0:
1260; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1261; VI-NEXT:    s_mov_b32 s3, 0xf000
1262; VI-NEXT:    s_mov_b32 s2, -1
1263; VI-NEXT:    v_mov_b32_e32 v0, 1
1264; VI-NEXT:    s_waitcnt lgkmcnt(0)
1265; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1266; VI-NEXT:    s_endpgm
1267  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8)
1268  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1269  ret void
1270}
1271
1272define amdgpu_kernel void @bfe_u32_constant_fold_test_9(ptr addrspace(1) %out) #0 {
1273; SI-LABEL: bfe_u32_constant_fold_test_9:
1274; SI:       ; %bb.0:
1275; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1276; SI-NEXT:    s_mov_b32 s3, 0xf000
1277; SI-NEXT:    s_mov_b32 s2, -1
1278; SI-NEXT:    v_mov_b32_e32 v0, 1
1279; SI-NEXT:    s_waitcnt lgkmcnt(0)
1280; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1281; SI-NEXT:    s_endpgm
1282;
1283; VI-LABEL: bfe_u32_constant_fold_test_9:
1284; VI:       ; %bb.0:
1285; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1286; VI-NEXT:    s_mov_b32 s3, 0xf000
1287; VI-NEXT:    s_mov_b32 s2, -1
1288; VI-NEXT:    v_mov_b32_e32 v0, 1
1289; VI-NEXT:    s_waitcnt lgkmcnt(0)
1290; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1291; VI-NEXT:    s_endpgm
1292  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8)
1293  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1294  ret void
1295}
1296
1297define amdgpu_kernel void @bfe_u32_constant_fold_test_10(ptr addrspace(1) %out) #0 {
1298; SI-LABEL: bfe_u32_constant_fold_test_10:
1299; SI:       ; %bb.0:
1300; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1301; SI-NEXT:    s_mov_b32 s3, 0xf000
1302; SI-NEXT:    s_mov_b32 s2, -1
1303; SI-NEXT:    v_mov_b32_e32 v0, 0
1304; SI-NEXT:    s_waitcnt lgkmcnt(0)
1305; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1306; SI-NEXT:    s_endpgm
1307;
1308; VI-LABEL: bfe_u32_constant_fold_test_10:
1309; VI:       ; %bb.0:
1310; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1311; VI-NEXT:    s_mov_b32 s3, 0xf000
1312; VI-NEXT:    s_mov_b32 s2, -1
1313; VI-NEXT:    v_mov_b32_e32 v0, 0
1314; VI-NEXT:    s_waitcnt lgkmcnt(0)
1315; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1316; VI-NEXT:    s_endpgm
1317  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16)
1318  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1319  ret void
1320}
1321
1322define amdgpu_kernel void @bfe_u32_constant_fold_test_11(ptr addrspace(1) %out) #0 {
1323; SI-LABEL: bfe_u32_constant_fold_test_11:
1324; SI:       ; %bb.0:
1325; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1326; SI-NEXT:    s_mov_b32 s3, 0xf000
1327; SI-NEXT:    s_mov_b32 s2, -1
1328; SI-NEXT:    v_mov_b32_e32 v0, 10
1329; SI-NEXT:    s_waitcnt lgkmcnt(0)
1330; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1331; SI-NEXT:    s_endpgm
1332;
1333; VI-LABEL: bfe_u32_constant_fold_test_11:
1334; VI:       ; %bb.0:
1335; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1336; VI-NEXT:    s_mov_b32 s3, 0xf000
1337; VI-NEXT:    s_mov_b32 s2, -1
1338; VI-NEXT:    v_mov_b32_e32 v0, 10
1339; VI-NEXT:    s_waitcnt lgkmcnt(0)
1340; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1341; VI-NEXT:    s_endpgm
1342  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4)
1343  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1344  ret void
1345}
1346
1347define amdgpu_kernel void @bfe_u32_constant_fold_test_12(ptr addrspace(1) %out) #0 {
1348; SI-LABEL: bfe_u32_constant_fold_test_12:
1349; SI:       ; %bb.0:
1350; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1351; SI-NEXT:    s_mov_b32 s3, 0xf000
1352; SI-NEXT:    s_mov_b32 s2, -1
1353; SI-NEXT:    v_mov_b32_e32 v0, 0
1354; SI-NEXT:    s_waitcnt lgkmcnt(0)
1355; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1356; SI-NEXT:    s_endpgm
1357;
1358; VI-LABEL: bfe_u32_constant_fold_test_12:
1359; VI:       ; %bb.0:
1360; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1361; VI-NEXT:    s_mov_b32 s3, 0xf000
1362; VI-NEXT:    s_mov_b32 s2, -1
1363; VI-NEXT:    v_mov_b32_e32 v0, 0
1364; VI-NEXT:    s_waitcnt lgkmcnt(0)
1365; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1366; VI-NEXT:    s_endpgm
1367  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1)
1368  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1369  ret void
1370}
1371
1372define amdgpu_kernel void @bfe_u32_constant_fold_test_13(ptr addrspace(1) %out) #0 {
1373; SI-LABEL: bfe_u32_constant_fold_test_13:
1374; SI:       ; %bb.0:
1375; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1376; SI-NEXT:    s_mov_b32 s3, 0xf000
1377; SI-NEXT:    s_mov_b32 s2, -1
1378; SI-NEXT:    v_mov_b32_e32 v0, 1
1379; SI-NEXT:    s_waitcnt lgkmcnt(0)
1380; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1381; SI-NEXT:    s_endpgm
1382;
1383; VI-LABEL: bfe_u32_constant_fold_test_13:
1384; VI:       ; %bb.0:
1385; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1386; VI-NEXT:    s_mov_b32 s3, 0xf000
1387; VI-NEXT:    s_mov_b32 s2, -1
1388; VI-NEXT:    v_mov_b32_e32 v0, 1
1389; VI-NEXT:    s_waitcnt lgkmcnt(0)
1390; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1391; VI-NEXT:    s_endpgm
1392  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16)
1393  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1394  ret void
1395}
1396
1397define amdgpu_kernel void @bfe_u32_constant_fold_test_14(ptr addrspace(1) %out) #0 {
1398; SI-LABEL: bfe_u32_constant_fold_test_14:
1399; SI:       ; %bb.0:
1400; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1401; SI-NEXT:    s_mov_b32 s3, 0xf000
1402; SI-NEXT:    s_mov_b32 s2, -1
1403; SI-NEXT:    v_mov_b32_e32 v0, 40
1404; SI-NEXT:    s_waitcnt lgkmcnt(0)
1405; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1406; SI-NEXT:    s_endpgm
1407;
1408; VI-LABEL: bfe_u32_constant_fold_test_14:
1409; VI:       ; %bb.0:
1410; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1411; VI-NEXT:    s_mov_b32 s3, 0xf000
1412; VI-NEXT:    s_mov_b32 s2, -1
1413; VI-NEXT:    v_mov_b32_e32 v0, 40
1414; VI-NEXT:    s_waitcnt lgkmcnt(0)
1415; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1416; VI-NEXT:    s_endpgm
1417  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30)
1418  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1419  ret void
1420}
1421
1422define amdgpu_kernel void @bfe_u32_constant_fold_test_15(ptr addrspace(1) %out) #0 {
1423; SI-LABEL: bfe_u32_constant_fold_test_15:
1424; SI:       ; %bb.0:
1425; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1426; SI-NEXT:    s_mov_b32 s3, 0xf000
1427; SI-NEXT:    s_mov_b32 s2, -1
1428; SI-NEXT:    v_mov_b32_e32 v0, 10
1429; SI-NEXT:    s_waitcnt lgkmcnt(0)
1430; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1431; SI-NEXT:    s_endpgm
1432;
1433; VI-LABEL: bfe_u32_constant_fold_test_15:
1434; VI:       ; %bb.0:
1435; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1436; VI-NEXT:    s_mov_b32 s3, 0xf000
1437; VI-NEXT:    s_mov_b32 s2, -1
1438; VI-NEXT:    v_mov_b32_e32 v0, 10
1439; VI-NEXT:    s_waitcnt lgkmcnt(0)
1440; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1441; VI-NEXT:    s_endpgm
1442  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28)
1443  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1444  ret void
1445}
1446
1447define amdgpu_kernel void @bfe_u32_constant_fold_test_16(ptr addrspace(1) %out) #0 {
1448; SI-LABEL: bfe_u32_constant_fold_test_16:
1449; SI:       ; %bb.0:
1450; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1451; SI-NEXT:    s_mov_b32 s3, 0xf000
1452; SI-NEXT:    s_mov_b32 s2, -1
1453; SI-NEXT:    v_mov_b32_e32 v0, 0x7f
1454; SI-NEXT:    s_waitcnt lgkmcnt(0)
1455; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1456; SI-NEXT:    s_endpgm
1457;
1458; VI-LABEL: bfe_u32_constant_fold_test_16:
1459; VI:       ; %bb.0:
1460; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1461; VI-NEXT:    s_mov_b32 s3, 0xf000
1462; VI-NEXT:    s_mov_b32 s2, -1
1463; VI-NEXT:    v_mov_b32_e32 v0, 0x7f
1464; VI-NEXT:    s_waitcnt lgkmcnt(0)
1465; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1466; VI-NEXT:    s_endpgm
1467  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7)
1468  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1469  ret void
1470}
1471
1472define amdgpu_kernel void @bfe_u32_constant_fold_test_17(ptr addrspace(1) %out) #0 {
1473; SI-LABEL: bfe_u32_constant_fold_test_17:
1474; SI:       ; %bb.0:
1475; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1476; SI-NEXT:    s_mov_b32 s3, 0xf000
1477; SI-NEXT:    s_mov_b32 s2, -1
1478; SI-NEXT:    v_mov_b32_e32 v0, 0x7f
1479; SI-NEXT:    s_waitcnt lgkmcnt(0)
1480; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1481; SI-NEXT:    s_endpgm
1482;
1483; VI-LABEL: bfe_u32_constant_fold_test_17:
1484; VI:       ; %bb.0:
1485; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1486; VI-NEXT:    s_mov_b32 s3, 0xf000
1487; VI-NEXT:    s_mov_b32 s2, -1
1488; VI-NEXT:    v_mov_b32_e32 v0, 0x7f
1489; VI-NEXT:    s_waitcnt lgkmcnt(0)
1490; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1491; VI-NEXT:    s_endpgm
1492  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31)
1493  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1494  ret void
1495}
1496
1497define amdgpu_kernel void @bfe_u32_constant_fold_test_18(ptr addrspace(1) %out) #0 {
1498; SI-LABEL: bfe_u32_constant_fold_test_18:
1499; SI:       ; %bb.0:
1500; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1501; SI-NEXT:    s_mov_b32 s3, 0xf000
1502; SI-NEXT:    s_mov_b32 s2, -1
1503; SI-NEXT:    v_mov_b32_e32 v0, 0
1504; SI-NEXT:    s_waitcnt lgkmcnt(0)
1505; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1506; SI-NEXT:    s_endpgm
1507;
1508; VI-LABEL: bfe_u32_constant_fold_test_18:
1509; VI:       ; %bb.0:
1510; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1511; VI-NEXT:    s_mov_b32 s3, 0xf000
1512; VI-NEXT:    s_mov_b32 s2, -1
1513; VI-NEXT:    v_mov_b32_e32 v0, 0
1514; VI-NEXT:    s_waitcnt lgkmcnt(0)
1515; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1516; VI-NEXT:    s_endpgm
1517  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1)
1518  store i32 %bfe_u32, ptr addrspace(1) %out, align 4
1519  ret void
1520}
1521
1522; Make sure that SimplifyDemandedBits doesn't cause the and to be
1523; reduced to the bits demanded by the bfe.
1524
1525; XXX: The operand to v_bfe_u32 could also just directly be the load register.
1526define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(ptr addrspace(1) %out0,
1527; SI-LABEL: simplify_bfe_u32_multi_use_arg:
1528; SI:       ; %bb.0:
1529; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0xd
1530; SI-NEXT:    s_mov_b32 s7, 0xf000
1531; SI-NEXT:    s_mov_b32 s6, -1
1532; SI-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
1533; SI-NEXT:    s_mov_b32 s2, s6
1534; SI-NEXT:    s_mov_b32 s3, s7
1535; SI-NEXT:    s_waitcnt lgkmcnt(0)
1536; SI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
1537; SI-NEXT:    s_mov_b32 s4, s8
1538; SI-NEXT:    s_mov_b32 s5, s9
1539; SI-NEXT:    s_mov_b32 s0, s10
1540; SI-NEXT:    s_mov_b32 s1, s11
1541; SI-NEXT:    s_waitcnt vmcnt(0)
1542; SI-NEXT:    v_and_b32_e32 v0, 63, v0
1543; SI-NEXT:    v_bfe_u32 v1, v0, 2, 2
1544; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
1545; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1546; SI-NEXT:    s_endpgm
1547;
1548; VI-LABEL: simplify_bfe_u32_multi_use_arg:
1549; VI:       ; %bb.0:
1550; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
1551; VI-NEXT:    s_mov_b32 s7, 0xf000
1552; VI-NEXT:    s_mov_b32 s6, -1
1553; VI-NEXT:    s_mov_b32 s2, s6
1554; VI-NEXT:    s_mov_b32 s3, s7
1555; VI-NEXT:    s_waitcnt lgkmcnt(0)
1556; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
1557; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1558; VI-NEXT:    s_mov_b32 s10, s6
1559; VI-NEXT:    s_mov_b32 s11, s7
1560; VI-NEXT:    s_waitcnt lgkmcnt(0)
1561; VI-NEXT:    s_mov_b32 s4, s0
1562; VI-NEXT:    s_mov_b32 s5, s1
1563; VI-NEXT:    s_mov_b32 s8, s2
1564; VI-NEXT:    s_mov_b32 s9, s3
1565; VI-NEXT:    s_waitcnt vmcnt(0)
1566; VI-NEXT:    v_and_b32_e32 v0, 63, v0
1567; VI-NEXT:    v_bfe_u32 v1, v0, 2, 2
1568; VI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
1569; VI-NEXT:    buffer_store_dword v0, off, s[8:11], 0
1570; VI-NEXT:    s_endpgm
1571                                            ptr addrspace(1) %out1,
1572                                            ptr addrspace(1) %in) #0 {
1573  %src = load i32, ptr addrspace(1) %in, align 4
1574  %and = and i32 %src, 63
1575  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %and, i32 2, i32 2)
1576  store i32 %bfe_u32, ptr addrspace(1) %out0, align 4
1577  store i32 %and, ptr addrspace(1) %out1, align 4
1578  ret void
1579}
1580
1581define amdgpu_kernel void @lshr_and(ptr addrspace(1) %out, i32 %a) #0 {
1582; SI-LABEL: lshr_and:
1583; SI:       ; %bb.0:
1584; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
1585; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1586; SI-NEXT:    s_mov_b32 s3, 0xf000
1587; SI-NEXT:    s_waitcnt lgkmcnt(0)
1588; SI-NEXT:    s_bfe_u32 s4, s2, 0x30006
1589; SI-NEXT:    s_mov_b32 s2, -1
1590; SI-NEXT:    v_mov_b32_e32 v0, s4
1591; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1592; SI-NEXT:    s_endpgm
1593;
1594; VI-LABEL: lshr_and:
1595; VI:       ; %bb.0:
1596; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
1597; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1598; VI-NEXT:    s_mov_b32 s3, 0xf000
1599; VI-NEXT:    s_mov_b32 s2, -1
1600; VI-NEXT:    s_waitcnt lgkmcnt(0)
1601; VI-NEXT:    s_bfe_u32 s4, s6, 0x30006
1602; VI-NEXT:    v_mov_b32_e32 v0, s4
1603; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1604; VI-NEXT:    s_endpgm
1605  %b = lshr i32 %a, 6
1606  %c = and i32 %b, 7
1607  store i32 %c, ptr addrspace(1) %out, align 8
1608  ret void
1609}
1610
1611define amdgpu_kernel void @v_lshr_and(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
1612; SI-LABEL: v_lshr_and:
1613; SI:       ; %bb.0:
1614; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1615; SI-NEXT:    s_mov_b32 s7, 0xf000
1616; SI-NEXT:    s_waitcnt lgkmcnt(0)
1617; SI-NEXT:    s_lshr_b32 s2, s2, s3
1618; SI-NEXT:    s_and_b32 s2, s2, 7
1619; SI-NEXT:    s_mov_b32 s6, -1
1620; SI-NEXT:    s_mov_b32 s4, s0
1621; SI-NEXT:    s_mov_b32 s5, s1
1622; SI-NEXT:    v_mov_b32_e32 v0, s2
1623; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1624; SI-NEXT:    s_endpgm
1625;
1626; VI-LABEL: v_lshr_and:
1627; VI:       ; %bb.0:
1628; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1629; VI-NEXT:    s_mov_b32 s7, 0xf000
1630; VI-NEXT:    s_mov_b32 s6, -1
1631; VI-NEXT:    s_waitcnt lgkmcnt(0)
1632; VI-NEXT:    s_mov_b32 s4, s0
1633; VI-NEXT:    s_lshr_b32 s0, s2, s3
1634; VI-NEXT:    s_and_b32 s0, s0, 7
1635; VI-NEXT:    s_mov_b32 s5, s1
1636; VI-NEXT:    v_mov_b32_e32 v0, s0
1637; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1638; VI-NEXT:    s_endpgm
1639  %c = lshr i32 %a, %b
1640  %d = and i32 %c, 7
1641  store i32 %d, ptr addrspace(1) %out, align 8
1642  ret void
1643}
1644
1645define amdgpu_kernel void @and_lshr(ptr addrspace(1) %out, i32 %a) #0 {
1646; SI-LABEL: and_lshr:
1647; SI:       ; %bb.0:
1648; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
1649; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1650; SI-NEXT:    s_mov_b32 s3, 0xf000
1651; SI-NEXT:    s_waitcnt lgkmcnt(0)
1652; SI-NEXT:    s_bfe_u32 s4, s2, 0x30006
1653; SI-NEXT:    s_mov_b32 s2, -1
1654; SI-NEXT:    v_mov_b32_e32 v0, s4
1655; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1656; SI-NEXT:    s_endpgm
1657;
1658; VI-LABEL: and_lshr:
1659; VI:       ; %bb.0:
1660; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
1661; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1662; VI-NEXT:    s_mov_b32 s3, 0xf000
1663; VI-NEXT:    s_mov_b32 s2, -1
1664; VI-NEXT:    s_waitcnt lgkmcnt(0)
1665; VI-NEXT:    s_bfe_u32 s4, s6, 0x30006
1666; VI-NEXT:    v_mov_b32_e32 v0, s4
1667; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1668; VI-NEXT:    s_endpgm
1669  %b = and i32 %a, 448
1670  %c = lshr i32 %b, 6
1671  store i32 %c, ptr addrspace(1) %out, align 8
1672  ret void
1673}
1674
1675define amdgpu_kernel void @and_lshr2(ptr addrspace(1) %out, i32 %a) #0 {
1676; SI-LABEL: and_lshr2:
1677; SI:       ; %bb.0:
1678; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
1679; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1680; SI-NEXT:    s_mov_b32 s3, 0xf000
1681; SI-NEXT:    s_waitcnt lgkmcnt(0)
1682; SI-NEXT:    s_bfe_u32 s4, s2, 0x30006
1683; SI-NEXT:    s_mov_b32 s2, -1
1684; SI-NEXT:    v_mov_b32_e32 v0, s4
1685; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1686; SI-NEXT:    s_endpgm
1687;
1688; VI-LABEL: and_lshr2:
1689; VI:       ; %bb.0:
1690; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
1691; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1692; VI-NEXT:    s_mov_b32 s3, 0xf000
1693; VI-NEXT:    s_mov_b32 s2, -1
1694; VI-NEXT:    s_waitcnt lgkmcnt(0)
1695; VI-NEXT:    s_bfe_u32 s4, s6, 0x30006
1696; VI-NEXT:    v_mov_b32_e32 v0, s4
1697; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1698; VI-NEXT:    s_endpgm
1699  %b = and i32 %a, 511
1700  %c = lshr i32 %b, 6
1701  store i32 %c, ptr addrspace(1) %out, align 8
1702  ret void
1703}
1704
1705define amdgpu_kernel void @shl_lshr(ptr addrspace(1) %out, i32 %a) #0 {
1706; SI-LABEL: shl_lshr:
1707; SI:       ; %bb.0:
1708; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
1709; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1710; SI-NEXT:    s_mov_b32 s3, 0xf000
1711; SI-NEXT:    s_waitcnt lgkmcnt(0)
1712; SI-NEXT:    s_bfe_u32 s4, s2, 0x150002
1713; SI-NEXT:    s_mov_b32 s2, -1
1714; SI-NEXT:    v_mov_b32_e32 v0, s4
1715; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1716; SI-NEXT:    s_endpgm
1717;
1718; VI-LABEL: shl_lshr:
1719; VI:       ; %bb.0:
1720; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
1721; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
1722; VI-NEXT:    s_mov_b32 s3, 0xf000
1723; VI-NEXT:    s_mov_b32 s2, -1
1724; VI-NEXT:    s_waitcnt lgkmcnt(0)
1725; VI-NEXT:    s_bfe_u32 s4, s6, 0x150002
1726; VI-NEXT:    v_mov_b32_e32 v0, s4
1727; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1728; VI-NEXT:    s_endpgm
1729  %b = shl i32 %a, 9
1730  %c = lshr i32 %b, 11
1731  store i32 %c, ptr addrspace(1) %out, align 8
1732  ret void
1733}
1734
1735declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) #1
1736
1737attributes #0 = { nounwind }
1738attributes #1 = { nounwind readnone }
1739