xref: /llvm-project/llvm/test/CodeGen/AMDGPU/bitreverse.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI
3; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
4; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=FLAT
5; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GISEL
6; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11-FLAT
7; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -global-isel -verify-machineinstrs | FileCheck %s --check-prefix=GFX11-GISEL
8
9declare i32 @llvm.amdgcn.workitem.id.x() #1
10
11declare i16 @llvm.bitreverse.i16(i16) #1
12declare i32 @llvm.bitreverse.i32(i32) #1
13declare i64 @llvm.bitreverse.i64(i64) #1
14
15declare <2 x i32> @llvm.bitreverse.v2i32(<2 x i32>) #1
16declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) #1
17
18declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) #1
19declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
20
21define amdgpu_kernel void @s_brev_i16(ptr addrspace(1) noalias %out, i16 %val) #0 {
22; SI-LABEL: s_brev_i16:
23; SI:       ; %bb.0:
24; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
25; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
26; SI-NEXT:    s_mov_b32 s3, 0xf000
27; SI-NEXT:    s_mov_b32 s2, -1
28; SI-NEXT:    s_waitcnt lgkmcnt(0)
29; SI-NEXT:    s_brev_b32 s4, s6
30; SI-NEXT:    s_lshr_b32 s4, s4, 16
31; SI-NEXT:    v_mov_b32_e32 v0, s4
32; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
33; SI-NEXT:    s_endpgm
34;
35; FLAT-LABEL: s_brev_i16:
36; FLAT:       ; %bb.0:
37; FLAT-NEXT:    s_load_dword s6, s[4:5], 0x2c
38; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
39; FLAT-NEXT:    s_mov_b32 s3, 0xf000
40; FLAT-NEXT:    s_mov_b32 s2, -1
41; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
42; FLAT-NEXT:    s_brev_b32 s4, s6
43; FLAT-NEXT:    s_lshr_b32 s4, s4, 16
44; FLAT-NEXT:    v_mov_b32_e32 v0, s4
45; FLAT-NEXT:    buffer_store_short v0, off, s[0:3], 0
46; FLAT-NEXT:    s_endpgm
47;
48; GISEL-LABEL: s_brev_i16:
49; GISEL:       ; %bb.0:
50; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
51; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
52; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
53; GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
54; GISEL-NEXT:    s_brev_b32 s2, s2
55; GISEL-NEXT:    s_lshr_b32 s2, s2, 16
56; GISEL-NEXT:    v_mov_b32_e32 v0, s0
57; GISEL-NEXT:    v_mov_b32_e32 v2, s2
58; GISEL-NEXT:    v_mov_b32_e32 v1, s1
59; GISEL-NEXT:    flat_store_short v[0:1], v2
60; GISEL-NEXT:    s_endpgm
61;
62; GFX11-FLAT-LABEL: s_brev_i16:
63; GFX11-FLAT:       ; %bb.0:
64; GFX11-FLAT-NEXT:    s_clause 0x1
65; GFX11-FLAT-NEXT:    s_load_b32 s2, s[4:5], 0x2c
66; GFX11-FLAT-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
67; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX11-FLAT-NEXT:    s_brev_b32 s2, s2
69; GFX11-FLAT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
70; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
71; GFX11-FLAT-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
72; GFX11-FLAT-NEXT:    s_endpgm
73;
74; GFX11-GISEL-LABEL: s_brev_i16:
75; GFX11-GISEL:       ; %bb.0:
76; GFX11-GISEL-NEXT:    s_clause 0x1
77; GFX11-GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
78; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
79; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
80; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX11-GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
82; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
83; GFX11-GISEL-NEXT:    s_brev_b32 s2, s2
84; GFX11-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
85; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
86; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
87; GFX11-GISEL-NEXT:    global_store_b16 v1, v0, s[0:1]
88; GFX11-GISEL-NEXT:    s_endpgm
89  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
90  store i16 %brev, ptr addrspace(1) %out
91  ret void
92}
93
94define amdgpu_kernel void @v_brev_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
95; SI-LABEL: v_brev_i16:
96; SI:       ; %bb.0:
97; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
98; SI-NEXT:    s_mov_b32 s7, 0xf000
99; SI-NEXT:    s_mov_b32 s6, -1
100; SI-NEXT:    s_mov_b32 s10, s6
101; SI-NEXT:    s_mov_b32 s11, s7
102; SI-NEXT:    s_waitcnt lgkmcnt(0)
103; SI-NEXT:    s_mov_b32 s8, s2
104; SI-NEXT:    s_mov_b32 s9, s3
105; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
106; SI-NEXT:    s_mov_b32 s4, s0
107; SI-NEXT:    s_mov_b32 s5, s1
108; SI-NEXT:    s_waitcnt vmcnt(0)
109; SI-NEXT:    v_bfrev_b32_e32 v0, v0
110; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
111; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
112; SI-NEXT:    s_endpgm
113;
114; FLAT-LABEL: v_brev_i16:
115; FLAT:       ; %bb.0:
116; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
117; FLAT-NEXT:    s_mov_b32 s7, 0xf000
118; FLAT-NEXT:    s_mov_b32 s6, -1
119; FLAT-NEXT:    s_mov_b32 s10, s6
120; FLAT-NEXT:    s_mov_b32 s11, s7
121; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
122; FLAT-NEXT:    s_mov_b32 s8, s2
123; FLAT-NEXT:    s_mov_b32 s9, s3
124; FLAT-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
125; FLAT-NEXT:    s_mov_b32 s4, s0
126; FLAT-NEXT:    s_mov_b32 s5, s1
127; FLAT-NEXT:    s_waitcnt vmcnt(0)
128; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
129; FLAT-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
130; FLAT-NEXT:    buffer_store_short v0, off, s[4:7], 0
131; FLAT-NEXT:    s_endpgm
132;
133; GISEL-LABEL: v_brev_i16:
134; GISEL:       ; %bb.0:
135; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
136; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
137; GISEL-NEXT:    v_mov_b32_e32 v0, s2
138; GISEL-NEXT:    v_mov_b32_e32 v1, s3
139; GISEL-NEXT:    flat_load_ushort v0, v[0:1]
140; GISEL-NEXT:    s_waitcnt vmcnt(0)
141; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
142; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
143; GISEL-NEXT:    v_mov_b32_e32 v0, s0
144; GISEL-NEXT:    v_mov_b32_e32 v1, s1
145; GISEL-NEXT:    flat_store_short v[0:1], v2
146; GISEL-NEXT:    s_endpgm
147;
148; GFX11-FLAT-LABEL: v_brev_i16:
149; GFX11-FLAT:       ; %bb.0:
150; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
151; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
152; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
153; GFX11-FLAT-NEXT:    v_mov_b32_e32 v1, 0
154; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX11-FLAT-NEXT:    s_mov_b32 s4, s2
156; GFX11-FLAT-NEXT:    s_mov_b32 s5, s3
157; GFX11-FLAT-NEXT:    buffer_load_u16 v0, off, s[4:7], 0
158; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
159; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
160; GFX11-FLAT-NEXT:    global_store_d16_hi_b16 v1, v0, s[0:1]
161; GFX11-FLAT-NEXT:    s_endpgm
162;
163; GFX11-GISEL-LABEL: v_brev_i16:
164; GFX11-GISEL:       ; %bb.0:
165; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
166; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
167; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
168; GFX11-GISEL-NEXT:    global_load_u16 v1, v0, s[2:3]
169; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
170; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
171; GFX11-GISEL-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
172; GFX11-GISEL-NEXT:    s_endpgm
173  %val = load i16, ptr addrspace(1) %valptr
174  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
175  store i16 %brev, ptr addrspace(1) %out
176  ret void
177}
178
179define amdgpu_kernel void @s_brev_i32(ptr addrspace(1) noalias %out, i32 %val) #0 {
180; SI-LABEL: s_brev_i32:
181; SI:       ; %bb.0:
182; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
183; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
184; SI-NEXT:    s_mov_b32 s3, 0xf000
185; SI-NEXT:    s_mov_b32 s2, -1
186; SI-NEXT:    s_waitcnt lgkmcnt(0)
187; SI-NEXT:    s_brev_b32 s4, s6
188; SI-NEXT:    v_mov_b32_e32 v0, s4
189; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
190; SI-NEXT:    s_endpgm
191;
192; FLAT-LABEL: s_brev_i32:
193; FLAT:       ; %bb.0:
194; FLAT-NEXT:    s_load_dword s6, s[4:5], 0x2c
195; FLAT-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
196; FLAT-NEXT:    s_mov_b32 s3, 0xf000
197; FLAT-NEXT:    s_mov_b32 s2, -1
198; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
199; FLAT-NEXT:    s_brev_b32 s4, s6
200; FLAT-NEXT:    v_mov_b32_e32 v0, s4
201; FLAT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
202; FLAT-NEXT:    s_endpgm
203;
204; GISEL-LABEL: s_brev_i32:
205; GISEL:       ; %bb.0:
206; GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
207; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
208; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
209; GISEL-NEXT:    s_brev_b32 s2, s2
210; GISEL-NEXT:    v_mov_b32_e32 v0, s0
211; GISEL-NEXT:    v_mov_b32_e32 v2, s2
212; GISEL-NEXT:    v_mov_b32_e32 v1, s1
213; GISEL-NEXT:    flat_store_dword v[0:1], v2
214; GISEL-NEXT:    s_endpgm
215;
216; GFX11-FLAT-LABEL: s_brev_i32:
217; GFX11-FLAT:       ; %bb.0:
218; GFX11-FLAT-NEXT:    s_clause 0x1
219; GFX11-FLAT-NEXT:    s_load_b32 s2, s[4:5], 0x2c
220; GFX11-FLAT-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
221; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
222; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
223; GFX11-FLAT-NEXT:    s_brev_b32 s2, s2
224; GFX11-FLAT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
225; GFX11-FLAT-NEXT:    v_mov_b32_e32 v0, s2
226; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
227; GFX11-FLAT-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
228; GFX11-FLAT-NEXT:    s_endpgm
229;
230; GFX11-GISEL-LABEL: s_brev_i32:
231; GFX11-GISEL:       ; %bb.0:
232; GFX11-GISEL-NEXT:    s_clause 0x1
233; GFX11-GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
234; GFX11-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
235; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0
236; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
237; GFX11-GISEL-NEXT:    s_brev_b32 s2, s2
238; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
239; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s2
240; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
241; GFX11-GISEL-NEXT:    s_endpgm
242  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
243  store i32 %brev, ptr addrspace(1) %out
244  ret void
245}
246
247define amdgpu_kernel void @v_brev_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
248; SI-LABEL: v_brev_i32:
249; SI:       ; %bb.0:
250; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
251; SI-NEXT:    s_mov_b32 s7, 0xf000
252; SI-NEXT:    s_mov_b32 s10, 0
253; SI-NEXT:    s_mov_b32 s11, s7
254; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
255; SI-NEXT:    s_waitcnt lgkmcnt(0)
256; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
257; SI-NEXT:    v_mov_b32_e32 v1, 0
258; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
259; SI-NEXT:    s_mov_b32 s6, -1
260; SI-NEXT:    s_mov_b32 s4, s0
261; SI-NEXT:    s_mov_b32 s5, s1
262; SI-NEXT:    s_waitcnt vmcnt(0)
263; SI-NEXT:    v_bfrev_b32_e32 v0, v0
264; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
265; SI-NEXT:    s_endpgm
266;
267; FLAT-LABEL: v_brev_i32:
268; FLAT:       ; %bb.0:
269; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
270; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
271; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
272; FLAT-NEXT:    v_mov_b32_e32 v1, s3
273; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
274; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
275; FLAT-NEXT:    flat_load_dword v0, v[0:1]
276; FLAT-NEXT:    s_mov_b32 s3, 0xf000
277; FLAT-NEXT:    s_mov_b32 s2, -1
278; FLAT-NEXT:    s_waitcnt vmcnt(0)
279; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
280; FLAT-NEXT:    buffer_store_dword v0, off, s[0:3], 0
281; FLAT-NEXT:    s_endpgm
282;
283; GISEL-LABEL: v_brev_i32:
284; GISEL:       ; %bb.0:
285; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
286; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
287; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
288; GISEL-NEXT:    v_mov_b32_e32 v0, s2
289; GISEL-NEXT:    v_mov_b32_e32 v1, s3
290; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
291; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
292; GISEL-NEXT:    flat_load_dword v0, v[0:1]
293; GISEL-NEXT:    s_waitcnt vmcnt(0)
294; GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
295; GISEL-NEXT:    v_mov_b32_e32 v0, s0
296; GISEL-NEXT:    v_mov_b32_e32 v1, s1
297; GISEL-NEXT:    flat_store_dword v[0:1], v2
298; GISEL-NEXT:    s_endpgm
299;
300; GFX11-FLAT-LABEL: v_brev_i32:
301; GFX11-FLAT:       ; %bb.0:
302; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
303; GFX11-FLAT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
304; GFX11-FLAT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
305; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
306; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
307; GFX11-FLAT-NEXT:    global_load_b32 v0, v0, s[2:3]
308; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
309; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
310; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
311; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
312; GFX11-FLAT-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
313; GFX11-FLAT-NEXT:    s_endpgm
314;
315; GFX11-GISEL-LABEL: v_brev_i32:
316; GFX11-GISEL:       ; %bb.0:
317; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
318; GFX11-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
319; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
320; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
321; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
322; GFX11-GISEL-NEXT:    global_load_b32 v0, v0, s[2:3]
323; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
324; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
325; GFX11-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
326; GFX11-GISEL-NEXT:    s_endpgm
327  %tid = call i32 @llvm.amdgcn.workitem.id.x()
328  %gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
329  %val = load i32, ptr addrspace(1) %gep
330  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
331  store i32 %brev, ptr addrspace(1) %out
332  ret void
333}
334
335define amdgpu_kernel void @s_brev_v2i32(ptr addrspace(1) noalias %out, <2 x i32> %val) #0 {
336; SI-LABEL: s_brev_v2i32:
337; SI:       ; %bb.0:
338; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
339; SI-NEXT:    s_mov_b32 s7, 0xf000
340; SI-NEXT:    s_mov_b32 s6, -1
341; SI-NEXT:    s_waitcnt lgkmcnt(0)
342; SI-NEXT:    s_mov_b32 s4, s0
343; SI-NEXT:    s_mov_b32 s5, s1
344; SI-NEXT:    s_brev_b32 s0, s3
345; SI-NEXT:    s_brev_b32 s1, s2
346; SI-NEXT:    v_mov_b32_e32 v0, s1
347; SI-NEXT:    v_mov_b32_e32 v1, s0
348; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
349; SI-NEXT:    s_endpgm
350;
351; FLAT-LABEL: s_brev_v2i32:
352; FLAT:       ; %bb.0:
353; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
354; FLAT-NEXT:    s_mov_b32 s7, 0xf000
355; FLAT-NEXT:    s_mov_b32 s6, -1
356; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
357; FLAT-NEXT:    s_mov_b32 s4, s0
358; FLAT-NEXT:    s_mov_b32 s5, s1
359; FLAT-NEXT:    s_brev_b32 s0, s3
360; FLAT-NEXT:    s_brev_b32 s1, s2
361; FLAT-NEXT:    v_mov_b32_e32 v0, s1
362; FLAT-NEXT:    v_mov_b32_e32 v1, s0
363; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
364; FLAT-NEXT:    s_endpgm
365;
366; GISEL-LABEL: s_brev_v2i32:
367; GISEL:       ; %bb.0:
368; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
369; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
370; GISEL-NEXT:    s_brev_b32 s2, s2
371; GISEL-NEXT:    s_brev_b32 s3, s3
372; GISEL-NEXT:    v_mov_b32_e32 v0, s2
373; GISEL-NEXT:    v_mov_b32_e32 v3, s1
374; GISEL-NEXT:    v_mov_b32_e32 v1, s3
375; GISEL-NEXT:    v_mov_b32_e32 v2, s0
376; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
377; GISEL-NEXT:    s_endpgm
378;
379; GFX11-FLAT-LABEL: s_brev_v2i32:
380; GFX11-FLAT:       ; %bb.0:
381; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
382; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
383; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
384; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
385; GFX11-FLAT-NEXT:    s_brev_b32 s2, s2
386; GFX11-FLAT-NEXT:    s_brev_b32 s3, s3
387; GFX11-FLAT-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
388; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
389; GFX11-FLAT-NEXT:    s_mov_b32 s4, s0
390; GFX11-FLAT-NEXT:    s_mov_b32 s5, s1
391; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
392; GFX11-FLAT-NEXT:    s_endpgm
393;
394; GFX11-GISEL-LABEL: s_brev_v2i32:
395; GFX11-GISEL:       ; %bb.0:
396; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
397; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
398; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
399; GFX11-GISEL-NEXT:    s_brev_b32 s2, s2
400; GFX11-GISEL-NEXT:    s_brev_b32 s3, s3
401; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
402; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
403; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
404; GFX11-GISEL-NEXT:    s_endpgm
405  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
406  store <2 x i32> %brev, ptr addrspace(1) %out
407  ret void
408}
409
410define amdgpu_kernel void @v_brev_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
411; SI-LABEL: v_brev_v2i32:
412; SI:       ; %bb.0:
413; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
414; SI-NEXT:    s_mov_b32 s7, 0xf000
415; SI-NEXT:    s_mov_b32 s10, 0
416; SI-NEXT:    s_mov_b32 s11, s7
417; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
418; SI-NEXT:    s_waitcnt lgkmcnt(0)
419; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
420; SI-NEXT:    v_mov_b32_e32 v1, 0
421; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
422; SI-NEXT:    s_mov_b32 s6, -1
423; SI-NEXT:    s_mov_b32 s4, s0
424; SI-NEXT:    s_mov_b32 s5, s1
425; SI-NEXT:    s_waitcnt vmcnt(0)
426; SI-NEXT:    v_bfrev_b32_e32 v1, v1
427; SI-NEXT:    v_bfrev_b32_e32 v0, v0
428; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
429; SI-NEXT:    s_endpgm
430;
431; FLAT-LABEL: v_brev_v2i32:
432; FLAT:       ; %bb.0:
433; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
434; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
435; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
436; FLAT-NEXT:    v_mov_b32_e32 v1, s3
437; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
438; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
439; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
440; FLAT-NEXT:    s_mov_b32 s3, 0xf000
441; FLAT-NEXT:    s_mov_b32 s2, -1
442; FLAT-NEXT:    s_waitcnt vmcnt(0)
443; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
444; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
445; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
446; FLAT-NEXT:    s_endpgm
447;
448; GISEL-LABEL: v_brev_v2i32:
449; GISEL:       ; %bb.0:
450; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
451; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
452; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
453; GISEL-NEXT:    v_mov_b32_e32 v0, s2
454; GISEL-NEXT:    v_mov_b32_e32 v1, s3
455; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
456; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
457; GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
458; GISEL-NEXT:    v_mov_b32_e32 v3, s1
459; GISEL-NEXT:    v_mov_b32_e32 v2, s0
460; GISEL-NEXT:    s_waitcnt vmcnt(0)
461; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
462; GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
463; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
464; GISEL-NEXT:    s_endpgm
465;
466; GFX11-FLAT-LABEL: v_brev_v2i32:
467; GFX11-FLAT:       ; %bb.0:
468; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
469; GFX11-FLAT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
470; GFX11-FLAT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
471; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
472; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
473; GFX11-FLAT-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
474; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
475; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
476; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
477; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
478; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
479; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
480; GFX11-FLAT-NEXT:    s_endpgm
481;
482; GFX11-GISEL-LABEL: v_brev_v2i32:
483; GFX11-GISEL:       ; %bb.0:
484; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
485; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
486; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
487; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
488; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
489; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
490; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
491; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
492; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
493; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
494; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
495; GFX11-GISEL-NEXT:    s_endpgm
496  %tid = call i32 @llvm.amdgcn.workitem.id.x()
497  %gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
498  %val = load <2 x i32>, ptr addrspace(1) %gep
499  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
500  store <2 x i32> %brev, ptr addrspace(1) %out
501  ret void
502}
503
504define amdgpu_kernel void @s_brev_i64(ptr addrspace(1) noalias %out, i64 %val) #0 {
505; SI-LABEL: s_brev_i64:
506; SI:       ; %bb.0:
507; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
508; SI-NEXT:    s_mov_b32 s7, 0xf000
509; SI-NEXT:    s_mov_b32 s6, -1
510; SI-NEXT:    s_waitcnt lgkmcnt(0)
511; SI-NEXT:    s_mov_b32 s4, s0
512; SI-NEXT:    s_mov_b32 s5, s1
513; SI-NEXT:    s_brev_b64 s[0:1], s[2:3]
514; SI-NEXT:    v_mov_b32_e32 v0, s0
515; SI-NEXT:    v_mov_b32_e32 v1, s1
516; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
517; SI-NEXT:    s_endpgm
518;
519; FLAT-LABEL: s_brev_i64:
520; FLAT:       ; %bb.0:
521; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
522; FLAT-NEXT:    s_mov_b32 s7, 0xf000
523; FLAT-NEXT:    s_mov_b32 s6, -1
524; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
525; FLAT-NEXT:    s_mov_b32 s4, s0
526; FLAT-NEXT:    s_mov_b32 s5, s1
527; FLAT-NEXT:    s_brev_b64 s[0:1], s[2:3]
528; FLAT-NEXT:    v_mov_b32_e32 v0, s0
529; FLAT-NEXT:    v_mov_b32_e32 v1, s1
530; FLAT-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
531; FLAT-NEXT:    s_endpgm
532;
533; GISEL-LABEL: s_brev_i64:
534; GISEL:       ; %bb.0:
535; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
536; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
537; GISEL-NEXT:    s_brev_b64 s[2:3], s[2:3]
538; GISEL-NEXT:    v_mov_b32_e32 v0, s2
539; GISEL-NEXT:    v_mov_b32_e32 v3, s1
540; GISEL-NEXT:    v_mov_b32_e32 v1, s3
541; GISEL-NEXT:    v_mov_b32_e32 v2, s0
542; GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
543; GISEL-NEXT:    s_endpgm
544;
545; GFX11-FLAT-LABEL: s_brev_i64:
546; GFX11-FLAT:       ; %bb.0:
547; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
548; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
549; GFX11-FLAT-NEXT:    s_brev_b64 s[4:5], s[2:3]
550; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
551; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
552; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
553; GFX11-FLAT-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
554; GFX11-FLAT-NEXT:    s_endpgm
555;
556; GFX11-GISEL-LABEL: s_brev_i64:
557; GFX11-GISEL:       ; %bb.0:
558; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
559; GFX11-GISEL-NEXT:    v_mov_b32_e32 v2, 0
560; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
561; GFX11-GISEL-NEXT:    s_brev_b64 s[2:3], s[2:3]
562; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
563; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
564; GFX11-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
565; GFX11-GISEL-NEXT:    s_endpgm
566  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
567  store i64 %brev, ptr addrspace(1) %out
568  ret void
569}
570
571define amdgpu_kernel void @v_brev_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
572; SI-LABEL: v_brev_i64:
573; SI:       ; %bb.0:
574; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
575; SI-NEXT:    s_mov_b32 s7, 0xf000
576; SI-NEXT:    s_mov_b32 s10, 0
577; SI-NEXT:    s_mov_b32 s11, s7
578; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
579; SI-NEXT:    s_waitcnt lgkmcnt(0)
580; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
581; SI-NEXT:    v_mov_b32_e32 v1, 0
582; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
583; SI-NEXT:    s_mov_b32 s6, -1
584; SI-NEXT:    s_mov_b32 s4, s0
585; SI-NEXT:    s_mov_b32 s5, s1
586; SI-NEXT:    s_waitcnt vmcnt(0)
587; SI-NEXT:    v_bfrev_b32_e32 v2, v0
588; SI-NEXT:    v_bfrev_b32_e32 v1, v1
589; SI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
590; SI-NEXT:    s_endpgm
591;
592; FLAT-LABEL: v_brev_i64:
593; FLAT:       ; %bb.0:
594; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
595; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
596; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
597; FLAT-NEXT:    v_mov_b32_e32 v1, s3
598; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
599; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
600; FLAT-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
601; FLAT-NEXT:    s_mov_b32 s3, 0xf000
602; FLAT-NEXT:    s_mov_b32 s2, -1
603; FLAT-NEXT:    s_waitcnt vmcnt(0)
604; FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
605; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
606; FLAT-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
607; FLAT-NEXT:    s_endpgm
608;
609; GISEL-LABEL: v_brev_i64:
610; GISEL:       ; %bb.0:
611; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
612; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
613; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
614; GISEL-NEXT:    v_mov_b32_e32 v0, s2
615; GISEL-NEXT:    v_mov_b32_e32 v1, s3
616; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
617; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
618; GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
619; GISEL-NEXT:    v_mov_b32_e32 v4, s1
620; GISEL-NEXT:    v_mov_b32_e32 v3, s0
621; GISEL-NEXT:    s_waitcnt vmcnt(0)
622; GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
623; GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
624; GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
625; GISEL-NEXT:    s_endpgm
626;
627; GFX11-FLAT-LABEL: v_brev_i64:
628; GFX11-FLAT:       ; %bb.0:
629; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
630; GFX11-FLAT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
631; GFX11-FLAT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
632; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
633; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
634; GFX11-FLAT-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
635; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
636; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
637; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
638; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
639; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
640; GFX11-FLAT-NEXT:    buffer_store_b64 v[1:2], off, s[0:3], 0
641; GFX11-FLAT-NEXT:    s_endpgm
642;
643; GFX11-GISEL-LABEL: v_brev_i64:
644; GFX11-GISEL:       ; %bb.0:
645; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
646; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
647; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
648; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
649; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
650; GFX11-GISEL-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
651; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
652; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v1, v1
653; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v2, v0
654; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
655; GFX11-GISEL-NEXT:    global_store_b64 v0, v[1:2], s[0:1]
656; GFX11-GISEL-NEXT:    s_endpgm
657  %tid = call i32 @llvm.amdgcn.workitem.id.x()
658  %gep = getelementptr i64, ptr addrspace(1) %valptr, i32 %tid
659  %val = load i64, ptr addrspace(1) %gep
660  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
661  store i64 %brev, ptr addrspace(1) %out
662  ret void
663}
664
665define amdgpu_kernel void @s_brev_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) #0 {
666; SI-LABEL: s_brev_v2i64:
667; SI:       ; %bb.0:
668; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
669; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
670; SI-NEXT:    s_mov_b32 s7, 0xf000
671; SI-NEXT:    s_mov_b32 s6, -1
672; SI-NEXT:    s_waitcnt lgkmcnt(0)
673; SI-NEXT:    s_brev_b64 s[2:3], s[2:3]
674; SI-NEXT:    s_brev_b64 s[0:1], s[0:1]
675; SI-NEXT:    v_mov_b32_e32 v0, s0
676; SI-NEXT:    v_mov_b32_e32 v1, s1
677; SI-NEXT:    v_mov_b32_e32 v2, s2
678; SI-NEXT:    v_mov_b32_e32 v3, s3
679; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
680; SI-NEXT:    s_endpgm
681;
682; FLAT-LABEL: s_brev_v2i64:
683; FLAT:       ; %bb.0:
684; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
685; FLAT-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
686; FLAT-NEXT:    s_mov_b32 s7, 0xf000
687; FLAT-NEXT:    s_mov_b32 s6, -1
688; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
689; FLAT-NEXT:    s_brev_b64 s[2:3], s[2:3]
690; FLAT-NEXT:    s_brev_b64 s[0:1], s[0:1]
691; FLAT-NEXT:    v_mov_b32_e32 v0, s0
692; FLAT-NEXT:    v_mov_b32_e32 v1, s1
693; FLAT-NEXT:    v_mov_b32_e32 v2, s2
694; FLAT-NEXT:    v_mov_b32_e32 v3, s3
695; FLAT-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
696; FLAT-NEXT:    s_endpgm
697;
698; GISEL-LABEL: s_brev_v2i64:
699; GISEL:       ; %bb.0:
700; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
701; GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
702; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
703; GISEL-NEXT:    s_brev_b64 s[0:1], s[0:1]
704; GISEL-NEXT:    s_brev_b64 s[2:3], s[2:3]
705; GISEL-NEXT:    v_mov_b32_e32 v0, s0
706; GISEL-NEXT:    v_mov_b32_e32 v4, s4
707; GISEL-NEXT:    v_mov_b32_e32 v1, s1
708; GISEL-NEXT:    v_mov_b32_e32 v2, s2
709; GISEL-NEXT:    v_mov_b32_e32 v3, s3
710; GISEL-NEXT:    v_mov_b32_e32 v5, s5
711; GISEL-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
712; GISEL-NEXT:    s_endpgm
713;
714; GFX11-FLAT-LABEL: s_brev_v2i64:
715; GFX11-FLAT:       ; %bb.0:
716; GFX11-FLAT-NEXT:    s_clause 0x1
717; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
718; GFX11-FLAT-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
719; GFX11-FLAT-NEXT:    s_mov_b32 s7, 0x31016000
720; GFX11-FLAT-NEXT:    s_mov_b32 s6, -1
721; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
722; GFX11-FLAT-NEXT:    s_brev_b64 s[0:1], s[0:1]
723; GFX11-FLAT-NEXT:    s_brev_b64 s[2:3], s[2:3]
724; GFX11-FLAT-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
725; GFX11-FLAT-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
726; GFX11-FLAT-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
727; GFX11-FLAT-NEXT:    s_endpgm
728;
729; GFX11-GISEL-LABEL: s_brev_v2i64:
730; GFX11-GISEL:       ; %bb.0:
731; GFX11-GISEL-NEXT:    s_clause 0x1
732; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
733; GFX11-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x24
734; GFX11-GISEL-NEXT:    v_mov_b32_e32 v4, 0
735; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
736; GFX11-GISEL-NEXT:    s_brev_b64 s[0:1], s[0:1]
737; GFX11-GISEL-NEXT:    s_brev_b64 s[2:3], s[2:3]
738; GFX11-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
739; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
740; GFX11-GISEL-NEXT:    global_store_b128 v4, v[0:3], s[4:5]
741; GFX11-GISEL-NEXT:    s_endpgm
742  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
743  store <2 x i64> %brev, ptr addrspace(1) %out
744  ret void
745}
746
747define amdgpu_kernel void @v_brev_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) #0 {
748; SI-LABEL: v_brev_v2i64:
749; SI:       ; %bb.0:
750; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
751; SI-NEXT:    s_mov_b32 s7, 0xf000
752; SI-NEXT:    s_mov_b32 s10, 0
753; SI-NEXT:    s_mov_b32 s11, s7
754; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
755; SI-NEXT:    s_waitcnt lgkmcnt(0)
756; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
757; SI-NEXT:    v_mov_b32_e32 v1, 0
758; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
759; SI-NEXT:    s_mov_b32 s6, -1
760; SI-NEXT:    s_mov_b32 s4, s0
761; SI-NEXT:    s_mov_b32 s5, s1
762; SI-NEXT:    s_waitcnt vmcnt(0)
763; SI-NEXT:    v_bfrev_b32_e32 v4, v2
764; SI-NEXT:    v_bfrev_b32_e32 v3, v3
765; SI-NEXT:    v_bfrev_b32_e32 v2, v0
766; SI-NEXT:    v_bfrev_b32_e32 v1, v1
767; SI-NEXT:    buffer_store_dwordx4 v[1:4], off, s[4:7], 0
768; SI-NEXT:    s_endpgm
769;
770; FLAT-LABEL: v_brev_v2i64:
771; FLAT:       ; %bb.0:
772; FLAT-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
773; FLAT-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
774; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
775; FLAT-NEXT:    v_mov_b32_e32 v1, s3
776; FLAT-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
777; FLAT-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
778; FLAT-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
779; FLAT-NEXT:    s_mov_b32 s3, 0xf000
780; FLAT-NEXT:    s_mov_b32 s2, -1
781; FLAT-NEXT:    s_waitcnt vmcnt(0)
782; FLAT-NEXT:    v_bfrev_b32_e32 v4, v2
783; FLAT-NEXT:    v_bfrev_b32_e32 v3, v3
784; FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
785; FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
786; FLAT-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
787; FLAT-NEXT:    s_endpgm
788;
789; GISEL-LABEL: v_brev_v2i64:
790; GISEL:       ; %bb.0:
791; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
792; GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
793; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
794; GISEL-NEXT:    v_mov_b32_e32 v0, s2
795; GISEL-NEXT:    v_mov_b32_e32 v1, s3
796; GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
797; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
798; GISEL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
799; GISEL-NEXT:    s_waitcnt vmcnt(0)
800; GISEL-NEXT:    v_bfrev_b32_e32 v4, v1
801; GISEL-NEXT:    v_bfrev_b32_e32 v5, v0
802; GISEL-NEXT:    v_mov_b32_e32 v0, s0
803; GISEL-NEXT:    v_bfrev_b32_e32 v6, v3
804; GISEL-NEXT:    v_bfrev_b32_e32 v7, v2
805; GISEL-NEXT:    v_mov_b32_e32 v1, s1
806; GISEL-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
807; GISEL-NEXT:    s_endpgm
808;
809; GFX11-FLAT-LABEL: v_brev_v2i64:
810; GFX11-FLAT:       ; %bb.0:
811; GFX11-FLAT-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
812; GFX11-FLAT-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
813; GFX11-FLAT-NEXT:    s_delay_alu instid0(VALU_DEP_1)
814; GFX11-FLAT-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
815; GFX11-FLAT-NEXT:    s_waitcnt lgkmcnt(0)
816; GFX11-FLAT-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
817; GFX11-FLAT-NEXT:    s_mov_b32 s3, 0x31016000
818; GFX11-FLAT-NEXT:    s_mov_b32 s2, -1
819; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0)
820; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v4, v2
821; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v3, v3
822; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v2, v0
823; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v1, v1
824; GFX11-FLAT-NEXT:    buffer_store_b128 v[1:4], off, s[0:3], 0
825; GFX11-FLAT-NEXT:    s_endpgm
826;
827; GFX11-GISEL-LABEL: v_brev_v2i64:
828; GFX11-GISEL:       ; %bb.0:
829; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
830; GFX11-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
831; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
832; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
833; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
834; GFX11-GISEL-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
835; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
836; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v4, v1
837; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v5, v0
838; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v6, v3
839; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v7, v2
840; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, 0
841; GFX11-GISEL-NEXT:    global_store_b128 v0, v[4:7], s[0:1]
842; GFX11-GISEL-NEXT:    s_endpgm
843  %tid = call i32 @llvm.amdgcn.workitem.id.x()
844  %gep = getelementptr <2 x i64> , ptr addrspace(1) %valptr, i32 %tid
845  %val = load <2 x i64>, ptr addrspace(1) %gep
846  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
847  store <2 x i64> %brev, ptr addrspace(1) %out
848  ret void
849}
850
851define float @missing_truncate_promote_bitreverse(i32 %arg) {
852; SI-LABEL: missing_truncate_promote_bitreverse:
853; SI:       ; %bb.0: ; %bb
854; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
855; SI-NEXT:    v_bfrev_b32_e32 v0, v0
856; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
857; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
858; SI-NEXT:    s_setpc_b64 s[30:31]
859;
860; FLAT-LABEL: missing_truncate_promote_bitreverse:
861; FLAT:       ; %bb.0: ; %bb
862; FLAT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
863; FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
864; FLAT-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
865; FLAT-NEXT:    s_setpc_b64 s[30:31]
866;
867; GISEL-LABEL: missing_truncate_promote_bitreverse:
868; GISEL:       ; %bb.0: ; %bb
869; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
870; GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
871; GISEL-NEXT:    v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
872; GISEL-NEXT:    s_setpc_b64 s[30:31]
873;
874; GFX11-FLAT-LABEL: missing_truncate_promote_bitreverse:
875; GFX11-FLAT:       ; %bb.0: ; %bb
876; GFX11-FLAT-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
877; GFX11-FLAT-NEXT:    v_bfrev_b32_e32 v0, v0
878; GFX11-FLAT-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
879; GFX11-FLAT-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
880; GFX11-FLAT-NEXT:    v_cvt_f32_f16_e32 v0, v0
881; GFX11-FLAT-NEXT:    s_setpc_b64 s[30:31]
882;
883; GFX11-GISEL-LABEL: missing_truncate_promote_bitreverse:
884; GFX11-GISEL:       ; %bb.0: ; %bb
885; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
886; GFX11-GISEL-NEXT:    v_bfrev_b32_e32 v0, v0
887; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
888; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
889; GFX11-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
890; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
891bb:
892  %tmp = trunc i32 %arg to i16
893  %tmp1 = call i16 @llvm.bitreverse.i16(i16 %tmp)
894  %tmp2 = bitcast i16 %tmp1 to half
895  %tmp3 = fpext half %tmp2 to float
896  ret float %tmp3
897}
898
899attributes #0 = { nounwind }
900attributes #1 = { nounwind readnone }
901