xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bitreverse.ll (revision 9122c5235ec85ce0c0ad337e862b006e7b349d84)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
4; RUN: llc -mtriple=riscv32 -mattr=+v,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVBB
5; RUN: llc -mtriple=riscv64 -mattr=+v,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVBB
6
7define void @bitreverse_v8i16(ptr %x, ptr %y) {
8; CHECK-LABEL: bitreverse_v8i16:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
11; CHECK-NEXT:    vle16.v v8, (a0)
12; CHECK-NEXT:    lui a1, 1
13; CHECK-NEXT:    addi a1, a1, -241
14; CHECK-NEXT:    vsrl.vi v9, v8, 8
15; CHECK-NEXT:    vsll.vi v8, v8, 8
16; CHECK-NEXT:    vor.vv v8, v8, v9
17; CHECK-NEXT:    vsrl.vi v9, v8, 4
18; CHECK-NEXT:    vand.vx v8, v8, a1
19; CHECK-NEXT:    vand.vx v9, v9, a1
20; CHECK-NEXT:    lui a1, 3
21; CHECK-NEXT:    addi a1, a1, 819
22; CHECK-NEXT:    vsll.vi v8, v8, 4
23; CHECK-NEXT:    vor.vv v8, v9, v8
24; CHECK-NEXT:    vsrl.vi v9, v8, 2
25; CHECK-NEXT:    vand.vx v8, v8, a1
26; CHECK-NEXT:    vand.vx v9, v9, a1
27; CHECK-NEXT:    lui a1, 5
28; CHECK-NEXT:    addi a1, a1, 1365
29; CHECK-NEXT:    vsll.vi v8, v8, 2
30; CHECK-NEXT:    vor.vv v8, v9, v8
31; CHECK-NEXT:    vsrl.vi v9, v8, 1
32; CHECK-NEXT:    vand.vx v8, v8, a1
33; CHECK-NEXT:    vand.vx v9, v9, a1
34; CHECK-NEXT:    vadd.vv v8, v8, v8
35; CHECK-NEXT:    vor.vv v8, v9, v8
36; CHECK-NEXT:    vse16.v v8, (a0)
37; CHECK-NEXT:    ret
38;
39; ZVBB-LABEL: bitreverse_v8i16:
40; ZVBB:       # %bb.0:
41; ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
42; ZVBB-NEXT:    vle16.v v8, (a0)
43; ZVBB-NEXT:    vbrev.v v8, v8
44; ZVBB-NEXT:    vse16.v v8, (a0)
45; ZVBB-NEXT:    ret
46  %a = load <8 x i16>, ptr %x
47  %b = load <8 x i16>, ptr %y
48  %c = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
49  store <8 x i16> %c, ptr %x
50  ret void
51}
52declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>)
53
54define void @bitreverse_v4i32(ptr %x, ptr %y) {
55; CHECK-LABEL: bitreverse_v4i32:
56; CHECK:       # %bb.0:
57; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
58; CHECK-NEXT:    vle32.v v8, (a0)
59; CHECK-NEXT:    lui a1, 16
60; CHECK-NEXT:    addi a1, a1, -256
61; CHECK-NEXT:    vsrl.vi v9, v8, 8
62; CHECK-NEXT:    vsrl.vi v10, v8, 24
63; CHECK-NEXT:    vand.vx v9, v9, a1
64; CHECK-NEXT:    vor.vv v9, v9, v10
65; CHECK-NEXT:    vand.vx v10, v8, a1
66; CHECK-NEXT:    lui a1, 61681
67; CHECK-NEXT:    addi a1, a1, -241
68; CHECK-NEXT:    vsll.vi v8, v8, 24
69; CHECK-NEXT:    vsll.vi v10, v10, 8
70; CHECK-NEXT:    vor.vv v8, v8, v10
71; CHECK-NEXT:    vor.vv v8, v8, v9
72; CHECK-NEXT:    vsrl.vi v9, v8, 4
73; CHECK-NEXT:    vand.vx v8, v8, a1
74; CHECK-NEXT:    vand.vx v9, v9, a1
75; CHECK-NEXT:    lui a1, 209715
76; CHECK-NEXT:    addi a1, a1, 819
77; CHECK-NEXT:    vsll.vi v8, v8, 4
78; CHECK-NEXT:    vor.vv v8, v9, v8
79; CHECK-NEXT:    vsrl.vi v9, v8, 2
80; CHECK-NEXT:    vand.vx v8, v8, a1
81; CHECK-NEXT:    vand.vx v9, v9, a1
82; CHECK-NEXT:    lui a1, 349525
83; CHECK-NEXT:    addi a1, a1, 1365
84; CHECK-NEXT:    vsll.vi v8, v8, 2
85; CHECK-NEXT:    vor.vv v8, v9, v8
86; CHECK-NEXT:    vsrl.vi v9, v8, 1
87; CHECK-NEXT:    vand.vx v8, v8, a1
88; CHECK-NEXT:    vand.vx v9, v9, a1
89; CHECK-NEXT:    vadd.vv v8, v8, v8
90; CHECK-NEXT:    vor.vv v8, v9, v8
91; CHECK-NEXT:    vse32.v v8, (a0)
92; CHECK-NEXT:    ret
93;
94; ZVBB-LABEL: bitreverse_v4i32:
95; ZVBB:       # %bb.0:
96; ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
97; ZVBB-NEXT:    vle32.v v8, (a0)
98; ZVBB-NEXT:    vbrev.v v8, v8
99; ZVBB-NEXT:    vse32.v v8, (a0)
100; ZVBB-NEXT:    ret
101  %a = load <4 x i32>, ptr %x
102  %b = load <4 x i32>, ptr %y
103  %c = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
104  store <4 x i32> %c, ptr %x
105  ret void
106}
107declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>)
108
109define void @bitreverse_v2i64(ptr %x, ptr %y) {
110; RV32-LABEL: bitreverse_v2i64:
111; RV32:       # %bb.0:
112; RV32-NEXT:    addi sp, sp, -16
113; RV32-NEXT:    .cfi_def_cfa_offset 16
114; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
115; RV32-NEXT:    vle64.v v8, (a0)
116; RV32-NEXT:    lui a2, 1044480
117; RV32-NEXT:    li a3, 56
118; RV32-NEXT:    li a4, 40
119; RV32-NEXT:    lui a5, 16
120; RV32-NEXT:    lui a1, 4080
121; RV32-NEXT:    addi a6, sp, 8
122; RV32-NEXT:    sw a2, 8(sp)
123; RV32-NEXT:    sw zero, 12(sp)
124; RV32-NEXT:    addi a2, a5, -256
125; RV32-NEXT:    vlse64.v v9, (a6), zero
126; RV32-NEXT:    vsrl.vx v10, v8, a3
127; RV32-NEXT:    vsrl.vx v11, v8, a4
128; RV32-NEXT:    vsrl.vi v12, v8, 24
129; RV32-NEXT:    vsll.vx v13, v8, a3
130; RV32-NEXT:    vand.vx v11, v11, a2
131; RV32-NEXT:    vor.vv v10, v11, v10
132; RV32-NEXT:    vand.vx v11, v8, a2
133; RV32-NEXT:    vsll.vx v11, v11, a4
134; RV32-NEXT:    vor.vv v11, v13, v11
135; RV32-NEXT:    vsrl.vi v13, v8, 8
136; RV32-NEXT:    vand.vx v12, v12, a1
137; RV32-NEXT:    vand.vv v13, v13, v9
138; RV32-NEXT:    vor.vv v12, v13, v12
139; RV32-NEXT:    lui a2, 61681
140; RV32-NEXT:    lui a3, 209715
141; RV32-NEXT:    lui a4, 349525
142; RV32-NEXT:    addi a2, a2, -241
143; RV32-NEXT:    addi a3, a3, 819
144; RV32-NEXT:    addi a4, a4, 1365
145; RV32-NEXT:    vor.vv v10, v12, v10
146; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
147; RV32-NEXT:    vmv.v.x v12, a2
148; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
149; RV32-NEXT:    vand.vv v9, v8, v9
150; RV32-NEXT:    vand.vx v8, v8, a1
151; RV32-NEXT:    vsll.vi v8, v8, 24
152; RV32-NEXT:    vsll.vi v9, v9, 8
153; RV32-NEXT:    vor.vv v8, v8, v9
154; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
155; RV32-NEXT:    vmv.v.x v9, a3
156; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
157; RV32-NEXT:    vor.vv v8, v11, v8
158; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
159; RV32-NEXT:    vmv.v.x v11, a4
160; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
161; RV32-NEXT:    vor.vv v8, v8, v10
162; RV32-NEXT:    vsrl.vi v10, v8, 4
163; RV32-NEXT:    vand.vv v8, v8, v12
164; RV32-NEXT:    vand.vv v10, v10, v12
165; RV32-NEXT:    vsll.vi v8, v8, 4
166; RV32-NEXT:    vor.vv v8, v10, v8
167; RV32-NEXT:    vsrl.vi v10, v8, 2
168; RV32-NEXT:    vand.vv v8, v8, v9
169; RV32-NEXT:    vand.vv v9, v10, v9
170; RV32-NEXT:    vsll.vi v8, v8, 2
171; RV32-NEXT:    vor.vv v8, v9, v8
172; RV32-NEXT:    vsrl.vi v9, v8, 1
173; RV32-NEXT:    vand.vv v8, v8, v11
174; RV32-NEXT:    vand.vv v9, v9, v11
175; RV32-NEXT:    vadd.vv v8, v8, v8
176; RV32-NEXT:    vor.vv v8, v9, v8
177; RV32-NEXT:    vse64.v v8, (a0)
178; RV32-NEXT:    addi sp, sp, 16
179; RV32-NEXT:    .cfi_def_cfa_offset 0
180; RV32-NEXT:    ret
181;
182; RV64-LABEL: bitreverse_v2i64:
183; RV64:       # %bb.0:
184; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
185; RV64-NEXT:    vle64.v v8, (a0)
186; RV64-NEXT:    li a1, 56
187; RV64-NEXT:    li a2, 40
188; RV64-NEXT:    lui a3, 16
189; RV64-NEXT:    lui a4, 4080
190; RV64-NEXT:    li a5, 255
191; RV64-NEXT:    addiw a3, a3, -256
192; RV64-NEXT:    slli a5, a5, 24
193; RV64-NEXT:    vsrl.vx v9, v8, a1
194; RV64-NEXT:    vsrl.vx v10, v8, a2
195; RV64-NEXT:    vsrl.vi v11, v8, 24
196; RV64-NEXT:    vsrl.vi v12, v8, 8
197; RV64-NEXT:    vand.vx v10, v10, a3
198; RV64-NEXT:    vor.vv v9, v10, v9
199; RV64-NEXT:    vand.vx v10, v8, a5
200; RV64-NEXT:    vand.vx v11, v11, a4
201; RV64-NEXT:    vand.vx v12, v12, a5
202; RV64-NEXT:    vor.vv v11, v12, v11
203; RV64-NEXT:    vand.vx v12, v8, a4
204; RV64-NEXT:    vsll.vi v10, v10, 8
205; RV64-NEXT:    vsll.vi v12, v12, 24
206; RV64-NEXT:    vor.vv v10, v12, v10
207; RV64-NEXT:    vsll.vx v12, v8, a1
208; RV64-NEXT:    vand.vx v8, v8, a3
209; RV64-NEXT:    vsll.vx v8, v8, a2
210; RV64-NEXT:    vor.vv v8, v12, v8
211; RV64-NEXT:    lui a1, 61681
212; RV64-NEXT:    lui a2, 209715
213; RV64-NEXT:    lui a3, 349525
214; RV64-NEXT:    addiw a1, a1, -241
215; RV64-NEXT:    addiw a2, a2, 819
216; RV64-NEXT:    addiw a3, a3, 1365
217; RV64-NEXT:    slli a4, a1, 32
218; RV64-NEXT:    slli a5, a2, 32
219; RV64-NEXT:    add a1, a1, a4
220; RV64-NEXT:    slli a4, a3, 32
221; RV64-NEXT:    add a2, a2, a5
222; RV64-NEXT:    add a3, a3, a4
223; RV64-NEXT:    vor.vv v9, v11, v9
224; RV64-NEXT:    vor.vv v8, v8, v10
225; RV64-NEXT:    vor.vv v8, v8, v9
226; RV64-NEXT:    vsrl.vi v9, v8, 4
227; RV64-NEXT:    vand.vx v8, v8, a1
228; RV64-NEXT:    vand.vx v9, v9, a1
229; RV64-NEXT:    vsll.vi v8, v8, 4
230; RV64-NEXT:    vor.vv v8, v9, v8
231; RV64-NEXT:    vsrl.vi v9, v8, 2
232; RV64-NEXT:    vand.vx v8, v8, a2
233; RV64-NEXT:    vand.vx v9, v9, a2
234; RV64-NEXT:    vsll.vi v8, v8, 2
235; RV64-NEXT:    vor.vv v8, v9, v8
236; RV64-NEXT:    vsrl.vi v9, v8, 1
237; RV64-NEXT:    vand.vx v8, v8, a3
238; RV64-NEXT:    vand.vx v9, v9, a3
239; RV64-NEXT:    vadd.vv v8, v8, v8
240; RV64-NEXT:    vor.vv v8, v9, v8
241; RV64-NEXT:    vse64.v v8, (a0)
242; RV64-NEXT:    ret
243;
244; ZVBB-LABEL: bitreverse_v2i64:
245; ZVBB:       # %bb.0:
246; ZVBB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
247; ZVBB-NEXT:    vle64.v v8, (a0)
248; ZVBB-NEXT:    vbrev.v v8, v8
249; ZVBB-NEXT:    vse64.v v8, (a0)
250; ZVBB-NEXT:    ret
251  %a = load <2 x i64>, ptr %x
252  %b = load <2 x i64>, ptr %y
253  %c = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
254  store <2 x i64> %c, ptr %x
255  ret void
256}
257declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>)
258
259define void @bitreverse_v16i16(ptr %x, ptr %y) {
260; CHECK-LABEL: bitreverse_v16i16:
261; CHECK:       # %bb.0:
262; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
263; CHECK-NEXT:    vle16.v v8, (a0)
264; CHECK-NEXT:    lui a1, 1
265; CHECK-NEXT:    addi a1, a1, -241
266; CHECK-NEXT:    vsrl.vi v10, v8, 8
267; CHECK-NEXT:    vsll.vi v8, v8, 8
268; CHECK-NEXT:    vor.vv v8, v8, v10
269; CHECK-NEXT:    vsrl.vi v10, v8, 4
270; CHECK-NEXT:    vand.vx v8, v8, a1
271; CHECK-NEXT:    vand.vx v10, v10, a1
272; CHECK-NEXT:    lui a1, 3
273; CHECK-NEXT:    addi a1, a1, 819
274; CHECK-NEXT:    vsll.vi v8, v8, 4
275; CHECK-NEXT:    vor.vv v8, v10, v8
276; CHECK-NEXT:    vsrl.vi v10, v8, 2
277; CHECK-NEXT:    vand.vx v8, v8, a1
278; CHECK-NEXT:    vand.vx v10, v10, a1
279; CHECK-NEXT:    lui a1, 5
280; CHECK-NEXT:    addi a1, a1, 1365
281; CHECK-NEXT:    vsll.vi v8, v8, 2
282; CHECK-NEXT:    vor.vv v8, v10, v8
283; CHECK-NEXT:    vsrl.vi v10, v8, 1
284; CHECK-NEXT:    vand.vx v8, v8, a1
285; CHECK-NEXT:    vand.vx v10, v10, a1
286; CHECK-NEXT:    vadd.vv v8, v8, v8
287; CHECK-NEXT:    vor.vv v8, v10, v8
288; CHECK-NEXT:    vse16.v v8, (a0)
289; CHECK-NEXT:    ret
290;
291; ZVBB-LABEL: bitreverse_v16i16:
292; ZVBB:       # %bb.0:
293; ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
294; ZVBB-NEXT:    vle16.v v8, (a0)
295; ZVBB-NEXT:    vbrev.v v8, v8
296; ZVBB-NEXT:    vse16.v v8, (a0)
297; ZVBB-NEXT:    ret
298  %a = load <16 x i16>, ptr %x
299  %b = load <16 x i16>, ptr %y
300  %c = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
301  store <16 x i16> %c, ptr %x
302  ret void
303}
304declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>)
305
306define void @bitreverse_v8i32(ptr %x, ptr %y) {
307; CHECK-LABEL: bitreverse_v8i32:
308; CHECK:       # %bb.0:
309; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
310; CHECK-NEXT:    vle32.v v8, (a0)
311; CHECK-NEXT:    lui a1, 16
312; CHECK-NEXT:    addi a1, a1, -256
313; CHECK-NEXT:    vsrl.vi v10, v8, 8
314; CHECK-NEXT:    vsrl.vi v12, v8, 24
315; CHECK-NEXT:    vand.vx v10, v10, a1
316; CHECK-NEXT:    vor.vv v10, v10, v12
317; CHECK-NEXT:    vand.vx v12, v8, a1
318; CHECK-NEXT:    lui a1, 61681
319; CHECK-NEXT:    addi a1, a1, -241
320; CHECK-NEXT:    vsll.vi v8, v8, 24
321; CHECK-NEXT:    vsll.vi v12, v12, 8
322; CHECK-NEXT:    vor.vv v8, v8, v12
323; CHECK-NEXT:    vor.vv v8, v8, v10
324; CHECK-NEXT:    vsrl.vi v10, v8, 4
325; CHECK-NEXT:    vand.vx v8, v8, a1
326; CHECK-NEXT:    vand.vx v10, v10, a1
327; CHECK-NEXT:    lui a1, 209715
328; CHECK-NEXT:    addi a1, a1, 819
329; CHECK-NEXT:    vsll.vi v8, v8, 4
330; CHECK-NEXT:    vor.vv v8, v10, v8
331; CHECK-NEXT:    vsrl.vi v10, v8, 2
332; CHECK-NEXT:    vand.vx v8, v8, a1
333; CHECK-NEXT:    vand.vx v10, v10, a1
334; CHECK-NEXT:    lui a1, 349525
335; CHECK-NEXT:    addi a1, a1, 1365
336; CHECK-NEXT:    vsll.vi v8, v8, 2
337; CHECK-NEXT:    vor.vv v8, v10, v8
338; CHECK-NEXT:    vsrl.vi v10, v8, 1
339; CHECK-NEXT:    vand.vx v8, v8, a1
340; CHECK-NEXT:    vand.vx v10, v10, a1
341; CHECK-NEXT:    vadd.vv v8, v8, v8
342; CHECK-NEXT:    vor.vv v8, v10, v8
343; CHECK-NEXT:    vse32.v v8, (a0)
344; CHECK-NEXT:    ret
345;
346; ZVBB-LABEL: bitreverse_v8i32:
347; ZVBB:       # %bb.0:
348; ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
349; ZVBB-NEXT:    vle32.v v8, (a0)
350; ZVBB-NEXT:    vbrev.v v8, v8
351; ZVBB-NEXT:    vse32.v v8, (a0)
352; ZVBB-NEXT:    ret
353  %a = load <8 x i32>, ptr %x
354  %b = load <8 x i32>, ptr %y
355  %c = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
356  store <8 x i32> %c, ptr %x
357  ret void
358}
359declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>)
360
361define void @bitreverse_v4i64(ptr %x, ptr %y) {
362; RV32-LABEL: bitreverse_v4i64:
363; RV32:       # %bb.0:
364; RV32-NEXT:    addi sp, sp, -16
365; RV32-NEXT:    .cfi_def_cfa_offset 16
366; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
367; RV32-NEXT:    vle64.v v8, (a0)
368; RV32-NEXT:    lui a2, 1044480
369; RV32-NEXT:    li a3, 56
370; RV32-NEXT:    li a4, 40
371; RV32-NEXT:    lui a5, 16
372; RV32-NEXT:    lui a1, 4080
373; RV32-NEXT:    addi a6, sp, 8
374; RV32-NEXT:    sw a2, 8(sp)
375; RV32-NEXT:    sw zero, 12(sp)
376; RV32-NEXT:    addi a2, a5, -256
377; RV32-NEXT:    vlse64.v v10, (a6), zero
378; RV32-NEXT:    vsrl.vx v12, v8, a3
379; RV32-NEXT:    vsrl.vx v14, v8, a4
380; RV32-NEXT:    vsrl.vi v16, v8, 24
381; RV32-NEXT:    vsll.vx v18, v8, a3
382; RV32-NEXT:    vand.vx v14, v14, a2
383; RV32-NEXT:    vor.vv v14, v14, v12
384; RV32-NEXT:    vand.vx v12, v8, a2
385; RV32-NEXT:    vsll.vx v12, v12, a4
386; RV32-NEXT:    vor.vv v12, v18, v12
387; RV32-NEXT:    vsrl.vi v18, v8, 8
388; RV32-NEXT:    vand.vx v16, v16, a1
389; RV32-NEXT:    vand.vv v18, v18, v10
390; RV32-NEXT:    vor.vv v16, v18, v16
391; RV32-NEXT:    lui a2, 61681
392; RV32-NEXT:    lui a3, 209715
393; RV32-NEXT:    lui a4, 349525
394; RV32-NEXT:    addi a2, a2, -241
395; RV32-NEXT:    addi a3, a3, 819
396; RV32-NEXT:    addi a4, a4, 1365
397; RV32-NEXT:    vor.vv v14, v16, v14
398; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
399; RV32-NEXT:    vmv.v.x v16, a2
400; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
401; RV32-NEXT:    vand.vv v10, v8, v10
402; RV32-NEXT:    vand.vx v8, v8, a1
403; RV32-NEXT:    vsll.vi v8, v8, 24
404; RV32-NEXT:    vsll.vi v10, v10, 8
405; RV32-NEXT:    vor.vv v8, v8, v10
406; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
407; RV32-NEXT:    vmv.v.x v10, a3
408; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
409; RV32-NEXT:    vor.vv v8, v12, v8
410; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
411; RV32-NEXT:    vmv.v.x v12, a4
412; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
413; RV32-NEXT:    vor.vv v8, v8, v14
414; RV32-NEXT:    vsrl.vi v14, v8, 4
415; RV32-NEXT:    vand.vv v8, v8, v16
416; RV32-NEXT:    vand.vv v14, v14, v16
417; RV32-NEXT:    vsll.vi v8, v8, 4
418; RV32-NEXT:    vor.vv v8, v14, v8
419; RV32-NEXT:    vsrl.vi v14, v8, 2
420; RV32-NEXT:    vand.vv v8, v8, v10
421; RV32-NEXT:    vand.vv v10, v14, v10
422; RV32-NEXT:    vsll.vi v8, v8, 2
423; RV32-NEXT:    vor.vv v8, v10, v8
424; RV32-NEXT:    vsrl.vi v10, v8, 1
425; RV32-NEXT:    vand.vv v8, v8, v12
426; RV32-NEXT:    vand.vv v10, v10, v12
427; RV32-NEXT:    vadd.vv v8, v8, v8
428; RV32-NEXT:    vor.vv v8, v10, v8
429; RV32-NEXT:    vse64.v v8, (a0)
430; RV32-NEXT:    addi sp, sp, 16
431; RV32-NEXT:    .cfi_def_cfa_offset 0
432; RV32-NEXT:    ret
433;
434; RV64-LABEL: bitreverse_v4i64:
435; RV64:       # %bb.0:
436; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
437; RV64-NEXT:    vle64.v v14, (a0)
438; RV64-NEXT:    li a1, 56
439; RV64-NEXT:    li a2, 40
440; RV64-NEXT:    lui a3, 16
441; RV64-NEXT:    lui a4, 4080
442; RV64-NEXT:    li a5, 255
443; RV64-NEXT:    addiw a3, a3, -256
444; RV64-NEXT:    slli a5, a5, 24
445; RV64-NEXT:    vsrl.vx v8, v14, a1
446; RV64-NEXT:    vsrl.vx v10, v14, a2
447; RV64-NEXT:    vsrl.vi v12, v14, 24
448; RV64-NEXT:    vsrl.vi v16, v14, 8
449; RV64-NEXT:    vand.vx v10, v10, a3
450; RV64-NEXT:    vor.vv v8, v10, v8
451; RV64-NEXT:    vand.vx v18, v14, a5
452; RV64-NEXT:    vand.vx v10, v12, a4
453; RV64-NEXT:    vand.vx v12, v16, a5
454; RV64-NEXT:    vor.vv v10, v12, v10
455; RV64-NEXT:    vand.vx v12, v14, a4
456; RV64-NEXT:    vsll.vi v16, v18, 8
457; RV64-NEXT:    vsll.vi v12, v12, 24
458; RV64-NEXT:    vor.vv v12, v12, v16
459; RV64-NEXT:    vsll.vx v16, v14, a1
460; RV64-NEXT:    vand.vx v14, v14, a3
461; RV64-NEXT:    vsll.vx v14, v14, a2
462; RV64-NEXT:    vor.vv v14, v16, v14
463; RV64-NEXT:    lui a1, 61681
464; RV64-NEXT:    lui a2, 209715
465; RV64-NEXT:    lui a3, 349525
466; RV64-NEXT:    addiw a1, a1, -241
467; RV64-NEXT:    addiw a2, a2, 819
468; RV64-NEXT:    addiw a3, a3, 1365
469; RV64-NEXT:    slli a4, a1, 32
470; RV64-NEXT:    slli a5, a2, 32
471; RV64-NEXT:    add a1, a1, a4
472; RV64-NEXT:    slli a4, a3, 32
473; RV64-NEXT:    add a2, a2, a5
474; RV64-NEXT:    add a3, a3, a4
475; RV64-NEXT:    vor.vv v8, v10, v8
476; RV64-NEXT:    vor.vv v10, v14, v12
477; RV64-NEXT:    vor.vv v8, v10, v8
478; RV64-NEXT:    vsrl.vi v10, v8, 4
479; RV64-NEXT:    vand.vx v8, v8, a1
480; RV64-NEXT:    vand.vx v10, v10, a1
481; RV64-NEXT:    vsll.vi v8, v8, 4
482; RV64-NEXT:    vor.vv v8, v10, v8
483; RV64-NEXT:    vsrl.vi v10, v8, 2
484; RV64-NEXT:    vand.vx v8, v8, a2
485; RV64-NEXT:    vand.vx v10, v10, a2
486; RV64-NEXT:    vsll.vi v8, v8, 2
487; RV64-NEXT:    vor.vv v8, v10, v8
488; RV64-NEXT:    vsrl.vi v10, v8, 1
489; RV64-NEXT:    vand.vx v8, v8, a3
490; RV64-NEXT:    vand.vx v10, v10, a3
491; RV64-NEXT:    vadd.vv v8, v8, v8
492; RV64-NEXT:    vor.vv v8, v10, v8
493; RV64-NEXT:    vse64.v v8, (a0)
494; RV64-NEXT:    ret
495;
496; ZVBB-LABEL: bitreverse_v4i64:
497; ZVBB:       # %bb.0:
498; ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
499; ZVBB-NEXT:    vle64.v v8, (a0)
500; ZVBB-NEXT:    vbrev.v v8, v8
501; ZVBB-NEXT:    vse64.v v8, (a0)
502; ZVBB-NEXT:    ret
503  %a = load <4 x i64>, ptr %x
504  %b = load <4 x i64>, ptr %y
505  %c = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
506  store <4 x i64> %c, ptr %x
507  ret void
508}
509declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>)
510