xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-bswap-vp.ll (revision 9122c5235ec85ce0c0ad337e862b006e7b349d84)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
3; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
4; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
5; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
6
7declare <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16>, <2 x i1>, i32)
8
9define <2 x i16> @vp_bswap_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
10; CHECK-LABEL: vp_bswap_v2i16:
11; CHECK:       # %bb.0:
12; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
13; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
14; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
15; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
16; CHECK-NEXT:    ret
17  %v = call <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16> %va, <2 x i1> %m, i32 %evl)
18  ret <2 x i16> %v
19}
20
21define <2 x i16> @vp_bswap_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
22; CHECK-LABEL: vp_bswap_v2i16_unmasked:
23; CHECK:       # %bb.0:
24; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
25; CHECK-NEXT:    vsrl.vi v9, v8, 8
26; CHECK-NEXT:    vsll.vi v8, v8, 8
27; CHECK-NEXT:    vor.vv v8, v8, v9
28; CHECK-NEXT:    ret
29  %v = call <2 x i16> @llvm.vp.bswap.v2i16(<2 x i16> %va, <2 x i1> splat (i1 true), i32 %evl)
30  ret <2 x i16> %v
31}
32
33declare <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16>, <4 x i1>, i32)
34
35define <4 x i16> @vp_bswap_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
36; CHECK-LABEL: vp_bswap_v4i16:
37; CHECK:       # %bb.0:
38; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
39; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
40; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
41; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
42; CHECK-NEXT:    ret
43  %v = call <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16> %va, <4 x i1> %m, i32 %evl)
44  ret <4 x i16> %v
45}
46
47define <4 x i16> @vp_bswap_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
48; CHECK-LABEL: vp_bswap_v4i16_unmasked:
49; CHECK:       # %bb.0:
50; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
51; CHECK-NEXT:    vsrl.vi v9, v8, 8
52; CHECK-NEXT:    vsll.vi v8, v8, 8
53; CHECK-NEXT:    vor.vv v8, v8, v9
54; CHECK-NEXT:    ret
55  %v = call <4 x i16> @llvm.vp.bswap.v4i16(<4 x i16> %va, <4 x i1> splat (i1 true), i32 %evl)
56  ret <4 x i16> %v
57}
58
59declare <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16>, <8 x i1>, i32)
60
61define <8 x i16> @vp_bswap_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
62; CHECK-LABEL: vp_bswap_v8i16:
63; CHECK:       # %bb.0:
64; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
65; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
66; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
67; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
68; CHECK-NEXT:    ret
69  %v = call <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16> %va, <8 x i1> %m, i32 %evl)
70  ret <8 x i16> %v
71}
72
73define <8 x i16> @vp_bswap_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
74; CHECK-LABEL: vp_bswap_v8i16_unmasked:
75; CHECK:       # %bb.0:
76; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
77; CHECK-NEXT:    vsrl.vi v9, v8, 8
78; CHECK-NEXT:    vsll.vi v8, v8, 8
79; CHECK-NEXT:    vor.vv v8, v8, v9
80; CHECK-NEXT:    ret
81  %v = call <8 x i16> @llvm.vp.bswap.v8i16(<8 x i16> %va, <8 x i1> splat (i1 true), i32 %evl)
82  ret <8 x i16> %v
83}
84
85declare <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16>, <16 x i1>, i32)
86
87define <16 x i16> @vp_bswap_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
88; CHECK-LABEL: vp_bswap_v16i16:
89; CHECK:       # %bb.0:
90; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
91; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
92; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
93; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
94; CHECK-NEXT:    ret
95  %v = call <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16> %va, <16 x i1> %m, i32 %evl)
96  ret <16 x i16> %v
97}
98
99define <16 x i16> @vp_bswap_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
100; CHECK-LABEL: vp_bswap_v16i16_unmasked:
101; CHECK:       # %bb.0:
102; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
103; CHECK-NEXT:    vsrl.vi v10, v8, 8
104; CHECK-NEXT:    vsll.vi v8, v8, 8
105; CHECK-NEXT:    vor.vv v8, v8, v10
106; CHECK-NEXT:    ret
107  %v = call <16 x i16> @llvm.vp.bswap.v16i16(<16 x i16> %va, <16 x i1> splat (i1 true), i32 %evl)
108  ret <16 x i16> %v
109}
110
111declare <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32>, <2 x i1>, i32)
112
113define <2 x i32> @vp_bswap_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
114; CHECK-LABEL: vp_bswap_v2i32:
115; CHECK:       # %bb.0:
116; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
117; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
118; CHECK-NEXT:    lui a0, 16
119; CHECK-NEXT:    addi a0, a0, -256
120; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
121; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
122; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
123; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
124; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
125; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
126; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
127; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
128; CHECK-NEXT:    ret
129  %v = call <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32> %va, <2 x i1> %m, i32 %evl)
130  ret <2 x i32> %v
131}
132
133define <2 x i32> @vp_bswap_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
134; CHECK-LABEL: vp_bswap_v2i32_unmasked:
135; CHECK:       # %bb.0:
136; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
137; CHECK-NEXT:    vsrl.vi v9, v8, 8
138; CHECK-NEXT:    lui a0, 16
139; CHECK-NEXT:    vsrl.vi v10, v8, 24
140; CHECK-NEXT:    addi a0, a0, -256
141; CHECK-NEXT:    vand.vx v9, v9, a0
142; CHECK-NEXT:    vor.vv v9, v9, v10
143; CHECK-NEXT:    vand.vx v10, v8, a0
144; CHECK-NEXT:    vsll.vi v10, v10, 8
145; CHECK-NEXT:    vsll.vi v8, v8, 24
146; CHECK-NEXT:    vor.vv v8, v8, v10
147; CHECK-NEXT:    vor.vv v8, v8, v9
148; CHECK-NEXT:    ret
149  %v = call <2 x i32> @llvm.vp.bswap.v2i32(<2 x i32> %va, <2 x i1> splat (i1 true), i32 %evl)
150  ret <2 x i32> %v
151}
152
153declare <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32>, <4 x i1>, i32)
154
155define <4 x i32> @vp_bswap_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
156; CHECK-LABEL: vp_bswap_v4i32:
157; CHECK:       # %bb.0:
158; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
159; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
160; CHECK-NEXT:    lui a0, 16
161; CHECK-NEXT:    addi a0, a0, -256
162; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
163; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
164; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
165; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
166; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
167; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
168; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
169; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
170; CHECK-NEXT:    ret
171  %v = call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> %va, <4 x i1> %m, i32 %evl)
172  ret <4 x i32> %v
173}
174
175define <4 x i32> @vp_bswap_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
176; CHECK-LABEL: vp_bswap_v4i32_unmasked:
177; CHECK:       # %bb.0:
178; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
179; CHECK-NEXT:    vsrl.vi v9, v8, 8
180; CHECK-NEXT:    lui a0, 16
181; CHECK-NEXT:    vsrl.vi v10, v8, 24
182; CHECK-NEXT:    addi a0, a0, -256
183; CHECK-NEXT:    vand.vx v9, v9, a0
184; CHECK-NEXT:    vor.vv v9, v9, v10
185; CHECK-NEXT:    vand.vx v10, v8, a0
186; CHECK-NEXT:    vsll.vi v10, v10, 8
187; CHECK-NEXT:    vsll.vi v8, v8, 24
188; CHECK-NEXT:    vor.vv v8, v8, v10
189; CHECK-NEXT:    vor.vv v8, v8, v9
190; CHECK-NEXT:    ret
191  %v = call <4 x i32> @llvm.vp.bswap.v4i32(<4 x i32> %va, <4 x i1> splat (i1 true), i32 %evl)
192  ret <4 x i32> %v
193}
194
195declare <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32>, <8 x i1>, i32)
196
197define <8 x i32> @vp_bswap_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
198; CHECK-LABEL: vp_bswap_v8i32:
199; CHECK:       # %bb.0:
200; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
201; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
202; CHECK-NEXT:    lui a0, 16
203; CHECK-NEXT:    addi a0, a0, -256
204; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
205; CHECK-NEXT:    vsrl.vi v12, v8, 24, v0.t
206; CHECK-NEXT:    vor.vv v10, v10, v12, v0.t
207; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
208; CHECK-NEXT:    vsll.vi v12, v12, 8, v0.t
209; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
210; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
211; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
212; CHECK-NEXT:    ret
213  %v = call <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32> %va, <8 x i1> %m, i32 %evl)
214  ret <8 x i32> %v
215}
216
217define <8 x i32> @vp_bswap_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
218; CHECK-LABEL: vp_bswap_v8i32_unmasked:
219; CHECK:       # %bb.0:
220; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
221; CHECK-NEXT:    vsrl.vi v10, v8, 8
222; CHECK-NEXT:    lui a0, 16
223; CHECK-NEXT:    vsrl.vi v12, v8, 24
224; CHECK-NEXT:    addi a0, a0, -256
225; CHECK-NEXT:    vand.vx v10, v10, a0
226; CHECK-NEXT:    vor.vv v10, v10, v12
227; CHECK-NEXT:    vand.vx v12, v8, a0
228; CHECK-NEXT:    vsll.vi v12, v12, 8
229; CHECK-NEXT:    vsll.vi v8, v8, 24
230; CHECK-NEXT:    vor.vv v8, v8, v12
231; CHECK-NEXT:    vor.vv v8, v8, v10
232; CHECK-NEXT:    ret
233  %v = call <8 x i32> @llvm.vp.bswap.v8i32(<8 x i32> %va, <8 x i1> splat (i1 true), i32 %evl)
234  ret <8 x i32> %v
235}
236
237declare <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32>, <16 x i1>, i32)
238
239define <16 x i32> @vp_bswap_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
240; CHECK-LABEL: vp_bswap_v16i32:
241; CHECK:       # %bb.0:
242; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
243; CHECK-NEXT:    vsrl.vi v12, v8, 8, v0.t
244; CHECK-NEXT:    lui a0, 16
245; CHECK-NEXT:    addi a0, a0, -256
246; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
247; CHECK-NEXT:    vsrl.vi v16, v8, 24, v0.t
248; CHECK-NEXT:    vor.vv v12, v12, v16, v0.t
249; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
250; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
251; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
252; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
253; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
254; CHECK-NEXT:    ret
255  %v = call <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32> %va, <16 x i1> %m, i32 %evl)
256  ret <16 x i32> %v
257}
258
259define <16 x i32> @vp_bswap_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
260; CHECK-LABEL: vp_bswap_v16i32_unmasked:
261; CHECK:       # %bb.0:
262; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
263; CHECK-NEXT:    vsrl.vi v12, v8, 8
264; CHECK-NEXT:    lui a0, 16
265; CHECK-NEXT:    vsrl.vi v16, v8, 24
266; CHECK-NEXT:    addi a0, a0, -256
267; CHECK-NEXT:    vand.vx v12, v12, a0
268; CHECK-NEXT:    vor.vv v12, v12, v16
269; CHECK-NEXT:    vand.vx v16, v8, a0
270; CHECK-NEXT:    vsll.vi v16, v16, 8
271; CHECK-NEXT:    vsll.vi v8, v8, 24
272; CHECK-NEXT:    vor.vv v8, v8, v16
273; CHECK-NEXT:    vor.vv v8, v8, v12
274; CHECK-NEXT:    ret
275  %v = call <16 x i32> @llvm.vp.bswap.v16i32(<16 x i32> %va, <16 x i1> splat (i1 true), i32 %evl)
276  ret <16 x i32> %v
277}
278
279declare <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64>, <2 x i1>, i32)
280
281define <2 x i64> @vp_bswap_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
282; RV32-LABEL: vp_bswap_v2i64:
283; RV32:       # %bb.0:
284; RV32-NEXT:    addi sp, sp, -16
285; RV32-NEXT:    .cfi_def_cfa_offset 16
286; RV32-NEXT:    lui a1, 1044480
287; RV32-NEXT:    li a2, 56
288; RV32-NEXT:    lui a3, 16
289; RV32-NEXT:    li a4, 40
290; RV32-NEXT:    lui a5, 4080
291; RV32-NEXT:    addi a6, sp, 8
292; RV32-NEXT:    sw a1, 8(sp)
293; RV32-NEXT:    sw zero, 12(sp)
294; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
295; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
296; RV32-NEXT:    addi a1, a3, -256
297; RV32-NEXT:    vand.vx v10, v8, a1, v0.t
298; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
299; RV32-NEXT:    vlse64.v v11, (a6), zero
300; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
301; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
302; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
303; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
304; RV32-NEXT:    vsll.vi v10, v10, 24, v0.t
305; RV32-NEXT:    vand.vv v12, v8, v11, v0.t
306; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
307; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
308; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
309; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
310; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
311; RV32-NEXT:    vand.vx v12, v12, a1, v0.t
312; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
313; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
314; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
315; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
316; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
317; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
318; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
319; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
320; RV32-NEXT:    addi sp, sp, 16
321; RV32-NEXT:    .cfi_def_cfa_offset 0
322; RV32-NEXT:    ret
323;
324; RV64-LABEL: vp_bswap_v2i64:
325; RV64:       # %bb.0:
326; RV64-NEXT:    lui a1, 4080
327; RV64-NEXT:    li a2, 255
328; RV64-NEXT:    li a3, 56
329; RV64-NEXT:    lui a4, 16
330; RV64-NEXT:    li a5, 40
331; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
332; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
333; RV64-NEXT:    slli a2, a2, 24
334; RV64-NEXT:    addiw a0, a4, -256
335; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
336; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
337; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
338; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
339; RV64-NEXT:    vsll.vx v10, v8, a3, v0.t
340; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
341; RV64-NEXT:    vsll.vx v11, v11, a5, v0.t
342; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
343; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
344; RV64-NEXT:    vsrl.vx v10, v8, a3, v0.t
345; RV64-NEXT:    vsrl.vx v11, v8, a5, v0.t
346; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
347; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
348; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
349; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
350; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
351; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
352; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
353; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
354; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
355; RV64-NEXT:    ret
356  %v = call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> %va, <2 x i1> %m, i32 %evl)
357  ret <2 x i64> %v
358}
359
360define <2 x i64> @vp_bswap_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
361; RV32-LABEL: vp_bswap_v2i64_unmasked:
362; RV32:       # %bb.0:
363; RV32-NEXT:    addi sp, sp, -16
364; RV32-NEXT:    .cfi_def_cfa_offset 16
365; RV32-NEXT:    lui a1, 1044480
366; RV32-NEXT:    li a2, 56
367; RV32-NEXT:    lui a3, 16
368; RV32-NEXT:    li a4, 40
369; RV32-NEXT:    lui a5, 4080
370; RV32-NEXT:    addi a6, sp, 8
371; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
372; RV32-NEXT:    vsrl.vi v9, v8, 24
373; RV32-NEXT:    sw a1, 8(sp)
374; RV32-NEXT:    sw zero, 12(sp)
375; RV32-NEXT:    vsll.vx v10, v8, a2
376; RV32-NEXT:    addi a1, a3, -256
377; RV32-NEXT:    vsrl.vx v11, v8, a2
378; RV32-NEXT:    vsrl.vx v12, v8, a4
379; RV32-NEXT:    vand.vx v13, v8, a1
380; RV32-NEXT:    vand.vx v12, v12, a1
381; RV32-NEXT:    vor.vv v11, v12, v11
382; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
383; RV32-NEXT:    vlse64.v v12, (a6), zero
384; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
385; RV32-NEXT:    vsll.vx v13, v13, a4
386; RV32-NEXT:    vor.vv v10, v10, v13
387; RV32-NEXT:    vsrl.vi v13, v8, 8
388; RV32-NEXT:    vand.vx v9, v9, a5
389; RV32-NEXT:    vand.vv v13, v13, v12
390; RV32-NEXT:    vor.vv v9, v13, v9
391; RV32-NEXT:    vand.vv v12, v8, v12
392; RV32-NEXT:    vand.vx v8, v8, a5
393; RV32-NEXT:    vsll.vi v8, v8, 24
394; RV32-NEXT:    vsll.vi v12, v12, 8
395; RV32-NEXT:    vor.vv v8, v8, v12
396; RV32-NEXT:    vor.vv v8, v10, v8
397; RV32-NEXT:    vor.vv v9, v9, v11
398; RV32-NEXT:    vor.vv v8, v8, v9
399; RV32-NEXT:    addi sp, sp, 16
400; RV32-NEXT:    .cfi_def_cfa_offset 0
401; RV32-NEXT:    ret
402;
403; RV64-LABEL: vp_bswap_v2i64_unmasked:
404; RV64:       # %bb.0:
405; RV64-NEXT:    lui a1, 4080
406; RV64-NEXT:    li a2, 255
407; RV64-NEXT:    li a3, 56
408; RV64-NEXT:    lui a4, 16
409; RV64-NEXT:    li a5, 40
410; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
411; RV64-NEXT:    vsrl.vi v9, v8, 24
412; RV64-NEXT:    vsrl.vi v10, v8, 8
413; RV64-NEXT:    addiw a0, a4, -256
414; RV64-NEXT:    vsrl.vx v11, v8, a3
415; RV64-NEXT:    vsrl.vx v12, v8, a5
416; RV64-NEXT:    vand.vx v12, v12, a0
417; RV64-NEXT:    vor.vv v11, v12, v11
418; RV64-NEXT:    vand.vx v12, v8, a1
419; RV64-NEXT:    slli a2, a2, 24
420; RV64-NEXT:    vand.vx v9, v9, a1
421; RV64-NEXT:    vsll.vi v12, v12, 24
422; RV64-NEXT:    vand.vx v10, v10, a2
423; RV64-NEXT:    vor.vv v9, v10, v9
424; RV64-NEXT:    vand.vx v10, v8, a2
425; RV64-NEXT:    vsll.vi v10, v10, 8
426; RV64-NEXT:    vor.vv v10, v12, v10
427; RV64-NEXT:    vsll.vx v12, v8, a3
428; RV64-NEXT:    vand.vx v8, v8, a0
429; RV64-NEXT:    vsll.vx v8, v8, a5
430; RV64-NEXT:    vor.vv v8, v12, v8
431; RV64-NEXT:    vor.vv v8, v8, v10
432; RV64-NEXT:    vor.vv v9, v9, v11
433; RV64-NEXT:    vor.vv v8, v8, v9
434; RV64-NEXT:    ret
435  %v = call <2 x i64> @llvm.vp.bswap.v2i64(<2 x i64> %va, <2 x i1> splat (i1 true), i32 %evl)
436  ret <2 x i64> %v
437}
438
439declare <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64>, <4 x i1>, i32)
440
441define <4 x i64> @vp_bswap_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
442; RV32-LABEL: vp_bswap_v4i64:
443; RV32:       # %bb.0:
444; RV32-NEXT:    addi sp, sp, -16
445; RV32-NEXT:    .cfi_def_cfa_offset 16
446; RV32-NEXT:    lui a1, 1044480
447; RV32-NEXT:    li a2, 56
448; RV32-NEXT:    lui a3, 16
449; RV32-NEXT:    li a4, 40
450; RV32-NEXT:    lui a5, 4080
451; RV32-NEXT:    addi a6, sp, 8
452; RV32-NEXT:    sw a1, 8(sp)
453; RV32-NEXT:    sw zero, 12(sp)
454; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
455; RV32-NEXT:    vsll.vx v10, v8, a2, v0.t
456; RV32-NEXT:    addi a1, a3, -256
457; RV32-NEXT:    vand.vx v12, v8, a1, v0.t
458; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
459; RV32-NEXT:    vlse64.v v14, (a6), zero
460; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
461; RV32-NEXT:    vsll.vx v12, v12, a4, v0.t
462; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
463; RV32-NEXT:    vand.vx v12, v8, a5, v0.t
464; RV32-NEXT:    vsll.vi v12, v12, 24, v0.t
465; RV32-NEXT:    vand.vv v16, v8, v14, v0.t
466; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
467; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
468; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
469; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
470; RV32-NEXT:    vsrl.vx v16, v8, a4, v0.t
471; RV32-NEXT:    vand.vx v16, v16, a1, v0.t
472; RV32-NEXT:    vor.vv v12, v16, v12, v0.t
473; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
474; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
475; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
476; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
477; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
478; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
479; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
480; RV32-NEXT:    addi sp, sp, 16
481; RV32-NEXT:    .cfi_def_cfa_offset 0
482; RV32-NEXT:    ret
483;
484; RV64-LABEL: vp_bswap_v4i64:
485; RV64:       # %bb.0:
486; RV64-NEXT:    lui a1, 4080
487; RV64-NEXT:    li a2, 255
488; RV64-NEXT:    li a3, 56
489; RV64-NEXT:    lui a4, 16
490; RV64-NEXT:    li a5, 40
491; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
492; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
493; RV64-NEXT:    slli a2, a2, 24
494; RV64-NEXT:    addiw a0, a4, -256
495; RV64-NEXT:    vsll.vi v10, v10, 24, v0.t
496; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
497; RV64-NEXT:    vsll.vi v12, v12, 8, v0.t
498; RV64-NEXT:    vor.vv v10, v10, v12, v0.t
499; RV64-NEXT:    vsll.vx v12, v8, a3, v0.t
500; RV64-NEXT:    vand.vx v14, v8, a0, v0.t
501; RV64-NEXT:    vsll.vx v14, v14, a5, v0.t
502; RV64-NEXT:    vor.vv v12, v12, v14, v0.t
503; RV64-NEXT:    vor.vv v10, v12, v10, v0.t
504; RV64-NEXT:    vsrl.vx v12, v8, a3, v0.t
505; RV64-NEXT:    vsrl.vx v14, v8, a5, v0.t
506; RV64-NEXT:    vand.vx v14, v14, a0, v0.t
507; RV64-NEXT:    vor.vv v12, v14, v12, v0.t
508; RV64-NEXT:    vsrl.vi v14, v8, 24, v0.t
509; RV64-NEXT:    vand.vx v14, v14, a1, v0.t
510; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
511; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
512; RV64-NEXT:    vor.vv v8, v8, v14, v0.t
513; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
514; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
515; RV64-NEXT:    ret
516  %v = call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> %va, <4 x i1> %m, i32 %evl)
517  ret <4 x i64> %v
518}
519
520define <4 x i64> @vp_bswap_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
521; RV32-LABEL: vp_bswap_v4i64_unmasked:
522; RV32:       # %bb.0:
523; RV32-NEXT:    addi sp, sp, -16
524; RV32-NEXT:    .cfi_def_cfa_offset 16
525; RV32-NEXT:    lui a1, 1044480
526; RV32-NEXT:    li a2, 56
527; RV32-NEXT:    lui a3, 16
528; RV32-NEXT:    li a4, 40
529; RV32-NEXT:    lui a5, 4080
530; RV32-NEXT:    addi a6, sp, 8
531; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
532; RV32-NEXT:    vsrl.vi v10, v8, 24
533; RV32-NEXT:    sw a1, 8(sp)
534; RV32-NEXT:    sw zero, 12(sp)
535; RV32-NEXT:    vsll.vx v12, v8, a2
536; RV32-NEXT:    addi a1, a3, -256
537; RV32-NEXT:    vsrl.vx v14, v8, a2
538; RV32-NEXT:    vsrl.vx v16, v8, a4
539; RV32-NEXT:    vand.vx v18, v8, a1
540; RV32-NEXT:    vand.vx v16, v16, a1
541; RV32-NEXT:    vor.vv v14, v16, v14
542; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
543; RV32-NEXT:    vlse64.v v16, (a6), zero
544; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
545; RV32-NEXT:    vsll.vx v18, v18, a4
546; RV32-NEXT:    vor.vv v12, v12, v18
547; RV32-NEXT:    vsrl.vi v18, v8, 8
548; RV32-NEXT:    vand.vx v10, v10, a5
549; RV32-NEXT:    vand.vv v18, v18, v16
550; RV32-NEXT:    vor.vv v10, v18, v10
551; RV32-NEXT:    vand.vv v16, v8, v16
552; RV32-NEXT:    vand.vx v8, v8, a5
553; RV32-NEXT:    vsll.vi v8, v8, 24
554; RV32-NEXT:    vsll.vi v16, v16, 8
555; RV32-NEXT:    vor.vv v8, v8, v16
556; RV32-NEXT:    vor.vv v8, v12, v8
557; RV32-NEXT:    vor.vv v10, v10, v14
558; RV32-NEXT:    vor.vv v8, v8, v10
559; RV32-NEXT:    addi sp, sp, 16
560; RV32-NEXT:    .cfi_def_cfa_offset 0
561; RV32-NEXT:    ret
562;
563; RV64-LABEL: vp_bswap_v4i64_unmasked:
564; RV64:       # %bb.0:
565; RV64-NEXT:    lui a1, 4080
566; RV64-NEXT:    li a2, 255
567; RV64-NEXT:    li a3, 56
568; RV64-NEXT:    lui a4, 16
569; RV64-NEXT:    li a5, 40
570; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
571; RV64-NEXT:    vsrl.vi v10, v8, 24
572; RV64-NEXT:    vsrl.vi v12, v8, 8
573; RV64-NEXT:    addiw a0, a4, -256
574; RV64-NEXT:    vsrl.vx v14, v8, a3
575; RV64-NEXT:    vsrl.vx v16, v8, a5
576; RV64-NEXT:    vand.vx v16, v16, a0
577; RV64-NEXT:    vor.vv v14, v16, v14
578; RV64-NEXT:    vand.vx v16, v8, a1
579; RV64-NEXT:    slli a2, a2, 24
580; RV64-NEXT:    vand.vx v10, v10, a1
581; RV64-NEXT:    vsll.vi v16, v16, 24
582; RV64-NEXT:    vand.vx v12, v12, a2
583; RV64-NEXT:    vor.vv v10, v12, v10
584; RV64-NEXT:    vand.vx v12, v8, a2
585; RV64-NEXT:    vsll.vi v12, v12, 8
586; RV64-NEXT:    vor.vv v12, v16, v12
587; RV64-NEXT:    vsll.vx v16, v8, a3
588; RV64-NEXT:    vand.vx v8, v8, a0
589; RV64-NEXT:    vsll.vx v8, v8, a5
590; RV64-NEXT:    vor.vv v8, v16, v8
591; RV64-NEXT:    vor.vv v8, v8, v12
592; RV64-NEXT:    vor.vv v10, v10, v14
593; RV64-NEXT:    vor.vv v8, v8, v10
594; RV64-NEXT:    ret
595  %v = call <4 x i64> @llvm.vp.bswap.v4i64(<4 x i64> %va, <4 x i1> splat (i1 true), i32 %evl)
596  ret <4 x i64> %v
597}
598
599declare <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64>, <8 x i1>, i32)
600
601define <8 x i64> @vp_bswap_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
602; RV32-LABEL: vp_bswap_v8i64:
603; RV32:       # %bb.0:
604; RV32-NEXT:    addi sp, sp, -16
605; RV32-NEXT:    .cfi_def_cfa_offset 16
606; RV32-NEXT:    lui a1, 1044480
607; RV32-NEXT:    li a2, 56
608; RV32-NEXT:    lui a3, 16
609; RV32-NEXT:    li a4, 40
610; RV32-NEXT:    lui a5, 4080
611; RV32-NEXT:    addi a6, sp, 8
612; RV32-NEXT:    sw a1, 8(sp)
613; RV32-NEXT:    sw zero, 12(sp)
614; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
615; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
616; RV32-NEXT:    addi a1, a3, -256
617; RV32-NEXT:    vand.vx v20, v8, a1, v0.t
618; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
619; RV32-NEXT:    vlse64.v v12, (a6), zero
620; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
621; RV32-NEXT:    vsll.vx v20, v20, a4, v0.t
622; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
623; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
624; RV32-NEXT:    vsll.vi v20, v20, 24, v0.t
625; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
626; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
627; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
628; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
629; RV32-NEXT:    vsrl.vx v20, v8, a2, v0.t
630; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
631; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
632; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
633; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
634; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
635; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
636; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
637; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
638; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
639; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
640; RV32-NEXT:    addi sp, sp, 16
641; RV32-NEXT:    .cfi_def_cfa_offset 0
642; RV32-NEXT:    ret
643;
644; RV64-LABEL: vp_bswap_v8i64:
645; RV64:       # %bb.0:
646; RV64-NEXT:    lui a1, 4080
647; RV64-NEXT:    li a2, 255
648; RV64-NEXT:    li a3, 56
649; RV64-NEXT:    lui a4, 16
650; RV64-NEXT:    li a5, 40
651; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
652; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
653; RV64-NEXT:    slli a2, a2, 24
654; RV64-NEXT:    addiw a0, a4, -256
655; RV64-NEXT:    vsll.vi v12, v12, 24, v0.t
656; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
657; RV64-NEXT:    vsll.vi v16, v16, 8, v0.t
658; RV64-NEXT:    vor.vv v12, v12, v16, v0.t
659; RV64-NEXT:    vsll.vx v16, v8, a3, v0.t
660; RV64-NEXT:    vand.vx v20, v8, a0, v0.t
661; RV64-NEXT:    vsll.vx v20, v20, a5, v0.t
662; RV64-NEXT:    vor.vv v16, v16, v20, v0.t
663; RV64-NEXT:    vor.vv v12, v16, v12, v0.t
664; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
665; RV64-NEXT:    vsrl.vx v20, v8, a5, v0.t
666; RV64-NEXT:    vand.vx v20, v20, a0, v0.t
667; RV64-NEXT:    vor.vv v16, v20, v16, v0.t
668; RV64-NEXT:    vsrl.vi v20, v8, 24, v0.t
669; RV64-NEXT:    vand.vx v20, v20, a1, v0.t
670; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
671; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
672; RV64-NEXT:    vor.vv v8, v8, v20, v0.t
673; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
674; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
675; RV64-NEXT:    ret
676  %v = call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> %va, <8 x i1> %m, i32 %evl)
677  ret <8 x i64> %v
678}
679
680define <8 x i64> @vp_bswap_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
681; RV32-LABEL: vp_bswap_v8i64_unmasked:
682; RV32:       # %bb.0:
683; RV32-NEXT:    addi sp, sp, -16
684; RV32-NEXT:    .cfi_def_cfa_offset 16
685; RV32-NEXT:    lui a1, 1044480
686; RV32-NEXT:    li a2, 56
687; RV32-NEXT:    lui a3, 16
688; RV32-NEXT:    li a4, 40
689; RV32-NEXT:    lui a5, 4080
690; RV32-NEXT:    addi a6, sp, 8
691; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
692; RV32-NEXT:    vsrl.vi v12, v8, 24
693; RV32-NEXT:    sw a1, 8(sp)
694; RV32-NEXT:    sw zero, 12(sp)
695; RV32-NEXT:    vsll.vx v16, v8, a2
696; RV32-NEXT:    addi a1, a3, -256
697; RV32-NEXT:    vsrl.vx v20, v8, a2
698; RV32-NEXT:    vsrl.vx v24, v8, a4
699; RV32-NEXT:    vand.vx v28, v8, a1
700; RV32-NEXT:    vand.vx v24, v24, a1
701; RV32-NEXT:    vor.vv v20, v24, v20
702; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
703; RV32-NEXT:    vlse64.v v24, (a6), zero
704; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
705; RV32-NEXT:    vsll.vx v28, v28, a4
706; RV32-NEXT:    vor.vv v16, v16, v28
707; RV32-NEXT:    vsrl.vi v28, v8, 8
708; RV32-NEXT:    vand.vx v12, v12, a5
709; RV32-NEXT:    vand.vv v28, v28, v24
710; RV32-NEXT:    vor.vv v12, v28, v12
711; RV32-NEXT:    vand.vv v24, v8, v24
712; RV32-NEXT:    vand.vx v8, v8, a5
713; RV32-NEXT:    vsll.vi v8, v8, 24
714; RV32-NEXT:    vsll.vi v24, v24, 8
715; RV32-NEXT:    vor.vv v8, v8, v24
716; RV32-NEXT:    vor.vv v8, v16, v8
717; RV32-NEXT:    vor.vv v12, v12, v20
718; RV32-NEXT:    vor.vv v8, v8, v12
719; RV32-NEXT:    addi sp, sp, 16
720; RV32-NEXT:    .cfi_def_cfa_offset 0
721; RV32-NEXT:    ret
722;
723; RV64-LABEL: vp_bswap_v8i64_unmasked:
724; RV64:       # %bb.0:
725; RV64-NEXT:    lui a1, 4080
726; RV64-NEXT:    li a2, 255
727; RV64-NEXT:    li a3, 56
728; RV64-NEXT:    lui a4, 16
729; RV64-NEXT:    li a5, 40
730; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
731; RV64-NEXT:    vsrl.vi v12, v8, 24
732; RV64-NEXT:    vsrl.vi v16, v8, 8
733; RV64-NEXT:    addiw a0, a4, -256
734; RV64-NEXT:    vsrl.vx v20, v8, a3
735; RV64-NEXT:    vsrl.vx v24, v8, a5
736; RV64-NEXT:    vand.vx v24, v24, a0
737; RV64-NEXT:    vor.vv v20, v24, v20
738; RV64-NEXT:    vand.vx v24, v8, a1
739; RV64-NEXT:    slli a2, a2, 24
740; RV64-NEXT:    vand.vx v12, v12, a1
741; RV64-NEXT:    vsll.vi v24, v24, 24
742; RV64-NEXT:    vand.vx v16, v16, a2
743; RV64-NEXT:    vor.vv v12, v16, v12
744; RV64-NEXT:    vand.vx v16, v8, a2
745; RV64-NEXT:    vsll.vi v16, v16, 8
746; RV64-NEXT:    vor.vv v16, v24, v16
747; RV64-NEXT:    vsll.vx v24, v8, a3
748; RV64-NEXT:    vand.vx v8, v8, a0
749; RV64-NEXT:    vsll.vx v8, v8, a5
750; RV64-NEXT:    vor.vv v8, v24, v8
751; RV64-NEXT:    vor.vv v8, v8, v16
752; RV64-NEXT:    vor.vv v12, v12, v20
753; RV64-NEXT:    vor.vv v8, v8, v12
754; RV64-NEXT:    ret
755  %v = call <8 x i64> @llvm.vp.bswap.v8i64(<8 x i64> %va, <8 x i1> splat (i1 true), i32 %evl)
756  ret <8 x i64> %v
757}
758
759declare <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64>, <15 x i1>, i32)
760
761define <15 x i64> @vp_bswap_v15i64(<15 x i64> %va, <15 x i1> %m, i32 zeroext %evl) {
762; RV32-LABEL: vp_bswap_v15i64:
763; RV32:       # %bb.0:
764; RV32-NEXT:    addi sp, sp, -16
765; RV32-NEXT:    .cfi_def_cfa_offset 16
766; RV32-NEXT:    csrr a1, vlenb
767; RV32-NEXT:    li a2, 24
768; RV32-NEXT:    mul a1, a1, a2
769; RV32-NEXT:    sub sp, sp, a1
770; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
771; RV32-NEXT:    lui a1, 1044480
772; RV32-NEXT:    li a2, 56
773; RV32-NEXT:    lui a3, 16
774; RV32-NEXT:    li a4, 40
775; RV32-NEXT:    addi a5, sp, 8
776; RV32-NEXT:    sw a1, 8(sp)
777; RV32-NEXT:    sw zero, 12(sp)
778; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
779; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
780; RV32-NEXT:    addi a1, a3, -256
781; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
782; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
783; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
784; RV32-NEXT:    csrr a3, vlenb
785; RV32-NEXT:    slli a3, a3, 4
786; RV32-NEXT:    add a3, sp, a3
787; RV32-NEXT:    addi a3, a3, 16
788; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
789; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
790; RV32-NEXT:    vlse64.v v16, (a5), zero
791; RV32-NEXT:    csrr a3, vlenb
792; RV32-NEXT:    slli a3, a3, 3
793; RV32-NEXT:    add a3, sp, a3
794; RV32-NEXT:    addi a3, a3, 16
795; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
796; RV32-NEXT:    lui a3, 4080
797; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
798; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
799; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
800; RV32-NEXT:    addi a0, sp, 16
801; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
802; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
803; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
804; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
805; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
806; RV32-NEXT:    csrr a0, vlenb
807; RV32-NEXT:    slli a0, a0, 4
808; RV32-NEXT:    add a0, sp, a0
809; RV32-NEXT:    addi a0, a0, 16
810; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
811; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
812; RV32-NEXT:    csrr a0, vlenb
813; RV32-NEXT:    slli a0, a0, 4
814; RV32-NEXT:    add a0, sp, a0
815; RV32-NEXT:    addi a0, a0, 16
816; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
817; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
818; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
819; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
820; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
821; RV32-NEXT:    addi a0, sp, 16
822; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
823; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
824; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
825; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
826; RV32-NEXT:    csrr a0, vlenb
827; RV32-NEXT:    slli a0, a0, 3
828; RV32-NEXT:    add a0, sp, a0
829; RV32-NEXT:    addi a0, a0, 16
830; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
831; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
832; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
833; RV32-NEXT:    addi a0, sp, 16
834; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
835; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
836; RV32-NEXT:    csrr a0, vlenb
837; RV32-NEXT:    slli a0, a0, 4
838; RV32-NEXT:    add a0, sp, a0
839; RV32-NEXT:    addi a0, a0, 16
840; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
841; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
842; RV32-NEXT:    csrr a0, vlenb
843; RV32-NEXT:    li a1, 24
844; RV32-NEXT:    mul a0, a0, a1
845; RV32-NEXT:    add sp, sp, a0
846; RV32-NEXT:    .cfi_def_cfa sp, 16
847; RV32-NEXT:    addi sp, sp, 16
848; RV32-NEXT:    .cfi_def_cfa_offset 0
849; RV32-NEXT:    ret
850;
851; RV64-LABEL: vp_bswap_v15i64:
852; RV64:       # %bb.0:
853; RV64-NEXT:    addi sp, sp, -16
854; RV64-NEXT:    .cfi_def_cfa_offset 16
855; RV64-NEXT:    csrr a1, vlenb
856; RV64-NEXT:    slli a1, a1, 3
857; RV64-NEXT:    sub sp, sp, a1
858; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
859; RV64-NEXT:    lui a1, 4080
860; RV64-NEXT:    li a2, 255
861; RV64-NEXT:    li a3, 56
862; RV64-NEXT:    lui a4, 16
863; RV64-NEXT:    li a5, 40
864; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
865; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
866; RV64-NEXT:    slli a2, a2, 24
867; RV64-NEXT:    addiw a0, a4, -256
868; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
869; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
870; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
871; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
872; RV64-NEXT:    addi a4, sp, 16
873; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
874; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
875; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
876; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
877; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
878; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
879; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
880; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
881; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
882; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
883; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
884; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
885; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
886; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
887; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
888; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
889; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
890; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
891; RV64-NEXT:    addi a0, sp, 16
892; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
893; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
894; RV64-NEXT:    csrr a0, vlenb
895; RV64-NEXT:    slli a0, a0, 3
896; RV64-NEXT:    add sp, sp, a0
897; RV64-NEXT:    .cfi_def_cfa sp, 16
898; RV64-NEXT:    addi sp, sp, 16
899; RV64-NEXT:    .cfi_def_cfa_offset 0
900; RV64-NEXT:    ret
901  %v = call <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64> %va, <15 x i1> %m, i32 %evl)
902  ret <15 x i64> %v
903}
904
905define <15 x i64> @vp_bswap_v15i64_unmasked(<15 x i64> %va, i32 zeroext %evl) {
906; RV32-LABEL: vp_bswap_v15i64_unmasked:
907; RV32:       # %bb.0:
908; RV32-NEXT:    addi sp, sp, -16
909; RV32-NEXT:    .cfi_def_cfa_offset 16
910; RV32-NEXT:    csrr a1, vlenb
911; RV32-NEXT:    slli a1, a1, 4
912; RV32-NEXT:    sub sp, sp, a1
913; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
914; RV32-NEXT:    lui a1, 1044480
915; RV32-NEXT:    li a2, 56
916; RV32-NEXT:    lui a3, 16
917; RV32-NEXT:    li a4, 40
918; RV32-NEXT:    lui a5, 4080
919; RV32-NEXT:    addi a6, sp, 8
920; RV32-NEXT:    sw a1, 8(sp)
921; RV32-NEXT:    sw zero, 12(sp)
922; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
923; RV32-NEXT:    vsll.vx v24, v8, a2
924; RV32-NEXT:    addi a1, a3, -256
925; RV32-NEXT:    vsrl.vx v16, v8, a2
926; RV32-NEXT:    vsrl.vx v0, v8, a4
927; RV32-NEXT:    vand.vx v0, v0, a1
928; RV32-NEXT:    vor.vv v16, v0, v16
929; RV32-NEXT:    csrr a2, vlenb
930; RV32-NEXT:    slli a2, a2, 3
931; RV32-NEXT:    add a2, sp, a2
932; RV32-NEXT:    addi a2, a2, 16
933; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
934; RV32-NEXT:    vand.vx v0, v8, a1
935; RV32-NEXT:    vsll.vx v0, v0, a4
936; RV32-NEXT:    vor.vv v16, v24, v0
937; RV32-NEXT:    addi a1, sp, 16
938; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
939; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
940; RV32-NEXT:    vlse64.v v0, (a6), zero
941; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
942; RV32-NEXT:    vsrl.vi v16, v8, 24
943; RV32-NEXT:    vand.vx v16, v16, a5
944; RV32-NEXT:    vsrl.vi v24, v8, 8
945; RV32-NEXT:    vand.vv v24, v24, v0
946; RV32-NEXT:    vor.vv v16, v24, v16
947; RV32-NEXT:    vand.vv v24, v8, v0
948; RV32-NEXT:    vand.vx v8, v8, a5
949; RV32-NEXT:    vsll.vi v8, v8, 24
950; RV32-NEXT:    vsll.vi v24, v24, 8
951; RV32-NEXT:    vor.vv v8, v8, v24
952; RV32-NEXT:    addi a0, sp, 16
953; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
954; RV32-NEXT:    vor.vv v8, v24, v8
955; RV32-NEXT:    csrr a0, vlenb
956; RV32-NEXT:    slli a0, a0, 3
957; RV32-NEXT:    add a0, sp, a0
958; RV32-NEXT:    addi a0, a0, 16
959; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
960; RV32-NEXT:    vor.vv v16, v16, v24
961; RV32-NEXT:    vor.vv v8, v8, v16
962; RV32-NEXT:    csrr a0, vlenb
963; RV32-NEXT:    slli a0, a0, 4
964; RV32-NEXT:    add sp, sp, a0
965; RV32-NEXT:    .cfi_def_cfa sp, 16
966; RV32-NEXT:    addi sp, sp, 16
967; RV32-NEXT:    .cfi_def_cfa_offset 0
968; RV32-NEXT:    ret
969;
970; RV64-LABEL: vp_bswap_v15i64_unmasked:
971; RV64:       # %bb.0:
972; RV64-NEXT:    addi sp, sp, -16
973; RV64-NEXT:    .cfi_def_cfa_offset 16
974; RV64-NEXT:    csrr a1, vlenb
975; RV64-NEXT:    slli a1, a1, 3
976; RV64-NEXT:    sub sp, sp, a1
977; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
978; RV64-NEXT:    lui a1, 4080
979; RV64-NEXT:    li a2, 255
980; RV64-NEXT:    li a3, 56
981; RV64-NEXT:    lui a4, 16
982; RV64-NEXT:    li a5, 40
983; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
984; RV64-NEXT:    vsrl.vi v24, v8, 24
985; RV64-NEXT:    addiw a0, a4, -256
986; RV64-NEXT:    vsrl.vx v16, v8, a3
987; RV64-NEXT:    vsrl.vx v0, v8, a5
988; RV64-NEXT:    vand.vx v0, v0, a0
989; RV64-NEXT:    vor.vv v16, v0, v16
990; RV64-NEXT:    addi a4, sp, 16
991; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
992; RV64-NEXT:    vsrl.vi v0, v8, 8
993; RV64-NEXT:    slli a2, a2, 24
994; RV64-NEXT:    vand.vx v24, v24, a1
995; RV64-NEXT:    vand.vx v0, v0, a2
996; RV64-NEXT:    vor.vv v24, v0, v24
997; RV64-NEXT:    vand.vx v0, v8, a1
998; RV64-NEXT:    vsll.vi v0, v0, 24
999; RV64-NEXT:    vand.vx v16, v8, a2
1000; RV64-NEXT:    vsll.vi v16, v16, 8
1001; RV64-NEXT:    vor.vv v16, v0, v16
1002; RV64-NEXT:    vsll.vx v0, v8, a3
1003; RV64-NEXT:    vand.vx v8, v8, a0
1004; RV64-NEXT:    vsll.vx v8, v8, a5
1005; RV64-NEXT:    vor.vv v8, v0, v8
1006; RV64-NEXT:    vor.vv v8, v8, v16
1007; RV64-NEXT:    addi a0, sp, 16
1008; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1009; RV64-NEXT:    vor.vv v16, v24, v16
1010; RV64-NEXT:    vor.vv v8, v8, v16
1011; RV64-NEXT:    csrr a0, vlenb
1012; RV64-NEXT:    slli a0, a0, 3
1013; RV64-NEXT:    add sp, sp, a0
1014; RV64-NEXT:    .cfi_def_cfa sp, 16
1015; RV64-NEXT:    addi sp, sp, 16
1016; RV64-NEXT:    .cfi_def_cfa_offset 0
1017; RV64-NEXT:    ret
1018  %v = call <15 x i64> @llvm.vp.bswap.v15i64(<15 x i64> %va, <15 x i1> splat (i1 true), i32 %evl)
1019  ret <15 x i64> %v
1020}
1021
1022declare <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64>, <16 x i1>, i32)
1023
1024define <16 x i64> @vp_bswap_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
1025; RV32-LABEL: vp_bswap_v16i64:
1026; RV32:       # %bb.0:
1027; RV32-NEXT:    addi sp, sp, -16
1028; RV32-NEXT:    .cfi_def_cfa_offset 16
1029; RV32-NEXT:    csrr a1, vlenb
1030; RV32-NEXT:    li a2, 24
1031; RV32-NEXT:    mul a1, a1, a2
1032; RV32-NEXT:    sub sp, sp, a1
1033; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
1034; RV32-NEXT:    lui a1, 1044480
1035; RV32-NEXT:    li a2, 56
1036; RV32-NEXT:    lui a3, 16
1037; RV32-NEXT:    li a4, 40
1038; RV32-NEXT:    addi a5, sp, 8
1039; RV32-NEXT:    sw a1, 8(sp)
1040; RV32-NEXT:    sw zero, 12(sp)
1041; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1042; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
1043; RV32-NEXT:    addi a1, a3, -256
1044; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
1045; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
1046; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
1047; RV32-NEXT:    csrr a3, vlenb
1048; RV32-NEXT:    slli a3, a3, 4
1049; RV32-NEXT:    add a3, sp, a3
1050; RV32-NEXT:    addi a3, a3, 16
1051; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
1052; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1053; RV32-NEXT:    vlse64.v v16, (a5), zero
1054; RV32-NEXT:    csrr a3, vlenb
1055; RV32-NEXT:    slli a3, a3, 3
1056; RV32-NEXT:    add a3, sp, a3
1057; RV32-NEXT:    addi a3, a3, 16
1058; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
1059; RV32-NEXT:    lui a3, 4080
1060; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1061; RV32-NEXT:    vand.vx v24, v8, a3, v0.t
1062; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
1063; RV32-NEXT:    addi a0, sp, 16
1064; RV32-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
1065; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
1066; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
1067; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
1068; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
1069; RV32-NEXT:    csrr a0, vlenb
1070; RV32-NEXT:    slli a0, a0, 4
1071; RV32-NEXT:    add a0, sp, a0
1072; RV32-NEXT:    addi a0, a0, 16
1073; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
1074; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
1075; RV32-NEXT:    csrr a0, vlenb
1076; RV32-NEXT:    slli a0, a0, 4
1077; RV32-NEXT:    add a0, sp, a0
1078; RV32-NEXT:    addi a0, a0, 16
1079; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
1080; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
1081; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
1082; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
1083; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
1084; RV32-NEXT:    addi a0, sp, 16
1085; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
1086; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
1087; RV32-NEXT:    vand.vx v24, v24, a3, v0.t
1088; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
1089; RV32-NEXT:    csrr a0, vlenb
1090; RV32-NEXT:    slli a0, a0, 3
1091; RV32-NEXT:    add a0, sp, a0
1092; RV32-NEXT:    addi a0, a0, 16
1093; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1094; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
1095; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
1096; RV32-NEXT:    addi a0, sp, 16
1097; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1098; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
1099; RV32-NEXT:    csrr a0, vlenb
1100; RV32-NEXT:    slli a0, a0, 4
1101; RV32-NEXT:    add a0, sp, a0
1102; RV32-NEXT:    addi a0, a0, 16
1103; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1104; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
1105; RV32-NEXT:    csrr a0, vlenb
1106; RV32-NEXT:    li a1, 24
1107; RV32-NEXT:    mul a0, a0, a1
1108; RV32-NEXT:    add sp, sp, a0
1109; RV32-NEXT:    .cfi_def_cfa sp, 16
1110; RV32-NEXT:    addi sp, sp, 16
1111; RV32-NEXT:    .cfi_def_cfa_offset 0
1112; RV32-NEXT:    ret
1113;
1114; RV64-LABEL: vp_bswap_v16i64:
1115; RV64:       # %bb.0:
1116; RV64-NEXT:    addi sp, sp, -16
1117; RV64-NEXT:    .cfi_def_cfa_offset 16
1118; RV64-NEXT:    csrr a1, vlenb
1119; RV64-NEXT:    slli a1, a1, 3
1120; RV64-NEXT:    sub sp, sp, a1
1121; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
1122; RV64-NEXT:    lui a1, 4080
1123; RV64-NEXT:    li a2, 255
1124; RV64-NEXT:    li a3, 56
1125; RV64-NEXT:    lui a4, 16
1126; RV64-NEXT:    li a5, 40
1127; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1128; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
1129; RV64-NEXT:    slli a2, a2, 24
1130; RV64-NEXT:    addiw a0, a4, -256
1131; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
1132; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
1133; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
1134; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
1135; RV64-NEXT:    addi a4, sp, 16
1136; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
1137; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
1138; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
1139; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
1140; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
1141; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
1142; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
1143; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
1144; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
1145; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
1146; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
1147; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
1148; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
1149; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
1150; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
1151; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
1152; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
1153; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
1154; RV64-NEXT:    addi a0, sp, 16
1155; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1156; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
1157; RV64-NEXT:    csrr a0, vlenb
1158; RV64-NEXT:    slli a0, a0, 3
1159; RV64-NEXT:    add sp, sp, a0
1160; RV64-NEXT:    .cfi_def_cfa sp, 16
1161; RV64-NEXT:    addi sp, sp, 16
1162; RV64-NEXT:    .cfi_def_cfa_offset 0
1163; RV64-NEXT:    ret
1164  %v = call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> %va, <16 x i1> %m, i32 %evl)
1165  ret <16 x i64> %v
1166}
1167
1168define <16 x i64> @vp_bswap_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
1169; RV32-LABEL: vp_bswap_v16i64_unmasked:
1170; RV32:       # %bb.0:
1171; RV32-NEXT:    addi sp, sp, -16
1172; RV32-NEXT:    .cfi_def_cfa_offset 16
1173; RV32-NEXT:    csrr a1, vlenb
1174; RV32-NEXT:    slli a1, a1, 4
1175; RV32-NEXT:    sub sp, sp, a1
1176; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1177; RV32-NEXT:    lui a1, 1044480
1178; RV32-NEXT:    li a2, 56
1179; RV32-NEXT:    lui a3, 16
1180; RV32-NEXT:    li a4, 40
1181; RV32-NEXT:    lui a5, 4080
1182; RV32-NEXT:    addi a6, sp, 8
1183; RV32-NEXT:    sw a1, 8(sp)
1184; RV32-NEXT:    sw zero, 12(sp)
1185; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1186; RV32-NEXT:    vsll.vx v24, v8, a2
1187; RV32-NEXT:    addi a1, a3, -256
1188; RV32-NEXT:    vsrl.vx v16, v8, a2
1189; RV32-NEXT:    vsrl.vx v0, v8, a4
1190; RV32-NEXT:    vand.vx v0, v0, a1
1191; RV32-NEXT:    vor.vv v16, v0, v16
1192; RV32-NEXT:    csrr a2, vlenb
1193; RV32-NEXT:    slli a2, a2, 3
1194; RV32-NEXT:    add a2, sp, a2
1195; RV32-NEXT:    addi a2, a2, 16
1196; RV32-NEXT:    vs8r.v v16, (a2) # Unknown-size Folded Spill
1197; RV32-NEXT:    vand.vx v0, v8, a1
1198; RV32-NEXT:    vsll.vx v0, v0, a4
1199; RV32-NEXT:    vor.vv v16, v24, v0
1200; RV32-NEXT:    addi a1, sp, 16
1201; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
1202; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
1203; RV32-NEXT:    vlse64.v v0, (a6), zero
1204; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1205; RV32-NEXT:    vsrl.vi v16, v8, 24
1206; RV32-NEXT:    vand.vx v16, v16, a5
1207; RV32-NEXT:    vsrl.vi v24, v8, 8
1208; RV32-NEXT:    vand.vv v24, v24, v0
1209; RV32-NEXT:    vor.vv v16, v24, v16
1210; RV32-NEXT:    vand.vv v24, v8, v0
1211; RV32-NEXT:    vand.vx v8, v8, a5
1212; RV32-NEXT:    vsll.vi v8, v8, 24
1213; RV32-NEXT:    vsll.vi v24, v24, 8
1214; RV32-NEXT:    vor.vv v8, v8, v24
1215; RV32-NEXT:    addi a0, sp, 16
1216; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
1217; RV32-NEXT:    vor.vv v8, v24, v8
1218; RV32-NEXT:    csrr a0, vlenb
1219; RV32-NEXT:    slli a0, a0, 3
1220; RV32-NEXT:    add a0, sp, a0
1221; RV32-NEXT:    addi a0, a0, 16
1222; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
1223; RV32-NEXT:    vor.vv v16, v16, v24
1224; RV32-NEXT:    vor.vv v8, v8, v16
1225; RV32-NEXT:    csrr a0, vlenb
1226; RV32-NEXT:    slli a0, a0, 4
1227; RV32-NEXT:    add sp, sp, a0
1228; RV32-NEXT:    .cfi_def_cfa sp, 16
1229; RV32-NEXT:    addi sp, sp, 16
1230; RV32-NEXT:    .cfi_def_cfa_offset 0
1231; RV32-NEXT:    ret
1232;
1233; RV64-LABEL: vp_bswap_v16i64_unmasked:
1234; RV64:       # %bb.0:
1235; RV64-NEXT:    addi sp, sp, -16
1236; RV64-NEXT:    .cfi_def_cfa_offset 16
1237; RV64-NEXT:    csrr a1, vlenb
1238; RV64-NEXT:    slli a1, a1, 3
1239; RV64-NEXT:    sub sp, sp, a1
1240; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
1241; RV64-NEXT:    lui a1, 4080
1242; RV64-NEXT:    li a2, 255
1243; RV64-NEXT:    li a3, 56
1244; RV64-NEXT:    lui a4, 16
1245; RV64-NEXT:    li a5, 40
1246; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1247; RV64-NEXT:    vsrl.vi v24, v8, 24
1248; RV64-NEXT:    addiw a0, a4, -256
1249; RV64-NEXT:    vsrl.vx v16, v8, a3
1250; RV64-NEXT:    vsrl.vx v0, v8, a5
1251; RV64-NEXT:    vand.vx v0, v0, a0
1252; RV64-NEXT:    vor.vv v16, v0, v16
1253; RV64-NEXT:    addi a4, sp, 16
1254; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
1255; RV64-NEXT:    vsrl.vi v0, v8, 8
1256; RV64-NEXT:    slli a2, a2, 24
1257; RV64-NEXT:    vand.vx v24, v24, a1
1258; RV64-NEXT:    vand.vx v0, v0, a2
1259; RV64-NEXT:    vor.vv v24, v0, v24
1260; RV64-NEXT:    vand.vx v0, v8, a1
1261; RV64-NEXT:    vsll.vi v0, v0, 24
1262; RV64-NEXT:    vand.vx v16, v8, a2
1263; RV64-NEXT:    vsll.vi v16, v16, 8
1264; RV64-NEXT:    vor.vv v16, v0, v16
1265; RV64-NEXT:    vsll.vx v0, v8, a3
1266; RV64-NEXT:    vand.vx v8, v8, a0
1267; RV64-NEXT:    vsll.vx v8, v8, a5
1268; RV64-NEXT:    vor.vv v8, v0, v8
1269; RV64-NEXT:    vor.vv v8, v8, v16
1270; RV64-NEXT:    addi a0, sp, 16
1271; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1272; RV64-NEXT:    vor.vv v16, v24, v16
1273; RV64-NEXT:    vor.vv v8, v8, v16
1274; RV64-NEXT:    csrr a0, vlenb
1275; RV64-NEXT:    slli a0, a0, 3
1276; RV64-NEXT:    add sp, sp, a0
1277; RV64-NEXT:    .cfi_def_cfa sp, 16
1278; RV64-NEXT:    addi sp, sp, 16
1279; RV64-NEXT:    .cfi_def_cfa_offset 0
1280; RV64-NEXT:    ret
1281  %v = call <16 x i64> @llvm.vp.bswap.v16i64(<16 x i64> %va, <16 x i1> splat (i1 true), i32 %evl)
1282  ret <16 x i64> %v
1283}
1284
1285declare <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16>, <128 x i1>, i32)
1286
1287define <128 x i16> @vp_bswap_v128i16(<128 x i16> %va, <128 x i1> %m, i32 zeroext %evl) {
1288; CHECK-LABEL: vp_bswap_v128i16:
1289; CHECK:       # %bb.0:
1290; CHECK-NEXT:    addi sp, sp, -16
1291; CHECK-NEXT:    .cfi_def_cfa_offset 16
1292; CHECK-NEXT:    csrr a1, vlenb
1293; CHECK-NEXT:    slli a1, a1, 4
1294; CHECK-NEXT:    sub sp, sp, a1
1295; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1296; CHECK-NEXT:    csrr a1, vlenb
1297; CHECK-NEXT:    slli a1, a1, 3
1298; CHECK-NEXT:    add a1, sp, a1
1299; CHECK-NEXT:    addi a1, a1, 16
1300; CHECK-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
1301; CHECK-NEXT:    li a2, 64
1302; CHECK-NEXT:    vsetivli zero, 8, e8, m1, ta, ma
1303; CHECK-NEXT:    vslidedown.vi v24, v0, 8
1304; CHECK-NEXT:    mv a1, a0
1305; CHECK-NEXT:    bltu a0, a2, .LBB26_2
1306; CHECK-NEXT:  # %bb.1:
1307; CHECK-NEXT:    li a1, 64
1308; CHECK-NEXT:  .LBB26_2:
1309; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
1310; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
1311; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
1312; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
1313; CHECK-NEXT:    addi a1, sp, 16
1314; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
1315; CHECK-NEXT:    addi a1, a0, -64
1316; CHECK-NEXT:    sltu a0, a0, a1
1317; CHECK-NEXT:    addi a0, a0, -1
1318; CHECK-NEXT:    and a0, a0, a1
1319; CHECK-NEXT:    vmv1r.v v0, v24
1320; CHECK-NEXT:    csrr a1, vlenb
1321; CHECK-NEXT:    slli a1, a1, 3
1322; CHECK-NEXT:    add a1, sp, a1
1323; CHECK-NEXT:    addi a1, a1, 16
1324; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
1325; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
1326; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
1327; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
1328; CHECK-NEXT:    vor.vv v16, v8, v16, v0.t
1329; CHECK-NEXT:    addi a0, sp, 16
1330; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
1331; CHECK-NEXT:    csrr a0, vlenb
1332; CHECK-NEXT:    slli a0, a0, 4
1333; CHECK-NEXT:    add sp, sp, a0
1334; CHECK-NEXT:    .cfi_def_cfa sp, 16
1335; CHECK-NEXT:    addi sp, sp, 16
1336; CHECK-NEXT:    .cfi_def_cfa_offset 0
1337; CHECK-NEXT:    ret
1338  %v = call <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16> %va, <128 x i1> %m, i32 %evl)
1339  ret <128 x i16> %v
1340}
1341
1342define <128 x i16> @vp_bswap_v128i16_unmasked(<128 x i16> %va, i32 zeroext %evl) {
1343; CHECK-LABEL: vp_bswap_v128i16_unmasked:
1344; CHECK:       # %bb.0:
1345; CHECK-NEXT:    li a2, 64
1346; CHECK-NEXT:    mv a1, a0
1347; CHECK-NEXT:    bltu a0, a2, .LBB27_2
1348; CHECK-NEXT:  # %bb.1:
1349; CHECK-NEXT:    li a1, 64
1350; CHECK-NEXT:  .LBB27_2:
1351; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
1352; CHECK-NEXT:    vsrl.vi v24, v8, 8
1353; CHECK-NEXT:    vsll.vi v8, v8, 8
1354; CHECK-NEXT:    vor.vv v8, v8, v24
1355; CHECK-NEXT:    addi a1, a0, -64
1356; CHECK-NEXT:    sltu a0, a0, a1
1357; CHECK-NEXT:    addi a0, a0, -1
1358; CHECK-NEXT:    and a0, a0, a1
1359; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
1360; CHECK-NEXT:    vsrl.vi v24, v16, 8
1361; CHECK-NEXT:    vsll.vi v16, v16, 8
1362; CHECK-NEXT:    vor.vv v16, v16, v24
1363; CHECK-NEXT:    ret
1364  %v = call <128 x i16> @llvm.vp.bswap.v128i16(<128 x i16> %va, <128 x i1> splat (i1 true), i32 %evl)
1365  ret <128 x i16> %v
1366}
1367