xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/bswap-vp.ll (revision 36e4176f1d83d04cdebb4e1870561099b2478d80)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv32 -mattr=+v,+m -target-abi=ilp32d \
3; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
4; RUN: llc -mtriple=riscv64 -mattr=+v,+m -target-abi=lp64d \
5; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
6; RUN: llc -mtriple=riscv32 -mattr=+v,+zvkb,+m -target-abi=ilp32d \
7; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
8; RUN: llc -mtriple=riscv64 -mattr=+v,+zvkb,+m -target-abi=lp64d \
9; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK-ZVKB
10
11declare <vscale x 1 x i16> @llvm.vp.bswap.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i1>, i32)
12
13define <vscale x 1 x i16> @vp_bswap_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
14; CHECK-LABEL: vp_bswap_nxv1i16:
15; CHECK:       # %bb.0:
16; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
17; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
18; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
19; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
20; CHECK-NEXT:    ret
21;
22; CHECK-ZVKB-LABEL: vp_bswap_nxv1i16:
23; CHECK-ZVKB:       # %bb.0:
24; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
25; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
26; CHECK-ZVKB-NEXT:    ret
27  %v = call <vscale x 1 x i16> @llvm.vp.bswap.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 %evl)
28  ret <vscale x 1 x i16> %v
29}
30
31define <vscale x 1 x i16> @vp_bswap_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
32; CHECK-LABEL: vp_bswap_nxv1i16_unmasked:
33; CHECK:       # %bb.0:
34; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
35; CHECK-NEXT:    vsrl.vi v9, v8, 8
36; CHECK-NEXT:    vsll.vi v8, v8, 8
37; CHECK-NEXT:    vor.vv v8, v8, v9
38; CHECK-NEXT:    ret
39;
40; CHECK-ZVKB-LABEL: vp_bswap_nxv1i16_unmasked:
41; CHECK-ZVKB:       # %bb.0:
42; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
43; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
44; CHECK-ZVKB-NEXT:    ret
45  %v = call <vscale x 1 x i16> @llvm.vp.bswap.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
46  ret <vscale x 1 x i16> %v
47}
48
49declare <vscale x 2 x i16> @llvm.vp.bswap.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i1>, i32)
50
51define <vscale x 2 x i16> @vp_bswap_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
52; CHECK-LABEL: vp_bswap_nxv2i16:
53; CHECK:       # %bb.0:
54; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
55; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
56; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
57; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
58; CHECK-NEXT:    ret
59;
60; CHECK-ZVKB-LABEL: vp_bswap_nxv2i16:
61; CHECK-ZVKB:       # %bb.0:
62; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
63; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
64; CHECK-ZVKB-NEXT:    ret
65  %v = call <vscale x 2 x i16> @llvm.vp.bswap.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 %evl)
66  ret <vscale x 2 x i16> %v
67}
68
69define <vscale x 2 x i16> @vp_bswap_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
70; CHECK-LABEL: vp_bswap_nxv2i16_unmasked:
71; CHECK:       # %bb.0:
72; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
73; CHECK-NEXT:    vsrl.vi v9, v8, 8
74; CHECK-NEXT:    vsll.vi v8, v8, 8
75; CHECK-NEXT:    vor.vv v8, v8, v9
76; CHECK-NEXT:    ret
77;
78; CHECK-ZVKB-LABEL: vp_bswap_nxv2i16_unmasked:
79; CHECK-ZVKB:       # %bb.0:
80; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
81; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
82; CHECK-ZVKB-NEXT:    ret
83  %v = call <vscale x 2 x i16> @llvm.vp.bswap.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
84  ret <vscale x 2 x i16> %v
85}
86
87declare <vscale x 4 x i16> @llvm.vp.bswap.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i1>, i32)
88
89define <vscale x 4 x i16> @vp_bswap_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
90; CHECK-LABEL: vp_bswap_nxv4i16:
91; CHECK:       # %bb.0:
92; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
93; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
94; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
95; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
96; CHECK-NEXT:    ret
97;
98; CHECK-ZVKB-LABEL: vp_bswap_nxv4i16:
99; CHECK-ZVKB:       # %bb.0:
100; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
101; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
102; CHECK-ZVKB-NEXT:    ret
103  %v = call <vscale x 4 x i16> @llvm.vp.bswap.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 %evl)
104  ret <vscale x 4 x i16> %v
105}
106
107define <vscale x 4 x i16> @vp_bswap_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
108; CHECK-LABEL: vp_bswap_nxv4i16_unmasked:
109; CHECK:       # %bb.0:
110; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
111; CHECK-NEXT:    vsrl.vi v9, v8, 8
112; CHECK-NEXT:    vsll.vi v8, v8, 8
113; CHECK-NEXT:    vor.vv v8, v8, v9
114; CHECK-NEXT:    ret
115;
116; CHECK-ZVKB-LABEL: vp_bswap_nxv4i16_unmasked:
117; CHECK-ZVKB:       # %bb.0:
118; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
119; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
120; CHECK-ZVKB-NEXT:    ret
121  %v = call <vscale x 4 x i16> @llvm.vp.bswap.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
122  ret <vscale x 4 x i16> %v
123}
124
125declare <vscale x 8 x i16> @llvm.vp.bswap.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i32)
126
127define <vscale x 8 x i16> @vp_bswap_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
128; CHECK-LABEL: vp_bswap_nxv8i16:
129; CHECK:       # %bb.0:
130; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
131; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
132; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
133; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
134; CHECK-NEXT:    ret
135;
136; CHECK-ZVKB-LABEL: vp_bswap_nxv8i16:
137; CHECK-ZVKB:       # %bb.0:
138; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
139; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
140; CHECK-ZVKB-NEXT:    ret
141  %v = call <vscale x 8 x i16> @llvm.vp.bswap.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 %evl)
142  ret <vscale x 8 x i16> %v
143}
144
145define <vscale x 8 x i16> @vp_bswap_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
146; CHECK-LABEL: vp_bswap_nxv8i16_unmasked:
147; CHECK:       # %bb.0:
148; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
149; CHECK-NEXT:    vsrl.vi v10, v8, 8
150; CHECK-NEXT:    vsll.vi v8, v8, 8
151; CHECK-NEXT:    vor.vv v8, v8, v10
152; CHECK-NEXT:    ret
153;
154; CHECK-ZVKB-LABEL: vp_bswap_nxv8i16_unmasked:
155; CHECK-ZVKB:       # %bb.0:
156; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
157; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
158; CHECK-ZVKB-NEXT:    ret
159  %v = call <vscale x 8 x i16> @llvm.vp.bswap.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
160  ret <vscale x 8 x i16> %v
161}
162
163declare <vscale x 16 x i16> @llvm.vp.bswap.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i1>, i32)
164
165define <vscale x 16 x i16> @vp_bswap_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
166; CHECK-LABEL: vp_bswap_nxv16i16:
167; CHECK:       # %bb.0:
168; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
169; CHECK-NEXT:    vsrl.vi v12, v8, 8, v0.t
170; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
171; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
172; CHECK-NEXT:    ret
173;
174; CHECK-ZVKB-LABEL: vp_bswap_nxv16i16:
175; CHECK-ZVKB:       # %bb.0:
176; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
177; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
178; CHECK-ZVKB-NEXT:    ret
179  %v = call <vscale x 16 x i16> @llvm.vp.bswap.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 %evl)
180  ret <vscale x 16 x i16> %v
181}
182
183define <vscale x 16 x i16> @vp_bswap_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
184; CHECK-LABEL: vp_bswap_nxv16i16_unmasked:
185; CHECK:       # %bb.0:
186; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
187; CHECK-NEXT:    vsrl.vi v12, v8, 8
188; CHECK-NEXT:    vsll.vi v8, v8, 8
189; CHECK-NEXT:    vor.vv v8, v8, v12
190; CHECK-NEXT:    ret
191;
192; CHECK-ZVKB-LABEL: vp_bswap_nxv16i16_unmasked:
193; CHECK-ZVKB:       # %bb.0:
194; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
195; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
196; CHECK-ZVKB-NEXT:    ret
197  %v = call <vscale x 16 x i16> @llvm.vp.bswap.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
198  ret <vscale x 16 x i16> %v
199}
200
201declare <vscale x 32 x i16> @llvm.vp.bswap.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i1>, i32)
202
203define <vscale x 32 x i16> @vp_bswap_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
204; CHECK-LABEL: vp_bswap_nxv32i16:
205; CHECK:       # %bb.0:
206; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
207; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
208; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
209; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
210; CHECK-NEXT:    ret
211;
212; CHECK-ZVKB-LABEL: vp_bswap_nxv32i16:
213; CHECK-ZVKB:       # %bb.0:
214; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
215; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
216; CHECK-ZVKB-NEXT:    ret
217  %v = call <vscale x 32 x i16> @llvm.vp.bswap.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 %evl)
218  ret <vscale x 32 x i16> %v
219}
220
221define <vscale x 32 x i16> @vp_bswap_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
222; CHECK-LABEL: vp_bswap_nxv32i16_unmasked:
223; CHECK:       # %bb.0:
224; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
225; CHECK-NEXT:    vsrl.vi v16, v8, 8
226; CHECK-NEXT:    vsll.vi v8, v8, 8
227; CHECK-NEXT:    vor.vv v8, v8, v16
228; CHECK-NEXT:    ret
229;
230; CHECK-ZVKB-LABEL: vp_bswap_nxv32i16_unmasked:
231; CHECK-ZVKB:       # %bb.0:
232; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
233; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
234; CHECK-ZVKB-NEXT:    ret
235  %v = call <vscale x 32 x i16> @llvm.vp.bswap.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> splat (i1 true), i32 %evl)
236  ret <vscale x 32 x i16> %v
237}
238
239declare <vscale x 1 x i32> @llvm.vp.bswap.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i1>, i32)
240
241define <vscale x 1 x i32> @vp_bswap_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
242; CHECK-LABEL: vp_bswap_nxv1i32:
243; CHECK:       # %bb.0:
244; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
245; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
246; CHECK-NEXT:    lui a0, 16
247; CHECK-NEXT:    addi a0, a0, -256
248; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
249; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
250; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
251; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
252; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
253; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
254; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
255; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
256; CHECK-NEXT:    ret
257;
258; CHECK-ZVKB-LABEL: vp_bswap_nxv1i32:
259; CHECK-ZVKB:       # %bb.0:
260; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
261; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
262; CHECK-ZVKB-NEXT:    ret
263  %v = call <vscale x 1 x i32> @llvm.vp.bswap.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 %evl)
264  ret <vscale x 1 x i32> %v
265}
266
267define <vscale x 1 x i32> @vp_bswap_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
268; CHECK-LABEL: vp_bswap_nxv1i32_unmasked:
269; CHECK:       # %bb.0:
270; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
271; CHECK-NEXT:    vsrl.vi v9, v8, 8
272; CHECK-NEXT:    lui a0, 16
273; CHECK-NEXT:    vsrl.vi v10, v8, 24
274; CHECK-NEXT:    addi a0, a0, -256
275; CHECK-NEXT:    vand.vx v9, v9, a0
276; CHECK-NEXT:    vor.vv v9, v9, v10
277; CHECK-NEXT:    vand.vx v10, v8, a0
278; CHECK-NEXT:    vsll.vi v10, v10, 8
279; CHECK-NEXT:    vsll.vi v8, v8, 24
280; CHECK-NEXT:    vor.vv v8, v8, v10
281; CHECK-NEXT:    vor.vv v8, v8, v9
282; CHECK-NEXT:    ret
283;
284; CHECK-ZVKB-LABEL: vp_bswap_nxv1i32_unmasked:
285; CHECK-ZVKB:       # %bb.0:
286; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
287; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
288; CHECK-ZVKB-NEXT:    ret
289  %v = call <vscale x 1 x i32> @llvm.vp.bswap.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
290  ret <vscale x 1 x i32> %v
291}
292
293declare <vscale x 2 x i32> @llvm.vp.bswap.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i1>, i32)
294
295define <vscale x 2 x i32> @vp_bswap_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
296; CHECK-LABEL: vp_bswap_nxv2i32:
297; CHECK:       # %bb.0:
298; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
299; CHECK-NEXT:    vsrl.vi v9, v8, 8, v0.t
300; CHECK-NEXT:    lui a0, 16
301; CHECK-NEXT:    addi a0, a0, -256
302; CHECK-NEXT:    vand.vx v9, v9, a0, v0.t
303; CHECK-NEXT:    vsrl.vi v10, v8, 24, v0.t
304; CHECK-NEXT:    vor.vv v9, v9, v10, v0.t
305; CHECK-NEXT:    vand.vx v10, v8, a0, v0.t
306; CHECK-NEXT:    vsll.vi v10, v10, 8, v0.t
307; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
308; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
309; CHECK-NEXT:    vor.vv v8, v8, v9, v0.t
310; CHECK-NEXT:    ret
311;
312; CHECK-ZVKB-LABEL: vp_bswap_nxv2i32:
313; CHECK-ZVKB:       # %bb.0:
314; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
315; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
316; CHECK-ZVKB-NEXT:    ret
317  %v = call <vscale x 2 x i32> @llvm.vp.bswap.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 %evl)
318  ret <vscale x 2 x i32> %v
319}
320
321define <vscale x 2 x i32> @vp_bswap_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
322; CHECK-LABEL: vp_bswap_nxv2i32_unmasked:
323; CHECK:       # %bb.0:
324; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
325; CHECK-NEXT:    vsrl.vi v9, v8, 8
326; CHECK-NEXT:    lui a0, 16
327; CHECK-NEXT:    vsrl.vi v10, v8, 24
328; CHECK-NEXT:    addi a0, a0, -256
329; CHECK-NEXT:    vand.vx v9, v9, a0
330; CHECK-NEXT:    vor.vv v9, v9, v10
331; CHECK-NEXT:    vand.vx v10, v8, a0
332; CHECK-NEXT:    vsll.vi v10, v10, 8
333; CHECK-NEXT:    vsll.vi v8, v8, 24
334; CHECK-NEXT:    vor.vv v8, v8, v10
335; CHECK-NEXT:    vor.vv v8, v8, v9
336; CHECK-NEXT:    ret
337;
338; CHECK-ZVKB-LABEL: vp_bswap_nxv2i32_unmasked:
339; CHECK-ZVKB:       # %bb.0:
340; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
341; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
342; CHECK-ZVKB-NEXT:    ret
343  %v = call <vscale x 2 x i32> @llvm.vp.bswap.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
344  ret <vscale x 2 x i32> %v
345}
346
347declare <vscale x 4 x i32> @llvm.vp.bswap.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
348
349define <vscale x 4 x i32> @vp_bswap_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
350; CHECK-LABEL: vp_bswap_nxv4i32:
351; CHECK:       # %bb.0:
352; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
353; CHECK-NEXT:    vsrl.vi v10, v8, 8, v0.t
354; CHECK-NEXT:    lui a0, 16
355; CHECK-NEXT:    addi a0, a0, -256
356; CHECK-NEXT:    vand.vx v10, v10, a0, v0.t
357; CHECK-NEXT:    vsrl.vi v12, v8, 24, v0.t
358; CHECK-NEXT:    vor.vv v10, v10, v12, v0.t
359; CHECK-NEXT:    vand.vx v12, v8, a0, v0.t
360; CHECK-NEXT:    vsll.vi v12, v12, 8, v0.t
361; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
362; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
363; CHECK-NEXT:    vor.vv v8, v8, v10, v0.t
364; CHECK-NEXT:    ret
365;
366; CHECK-ZVKB-LABEL: vp_bswap_nxv4i32:
367; CHECK-ZVKB:       # %bb.0:
368; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
369; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
370; CHECK-ZVKB-NEXT:    ret
371  %v = call <vscale x 4 x i32> @llvm.vp.bswap.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 %evl)
372  ret <vscale x 4 x i32> %v
373}
374
375define <vscale x 4 x i32> @vp_bswap_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
376; CHECK-LABEL: vp_bswap_nxv4i32_unmasked:
377; CHECK:       # %bb.0:
378; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
379; CHECK-NEXT:    vsrl.vi v10, v8, 8
380; CHECK-NEXT:    lui a0, 16
381; CHECK-NEXT:    vsrl.vi v12, v8, 24
382; CHECK-NEXT:    addi a0, a0, -256
383; CHECK-NEXT:    vand.vx v10, v10, a0
384; CHECK-NEXT:    vor.vv v10, v10, v12
385; CHECK-NEXT:    vand.vx v12, v8, a0
386; CHECK-NEXT:    vsll.vi v12, v12, 8
387; CHECK-NEXT:    vsll.vi v8, v8, 24
388; CHECK-NEXT:    vor.vv v8, v8, v12
389; CHECK-NEXT:    vor.vv v8, v8, v10
390; CHECK-NEXT:    ret
391;
392; CHECK-ZVKB-LABEL: vp_bswap_nxv4i32_unmasked:
393; CHECK-ZVKB:       # %bb.0:
394; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
395; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
396; CHECK-ZVKB-NEXT:    ret
397  %v = call <vscale x 4 x i32> @llvm.vp.bswap.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
398  ret <vscale x 4 x i32> %v
399}
400
401declare <vscale x 8 x i32> @llvm.vp.bswap.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i1>, i32)
402
403define <vscale x 8 x i32> @vp_bswap_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
404; CHECK-LABEL: vp_bswap_nxv8i32:
405; CHECK:       # %bb.0:
406; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
407; CHECK-NEXT:    vsrl.vi v12, v8, 8, v0.t
408; CHECK-NEXT:    lui a0, 16
409; CHECK-NEXT:    addi a0, a0, -256
410; CHECK-NEXT:    vand.vx v12, v12, a0, v0.t
411; CHECK-NEXT:    vsrl.vi v16, v8, 24, v0.t
412; CHECK-NEXT:    vor.vv v12, v12, v16, v0.t
413; CHECK-NEXT:    vand.vx v16, v8, a0, v0.t
414; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
415; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
416; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
417; CHECK-NEXT:    vor.vv v8, v8, v12, v0.t
418; CHECK-NEXT:    ret
419;
420; CHECK-ZVKB-LABEL: vp_bswap_nxv8i32:
421; CHECK-ZVKB:       # %bb.0:
422; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
423; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
424; CHECK-ZVKB-NEXT:    ret
425  %v = call <vscale x 8 x i32> @llvm.vp.bswap.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 %evl)
426  ret <vscale x 8 x i32> %v
427}
428
429define <vscale x 8 x i32> @vp_bswap_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
430; CHECK-LABEL: vp_bswap_nxv8i32_unmasked:
431; CHECK:       # %bb.0:
432; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
433; CHECK-NEXT:    vsrl.vi v12, v8, 8
434; CHECK-NEXT:    lui a0, 16
435; CHECK-NEXT:    vsrl.vi v16, v8, 24
436; CHECK-NEXT:    addi a0, a0, -256
437; CHECK-NEXT:    vand.vx v12, v12, a0
438; CHECK-NEXT:    vor.vv v12, v12, v16
439; CHECK-NEXT:    vand.vx v16, v8, a0
440; CHECK-NEXT:    vsll.vi v16, v16, 8
441; CHECK-NEXT:    vsll.vi v8, v8, 24
442; CHECK-NEXT:    vor.vv v8, v8, v16
443; CHECK-NEXT:    vor.vv v8, v8, v12
444; CHECK-NEXT:    ret
445;
446; CHECK-ZVKB-LABEL: vp_bswap_nxv8i32_unmasked:
447; CHECK-ZVKB:       # %bb.0:
448; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
449; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
450; CHECK-ZVKB-NEXT:    ret
451  %v = call <vscale x 8 x i32> @llvm.vp.bswap.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
452  ret <vscale x 8 x i32> %v
453}
454
455declare <vscale x 16 x i32> @llvm.vp.bswap.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i1>, i32)
456
457define <vscale x 16 x i32> @vp_bswap_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
458; CHECK-LABEL: vp_bswap_nxv16i32:
459; CHECK:       # %bb.0:
460; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
461; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
462; CHECK-NEXT:    lui a0, 16
463; CHECK-NEXT:    addi a0, a0, -256
464; CHECK-NEXT:    vand.vx v16, v16, a0, v0.t
465; CHECK-NEXT:    vsrl.vi v24, v8, 24, v0.t
466; CHECK-NEXT:    vor.vv v16, v16, v24, v0.t
467; CHECK-NEXT:    vand.vx v24, v8, a0, v0.t
468; CHECK-NEXT:    vsll.vi v24, v24, 8, v0.t
469; CHECK-NEXT:    vsll.vi v8, v8, 24, v0.t
470; CHECK-NEXT:    vor.vv v8, v8, v24, v0.t
471; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
472; CHECK-NEXT:    ret
473;
474; CHECK-ZVKB-LABEL: vp_bswap_nxv16i32:
475; CHECK-ZVKB:       # %bb.0:
476; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
477; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
478; CHECK-ZVKB-NEXT:    ret
479  %v = call <vscale x 16 x i32> @llvm.vp.bswap.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 %evl)
480  ret <vscale x 16 x i32> %v
481}
482
483define <vscale x 16 x i32> @vp_bswap_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
484; CHECK-LABEL: vp_bswap_nxv16i32_unmasked:
485; CHECK:       # %bb.0:
486; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
487; CHECK-NEXT:    vsrl.vi v16, v8, 8
488; CHECK-NEXT:    lui a0, 16
489; CHECK-NEXT:    vsrl.vi v24, v8, 24
490; CHECK-NEXT:    addi a0, a0, -256
491; CHECK-NEXT:    vand.vx v16, v16, a0
492; CHECK-NEXT:    vor.vv v16, v16, v24
493; CHECK-NEXT:    vand.vx v24, v8, a0
494; CHECK-NEXT:    vsll.vi v24, v24, 8
495; CHECK-NEXT:    vsll.vi v8, v8, 24
496; CHECK-NEXT:    vor.vv v8, v8, v24
497; CHECK-NEXT:    vor.vv v8, v8, v16
498; CHECK-NEXT:    ret
499;
500; CHECK-ZVKB-LABEL: vp_bswap_nxv16i32_unmasked:
501; CHECK-ZVKB:       # %bb.0:
502; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
503; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
504; CHECK-ZVKB-NEXT:    ret
505  %v = call <vscale x 16 x i32> @llvm.vp.bswap.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> splat (i1 true), i32 %evl)
506  ret <vscale x 16 x i32> %v
507}
508
509declare <vscale x 1 x i64> @llvm.vp.bswap.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i1>, i32)
510
511define <vscale x 1 x i64> @vp_bswap_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
512; RV32-LABEL: vp_bswap_nxv1i64:
513; RV32:       # %bb.0:
514; RV32-NEXT:    addi sp, sp, -16
515; RV32-NEXT:    .cfi_def_cfa_offset 16
516; RV32-NEXT:    lui a1, 1044480
517; RV32-NEXT:    li a2, 56
518; RV32-NEXT:    lui a3, 16
519; RV32-NEXT:    li a4, 40
520; RV32-NEXT:    lui a5, 4080
521; RV32-NEXT:    addi a6, sp, 8
522; RV32-NEXT:    sw a1, 8(sp)
523; RV32-NEXT:    sw zero, 12(sp)
524; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
525; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
526; RV32-NEXT:    addi a0, a3, -256
527; RV32-NEXT:    vand.vx v10, v8, a0, v0.t
528; RV32-NEXT:    vlse64.v v11, (a6), zero
529; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
530; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
531; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
532; RV32-NEXT:    vsll.vi v10, v10, 24, v0.t
533; RV32-NEXT:    vand.vv v12, v8, v11, v0.t
534; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
535; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
536; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
537; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
538; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
539; RV32-NEXT:    vand.vx v12, v12, a0, v0.t
540; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
541; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
542; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
543; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
544; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
545; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
546; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
547; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
548; RV32-NEXT:    addi sp, sp, 16
549; RV32-NEXT:    .cfi_def_cfa_offset 0
550; RV32-NEXT:    ret
551;
552; RV64-LABEL: vp_bswap_nxv1i64:
553; RV64:       # %bb.0:
554; RV64-NEXT:    lui a1, 4080
555; RV64-NEXT:    li a2, 255
556; RV64-NEXT:    li a3, 56
557; RV64-NEXT:    lui a4, 16
558; RV64-NEXT:    li a5, 40
559; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
560; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
561; RV64-NEXT:    slli a2, a2, 24
562; RV64-NEXT:    addiw a0, a4, -256
563; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
564; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
565; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
566; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
567; RV64-NEXT:    vsll.vx v10, v8, a3, v0.t
568; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
569; RV64-NEXT:    vsll.vx v11, v11, a5, v0.t
570; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
571; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
572; RV64-NEXT:    vsrl.vx v10, v8, a3, v0.t
573; RV64-NEXT:    vsrl.vx v11, v8, a5, v0.t
574; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
575; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
576; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
577; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
578; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
579; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
580; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
581; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
582; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
583; RV64-NEXT:    ret
584;
585; CHECK-ZVKB-LABEL: vp_bswap_nxv1i64:
586; CHECK-ZVKB:       # %bb.0:
587; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
588; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
589; CHECK-ZVKB-NEXT:    ret
590  %v = call <vscale x 1 x i64> @llvm.vp.bswap.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 %evl)
591  ret <vscale x 1 x i64> %v
592}
593
594define <vscale x 1 x i64> @vp_bswap_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
595; RV32-LABEL: vp_bswap_nxv1i64_unmasked:
596; RV32:       # %bb.0:
597; RV32-NEXT:    addi sp, sp, -16
598; RV32-NEXT:    .cfi_def_cfa_offset 16
599; RV32-NEXT:    lui a1, 1044480
600; RV32-NEXT:    li a2, 56
601; RV32-NEXT:    lui a3, 16
602; RV32-NEXT:    li a4, 40
603; RV32-NEXT:    lui a5, 4080
604; RV32-NEXT:    addi a6, sp, 8
605; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
606; RV32-NEXT:    vsrl.vi v9, v8, 24
607; RV32-NEXT:    sw a1, 8(sp)
608; RV32-NEXT:    sw zero, 12(sp)
609; RV32-NEXT:    vsll.vx v10, v8, a2
610; RV32-NEXT:    addi a0, a3, -256
611; RV32-NEXT:    vsrl.vx v11, v8, a2
612; RV32-NEXT:    vsrl.vx v12, v8, a4
613; RV32-NEXT:    vand.vx v13, v8, a0
614; RV32-NEXT:    vand.vx v12, v12, a0
615; RV32-NEXT:    vor.vv v11, v12, v11
616; RV32-NEXT:    vlse64.v v12, (a6), zero
617; RV32-NEXT:    vsll.vx v13, v13, a4
618; RV32-NEXT:    vor.vv v10, v10, v13
619; RV32-NEXT:    vsrl.vi v13, v8, 8
620; RV32-NEXT:    vand.vx v9, v9, a5
621; RV32-NEXT:    vand.vv v13, v13, v12
622; RV32-NEXT:    vor.vv v9, v13, v9
623; RV32-NEXT:    vand.vv v12, v8, v12
624; RV32-NEXT:    vand.vx v8, v8, a5
625; RV32-NEXT:    vsll.vi v8, v8, 24
626; RV32-NEXT:    vsll.vi v12, v12, 8
627; RV32-NEXT:    vor.vv v8, v8, v12
628; RV32-NEXT:    vor.vv v8, v10, v8
629; RV32-NEXT:    vor.vv v9, v9, v11
630; RV32-NEXT:    vor.vv v8, v8, v9
631; RV32-NEXT:    addi sp, sp, 16
632; RV32-NEXT:    .cfi_def_cfa_offset 0
633; RV32-NEXT:    ret
634;
635; RV64-LABEL: vp_bswap_nxv1i64_unmasked:
636; RV64:       # %bb.0:
637; RV64-NEXT:    lui a1, 4080
638; RV64-NEXT:    li a2, 255
639; RV64-NEXT:    li a3, 56
640; RV64-NEXT:    lui a4, 16
641; RV64-NEXT:    li a5, 40
642; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
643; RV64-NEXT:    vsrl.vi v9, v8, 24
644; RV64-NEXT:    vsrl.vi v10, v8, 8
645; RV64-NEXT:    addiw a0, a4, -256
646; RV64-NEXT:    vsrl.vx v11, v8, a3
647; RV64-NEXT:    vsrl.vx v12, v8, a5
648; RV64-NEXT:    vand.vx v12, v12, a0
649; RV64-NEXT:    vor.vv v11, v12, v11
650; RV64-NEXT:    vand.vx v12, v8, a1
651; RV64-NEXT:    slli a2, a2, 24
652; RV64-NEXT:    vand.vx v9, v9, a1
653; RV64-NEXT:    vsll.vi v12, v12, 24
654; RV64-NEXT:    vand.vx v10, v10, a2
655; RV64-NEXT:    vor.vv v9, v10, v9
656; RV64-NEXT:    vand.vx v10, v8, a2
657; RV64-NEXT:    vsll.vi v10, v10, 8
658; RV64-NEXT:    vor.vv v10, v12, v10
659; RV64-NEXT:    vsll.vx v12, v8, a3
660; RV64-NEXT:    vand.vx v8, v8, a0
661; RV64-NEXT:    vsll.vx v8, v8, a5
662; RV64-NEXT:    vor.vv v8, v12, v8
663; RV64-NEXT:    vor.vv v8, v8, v10
664; RV64-NEXT:    vor.vv v9, v9, v11
665; RV64-NEXT:    vor.vv v8, v8, v9
666; RV64-NEXT:    ret
667;
668; CHECK-ZVKB-LABEL: vp_bswap_nxv1i64_unmasked:
669; CHECK-ZVKB:       # %bb.0:
670; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
671; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
672; CHECK-ZVKB-NEXT:    ret
673  %v = call <vscale x 1 x i64> @llvm.vp.bswap.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
674  ret <vscale x 1 x i64> %v
675}
676
677declare <vscale x 2 x i64> @llvm.vp.bswap.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i32)
678
679define <vscale x 2 x i64> @vp_bswap_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
680; RV32-LABEL: vp_bswap_nxv2i64:
681; RV32:       # %bb.0:
682; RV32-NEXT:    addi sp, sp, -16
683; RV32-NEXT:    .cfi_def_cfa_offset 16
684; RV32-NEXT:    lui a1, 1044480
685; RV32-NEXT:    li a2, 56
686; RV32-NEXT:    lui a3, 16
687; RV32-NEXT:    li a4, 40
688; RV32-NEXT:    lui a5, 4080
689; RV32-NEXT:    addi a6, sp, 8
690; RV32-NEXT:    sw a1, 8(sp)
691; RV32-NEXT:    sw zero, 12(sp)
692; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
693; RV32-NEXT:    vsll.vx v10, v8, a2, v0.t
694; RV32-NEXT:    addi a0, a3, -256
695; RV32-NEXT:    vand.vx v12, v8, a0, v0.t
696; RV32-NEXT:    vlse64.v v14, (a6), zero
697; RV32-NEXT:    vsll.vx v12, v12, a4, v0.t
698; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
699; RV32-NEXT:    vand.vx v12, v8, a5, v0.t
700; RV32-NEXT:    vsll.vi v12, v12, 24, v0.t
701; RV32-NEXT:    vand.vv v16, v8, v14, v0.t
702; RV32-NEXT:    vsll.vi v16, v16, 8, v0.t
703; RV32-NEXT:    vor.vv v12, v12, v16, v0.t
704; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
705; RV32-NEXT:    vsrl.vx v12, v8, a2, v0.t
706; RV32-NEXT:    vsrl.vx v16, v8, a4, v0.t
707; RV32-NEXT:    vand.vx v16, v16, a0, v0.t
708; RV32-NEXT:    vor.vv v12, v16, v12, v0.t
709; RV32-NEXT:    vsrl.vi v16, v8, 24, v0.t
710; RV32-NEXT:    vand.vx v16, v16, a5, v0.t
711; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
712; RV32-NEXT:    vand.vv v8, v8, v14, v0.t
713; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
714; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
715; RV32-NEXT:    vor.vv v8, v10, v8, v0.t
716; RV32-NEXT:    addi sp, sp, 16
717; RV32-NEXT:    .cfi_def_cfa_offset 0
718; RV32-NEXT:    ret
719;
720; RV64-LABEL: vp_bswap_nxv2i64:
721; RV64:       # %bb.0:
722; RV64-NEXT:    lui a1, 4080
723; RV64-NEXT:    li a2, 255
724; RV64-NEXT:    li a3, 56
725; RV64-NEXT:    lui a4, 16
726; RV64-NEXT:    li a5, 40
727; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
728; RV64-NEXT:    vand.vx v10, v8, a1, v0.t
729; RV64-NEXT:    slli a2, a2, 24
730; RV64-NEXT:    addiw a0, a4, -256
731; RV64-NEXT:    vsll.vi v10, v10, 24, v0.t
732; RV64-NEXT:    vand.vx v12, v8, a2, v0.t
733; RV64-NEXT:    vsll.vi v12, v12, 8, v0.t
734; RV64-NEXT:    vor.vv v10, v10, v12, v0.t
735; RV64-NEXT:    vsll.vx v12, v8, a3, v0.t
736; RV64-NEXT:    vand.vx v14, v8, a0, v0.t
737; RV64-NEXT:    vsll.vx v14, v14, a5, v0.t
738; RV64-NEXT:    vor.vv v12, v12, v14, v0.t
739; RV64-NEXT:    vor.vv v10, v12, v10, v0.t
740; RV64-NEXT:    vsrl.vx v12, v8, a3, v0.t
741; RV64-NEXT:    vsrl.vx v14, v8, a5, v0.t
742; RV64-NEXT:    vand.vx v14, v14, a0, v0.t
743; RV64-NEXT:    vor.vv v12, v14, v12, v0.t
744; RV64-NEXT:    vsrl.vi v14, v8, 24, v0.t
745; RV64-NEXT:    vand.vx v14, v14, a1, v0.t
746; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
747; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
748; RV64-NEXT:    vor.vv v8, v8, v14, v0.t
749; RV64-NEXT:    vor.vv v8, v8, v12, v0.t
750; RV64-NEXT:    vor.vv v8, v10, v8, v0.t
751; RV64-NEXT:    ret
752;
753; CHECK-ZVKB-LABEL: vp_bswap_nxv2i64:
754; CHECK-ZVKB:       # %bb.0:
755; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
756; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
757; CHECK-ZVKB-NEXT:    ret
758  %v = call <vscale x 2 x i64> @llvm.vp.bswap.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 %evl)
759  ret <vscale x 2 x i64> %v
760}
761
762define <vscale x 2 x i64> @vp_bswap_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
763; RV32-LABEL: vp_bswap_nxv2i64_unmasked:
764; RV32:       # %bb.0:
765; RV32-NEXT:    addi sp, sp, -16
766; RV32-NEXT:    .cfi_def_cfa_offset 16
767; RV32-NEXT:    lui a1, 1044480
768; RV32-NEXT:    li a2, 56
769; RV32-NEXT:    lui a3, 16
770; RV32-NEXT:    li a4, 40
771; RV32-NEXT:    lui a5, 4080
772; RV32-NEXT:    addi a6, sp, 8
773; RV32-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
774; RV32-NEXT:    vsrl.vi v10, v8, 24
775; RV32-NEXT:    sw a1, 8(sp)
776; RV32-NEXT:    sw zero, 12(sp)
777; RV32-NEXT:    vsll.vx v12, v8, a2
778; RV32-NEXT:    addi a0, a3, -256
779; RV32-NEXT:    vsrl.vx v14, v8, a2
780; RV32-NEXT:    vsrl.vx v16, v8, a4
781; RV32-NEXT:    vand.vx v18, v8, a0
782; RV32-NEXT:    vand.vx v16, v16, a0
783; RV32-NEXT:    vor.vv v14, v16, v14
784; RV32-NEXT:    vlse64.v v16, (a6), zero
785; RV32-NEXT:    vsll.vx v18, v18, a4
786; RV32-NEXT:    vor.vv v12, v12, v18
787; RV32-NEXT:    vsrl.vi v18, v8, 8
788; RV32-NEXT:    vand.vx v10, v10, a5
789; RV32-NEXT:    vand.vv v18, v18, v16
790; RV32-NEXT:    vor.vv v10, v18, v10
791; RV32-NEXT:    vand.vv v16, v8, v16
792; RV32-NEXT:    vand.vx v8, v8, a5
793; RV32-NEXT:    vsll.vi v8, v8, 24
794; RV32-NEXT:    vsll.vi v16, v16, 8
795; RV32-NEXT:    vor.vv v8, v8, v16
796; RV32-NEXT:    vor.vv v8, v12, v8
797; RV32-NEXT:    vor.vv v10, v10, v14
798; RV32-NEXT:    vor.vv v8, v8, v10
799; RV32-NEXT:    addi sp, sp, 16
800; RV32-NEXT:    .cfi_def_cfa_offset 0
801; RV32-NEXT:    ret
802;
803; RV64-LABEL: vp_bswap_nxv2i64_unmasked:
804; RV64:       # %bb.0:
805; RV64-NEXT:    lui a1, 4080
806; RV64-NEXT:    li a2, 255
807; RV64-NEXT:    li a3, 56
808; RV64-NEXT:    lui a4, 16
809; RV64-NEXT:    li a5, 40
810; RV64-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
811; RV64-NEXT:    vsrl.vi v10, v8, 24
812; RV64-NEXT:    vsrl.vi v12, v8, 8
813; RV64-NEXT:    addiw a0, a4, -256
814; RV64-NEXT:    vsrl.vx v14, v8, a3
815; RV64-NEXT:    vsrl.vx v16, v8, a5
816; RV64-NEXT:    vand.vx v16, v16, a0
817; RV64-NEXT:    vor.vv v14, v16, v14
818; RV64-NEXT:    vand.vx v16, v8, a1
819; RV64-NEXT:    slli a2, a2, 24
820; RV64-NEXT:    vand.vx v10, v10, a1
821; RV64-NEXT:    vsll.vi v16, v16, 24
822; RV64-NEXT:    vand.vx v12, v12, a2
823; RV64-NEXT:    vor.vv v10, v12, v10
824; RV64-NEXT:    vand.vx v12, v8, a2
825; RV64-NEXT:    vsll.vi v12, v12, 8
826; RV64-NEXT:    vor.vv v12, v16, v12
827; RV64-NEXT:    vsll.vx v16, v8, a3
828; RV64-NEXT:    vand.vx v8, v8, a0
829; RV64-NEXT:    vsll.vx v8, v8, a5
830; RV64-NEXT:    vor.vv v8, v16, v8
831; RV64-NEXT:    vor.vv v8, v8, v12
832; RV64-NEXT:    vor.vv v10, v10, v14
833; RV64-NEXT:    vor.vv v8, v8, v10
834; RV64-NEXT:    ret
835;
836; CHECK-ZVKB-LABEL: vp_bswap_nxv2i64_unmasked:
837; CHECK-ZVKB:       # %bb.0:
838; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
839; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
840; CHECK-ZVKB-NEXT:    ret
841  %v = call <vscale x 2 x i64> @llvm.vp.bswap.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> splat (i1 true), i32 %evl)
842  ret <vscale x 2 x i64> %v
843}
844
845declare <vscale x 4 x i64> @llvm.vp.bswap.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i1>, i32)
846
847define <vscale x 4 x i64> @vp_bswap_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
848; RV32-LABEL: vp_bswap_nxv4i64:
849; RV32:       # %bb.0:
850; RV32-NEXT:    addi sp, sp, -16
851; RV32-NEXT:    .cfi_def_cfa_offset 16
852; RV32-NEXT:    lui a1, 1044480
853; RV32-NEXT:    li a2, 56
854; RV32-NEXT:    lui a3, 16
855; RV32-NEXT:    li a4, 40
856; RV32-NEXT:    lui a5, 4080
857; RV32-NEXT:    addi a6, sp, 8
858; RV32-NEXT:    sw a1, 8(sp)
859; RV32-NEXT:    sw zero, 12(sp)
860; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
861; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
862; RV32-NEXT:    addi a0, a3, -256
863; RV32-NEXT:    vand.vx v20, v8, a0, v0.t
864; RV32-NEXT:    vlse64.v v12, (a6), zero
865; RV32-NEXT:    vsll.vx v20, v20, a4, v0.t
866; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
867; RV32-NEXT:    vand.vx v20, v8, a5, v0.t
868; RV32-NEXT:    vsll.vi v20, v20, 24, v0.t
869; RV32-NEXT:    vand.vv v24, v8, v12, v0.t
870; RV32-NEXT:    vsll.vi v24, v24, 8, v0.t
871; RV32-NEXT:    vor.vv v20, v20, v24, v0.t
872; RV32-NEXT:    vor.vv v16, v16, v20, v0.t
873; RV32-NEXT:    vsrl.vx v20, v8, a2, v0.t
874; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
875; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
876; RV32-NEXT:    vor.vv v20, v24, v20, v0.t
877; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
878; RV32-NEXT:    vand.vx v24, v24, a5, v0.t
879; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
880; RV32-NEXT:    vand.vv v8, v8, v12, v0.t
881; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
882; RV32-NEXT:    vor.vv v8, v8, v20, v0.t
883; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
884; RV32-NEXT:    addi sp, sp, 16
885; RV32-NEXT:    .cfi_def_cfa_offset 0
886; RV32-NEXT:    ret
887;
888; RV64-LABEL: vp_bswap_nxv4i64:
889; RV64:       # %bb.0:
890; RV64-NEXT:    lui a1, 4080
891; RV64-NEXT:    li a2, 255
892; RV64-NEXT:    li a3, 56
893; RV64-NEXT:    lui a4, 16
894; RV64-NEXT:    li a5, 40
895; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
896; RV64-NEXT:    vand.vx v12, v8, a1, v0.t
897; RV64-NEXT:    slli a2, a2, 24
898; RV64-NEXT:    addiw a0, a4, -256
899; RV64-NEXT:    vsll.vi v12, v12, 24, v0.t
900; RV64-NEXT:    vand.vx v16, v8, a2, v0.t
901; RV64-NEXT:    vsll.vi v16, v16, 8, v0.t
902; RV64-NEXT:    vor.vv v12, v12, v16, v0.t
903; RV64-NEXT:    vsll.vx v16, v8, a3, v0.t
904; RV64-NEXT:    vand.vx v20, v8, a0, v0.t
905; RV64-NEXT:    vsll.vx v20, v20, a5, v0.t
906; RV64-NEXT:    vor.vv v16, v16, v20, v0.t
907; RV64-NEXT:    vor.vv v12, v16, v12, v0.t
908; RV64-NEXT:    vsrl.vx v16, v8, a3, v0.t
909; RV64-NEXT:    vsrl.vx v20, v8, a5, v0.t
910; RV64-NEXT:    vand.vx v20, v20, a0, v0.t
911; RV64-NEXT:    vor.vv v16, v20, v16, v0.t
912; RV64-NEXT:    vsrl.vi v20, v8, 24, v0.t
913; RV64-NEXT:    vand.vx v20, v20, a1, v0.t
914; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
915; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
916; RV64-NEXT:    vor.vv v8, v8, v20, v0.t
917; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
918; RV64-NEXT:    vor.vv v8, v12, v8, v0.t
919; RV64-NEXT:    ret
920;
921; CHECK-ZVKB-LABEL: vp_bswap_nxv4i64:
922; CHECK-ZVKB:       # %bb.0:
923; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
924; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
925; CHECK-ZVKB-NEXT:    ret
926  %v = call <vscale x 4 x i64> @llvm.vp.bswap.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 %evl)
927  ret <vscale x 4 x i64> %v
928}
929
930define <vscale x 4 x i64> @vp_bswap_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
931; RV32-LABEL: vp_bswap_nxv4i64_unmasked:
932; RV32:       # %bb.0:
933; RV32-NEXT:    addi sp, sp, -16
934; RV32-NEXT:    .cfi_def_cfa_offset 16
935; RV32-NEXT:    lui a1, 1044480
936; RV32-NEXT:    li a2, 56
937; RV32-NEXT:    lui a3, 16
938; RV32-NEXT:    li a4, 40
939; RV32-NEXT:    lui a5, 4080
940; RV32-NEXT:    addi a6, sp, 8
941; RV32-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
942; RV32-NEXT:    vsrl.vi v12, v8, 24
943; RV32-NEXT:    sw a1, 8(sp)
944; RV32-NEXT:    sw zero, 12(sp)
945; RV32-NEXT:    vsll.vx v16, v8, a2
946; RV32-NEXT:    addi a0, a3, -256
947; RV32-NEXT:    vsrl.vx v20, v8, a2
948; RV32-NEXT:    vsrl.vx v24, v8, a4
949; RV32-NEXT:    vand.vx v28, v8, a0
950; RV32-NEXT:    vand.vx v24, v24, a0
951; RV32-NEXT:    vor.vv v20, v24, v20
952; RV32-NEXT:    vlse64.v v24, (a6), zero
953; RV32-NEXT:    vsll.vx v28, v28, a4
954; RV32-NEXT:    vor.vv v16, v16, v28
955; RV32-NEXT:    vsrl.vi v28, v8, 8
956; RV32-NEXT:    vand.vx v12, v12, a5
957; RV32-NEXT:    vand.vv v28, v28, v24
958; RV32-NEXT:    vor.vv v12, v28, v12
959; RV32-NEXT:    vand.vv v24, v8, v24
960; RV32-NEXT:    vand.vx v8, v8, a5
961; RV32-NEXT:    vsll.vi v8, v8, 24
962; RV32-NEXT:    vsll.vi v24, v24, 8
963; RV32-NEXT:    vor.vv v8, v8, v24
964; RV32-NEXT:    vor.vv v8, v16, v8
965; RV32-NEXT:    vor.vv v12, v12, v20
966; RV32-NEXT:    vor.vv v8, v8, v12
967; RV32-NEXT:    addi sp, sp, 16
968; RV32-NEXT:    .cfi_def_cfa_offset 0
969; RV32-NEXT:    ret
970;
971; RV64-LABEL: vp_bswap_nxv4i64_unmasked:
972; RV64:       # %bb.0:
973; RV64-NEXT:    lui a1, 4080
974; RV64-NEXT:    li a2, 255
975; RV64-NEXT:    li a3, 56
976; RV64-NEXT:    lui a4, 16
977; RV64-NEXT:    li a5, 40
978; RV64-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
979; RV64-NEXT:    vsrl.vi v12, v8, 24
980; RV64-NEXT:    vsrl.vi v16, v8, 8
981; RV64-NEXT:    addiw a0, a4, -256
982; RV64-NEXT:    vsrl.vx v20, v8, a3
983; RV64-NEXT:    vsrl.vx v24, v8, a5
984; RV64-NEXT:    vand.vx v24, v24, a0
985; RV64-NEXT:    vor.vv v20, v24, v20
986; RV64-NEXT:    vand.vx v24, v8, a1
987; RV64-NEXT:    slli a2, a2, 24
988; RV64-NEXT:    vand.vx v12, v12, a1
989; RV64-NEXT:    vsll.vi v24, v24, 24
990; RV64-NEXT:    vand.vx v16, v16, a2
991; RV64-NEXT:    vor.vv v12, v16, v12
992; RV64-NEXT:    vand.vx v16, v8, a2
993; RV64-NEXT:    vsll.vi v16, v16, 8
994; RV64-NEXT:    vor.vv v16, v24, v16
995; RV64-NEXT:    vsll.vx v24, v8, a3
996; RV64-NEXT:    vand.vx v8, v8, a0
997; RV64-NEXT:    vsll.vx v8, v8, a5
998; RV64-NEXT:    vor.vv v8, v24, v8
999; RV64-NEXT:    vor.vv v8, v8, v16
1000; RV64-NEXT:    vor.vv v12, v12, v20
1001; RV64-NEXT:    vor.vv v8, v8, v12
1002; RV64-NEXT:    ret
1003;
1004; CHECK-ZVKB-LABEL: vp_bswap_nxv4i64_unmasked:
1005; CHECK-ZVKB:       # %bb.0:
1006; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
1007; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
1008; CHECK-ZVKB-NEXT:    ret
1009  %v = call <vscale x 4 x i64> @llvm.vp.bswap.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> splat (i1 true), i32 %evl)
1010  ret <vscale x 4 x i64> %v
1011}
1012
1013declare <vscale x 7 x i64> @llvm.vp.bswap.nxv7i64(<vscale x 7 x i64>, <vscale x 7 x i1>, i32)
1014
1015define <vscale x 7 x i64> @vp_bswap_nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 zeroext %evl) {
1016; RV32-LABEL: vp_bswap_nxv7i64:
1017; RV32:       # %bb.0:
1018; RV32-NEXT:    addi sp, sp, -16
1019; RV32-NEXT:    .cfi_def_cfa_offset 16
1020; RV32-NEXT:    csrr a1, vlenb
1021; RV32-NEXT:    li a2, 24
1022; RV32-NEXT:    mul a1, a1, a2
1023; RV32-NEXT:    sub sp, sp, a1
1024; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
1025; RV32-NEXT:    lui a1, 1044480
1026; RV32-NEXT:    li a2, 56
1027; RV32-NEXT:    lui a3, 16
1028; RV32-NEXT:    li a4, 40
1029; RV32-NEXT:    addi a5, sp, 8
1030; RV32-NEXT:    sw a1, 8(sp)
1031; RV32-NEXT:    sw zero, 12(sp)
1032; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1033; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
1034; RV32-NEXT:    addi a0, a3, -256
1035; RV32-NEXT:    vand.vx v24, v8, a0, v0.t
1036; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
1037; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
1038; RV32-NEXT:    csrr a1, vlenb
1039; RV32-NEXT:    slli a1, a1, 4
1040; RV32-NEXT:    add a1, sp, a1
1041; RV32-NEXT:    addi a1, a1, 16
1042; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
1043; RV32-NEXT:    vlse64.v v16, (a5), zero
1044; RV32-NEXT:    csrr a1, vlenb
1045; RV32-NEXT:    slli a1, a1, 3
1046; RV32-NEXT:    add a1, sp, a1
1047; RV32-NEXT:    addi a1, a1, 16
1048; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
1049; RV32-NEXT:    lui a1, 4080
1050; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
1051; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
1052; RV32-NEXT:    addi a3, sp, 16
1053; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
1054; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
1055; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
1056; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
1057; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
1058; RV32-NEXT:    csrr a3, vlenb
1059; RV32-NEXT:    slli a3, a3, 4
1060; RV32-NEXT:    add a3, sp, a3
1061; RV32-NEXT:    addi a3, a3, 16
1062; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
1063; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
1064; RV32-NEXT:    csrr a3, vlenb
1065; RV32-NEXT:    slli a3, a3, 4
1066; RV32-NEXT:    add a3, sp, a3
1067; RV32-NEXT:    addi a3, a3, 16
1068; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
1069; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
1070; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
1071; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
1072; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
1073; RV32-NEXT:    addi a0, sp, 16
1074; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
1075; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
1076; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
1077; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
1078; RV32-NEXT:    csrr a0, vlenb
1079; RV32-NEXT:    slli a0, a0, 3
1080; RV32-NEXT:    add a0, sp, a0
1081; RV32-NEXT:    addi a0, a0, 16
1082; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1083; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
1084; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
1085; RV32-NEXT:    addi a0, sp, 16
1086; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1087; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
1088; RV32-NEXT:    csrr a0, vlenb
1089; RV32-NEXT:    slli a0, a0, 4
1090; RV32-NEXT:    add a0, sp, a0
1091; RV32-NEXT:    addi a0, a0, 16
1092; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1093; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
1094; RV32-NEXT:    csrr a0, vlenb
1095; RV32-NEXT:    li a1, 24
1096; RV32-NEXT:    mul a0, a0, a1
1097; RV32-NEXT:    add sp, sp, a0
1098; RV32-NEXT:    .cfi_def_cfa sp, 16
1099; RV32-NEXT:    addi sp, sp, 16
1100; RV32-NEXT:    .cfi_def_cfa_offset 0
1101; RV32-NEXT:    ret
1102;
1103; RV64-LABEL: vp_bswap_nxv7i64:
1104; RV64:       # %bb.0:
1105; RV64-NEXT:    addi sp, sp, -16
1106; RV64-NEXT:    .cfi_def_cfa_offset 16
1107; RV64-NEXT:    csrr a1, vlenb
1108; RV64-NEXT:    slli a1, a1, 3
1109; RV64-NEXT:    sub sp, sp, a1
1110; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
1111; RV64-NEXT:    lui a1, 4080
1112; RV64-NEXT:    li a2, 255
1113; RV64-NEXT:    li a3, 56
1114; RV64-NEXT:    lui a4, 16
1115; RV64-NEXT:    li a5, 40
1116; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1117; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
1118; RV64-NEXT:    slli a2, a2, 24
1119; RV64-NEXT:    addiw a0, a4, -256
1120; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
1121; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
1122; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
1123; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
1124; RV64-NEXT:    addi a4, sp, 16
1125; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
1126; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
1127; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
1128; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
1129; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
1130; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
1131; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
1132; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
1133; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
1134; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
1135; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
1136; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
1137; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
1138; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
1139; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
1140; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
1141; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
1142; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
1143; RV64-NEXT:    addi a0, sp, 16
1144; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1145; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
1146; RV64-NEXT:    csrr a0, vlenb
1147; RV64-NEXT:    slli a0, a0, 3
1148; RV64-NEXT:    add sp, sp, a0
1149; RV64-NEXT:    .cfi_def_cfa sp, 16
1150; RV64-NEXT:    addi sp, sp, 16
1151; RV64-NEXT:    .cfi_def_cfa_offset 0
1152; RV64-NEXT:    ret
1153;
1154; CHECK-ZVKB-LABEL: vp_bswap_nxv7i64:
1155; CHECK-ZVKB:       # %bb.0:
1156; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1157; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
1158; CHECK-ZVKB-NEXT:    ret
1159  %v = call <vscale x 7 x i64> @llvm.vp.bswap.nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> %m, i32 %evl)
1160  ret <vscale x 7 x i64> %v
1161}
1162
1163define <vscale x 7 x i64> @vp_bswap_nxv7i64_unmasked(<vscale x 7 x i64> %va, i32 zeroext %evl) {
1164; RV32-LABEL: vp_bswap_nxv7i64_unmasked:
1165; RV32:       # %bb.0:
1166; RV32-NEXT:    addi sp, sp, -16
1167; RV32-NEXT:    .cfi_def_cfa_offset 16
1168; RV32-NEXT:    csrr a1, vlenb
1169; RV32-NEXT:    slli a1, a1, 4
1170; RV32-NEXT:    sub sp, sp, a1
1171; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1172; RV32-NEXT:    lui a1, 1044480
1173; RV32-NEXT:    li a2, 56
1174; RV32-NEXT:    lui a3, 16
1175; RV32-NEXT:    li a4, 40
1176; RV32-NEXT:    lui a5, 4080
1177; RV32-NEXT:    addi a6, sp, 8
1178; RV32-NEXT:    sw a1, 8(sp)
1179; RV32-NEXT:    sw zero, 12(sp)
1180; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1181; RV32-NEXT:    vsll.vx v24, v8, a2
1182; RV32-NEXT:    addi a0, a3, -256
1183; RV32-NEXT:    vsrl.vx v16, v8, a2
1184; RV32-NEXT:    vsrl.vx v0, v8, a4
1185; RV32-NEXT:    vand.vx v0, v0, a0
1186; RV32-NEXT:    vor.vv v16, v0, v16
1187; RV32-NEXT:    csrr a1, vlenb
1188; RV32-NEXT:    slli a1, a1, 3
1189; RV32-NEXT:    add a1, sp, a1
1190; RV32-NEXT:    addi a1, a1, 16
1191; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
1192; RV32-NEXT:    vand.vx v0, v8, a0
1193; RV32-NEXT:    vsll.vx v0, v0, a4
1194; RV32-NEXT:    vor.vv v16, v24, v0
1195; RV32-NEXT:    addi a0, sp, 16
1196; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
1197; RV32-NEXT:    vlse64.v v0, (a6), zero
1198; RV32-NEXT:    vsrl.vi v16, v8, 24
1199; RV32-NEXT:    vand.vx v16, v16, a5
1200; RV32-NEXT:    vsrl.vi v24, v8, 8
1201; RV32-NEXT:    vand.vv v24, v24, v0
1202; RV32-NEXT:    vor.vv v16, v24, v16
1203; RV32-NEXT:    vand.vv v24, v8, v0
1204; RV32-NEXT:    vand.vx v8, v8, a5
1205; RV32-NEXT:    vsll.vi v8, v8, 24
1206; RV32-NEXT:    vsll.vi v24, v24, 8
1207; RV32-NEXT:    vor.vv v8, v8, v24
1208; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
1209; RV32-NEXT:    vor.vv v8, v24, v8
1210; RV32-NEXT:    csrr a0, vlenb
1211; RV32-NEXT:    slli a0, a0, 3
1212; RV32-NEXT:    add a0, sp, a0
1213; RV32-NEXT:    addi a0, a0, 16
1214; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
1215; RV32-NEXT:    vor.vv v16, v16, v24
1216; RV32-NEXT:    vor.vv v8, v8, v16
1217; RV32-NEXT:    csrr a0, vlenb
1218; RV32-NEXT:    slli a0, a0, 4
1219; RV32-NEXT:    add sp, sp, a0
1220; RV32-NEXT:    .cfi_def_cfa sp, 16
1221; RV32-NEXT:    addi sp, sp, 16
1222; RV32-NEXT:    .cfi_def_cfa_offset 0
1223; RV32-NEXT:    ret
1224;
1225; RV64-LABEL: vp_bswap_nxv7i64_unmasked:
1226; RV64:       # %bb.0:
1227; RV64-NEXT:    addi sp, sp, -16
1228; RV64-NEXT:    .cfi_def_cfa_offset 16
1229; RV64-NEXT:    csrr a1, vlenb
1230; RV64-NEXT:    slli a1, a1, 3
1231; RV64-NEXT:    sub sp, sp, a1
1232; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
1233; RV64-NEXT:    lui a1, 4080
1234; RV64-NEXT:    li a2, 255
1235; RV64-NEXT:    li a3, 56
1236; RV64-NEXT:    lui a4, 16
1237; RV64-NEXT:    li a5, 40
1238; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1239; RV64-NEXT:    vsrl.vi v24, v8, 24
1240; RV64-NEXT:    addiw a0, a4, -256
1241; RV64-NEXT:    vsrl.vx v16, v8, a3
1242; RV64-NEXT:    vsrl.vx v0, v8, a5
1243; RV64-NEXT:    vand.vx v0, v0, a0
1244; RV64-NEXT:    vor.vv v16, v0, v16
1245; RV64-NEXT:    addi a4, sp, 16
1246; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
1247; RV64-NEXT:    vsrl.vi v0, v8, 8
1248; RV64-NEXT:    slli a2, a2, 24
1249; RV64-NEXT:    vand.vx v24, v24, a1
1250; RV64-NEXT:    vand.vx v0, v0, a2
1251; RV64-NEXT:    vor.vv v24, v0, v24
1252; RV64-NEXT:    vand.vx v0, v8, a1
1253; RV64-NEXT:    vsll.vi v0, v0, 24
1254; RV64-NEXT:    vand.vx v16, v8, a2
1255; RV64-NEXT:    vsll.vi v16, v16, 8
1256; RV64-NEXT:    vor.vv v16, v0, v16
1257; RV64-NEXT:    vsll.vx v0, v8, a3
1258; RV64-NEXT:    vand.vx v8, v8, a0
1259; RV64-NEXT:    vsll.vx v8, v8, a5
1260; RV64-NEXT:    vor.vv v8, v0, v8
1261; RV64-NEXT:    vor.vv v8, v8, v16
1262; RV64-NEXT:    addi a0, sp, 16
1263; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1264; RV64-NEXT:    vor.vv v16, v24, v16
1265; RV64-NEXT:    vor.vv v8, v8, v16
1266; RV64-NEXT:    csrr a0, vlenb
1267; RV64-NEXT:    slli a0, a0, 3
1268; RV64-NEXT:    add sp, sp, a0
1269; RV64-NEXT:    .cfi_def_cfa sp, 16
1270; RV64-NEXT:    addi sp, sp, 16
1271; RV64-NEXT:    .cfi_def_cfa_offset 0
1272; RV64-NEXT:    ret
1273;
1274; CHECK-ZVKB-LABEL: vp_bswap_nxv7i64_unmasked:
1275; CHECK-ZVKB:       # %bb.0:
1276; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1277; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
1278; CHECK-ZVKB-NEXT:    ret
1279  %v = call <vscale x 7 x i64> @llvm.vp.bswap.nxv7i64(<vscale x 7 x i64> %va, <vscale x 7 x i1> splat (i1 true), i32 %evl)
1280  ret <vscale x 7 x i64> %v
1281}
1282
1283declare <vscale x 8 x i64> @llvm.vp.bswap.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i1>, i32)
1284
1285define <vscale x 8 x i64> @vp_bswap_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
1286; RV32-LABEL: vp_bswap_nxv8i64:
1287; RV32:       # %bb.0:
1288; RV32-NEXT:    addi sp, sp, -16
1289; RV32-NEXT:    .cfi_def_cfa_offset 16
1290; RV32-NEXT:    csrr a1, vlenb
1291; RV32-NEXT:    li a2, 24
1292; RV32-NEXT:    mul a1, a1, a2
1293; RV32-NEXT:    sub sp, sp, a1
1294; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
1295; RV32-NEXT:    lui a1, 1044480
1296; RV32-NEXT:    li a2, 56
1297; RV32-NEXT:    lui a3, 16
1298; RV32-NEXT:    li a4, 40
1299; RV32-NEXT:    addi a5, sp, 8
1300; RV32-NEXT:    sw a1, 8(sp)
1301; RV32-NEXT:    sw zero, 12(sp)
1302; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1303; RV32-NEXT:    vsll.vx v16, v8, a2, v0.t
1304; RV32-NEXT:    addi a0, a3, -256
1305; RV32-NEXT:    vand.vx v24, v8, a0, v0.t
1306; RV32-NEXT:    vsll.vx v24, v24, a4, v0.t
1307; RV32-NEXT:    vor.vv v16, v16, v24, v0.t
1308; RV32-NEXT:    csrr a1, vlenb
1309; RV32-NEXT:    slli a1, a1, 4
1310; RV32-NEXT:    add a1, sp, a1
1311; RV32-NEXT:    addi a1, a1, 16
1312; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
1313; RV32-NEXT:    vlse64.v v16, (a5), zero
1314; RV32-NEXT:    csrr a1, vlenb
1315; RV32-NEXT:    slli a1, a1, 3
1316; RV32-NEXT:    add a1, sp, a1
1317; RV32-NEXT:    addi a1, a1, 16
1318; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
1319; RV32-NEXT:    lui a1, 4080
1320; RV32-NEXT:    vand.vx v24, v8, a1, v0.t
1321; RV32-NEXT:    vsll.vi v24, v24, 24, v0.t
1322; RV32-NEXT:    addi a3, sp, 16
1323; RV32-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
1324; RV32-NEXT:    vand.vv v24, v8, v16, v0.t
1325; RV32-NEXT:    vsll.vi v16, v24, 8, v0.t
1326; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
1327; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
1328; RV32-NEXT:    csrr a3, vlenb
1329; RV32-NEXT:    slli a3, a3, 4
1330; RV32-NEXT:    add a3, sp, a3
1331; RV32-NEXT:    addi a3, a3, 16
1332; RV32-NEXT:    vl8r.v v24, (a3) # Unknown-size Folded Reload
1333; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
1334; RV32-NEXT:    csrr a3, vlenb
1335; RV32-NEXT:    slli a3, a3, 4
1336; RV32-NEXT:    add a3, sp, a3
1337; RV32-NEXT:    addi a3, a3, 16
1338; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
1339; RV32-NEXT:    vsrl.vx v16, v8, a2, v0.t
1340; RV32-NEXT:    vsrl.vx v24, v8, a4, v0.t
1341; RV32-NEXT:    vand.vx v24, v24, a0, v0.t
1342; RV32-NEXT:    vor.vv v16, v24, v16, v0.t
1343; RV32-NEXT:    addi a0, sp, 16
1344; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
1345; RV32-NEXT:    vsrl.vi v24, v8, 24, v0.t
1346; RV32-NEXT:    vand.vx v24, v24, a1, v0.t
1347; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
1348; RV32-NEXT:    csrr a0, vlenb
1349; RV32-NEXT:    slli a0, a0, 3
1350; RV32-NEXT:    add a0, sp, a0
1351; RV32-NEXT:    addi a0, a0, 16
1352; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1353; RV32-NEXT:    vand.vv v8, v8, v16, v0.t
1354; RV32-NEXT:    vor.vv v8, v8, v24, v0.t
1355; RV32-NEXT:    addi a0, sp, 16
1356; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1357; RV32-NEXT:    vor.vv v8, v8, v16, v0.t
1358; RV32-NEXT:    csrr a0, vlenb
1359; RV32-NEXT:    slli a0, a0, 4
1360; RV32-NEXT:    add a0, sp, a0
1361; RV32-NEXT:    addi a0, a0, 16
1362; RV32-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1363; RV32-NEXT:    vor.vv v8, v16, v8, v0.t
1364; RV32-NEXT:    csrr a0, vlenb
1365; RV32-NEXT:    li a1, 24
1366; RV32-NEXT:    mul a0, a0, a1
1367; RV32-NEXT:    add sp, sp, a0
1368; RV32-NEXT:    .cfi_def_cfa sp, 16
1369; RV32-NEXT:    addi sp, sp, 16
1370; RV32-NEXT:    .cfi_def_cfa_offset 0
1371; RV32-NEXT:    ret
1372;
1373; RV64-LABEL: vp_bswap_nxv8i64:
1374; RV64:       # %bb.0:
1375; RV64-NEXT:    addi sp, sp, -16
1376; RV64-NEXT:    .cfi_def_cfa_offset 16
1377; RV64-NEXT:    csrr a1, vlenb
1378; RV64-NEXT:    slli a1, a1, 3
1379; RV64-NEXT:    sub sp, sp, a1
1380; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
1381; RV64-NEXT:    lui a1, 4080
1382; RV64-NEXT:    li a2, 255
1383; RV64-NEXT:    li a3, 56
1384; RV64-NEXT:    lui a4, 16
1385; RV64-NEXT:    li a5, 40
1386; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1387; RV64-NEXT:    vand.vx v16, v8, a1, v0.t
1388; RV64-NEXT:    slli a2, a2, 24
1389; RV64-NEXT:    addiw a0, a4, -256
1390; RV64-NEXT:    vsll.vi v16, v16, 24, v0.t
1391; RV64-NEXT:    vand.vx v24, v8, a2, v0.t
1392; RV64-NEXT:    vsll.vi v24, v24, 8, v0.t
1393; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
1394; RV64-NEXT:    addi a4, sp, 16
1395; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
1396; RV64-NEXT:    vsll.vx v24, v8, a3, v0.t
1397; RV64-NEXT:    vand.vx v16, v8, a0, v0.t
1398; RV64-NEXT:    vsll.vx v16, v16, a5, v0.t
1399; RV64-NEXT:    vor.vv v16, v24, v16, v0.t
1400; RV64-NEXT:    vl8r.v v24, (a4) # Unknown-size Folded Reload
1401; RV64-NEXT:    vor.vv v16, v16, v24, v0.t
1402; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
1403; RV64-NEXT:    vsrl.vx v24, v8, a3, v0.t
1404; RV64-NEXT:    vsrl.vx v16, v8, a5, v0.t
1405; RV64-NEXT:    vand.vx v16, v16, a0, v0.t
1406; RV64-NEXT:    vor.vv v24, v16, v24, v0.t
1407; RV64-NEXT:    vsrl.vi v16, v8, 24, v0.t
1408; RV64-NEXT:    vand.vx v16, v16, a1, v0.t
1409; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
1410; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
1411; RV64-NEXT:    vor.vv v8, v8, v16, v0.t
1412; RV64-NEXT:    vor.vv v8, v8, v24, v0.t
1413; RV64-NEXT:    addi a0, sp, 16
1414; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1415; RV64-NEXT:    vor.vv v8, v16, v8, v0.t
1416; RV64-NEXT:    csrr a0, vlenb
1417; RV64-NEXT:    slli a0, a0, 3
1418; RV64-NEXT:    add sp, sp, a0
1419; RV64-NEXT:    .cfi_def_cfa sp, 16
1420; RV64-NEXT:    addi sp, sp, 16
1421; RV64-NEXT:    .cfi_def_cfa_offset 0
1422; RV64-NEXT:    ret
1423;
1424; CHECK-ZVKB-LABEL: vp_bswap_nxv8i64:
1425; CHECK-ZVKB:       # %bb.0:
1426; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1427; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
1428; CHECK-ZVKB-NEXT:    ret
1429  %v = call <vscale x 8 x i64> @llvm.vp.bswap.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 %evl)
1430  ret <vscale x 8 x i64> %v
1431}
1432
1433define <vscale x 8 x i64> @vp_bswap_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
1434; RV32-LABEL: vp_bswap_nxv8i64_unmasked:
1435; RV32:       # %bb.0:
1436; RV32-NEXT:    addi sp, sp, -16
1437; RV32-NEXT:    .cfi_def_cfa_offset 16
1438; RV32-NEXT:    csrr a1, vlenb
1439; RV32-NEXT:    slli a1, a1, 4
1440; RV32-NEXT:    sub sp, sp, a1
1441; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1442; RV32-NEXT:    lui a1, 1044480
1443; RV32-NEXT:    li a2, 56
1444; RV32-NEXT:    lui a3, 16
1445; RV32-NEXT:    li a4, 40
1446; RV32-NEXT:    lui a5, 4080
1447; RV32-NEXT:    addi a6, sp, 8
1448; RV32-NEXT:    sw a1, 8(sp)
1449; RV32-NEXT:    sw zero, 12(sp)
1450; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1451; RV32-NEXT:    vsll.vx v24, v8, a2
1452; RV32-NEXT:    addi a0, a3, -256
1453; RV32-NEXT:    vsrl.vx v16, v8, a2
1454; RV32-NEXT:    vsrl.vx v0, v8, a4
1455; RV32-NEXT:    vand.vx v0, v0, a0
1456; RV32-NEXT:    vor.vv v16, v0, v16
1457; RV32-NEXT:    csrr a1, vlenb
1458; RV32-NEXT:    slli a1, a1, 3
1459; RV32-NEXT:    add a1, sp, a1
1460; RV32-NEXT:    addi a1, a1, 16
1461; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
1462; RV32-NEXT:    vand.vx v0, v8, a0
1463; RV32-NEXT:    vsll.vx v0, v0, a4
1464; RV32-NEXT:    vor.vv v16, v24, v0
1465; RV32-NEXT:    addi a0, sp, 16
1466; RV32-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
1467; RV32-NEXT:    vlse64.v v0, (a6), zero
1468; RV32-NEXT:    vsrl.vi v16, v8, 24
1469; RV32-NEXT:    vand.vx v16, v16, a5
1470; RV32-NEXT:    vsrl.vi v24, v8, 8
1471; RV32-NEXT:    vand.vv v24, v24, v0
1472; RV32-NEXT:    vor.vv v16, v24, v16
1473; RV32-NEXT:    vand.vv v24, v8, v0
1474; RV32-NEXT:    vand.vx v8, v8, a5
1475; RV32-NEXT:    vsll.vi v8, v8, 24
1476; RV32-NEXT:    vsll.vi v24, v24, 8
1477; RV32-NEXT:    vor.vv v8, v8, v24
1478; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
1479; RV32-NEXT:    vor.vv v8, v24, v8
1480; RV32-NEXT:    csrr a0, vlenb
1481; RV32-NEXT:    slli a0, a0, 3
1482; RV32-NEXT:    add a0, sp, a0
1483; RV32-NEXT:    addi a0, a0, 16
1484; RV32-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
1485; RV32-NEXT:    vor.vv v16, v16, v24
1486; RV32-NEXT:    vor.vv v8, v8, v16
1487; RV32-NEXT:    csrr a0, vlenb
1488; RV32-NEXT:    slli a0, a0, 4
1489; RV32-NEXT:    add sp, sp, a0
1490; RV32-NEXT:    .cfi_def_cfa sp, 16
1491; RV32-NEXT:    addi sp, sp, 16
1492; RV32-NEXT:    .cfi_def_cfa_offset 0
1493; RV32-NEXT:    ret
1494;
1495; RV64-LABEL: vp_bswap_nxv8i64_unmasked:
1496; RV64:       # %bb.0:
1497; RV64-NEXT:    addi sp, sp, -16
1498; RV64-NEXT:    .cfi_def_cfa_offset 16
1499; RV64-NEXT:    csrr a1, vlenb
1500; RV64-NEXT:    slli a1, a1, 3
1501; RV64-NEXT:    sub sp, sp, a1
1502; RV64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
1503; RV64-NEXT:    lui a1, 4080
1504; RV64-NEXT:    li a2, 255
1505; RV64-NEXT:    li a3, 56
1506; RV64-NEXT:    lui a4, 16
1507; RV64-NEXT:    li a5, 40
1508; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1509; RV64-NEXT:    vsrl.vi v24, v8, 24
1510; RV64-NEXT:    addiw a0, a4, -256
1511; RV64-NEXT:    vsrl.vx v16, v8, a3
1512; RV64-NEXT:    vsrl.vx v0, v8, a5
1513; RV64-NEXT:    vand.vx v0, v0, a0
1514; RV64-NEXT:    vor.vv v16, v0, v16
1515; RV64-NEXT:    addi a4, sp, 16
1516; RV64-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
1517; RV64-NEXT:    vsrl.vi v0, v8, 8
1518; RV64-NEXT:    slli a2, a2, 24
1519; RV64-NEXT:    vand.vx v24, v24, a1
1520; RV64-NEXT:    vand.vx v0, v0, a2
1521; RV64-NEXT:    vor.vv v24, v0, v24
1522; RV64-NEXT:    vand.vx v0, v8, a1
1523; RV64-NEXT:    vsll.vi v0, v0, 24
1524; RV64-NEXT:    vand.vx v16, v8, a2
1525; RV64-NEXT:    vsll.vi v16, v16, 8
1526; RV64-NEXT:    vor.vv v16, v0, v16
1527; RV64-NEXT:    vsll.vx v0, v8, a3
1528; RV64-NEXT:    vand.vx v8, v8, a0
1529; RV64-NEXT:    vsll.vx v8, v8, a5
1530; RV64-NEXT:    vor.vv v8, v0, v8
1531; RV64-NEXT:    vor.vv v8, v8, v16
1532; RV64-NEXT:    addi a0, sp, 16
1533; RV64-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1534; RV64-NEXT:    vor.vv v16, v24, v16
1535; RV64-NEXT:    vor.vv v8, v8, v16
1536; RV64-NEXT:    csrr a0, vlenb
1537; RV64-NEXT:    slli a0, a0, 3
1538; RV64-NEXT:    add sp, sp, a0
1539; RV64-NEXT:    .cfi_def_cfa sp, 16
1540; RV64-NEXT:    addi sp, sp, 16
1541; RV64-NEXT:    .cfi_def_cfa_offset 0
1542; RV64-NEXT:    ret
1543;
1544; CHECK-ZVKB-LABEL: vp_bswap_nxv8i64_unmasked:
1545; CHECK-ZVKB:       # %bb.0:
1546; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
1547; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
1548; CHECK-ZVKB-NEXT:    ret
1549  %v = call <vscale x 8 x i64> @llvm.vp.bswap.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> splat (i1 true), i32 %evl)
1550  ret <vscale x 8 x i64> %v
1551}
1552
1553; Test splitting. Use i16 version for easier check.
1554declare <vscale x 64 x i16> @llvm.vp.bswap.nxv64i16(<vscale x 64 x i16>, <vscale x 64 x i1>, i32)
1555
1556define <vscale x 64 x i16> @vp_bswap_nxv64i16(<vscale x 64 x i16> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
1557; CHECK-LABEL: vp_bswap_nxv64i16:
1558; CHECK:       # %bb.0:
1559; CHECK-NEXT:    addi sp, sp, -16
1560; CHECK-NEXT:    .cfi_def_cfa_offset 16
1561; CHECK-NEXT:    csrr a1, vlenb
1562; CHECK-NEXT:    slli a1, a1, 4
1563; CHECK-NEXT:    sub sp, sp, a1
1564; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
1565; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
1566; CHECK-NEXT:    vmv1r.v v24, v0
1567; CHECK-NEXT:    csrr a1, vlenb
1568; CHECK-NEXT:    slli a1, a1, 3
1569; CHECK-NEXT:    add a1, sp, a1
1570; CHECK-NEXT:    addi a1, a1, 16
1571; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
1572; CHECK-NEXT:    csrr a1, vlenb
1573; CHECK-NEXT:    srli a2, a1, 1
1574; CHECK-NEXT:    slli a1, a1, 2
1575; CHECK-NEXT:    vslidedown.vx v0, v0, a2
1576; CHECK-NEXT:    sub a2, a0, a1
1577; CHECK-NEXT:    sltu a3, a0, a2
1578; CHECK-NEXT:    addi a3, a3, -1
1579; CHECK-NEXT:    and a2, a3, a2
1580; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
1581; CHECK-NEXT:    vsrl.vi v8, v16, 8, v0.t
1582; CHECK-NEXT:    vsll.vi v16, v16, 8, v0.t
1583; CHECK-NEXT:    vor.vv v8, v16, v8, v0.t
1584; CHECK-NEXT:    addi a2, sp, 16
1585; CHECK-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
1586; CHECK-NEXT:    bltu a0, a1, .LBB32_2
1587; CHECK-NEXT:  # %bb.1:
1588; CHECK-NEXT:    mv a0, a1
1589; CHECK-NEXT:  .LBB32_2:
1590; CHECK-NEXT:    vmv1r.v v0, v24
1591; CHECK-NEXT:    csrr a1, vlenb
1592; CHECK-NEXT:    slli a1, a1, 3
1593; CHECK-NEXT:    add a1, sp, a1
1594; CHECK-NEXT:    addi a1, a1, 16
1595; CHECK-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
1596; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
1597; CHECK-NEXT:    vsrl.vi v16, v8, 8, v0.t
1598; CHECK-NEXT:    vsll.vi v8, v8, 8, v0.t
1599; CHECK-NEXT:    vor.vv v8, v8, v16, v0.t
1600; CHECK-NEXT:    addi a0, sp, 16
1601; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
1602; CHECK-NEXT:    csrr a0, vlenb
1603; CHECK-NEXT:    slli a0, a0, 4
1604; CHECK-NEXT:    add sp, sp, a0
1605; CHECK-NEXT:    .cfi_def_cfa sp, 16
1606; CHECK-NEXT:    addi sp, sp, 16
1607; CHECK-NEXT:    .cfi_def_cfa_offset 0
1608; CHECK-NEXT:    ret
1609;
1610; CHECK-ZVKB-LABEL: vp_bswap_nxv64i16:
1611; CHECK-ZVKB:       # %bb.0:
1612; CHECK-ZVKB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
1613; CHECK-ZVKB-NEXT:    vmv1r.v v24, v0
1614; CHECK-ZVKB-NEXT:    csrr a1, vlenb
1615; CHECK-ZVKB-NEXT:    srli a2, a1, 1
1616; CHECK-ZVKB-NEXT:    slli a1, a1, 2
1617; CHECK-ZVKB-NEXT:    vslidedown.vx v0, v0, a2
1618; CHECK-ZVKB-NEXT:    sub a2, a0, a1
1619; CHECK-ZVKB-NEXT:    sltu a3, a0, a2
1620; CHECK-ZVKB-NEXT:    addi a3, a3, -1
1621; CHECK-ZVKB-NEXT:    and a2, a3, a2
1622; CHECK-ZVKB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
1623; CHECK-ZVKB-NEXT:    vrev8.v v16, v16, v0.t
1624; CHECK-ZVKB-NEXT:    bltu a0, a1, .LBB32_2
1625; CHECK-ZVKB-NEXT:  # %bb.1:
1626; CHECK-ZVKB-NEXT:    mv a0, a1
1627; CHECK-ZVKB-NEXT:  .LBB32_2:
1628; CHECK-ZVKB-NEXT:    vmv1r.v v0, v24
1629; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
1630; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
1631; CHECK-ZVKB-NEXT:    ret
1632  %v = call <vscale x 64 x i16> @llvm.vp.bswap.nxv64i16(<vscale x 64 x i16> %va, <vscale x 64 x i1> %m, i32 %evl)
1633  ret <vscale x 64 x i16> %v
1634}
1635
1636define <vscale x 64 x i16> @vp_bswap_nxv64i16_unmasked(<vscale x 64 x i16> %va, i32 zeroext %evl) {
1637; CHECK-LABEL: vp_bswap_nxv64i16_unmasked:
1638; CHECK:       # %bb.0:
1639; CHECK-NEXT:    csrr a1, vlenb
1640; CHECK-NEXT:    slli a1, a1, 2
1641; CHECK-NEXT:    sub a2, a0, a1
1642; CHECK-NEXT:    sltu a3, a0, a2
1643; CHECK-NEXT:    addi a3, a3, -1
1644; CHECK-NEXT:    and a2, a3, a2
1645; CHECK-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
1646; CHECK-NEXT:    vsrl.vi v24, v16, 8
1647; CHECK-NEXT:    vsll.vi v16, v16, 8
1648; CHECK-NEXT:    vor.vv v16, v16, v24
1649; CHECK-NEXT:    bltu a0, a1, .LBB33_2
1650; CHECK-NEXT:  # %bb.1:
1651; CHECK-NEXT:    mv a0, a1
1652; CHECK-NEXT:  .LBB33_2:
1653; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
1654; CHECK-NEXT:    vsrl.vi v24, v8, 8
1655; CHECK-NEXT:    vsll.vi v8, v8, 8
1656; CHECK-NEXT:    vor.vv v8, v8, v24
1657; CHECK-NEXT:    ret
1658;
1659; CHECK-ZVKB-LABEL: vp_bswap_nxv64i16_unmasked:
1660; CHECK-ZVKB:       # %bb.0:
1661; CHECK-ZVKB-NEXT:    csrr a1, vlenb
1662; CHECK-ZVKB-NEXT:    slli a1, a1, 2
1663; CHECK-ZVKB-NEXT:    sub a2, a0, a1
1664; CHECK-ZVKB-NEXT:    sltu a3, a0, a2
1665; CHECK-ZVKB-NEXT:    addi a3, a3, -1
1666; CHECK-ZVKB-NEXT:    and a2, a3, a2
1667; CHECK-ZVKB-NEXT:    vsetvli zero, a2, e16, m8, ta, ma
1668; CHECK-ZVKB-NEXT:    vrev8.v v16, v16
1669; CHECK-ZVKB-NEXT:    bltu a0, a1, .LBB33_2
1670; CHECK-ZVKB-NEXT:  # %bb.1:
1671; CHECK-ZVKB-NEXT:    mv a0, a1
1672; CHECK-ZVKB-NEXT:  .LBB33_2:
1673; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
1674; CHECK-ZVKB-NEXT:    vrev8.v v8, v8
1675; CHECK-ZVKB-NEXT:    ret
1676  %v = call <vscale x 64 x i16> @llvm.vp.bswap.nxv64i16(<vscale x 64 x i16> %va, <vscale x 64 x i1> splat (i1 true), i32 %evl)
1677  ret <vscale x 64 x i16> %v
1678}
1679
1680; Test promotion.
1681declare <vscale x 1 x i48> @llvm.vp.bswap.nxv1i48(<vscale x 1 x i48>, <vscale x 1 x i1>, i32)
1682define <vscale x 1 x i48> @vp_bswap_nxv1i48(<vscale x 1 x i48> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
1683; RV32-LABEL: vp_bswap_nxv1i48:
1684; RV32:       # %bb.0:
1685; RV32-NEXT:    addi sp, sp, -16
1686; RV32-NEXT:    .cfi_def_cfa_offset 16
1687; RV32-NEXT:    lui a1, 1044480
1688; RV32-NEXT:    li a2, 56
1689; RV32-NEXT:    lui a3, 16
1690; RV32-NEXT:    li a4, 40
1691; RV32-NEXT:    lui a5, 4080
1692; RV32-NEXT:    addi a6, sp, 8
1693; RV32-NEXT:    sw a1, 8(sp)
1694; RV32-NEXT:    sw zero, 12(sp)
1695; RV32-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
1696; RV32-NEXT:    vsll.vx v9, v8, a2, v0.t
1697; RV32-NEXT:    addi a0, a3, -256
1698; RV32-NEXT:    vand.vx v10, v8, a0, v0.t
1699; RV32-NEXT:    vlse64.v v11, (a6), zero
1700; RV32-NEXT:    vsll.vx v10, v10, a4, v0.t
1701; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
1702; RV32-NEXT:    vand.vx v10, v8, a5, v0.t
1703; RV32-NEXT:    vsll.vi v10, v10, 24, v0.t
1704; RV32-NEXT:    vand.vv v12, v8, v11, v0.t
1705; RV32-NEXT:    vsll.vi v12, v12, 8, v0.t
1706; RV32-NEXT:    vor.vv v10, v10, v12, v0.t
1707; RV32-NEXT:    vor.vv v9, v9, v10, v0.t
1708; RV32-NEXT:    vsrl.vx v10, v8, a2, v0.t
1709; RV32-NEXT:    vsrl.vx v12, v8, a4, v0.t
1710; RV32-NEXT:    vand.vx v12, v12, a0, v0.t
1711; RV32-NEXT:    vor.vv v10, v12, v10, v0.t
1712; RV32-NEXT:    vsrl.vi v12, v8, 24, v0.t
1713; RV32-NEXT:    vand.vx v12, v12, a5, v0.t
1714; RV32-NEXT:    vsrl.vi v8, v8, 8, v0.t
1715; RV32-NEXT:    vand.vv v8, v8, v11, v0.t
1716; RV32-NEXT:    vor.vv v8, v8, v12, v0.t
1717; RV32-NEXT:    vor.vv v8, v8, v10, v0.t
1718; RV32-NEXT:    vor.vv v8, v9, v8, v0.t
1719; RV32-NEXT:    vsrl.vi v8, v8, 16, v0.t
1720; RV32-NEXT:    addi sp, sp, 16
1721; RV32-NEXT:    .cfi_def_cfa_offset 0
1722; RV32-NEXT:    ret
1723;
1724; RV64-LABEL: vp_bswap_nxv1i48:
1725; RV64:       # %bb.0:
1726; RV64-NEXT:    lui a1, 4080
1727; RV64-NEXT:    li a2, 255
1728; RV64-NEXT:    li a3, 56
1729; RV64-NEXT:    lui a4, 16
1730; RV64-NEXT:    li a5, 40
1731; RV64-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
1732; RV64-NEXT:    vand.vx v9, v8, a1, v0.t
1733; RV64-NEXT:    slli a2, a2, 24
1734; RV64-NEXT:    addiw a0, a4, -256
1735; RV64-NEXT:    vsll.vi v9, v9, 24, v0.t
1736; RV64-NEXT:    vand.vx v10, v8, a2, v0.t
1737; RV64-NEXT:    vsll.vi v10, v10, 8, v0.t
1738; RV64-NEXT:    vor.vv v9, v9, v10, v0.t
1739; RV64-NEXT:    vsll.vx v10, v8, a3, v0.t
1740; RV64-NEXT:    vand.vx v11, v8, a0, v0.t
1741; RV64-NEXT:    vsll.vx v11, v11, a5, v0.t
1742; RV64-NEXT:    vor.vv v10, v10, v11, v0.t
1743; RV64-NEXT:    vor.vv v9, v10, v9, v0.t
1744; RV64-NEXT:    vsrl.vx v10, v8, a3, v0.t
1745; RV64-NEXT:    vsrl.vx v11, v8, a5, v0.t
1746; RV64-NEXT:    vand.vx v11, v11, a0, v0.t
1747; RV64-NEXT:    vor.vv v10, v11, v10, v0.t
1748; RV64-NEXT:    vsrl.vi v11, v8, 24, v0.t
1749; RV64-NEXT:    vand.vx v11, v11, a1, v0.t
1750; RV64-NEXT:    vsrl.vi v8, v8, 8, v0.t
1751; RV64-NEXT:    vand.vx v8, v8, a2, v0.t
1752; RV64-NEXT:    vor.vv v8, v8, v11, v0.t
1753; RV64-NEXT:    vor.vv v8, v8, v10, v0.t
1754; RV64-NEXT:    vor.vv v8, v9, v8, v0.t
1755; RV64-NEXT:    vsrl.vi v8, v8, 16, v0.t
1756; RV64-NEXT:    ret
1757;
1758; CHECK-ZVKB-LABEL: vp_bswap_nxv1i48:
1759; CHECK-ZVKB:       # %bb.0:
1760; CHECK-ZVKB-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
1761; CHECK-ZVKB-NEXT:    vrev8.v v8, v8, v0.t
1762; CHECK-ZVKB-NEXT:    vsrl.vi v8, v8, 16, v0.t
1763; CHECK-ZVKB-NEXT:    ret
1764  %v = call <vscale x 1 x i48> @llvm.vp.bswap.nxv1i48(<vscale x 1 x i48> %va, <vscale x 1 x i1> %m, i32 %evl)
1765  ret <vscale x 1 x i48> %v
1766}
1767