xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll (revision 66a0a081338d9942997d1620db5c37c9c72ec3f3)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV32-V128
3; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zvl128b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V128,RV64-V128
4; RUN: llc -mtriple=riscv32 -mattr=+v,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV32-V512
5; RUN: llc -mtriple=riscv64 -mattr=+v,+m,+zvl512b -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,V512,RV64-V512
6
7; Test optimizing interleaves to widening arithmetic.
8
9define <4 x i8> @interleave_v2i8(<2 x i8> %x, <2 x i8> %y) {
10; CHECK-LABEL: interleave_v2i8:
11; CHECK:       # %bb.0:
12; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
13; CHECK-NEXT:    vwaddu.vv v10, v8, v9
14; CHECK-NEXT:    li a0, -1
15; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
16; CHECK-NEXT:    vmv1r.v v8, v10
17; CHECK-NEXT:    ret
18  %a = shufflevector <2 x i8> %x, <2 x i8> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
19  ret <4 x i8> %a
20}
21
22define <4 x i16> @interleave_v2i16(<2 x i16> %x, <2 x i16> %y) {
23; CHECK-LABEL: interleave_v2i16:
24; CHECK:       # %bb.0:
25; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
26; CHECK-NEXT:    vwaddu.vv v10, v8, v9
27; CHECK-NEXT:    li a0, -1
28; CHECK-NEXT:    vwmaccu.vx v10, a0, v9
29; CHECK-NEXT:    vmv1r.v v8, v10
30; CHECK-NEXT:    ret
31  %a = shufflevector <2 x i16> %x, <2 x i16> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
32  ret <4 x i16> %a
33}
34
35; Vector order switched for coverage.
36define <4 x i32> @interleave_v2i32(<2 x i32> %x, <2 x i32> %y) {
37; CHECK-LABEL: interleave_v2i32:
38; CHECK:       # %bb.0:
39; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
40; CHECK-NEXT:    vwaddu.vv v10, v9, v8
41; CHECK-NEXT:    li a0, -1
42; CHECK-NEXT:    vwmaccu.vx v10, a0, v8
43; CHECK-NEXT:    vmv1r.v v8, v10
44; CHECK-NEXT:    ret
45  %a = shufflevector <2 x i32> %x, <2 x i32> %y, <4 x i32> <i32 2, i32 0, i32 3, i32 1>
46  ret <4 x i32> %a
47}
48
49; One vXi64 test case to very that we don't optimize it.
50; FIXME: Is there better codegen we can do here?
51define <4 x i64> @interleave_v2i64(<2 x i64> %x, <2 x i64> %y) {
52; V128-LABEL: interleave_v2i64:
53; V128:       # %bb.0:
54; V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
55; V128-NEXT:    vmv1r.v v12, v9
56; V128-NEXT:    vid.v v9
57; V128-NEXT:    vmv.v.i v0, 10
58; V128-NEXT:    vsrl.vi v14, v9, 1
59; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
60; V128-NEXT:    vrgatherei16.vv v10, v8, v14
61; V128-NEXT:    vrgatherei16.vv v10, v12, v14, v0.t
62; V128-NEXT:    vmv.v.v v8, v10
63; V128-NEXT:    ret
64;
65; RV32-V512-LABEL: interleave_v2i64:
66; RV32-V512:       # %bb.0:
67; RV32-V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
68; RV32-V512-NEXT:    vid.v v10
69; RV32-V512-NEXT:    vsrl.vi v11, v10, 1
70; RV32-V512-NEXT:    vmv.v.i v0, 10
71; RV32-V512-NEXT:    vsetvli zero, zero, e64, m1, ta, mu
72; RV32-V512-NEXT:    vrgatherei16.vv v10, v8, v11
73; RV32-V512-NEXT:    vrgatherei16.vv v10, v9, v11, v0.t
74; RV32-V512-NEXT:    vmv.v.v v8, v10
75; RV32-V512-NEXT:    ret
76;
77; RV64-V512-LABEL: interleave_v2i64:
78; RV64-V512:       # %bb.0:
79; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, mu
80; RV64-V512-NEXT:    vid.v v10
81; RV64-V512-NEXT:    vsrl.vi v11, v10, 1
82; RV64-V512-NEXT:    vmv.v.i v0, 10
83; RV64-V512-NEXT:    vrgather.vv v10, v8, v11
84; RV64-V512-NEXT:    vrgather.vv v10, v9, v11, v0.t
85; RV64-V512-NEXT:    vmv.v.v v8, v10
86; RV64-V512-NEXT:    ret
87  %a = shufflevector <2 x i64> %x, <2 x i64> %y, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
88  ret <4 x i64> %a
89}
90
91; Vector order switched for coverage.
92define <8 x i8> @interleave_v4i8(<4 x i8> %x, <4 x i8> %y) {
93; V128-LABEL: interleave_v4i8:
94; V128:       # %bb.0:
95; V128-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
96; V128-NEXT:    vwaddu.vv v10, v9, v8
97; V128-NEXT:    li a0, -1
98; V128-NEXT:    vwmaccu.vx v10, a0, v8
99; V128-NEXT:    vmv1r.v v8, v10
100; V128-NEXT:    ret
101;
102; V512-LABEL: interleave_v4i8:
103; V512:       # %bb.0:
104; V512-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
105; V512-NEXT:    vwaddu.vv v10, v9, v8
106; V512-NEXT:    li a0, -1
107; V512-NEXT:    vwmaccu.vx v10, a0, v8
108; V512-NEXT:    vmv1r.v v8, v10
109; V512-NEXT:    ret
110  %a = shufflevector <4 x i8> %x, <4 x i8> %y, <8 x i32> <i32 4, i32 0, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
111  ret <8 x i8> %a
112}
113
114; Undef elements for coverage
115define <8 x i16> @interleave_v4i16(<4 x i16> %x, <4 x i16> %y) {
116; V128-LABEL: interleave_v4i16:
117; V128:       # %bb.0:
118; V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
119; V128-NEXT:    vwaddu.vv v10, v8, v9
120; V128-NEXT:    li a0, -1
121; V128-NEXT:    vwmaccu.vx v10, a0, v9
122; V128-NEXT:    vmv1r.v v8, v10
123; V128-NEXT:    ret
124;
125; V512-LABEL: interleave_v4i16:
126; V512:       # %bb.0:
127; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
128; V512-NEXT:    vwaddu.vv v10, v8, v9
129; V512-NEXT:    li a0, -1
130; V512-NEXT:    vwmaccu.vx v10, a0, v9
131; V512-NEXT:    vmv1r.v v8, v10
132; V512-NEXT:    ret
133  %a = shufflevector <4 x i16> %x, <4 x i16> %y, <8 x i32> <i32 0, i32 4, i32 undef, i32 5, i32 2, i32 undef, i32 3, i32 7>
134  ret <8 x i16> %a
135}
136
137define <8 x i32> @interleave_v4i32(<4 x i32> %x, <4 x i32> %y) {
138; V128-LABEL: interleave_v4i32:
139; V128:       # %bb.0:
140; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
141; V128-NEXT:    vwaddu.vv v10, v8, v9
142; V128-NEXT:    li a0, -1
143; V128-NEXT:    vwmaccu.vx v10, a0, v9
144; V128-NEXT:    vmv2r.v v8, v10
145; V128-NEXT:    ret
146;
147; V512-LABEL: interleave_v4i32:
148; V512:       # %bb.0:
149; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
150; V512-NEXT:    vwaddu.vv v10, v8, v9
151; V512-NEXT:    li a0, -1
152; V512-NEXT:    vwmaccu.vx v10, a0, v9
153; V512-NEXT:    vmv1r.v v8, v10
154; V512-NEXT:    ret
155  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
156  ret <8 x i32> %a
157}
158
159; %y should be slid down by 2
160define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
161; V128-LABEL: interleave_v4i32_offset_2:
162; V128:       # %bb.0:
163; V128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
164; V128-NEXT:    vslidedown.vi v10, v9, 2
165; V128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
166; V128-NEXT:    vwaddu.vv v9, v8, v10
167; V128-NEXT:    li a0, -1
168; V128-NEXT:    vwmaccu.vx v9, a0, v10
169; V128-NEXT:    vmv1r.v v8, v9
170; V128-NEXT:    ret
171;
172; V512-LABEL: interleave_v4i32_offset_2:
173; V512:       # %bb.0:
174; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
175; V512-NEXT:    vslidedown.vi v10, v9, 2
176; V512-NEXT:    vwaddu.vv v9, v8, v10
177; V512-NEXT:    li a0, -1
178; V512-NEXT:    vwmaccu.vx v9, a0, v10
179; V512-NEXT:    vmv1r.v v8, v9
180; V512-NEXT:    ret
181  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 6, i32 1, i32 7>
182  ret <4 x i32> %a
183}
184
185; %y should be slid down by 1
186define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
187; V128-LABEL: interleave_v4i32_offset_1:
188; V128:       # %bb.0:
189; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
190; V128-NEXT:    vid.v v10
191; V128-NEXT:    vmv.v.i v0, 10
192; V128-NEXT:    vsrl.vi v10, v10, 1
193; V128-NEXT:    vadd.vi v11, v10, 1
194; V128-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
195; V128-NEXT:    vzext.vf2 v10, v8
196; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
197; V128-NEXT:    vrgather.vv v10, v9, v11, v0.t
198; V128-NEXT:    vmv.v.v v8, v10
199; V128-NEXT:    ret
200;
201; V512-LABEL: interleave_v4i32_offset_1:
202; V512:       # %bb.0:
203; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
204; V512-NEXT:    vid.v v10
205; V512-NEXT:    vmv.v.i v0, 10
206; V512-NEXT:    vsrl.vi v10, v10, 1
207; V512-NEXT:    vadd.vi v11, v10, 1
208; V512-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
209; V512-NEXT:    vzext.vf2 v10, v8
210; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, mu
211; V512-NEXT:    vrgather.vv v10, v9, v11, v0.t
212; V512-NEXT:    vmv1r.v v8, v10
213; V512-NEXT:    ret
214  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 5, i32 1, i32 6>
215  ret <4 x i32> %a
216}
217
218define <16 x i8> @interleave_v8i8(<8 x i8> %x, <8 x i8> %y) {
219; V128-LABEL: interleave_v8i8:
220; V128:       # %bb.0:
221; V128-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
222; V128-NEXT:    vwaddu.vv v10, v8, v9
223; V128-NEXT:    li a0, -1
224; V128-NEXT:    vwmaccu.vx v10, a0, v9
225; V128-NEXT:    vmv1r.v v8, v10
226; V128-NEXT:    ret
227;
228; V512-LABEL: interleave_v8i8:
229; V512:       # %bb.0:
230; V512-NEXT:    vsetivli zero, 8, e8, mf8, ta, ma
231; V512-NEXT:    vwaddu.vv v10, v8, v9
232; V512-NEXT:    li a0, -1
233; V512-NEXT:    vwmaccu.vx v10, a0, v9
234; V512-NEXT:    vmv1r.v v8, v10
235; V512-NEXT:    ret
236  %a = shufflevector <8 x i8> %x, <8 x i8> %y, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
237  ret <16 x i8> %a
238}
239
240; Vector order switched for coverage.
241define <16 x i16> @interleave_v8i16(<8 x i16> %x, <8 x i16> %y) {
242; V128-LABEL: interleave_v8i16:
243; V128:       # %bb.0:
244; V128-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
245; V128-NEXT:    vwaddu.vv v10, v9, v8
246; V128-NEXT:    li a0, -1
247; V128-NEXT:    vwmaccu.vx v10, a0, v8
248; V128-NEXT:    vmv2r.v v8, v10
249; V128-NEXT:    ret
250;
251; V512-LABEL: interleave_v8i16:
252; V512:       # %bb.0:
253; V512-NEXT:    vsetivli zero, 8, e16, mf4, ta, ma
254; V512-NEXT:    vwaddu.vv v10, v9, v8
255; V512-NEXT:    li a0, -1
256; V512-NEXT:    vwmaccu.vx v10, a0, v8
257; V512-NEXT:    vmv1r.v v8, v10
258; V512-NEXT:    ret
259  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <16 x i32> <i32 8, i32 0, i32 9, i32 1, i32 10, i32 2, i32 11, i32 3, i32 12, i32 4, i32 13, i32 5, i32 14, i32 6, i32 15, i32 7>
260  ret <16 x i16> %a
261}
262
263define <16 x i32> @interleave_v8i32(<8 x i32> %x, <8 x i32> %y) {
264; V128-LABEL: interleave_v8i32:
265; V128:       # %bb.0:
266; V128-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
267; V128-NEXT:    vwaddu.vv v12, v8, v10
268; V128-NEXT:    li a0, -1
269; V128-NEXT:    vwmaccu.vx v12, a0, v10
270; V128-NEXT:    vmv4r.v v8, v12
271; V128-NEXT:    ret
272;
273; V512-LABEL: interleave_v8i32:
274; V512:       # %bb.0:
275; V512-NEXT:    vsetivli zero, 8, e32, mf2, ta, ma
276; V512-NEXT:    vwaddu.vv v10, v8, v9
277; V512-NEXT:    li a0, -1
278; V512-NEXT:    vwmaccu.vx v10, a0, v9
279; V512-NEXT:    vmv1r.v v8, v10
280; V512-NEXT:    ret
281  %a = shufflevector <8 x i32> %x, <8 x i32> %y, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
282  ret <16 x i32> %a
283}
284
285define <32 x i8> @interleave_v16i8(<16 x i8> %x, <16 x i8> %y) {
286; V128-LABEL: interleave_v16i8:
287; V128:       # %bb.0:
288; V128-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
289; V128-NEXT:    vwaddu.vv v10, v8, v9
290; V128-NEXT:    li a0, -1
291; V128-NEXT:    vwmaccu.vx v10, a0, v9
292; V128-NEXT:    vmv2r.v v8, v10
293; V128-NEXT:    ret
294;
295; V512-LABEL: interleave_v16i8:
296; V512:       # %bb.0:
297; V512-NEXT:    vsetivli zero, 16, e8, mf4, ta, ma
298; V512-NEXT:    vwaddu.vv v10, v8, v9
299; V512-NEXT:    li a0, -1
300; V512-NEXT:    vwmaccu.vx v10, a0, v9
301; V512-NEXT:    vmv1r.v v8, v10
302; V512-NEXT:    ret
303  %a = shufflevector <16 x i8> %x, <16 x i8> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
304  ret <32 x i8> %a
305}
306
307define <32 x i16> @interleave_v16i16(<16 x i16> %x, <16 x i16> %y) {
308; V128-LABEL: interleave_v16i16:
309; V128:       # %bb.0:
310; V128-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
311; V128-NEXT:    vwaddu.vv v12, v8, v10
312; V128-NEXT:    li a0, -1
313; V128-NEXT:    vwmaccu.vx v12, a0, v10
314; V128-NEXT:    vmv4r.v v8, v12
315; V128-NEXT:    ret
316;
317; V512-LABEL: interleave_v16i16:
318; V512:       # %bb.0:
319; V512-NEXT:    vsetivli zero, 16, e16, mf2, ta, ma
320; V512-NEXT:    vwaddu.vv v10, v8, v9
321; V512-NEXT:    li a0, -1
322; V512-NEXT:    vwmaccu.vx v10, a0, v9
323; V512-NEXT:    vmv1r.v v8, v10
324; V512-NEXT:    ret
325  %a = shufflevector <16 x i16> %x, <16 x i16> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
326  ret <32 x i16> %a
327}
328
329define <32 x i32> @interleave_v16i32(<16 x i32> %x, <16 x i32> %y) {
330; V128-LABEL: interleave_v16i32:
331; V128:       # %bb.0:
332; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
333; V128-NEXT:    vwaddu.vv v16, v8, v12
334; V128-NEXT:    li a0, -1
335; V128-NEXT:    vwmaccu.vx v16, a0, v12
336; V128-NEXT:    vmv8r.v v8, v16
337; V128-NEXT:    ret
338;
339; V512-LABEL: interleave_v16i32:
340; V512:       # %bb.0:
341; V512-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
342; V512-NEXT:    vwaddu.vv v10, v8, v9
343; V512-NEXT:    li a0, -1
344; V512-NEXT:    vwmaccu.vx v10, a0, v9
345; V512-NEXT:    vmv2r.v v8, v10
346; V512-NEXT:    ret
347  %a = shufflevector <16 x i32> %x, <16 x i32> %y, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
348  ret <32 x i32> %a
349}
350
351define <64 x i8> @interleave_v32i8(<32 x i8> %x, <32 x i8> %y) {
352; V128-LABEL: interleave_v32i8:
353; V128:       # %bb.0:
354; V128-NEXT:    li a0, 32
355; V128-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
356; V128-NEXT:    vwaddu.vv v12, v8, v10
357; V128-NEXT:    li a0, -1
358; V128-NEXT:    vwmaccu.vx v12, a0, v10
359; V128-NEXT:    vmv4r.v v8, v12
360; V128-NEXT:    ret
361;
362; V512-LABEL: interleave_v32i8:
363; V512:       # %bb.0:
364; V512-NEXT:    li a0, 32
365; V512-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
366; V512-NEXT:    vwaddu.vv v10, v8, v9
367; V512-NEXT:    li a0, -1
368; V512-NEXT:    vwmaccu.vx v10, a0, v9
369; V512-NEXT:    vmv1r.v v8, v10
370; V512-NEXT:    ret
371  %a = shufflevector <32 x i8> %x, <32 x i8> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
372  ret <64 x i8> %a
373}
374
375define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
376; V128-LABEL: interleave_v32i16:
377; V128:       # %bb.0:
378; V128-NEXT:    li a0, 32
379; V128-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
380; V128-NEXT:    vwaddu.vv v16, v8, v12
381; V128-NEXT:    li a0, -1
382; V128-NEXT:    vwmaccu.vx v16, a0, v12
383; V128-NEXT:    vmv8r.v v8, v16
384; V128-NEXT:    ret
385;
386; V512-LABEL: interleave_v32i16:
387; V512:       # %bb.0:
388; V512-NEXT:    li a0, 32
389; V512-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
390; V512-NEXT:    vwaddu.vv v10, v8, v9
391; V512-NEXT:    li a0, -1
392; V512-NEXT:    vwmaccu.vx v10, a0, v9
393; V512-NEXT:    vmv2r.v v8, v10
394; V512-NEXT:    ret
395  %a = shufflevector <32 x i16> %x, <32 x i16> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
396  ret <64 x i16> %a
397}
398
399define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
400; V128-LABEL: interleave_v32i32:
401; V128:       # %bb.0:
402; V128-NEXT:    addi sp, sp, -16
403; V128-NEXT:    .cfi_def_cfa_offset 16
404; V128-NEXT:    csrr a0, vlenb
405; V128-NEXT:    slli a0, a0, 3
406; V128-NEXT:    sub sp, sp, a0
407; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
408; V128-NEXT:    addi a0, sp, 16
409; V128-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
410; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
411; V128-NEXT:    vslidedown.vi v24, v16, 16
412; V128-NEXT:    li a0, 32
413; V128-NEXT:    lui a1, 699051
414; V128-NEXT:    vslidedown.vi v0, v8, 16
415; V128-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
416; V128-NEXT:    vzext.vf2 v8, v24
417; V128-NEXT:    addi a1, a1, -1366
418; V128-NEXT:    vzext.vf2 v24, v0
419; V128-NEXT:    vmv.s.x v0, a1
420; V128-NEXT:    vsll.vx v8, v8, a0
421; V128-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
422; V128-NEXT:    vmerge.vvm v24, v24, v8, v0
423; V128-NEXT:    addi a0, sp, 16
424; V128-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
425; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
426; V128-NEXT:    vwaddu.vv v0, v8, v16
427; V128-NEXT:    li a0, -1
428; V128-NEXT:    vwmaccu.vx v0, a0, v16
429; V128-NEXT:    vmv8r.v v8, v0
430; V128-NEXT:    vmv8r.v v16, v24
431; V128-NEXT:    csrr a0, vlenb
432; V128-NEXT:    slli a0, a0, 3
433; V128-NEXT:    add sp, sp, a0
434; V128-NEXT:    .cfi_def_cfa sp, 16
435; V128-NEXT:    addi sp, sp, 16
436; V128-NEXT:    .cfi_def_cfa_offset 0
437; V128-NEXT:    ret
438;
439; V512-LABEL: interleave_v32i32:
440; V512:       # %bb.0:
441; V512-NEXT:    li a0, 32
442; V512-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
443; V512-NEXT:    vwaddu.vv v12, v8, v10
444; V512-NEXT:    li a0, -1
445; V512-NEXT:    vwmaccu.vx v12, a0, v10
446; V512-NEXT:    vmv4r.v v8, v12
447; V512-NEXT:    ret
448  %a = shufflevector <32 x i32> %x, <32 x i32> %y, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
449  ret <64 x i32> %a
450}
451
452define <4 x i8> @unary_interleave_v4i8(<4 x i8> %x) {
453; V128-LABEL: unary_interleave_v4i8:
454; V128:       # %bb.0:
455; V128-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
456; V128-NEXT:    vslidedown.vi v10, v8, 2
457; V128-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
458; V128-NEXT:    vwaddu.vv v9, v8, v10
459; V128-NEXT:    li a0, -1
460; V128-NEXT:    vwmaccu.vx v9, a0, v10
461; V128-NEXT:    vmv1r.v v8, v9
462; V128-NEXT:    ret
463;
464; V512-LABEL: unary_interleave_v4i8:
465; V512:       # %bb.0:
466; V512-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
467; V512-NEXT:    vslidedown.vi v10, v8, 2
468; V512-NEXT:    vwaddu.vv v9, v8, v10
469; V512-NEXT:    li a0, -1
470; V512-NEXT:    vwmaccu.vx v9, a0, v10
471; V512-NEXT:    vmv1r.v v8, v9
472; V512-NEXT:    ret
473  %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
474  ret <4 x i8> %a
475}
476
477; This shouldn't be interleaved
478define <4 x i8> @unary_interleave_v4i8_invalid(<4 x i8> %x) {
479; V128-LABEL: unary_interleave_v4i8_invalid:
480; V128:       # %bb.0:
481; V128-NEXT:    lui a0, 16
482; V128-NEXT:    addi a0, a0, 768
483; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
484; V128-NEXT:    vmv.s.x v10, a0
485; V128-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
486; V128-NEXT:    vrgather.vv v9, v8, v10
487; V128-NEXT:    vmv1r.v v8, v9
488; V128-NEXT:    ret
489;
490; V512-LABEL: unary_interleave_v4i8_invalid:
491; V512:       # %bb.0:
492; V512-NEXT:    lui a0, 16
493; V512-NEXT:    addi a0, a0, 768
494; V512-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
495; V512-NEXT:    vmv.s.x v10, a0
496; V512-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
497; V512-NEXT:    vrgather.vv v9, v8, v10
498; V512-NEXT:    vmv1r.v v8, v9
499; V512-NEXT:    ret
500  %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 0, i32 3, i32 1, i32 4>
501  ret <4 x i8> %a
502}
503
504define <4 x i16> @unary_interleave_v4i16(<4 x i16> %x) {
505; V128-LABEL: unary_interleave_v4i16:
506; V128:       # %bb.0:
507; V128-NEXT:    vsetivli zero, 2, e16, mf2, ta, ma
508; V128-NEXT:    vslidedown.vi v10, v8, 2
509; V128-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
510; V128-NEXT:    vwaddu.vv v9, v8, v10
511; V128-NEXT:    li a0, -1
512; V128-NEXT:    vwmaccu.vx v9, a0, v10
513; V128-NEXT:    vmv1r.v v8, v9
514; V128-NEXT:    ret
515;
516; V512-LABEL: unary_interleave_v4i16:
517; V512:       # %bb.0:
518; V512-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
519; V512-NEXT:    vslidedown.vi v10, v8, 2
520; V512-NEXT:    vwaddu.vv v9, v8, v10
521; V512-NEXT:    li a0, -1
522; V512-NEXT:    vwmaccu.vx v9, a0, v10
523; V512-NEXT:    vmv1r.v v8, v9
524; V512-NEXT:    ret
525  %a = shufflevector <4 x i16> %x, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
526  ret <4 x i16> %a
527}
528
529define <4 x i32> @unary_interleave_v4i32(<4 x i32> %x) {
530; V128-LABEL: unary_interleave_v4i32:
531; V128:       # %bb.0:
532; V128-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
533; V128-NEXT:    vslidedown.vi v10, v8, 2
534; V128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
535; V128-NEXT:    vwaddu.vv v9, v8, v10
536; V128-NEXT:    li a0, -1
537; V128-NEXT:    vwmaccu.vx v9, a0, v10
538; V128-NEXT:    vmv1r.v v8, v9
539; V128-NEXT:    ret
540;
541; V512-LABEL: unary_interleave_v4i32:
542; V512:       # %bb.0:
543; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
544; V512-NEXT:    vslidedown.vi v10, v8, 2
545; V512-NEXT:    vwaddu.vv v9, v8, v10
546; V512-NEXT:    li a0, -1
547; V512-NEXT:    vwmaccu.vx v9, a0, v10
548; V512-NEXT:    vmv1r.v v8, v9
549; V512-NEXT:    ret
550  %a = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
551  ret <4 x i32> %a
552}
553
554; FIXME: Is there better codegen we can do here?
555define <4 x i64> @unary_interleave_v4i64(<4 x i64> %x) {
556; V128-LABEL: unary_interleave_v4i64:
557; V128:       # %bb.0:
558; V128-NEXT:    lui a0, 12304
559; V128-NEXT:    addi a0, a0, 512
560; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
561; V128-NEXT:    vmv.s.x v10, a0
562; V128-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
563; V128-NEXT:    vsext.vf2 v12, v10
564; V128-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
565; V128-NEXT:    vrgatherei16.vv v10, v8, v12
566; V128-NEXT:    vmv.v.v v8, v10
567; V128-NEXT:    ret
568;
569; RV32-V512-LABEL: unary_interleave_v4i64:
570; RV32-V512:       # %bb.0:
571; RV32-V512-NEXT:    lui a0, 12304
572; RV32-V512-NEXT:    addi a0, a0, 512
573; RV32-V512-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
574; RV32-V512-NEXT:    vmv.s.x v9, a0
575; RV32-V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
576; RV32-V512-NEXT:    vsext.vf2 v10, v9
577; RV32-V512-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
578; RV32-V512-NEXT:    vrgatherei16.vv v9, v8, v10
579; RV32-V512-NEXT:    vmv.v.v v8, v9
580; RV32-V512-NEXT:    ret
581;
582; RV64-V512-LABEL: unary_interleave_v4i64:
583; RV64-V512:       # %bb.0:
584; RV64-V512-NEXT:    lui a0, 12304
585; RV64-V512-NEXT:    addi a0, a0, 512
586; RV64-V512-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
587; RV64-V512-NEXT:    vmv.s.x v9, a0
588; RV64-V512-NEXT:    vsext.vf8 v10, v9
589; RV64-V512-NEXT:    vrgather.vv v9, v8, v10
590; RV64-V512-NEXT:    vmv.v.v v8, v9
591; RV64-V512-NEXT:    ret
592  %a = shufflevector <4 x i64> %x, <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
593  ret <4 x i64> %a
594}
595
596define <8 x i8> @unary_interleave_v8i8(<8 x i8> %x) {
597; V128-LABEL: unary_interleave_v8i8:
598; V128:       # %bb.0:
599; V128-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
600; V128-NEXT:    vslidedown.vi v10, v8, 4
601; V128-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
602; V128-NEXT:    vwaddu.vv v9, v8, v10
603; V128-NEXT:    li a0, -1
604; V128-NEXT:    vwmaccu.vx v9, a0, v10
605; V128-NEXT:    vmv1r.v v8, v9
606; V128-NEXT:    ret
607;
608; V512-LABEL: unary_interleave_v8i8:
609; V512:       # %bb.0:
610; V512-NEXT:    vsetivli zero, 4, e8, mf8, ta, ma
611; V512-NEXT:    vslidedown.vi v10, v8, 4
612; V512-NEXT:    vwaddu.vv v9, v8, v10
613; V512-NEXT:    li a0, -1
614; V512-NEXT:    vwmaccu.vx v9, a0, v10
615; V512-NEXT:    vmv1r.v v8, v9
616; V512-NEXT:    ret
617  %a = shufflevector <8 x i8> %x, <8 x i8> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 undef, i32 6, i32 3, i32 7>
618  ret <8 x i8> %a
619}
620
621define <8 x i16> @unary_interleave_v8i16(<8 x i16> %x) {
622; V128-LABEL: unary_interleave_v8i16:
623; V128:       # %bb.0:
624; V128-NEXT:    vsetivli zero, 4, e16, m1, ta, ma
625; V128-NEXT:    vslidedown.vi v10, v8, 4
626; V128-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
627; V128-NEXT:    vwaddu.vv v9, v10, v8
628; V128-NEXT:    li a0, -1
629; V128-NEXT:    vwmaccu.vx v9, a0, v8
630; V128-NEXT:    vmv1r.v v8, v9
631; V128-NEXT:    ret
632;
633; V512-LABEL: unary_interleave_v8i16:
634; V512:       # %bb.0:
635; V512-NEXT:    vsetivli zero, 4, e16, mf4, ta, ma
636; V512-NEXT:    vslidedown.vi v10, v8, 4
637; V512-NEXT:    vwaddu.vv v9, v10, v8
638; V512-NEXT:    li a0, -1
639; V512-NEXT:    vwmaccu.vx v9, a0, v8
640; V512-NEXT:    vmv1r.v v8, v9
641; V512-NEXT:    ret
642  %a = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 6, i32 2, i32 7, i32 3>
643  ret <8 x i16> %a
644}
645
646define <8 x i32> @unary_interleave_v8i32(<8 x i32> %x) {
647; V128-LABEL: unary_interleave_v8i32:
648; V128:       # %bb.0:
649; V128-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
650; V128-NEXT:    vslidedown.vi v12, v8, 4
651; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
652; V128-NEXT:    vwaddu.vv v10, v8, v12
653; V128-NEXT:    li a0, -1
654; V128-NEXT:    vwmaccu.vx v10, a0, v12
655; V128-NEXT:    vmv2r.v v8, v10
656; V128-NEXT:    ret
657;
658; V512-LABEL: unary_interleave_v8i32:
659; V512:       # %bb.0:
660; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, ma
661; V512-NEXT:    vslidedown.vi v10, v8, 4
662; V512-NEXT:    vwaddu.vv v9, v8, v10
663; V512-NEXT:    li a0, -1
664; V512-NEXT:    vwmaccu.vx v9, a0, v10
665; V512-NEXT:    vmv1r.v v8, v9
666; V512-NEXT:    ret
667  %a = shufflevector <8 x i32> %x, <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
668  ret <8 x i32> %a
669}
670
671; This interleaves the first 2 elements of a vector in opposite order. With
672; undefs for the remaining elements. We use to miscompile this.
673define <4 x i8> @unary_interleave_10uu_v4i8(<4 x i8> %x) {
674; CHECK-LABEL: unary_interleave_10uu_v4i8:
675; CHECK:       # %bb.0:
676; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
677; CHECK-NEXT:    vsrl.vi v9, v8, 8
678; CHECK-NEXT:    vsll.vi v8, v8, 8
679; CHECK-NEXT:    vor.vv v8, v8, v9
680; CHECK-NEXT:    ret
681  %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> <i32 1, i32 0, i32 undef, i32 undef>
682  ret <4 x i8> %a
683}
684
685;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
686; RV32-V128: {{.*}}
687; RV64-V128: {{.*}}
688