xref: /llvm-project/llvm/test/CodeGen/AArch64/named-vector-shuffles-sve.ll (revision b277bf56d7654877a1c4b59dc08bc96b4d75b649)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs < %s | FileCheck %s
3
4target triple = "aarch64-unknown-linux-gnu"
5
6;
7; VECTOR_SPLICE (index)
8;
9
10define <vscale x 16 x i8> @splice_nxv16i8_zero_idx(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
11; CHECK-LABEL: splice_nxv16i8_zero_idx:
12; CHECK:       // %bb.0:
13; CHECK-NEXT:    ret
14  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 0)
15  ret <vscale x 16 x i8> %res
16}
17
18define <vscale x 16 x i8> @splice_nxv16i8_first_idx(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
19; CHECK-LABEL: splice_nxv16i8_first_idx:
20; CHECK:       // %bb.0:
21; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #1
22; CHECK-NEXT:    ret
23  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 1)
24  ret <vscale x 16 x i8> %res
25}
26
27define <vscale x 16 x i8> @splice_nxv16i8_last_idx(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) vscale_range(16,16) #0 {
28; CHECK-LABEL: splice_nxv16i8_last_idx:
29; CHECK:       // %bb.0:
30; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #255
31; CHECK-NEXT:    ret
32  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 255)
33  ret <vscale x 16 x i8> %res
34}
35
36define <vscale x 8 x i16> @splice_nxv8i16_first_idx(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
37; CHECK-LABEL: splice_nxv8i16_first_idx:
38; CHECK:       // %bb.0:
39; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #2
40; CHECK-NEXT:    ret
41  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 1)
42  ret <vscale x 8 x i16> %res
43}
44
45define <vscale x 4 x i32> @splice_nxv4i32_first_idx(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
46; CHECK-LABEL: splice_nxv4i32_first_idx:
47; CHECK:       // %bb.0:
48; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
49; CHECK-NEXT:    ret
50  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 1)
51  ret <vscale x 4 x i32> %res
52}
53
54define <vscale x 4 x i32> @splice_nxv4i32_last_idx(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) vscale_range(16,16) #0 {
55; CHECK-LABEL: splice_nxv4i32_last_idx:
56; CHECK:       // %bb.0:
57; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
58; CHECK-NEXT:    ret
59  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 63)
60  ret <vscale x 4 x i32> %res
61}
62
63define <vscale x 2 x i64> @splice_nxv2i64_first_idx(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
64; CHECK-LABEL: splice_nxv2i64_first_idx:
65; CHECK:       // %bb.0:
66; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
67; CHECK-NEXT:    ret
68  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 1)
69  ret <vscale x 2 x i64> %res
70}
71
72define <vscale x 2 x i64> @splice_nxv2i64_last_idx(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) vscale_range(16,16) #0 {
73; CHECK-LABEL: splice_nxv2i64_last_idx:
74; CHECK:       // %bb.0:
75; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
76; CHECK-NEXT:    ret
77  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 31)
78  ret <vscale x 2 x i64> %res
79}
80
81define <vscale x 2 x half> @splice_nxv2f16_neg_idx(<vscale x 2 x half> %a, <vscale x 2 x half> %b) #0 {
82; CHECK-LABEL: splice_nxv2f16_neg_idx:
83; CHECK:       // %bb.0:
84; CHECK-NEXT:    ptrue p0.d, vl1
85; CHECK-NEXT:    rev p0.d, p0.d
86; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
87; CHECK-NEXT:    ret
88  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -1)
89  ret <vscale x 2 x half> %res
90}
91
92define <vscale x 2 x half> @splice_nxv2f16_neg2_idx(<vscale x 2 x half> %a, <vscale x 2 x half> %b) #0 {
93; CHECK-LABEL: splice_nxv2f16_neg2_idx:
94; CHECK:       // %bb.0:
95; CHECK-NEXT:    ptrue p0.d, vl2
96; CHECK-NEXT:    rev p0.d, p0.d
97; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
98; CHECK-NEXT:    ret
99  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 -2)
100  ret <vscale x 2 x half> %res
101}
102
103define <vscale x 2 x half> @splice_nxv2f16_first_idx(<vscale x 2 x half> %a, <vscale x 2 x half> %b) #0 {
104; CHECK-LABEL: splice_nxv2f16_first_idx:
105; CHECK:       // %bb.0:
106; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
107; CHECK-NEXT:    ret
108  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 1)
109  ret <vscale x 2 x half> %res
110}
111
112define <vscale x 2 x half> @splice_nxv2f16_last_idx(<vscale x 2 x half> %a, <vscale x 2 x half> %b) vscale_range(16,16) #0 {
113; CHECK-LABEL: splice_nxv2f16_last_idx:
114; CHECK:       // %bb.0:
115; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
116; CHECK-NEXT:    ret
117  %res = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b, i32 31)
118  ret <vscale x 2 x half> %res
119}
120
121define <vscale x 4 x half> @splice_nxv4f16_neg_idx(<vscale x 4 x half> %a, <vscale x 4 x half> %b) #0 {
122; CHECK-LABEL: splice_nxv4f16_neg_idx:
123; CHECK:       // %bb.0:
124; CHECK-NEXT:    ptrue p0.s, vl1
125; CHECK-NEXT:    rev p0.s, p0.s
126; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
127; CHECK-NEXT:    ret
128  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -1)
129  ret <vscale x 4 x half> %res
130}
131
132define <vscale x 4 x half> @splice_nxv4f16_neg3_idx(<vscale x 4 x half> %a, <vscale x 4 x half> %b) #0 {
133; CHECK-LABEL: splice_nxv4f16_neg3_idx:
134; CHECK:       // %bb.0:
135; CHECK-NEXT:    ptrue p0.s, vl3
136; CHECK-NEXT:    rev p0.s, p0.s
137; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
138; CHECK-NEXT:    ret
139  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 -3)
140  ret <vscale x 4 x half> %res
141}
142
143define <vscale x 4 x half> @splice_nxv4f16_first_idx(<vscale x 4 x half> %a, <vscale x 4 x half> %b) #0 {
144; CHECK-LABEL: splice_nxv4f16_first_idx:
145; CHECK:       // %bb.0:
146; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
147; CHECK-NEXT:    ret
148  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 1)
149  ret <vscale x 4 x half> %res
150}
151
152define <vscale x 4 x half> @splice_nxv4f16_last_idx(<vscale x 4 x half> %a, <vscale x 4 x half> %b) vscale_range(16,16) #0 {
153; CHECK-LABEL: splice_nxv4f16_last_idx:
154; CHECK:       // %bb.0:
155; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
156; CHECK-NEXT:    ret
157  %res = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b, i32 63)
158  ret <vscale x 4 x half> %res
159}
160
161define <vscale x 8 x half> @splice_nxv8f16_first_idx(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
162; CHECK-LABEL: splice_nxv8f16_first_idx:
163; CHECK:       // %bb.0:
164; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #2
165; CHECK-NEXT:    ret
166  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 1)
167  ret <vscale x 8 x half> %res
168}
169
170define <vscale x 8 x half> @splice_nxv8f16_last_idx(<vscale x 8 x half> %a, <vscale x 8 x half> %b) vscale_range(16,16) #0 {
171; CHECK-LABEL: splice_nxv8f16_last_idx:
172; CHECK:       // %bb.0:
173; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #254
174; CHECK-NEXT:    ret
175  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 127)
176  ret <vscale x 8 x half> %res
177}
178
179define <vscale x 2 x float> @splice_nxv2f32_neg_idx(<vscale x 2 x float> %a, <vscale x 2 x float> %b) #0 {
180; CHECK-LABEL: splice_nxv2f32_neg_idx:
181; CHECK:       // %bb.0:
182; CHECK-NEXT:    ptrue p0.d, vl1
183; CHECK-NEXT:    rev p0.d, p0.d
184; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
185; CHECK-NEXT:    ret
186  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -1)
187  ret <vscale x 2 x float> %res
188}
189
190define <vscale x 2 x float> @splice_nxv2f32_neg2_idx(<vscale x 2 x float> %a, <vscale x 2 x float> %b) #0 {
191; CHECK-LABEL: splice_nxv2f32_neg2_idx:
192; CHECK:       // %bb.0:
193; CHECK-NEXT:    ptrue p0.d, vl2
194; CHECK-NEXT:    rev p0.d, p0.d
195; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
196; CHECK-NEXT:    ret
197  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 -2)
198  ret <vscale x 2 x float> %res
199}
200
201define <vscale x 2 x float> @splice_nxv2f32_first_idx(<vscale x 2 x float> %a, <vscale x 2 x float> %b) #0 {
202; CHECK-LABEL: splice_nxv2f32_first_idx:
203; CHECK:       // %bb.0:
204; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
205; CHECK-NEXT:    ret
206  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 1)
207  ret <vscale x 2 x float> %res
208}
209
210define <vscale x 2 x float> @splice_nxv2f32_last_idx(<vscale x 2 x float> %a, <vscale x 2 x float> %b) vscale_range(16,16) #0 {
211; CHECK-LABEL: splice_nxv2f32_last_idx:
212; CHECK:       // %bb.0:
213; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
214; CHECK-NEXT:    ret
215  %res = call <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b, i32 31)
216  ret <vscale x 2 x float> %res
217}
218
219define <vscale x 4 x float> @splice_nxv4f32_first_idx(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
220; CHECK-LABEL: splice_nxv4f32_first_idx:
221; CHECK:       // %bb.0:
222; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
223; CHECK-NEXT:    ret
224  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 1)
225  ret <vscale x 4 x float> %res
226}
227
228define <vscale x 4 x float> @splice_nxv4f32_last_idx(<vscale x 4 x float> %a, <vscale x 4 x float> %b) vscale_range(16,16) #0 {
229; CHECK-LABEL: splice_nxv4f32_last_idx:
230; CHECK:       // %bb.0:
231; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
232; CHECK-NEXT:    ret
233  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 63)
234  ret <vscale x 4 x float> %res
235}
236
237define <vscale x 2 x double> @splice_nxv2f64_first_idx(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
238; CHECK-LABEL: splice_nxv2f64_first_idx:
239; CHECK:       // %bb.0:
240; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
241; CHECK-NEXT:    ret
242  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 1)
243  ret <vscale x 2 x double> %res
244}
245
246define <vscale x 2 x double> @splice_nxv2f64_last_idx(<vscale x 2 x double> %a, <vscale x 2 x double> %b) vscale_range(16,16) #0 {
247; CHECK-LABEL: splice_nxv2f64_last_idx:
248; CHECK:       // %bb.0:
249; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
250; CHECK-NEXT:    ret
251  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 31)
252  ret <vscale x 2 x double> %res
253}
254
255; Ensure predicate based splice is promoted to use ZPRs.
256define <vscale x 2 x i1> @splice_nxv2i1_idx(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
257; CHECK-LABEL: splice_nxv2i1_idx:
258; CHECK:       // %bb.0:
259; CHECK-NEXT:    mov z0.d, p1/z, #1 // =0x1
260; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
261; CHECK-NEXT:    ptrue p0.d
262; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
263; CHECK-NEXT:    and z1.d, z1.d, #0x1
264; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
265; CHECK-NEXT:    ret
266  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 1)
267  ret <vscale x 2 x i1> %res
268}
269
270; Ensure predicate based splice is promoted to use ZPRs.
271define <vscale x 4 x i1> @splice_nxv4i1_idx(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) #0 {
272; CHECK-LABEL: splice_nxv4i1_idx:
273; CHECK:       // %bb.0:
274; CHECK-NEXT:    mov z0.s, p1/z, #1 // =0x1
275; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
276; CHECK-NEXT:    ptrue p0.s
277; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
278; CHECK-NEXT:    and z1.s, z1.s, #0x1
279; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
280; CHECK-NEXT:    ret
281  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 2)
282  ret <vscale x 4 x i1> %res
283}
284
285; Ensure predicate based splice is promoted to use ZPRs.
286define <vscale x 8 x i1> @splice_nxv8i1_idx(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) #0 {
287; CHECK-LABEL: splice_nxv8i1_idx:
288; CHECK:       // %bb.0:
289; CHECK-NEXT:    mov z0.h, p1/z, #1 // =0x1
290; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
291; CHECK-NEXT:    ptrue p0.h
292; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
293; CHECK-NEXT:    and z1.h, z1.h, #0x1
294; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
295; CHECK-NEXT:    ret
296  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 4)
297  ret <vscale x 8 x i1> %res
298}
299
300; Ensure predicate based splice is promoted to use ZPRs.
301define <vscale x 16 x i1> @splice_nxv16i1_idx(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
302; CHECK-LABEL: splice_nxv16i1_idx:
303; CHECK:       // %bb.0:
304; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
305; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
306; CHECK-NEXT:    ptrue p0.b
307; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #8
308; CHECK-NEXT:    and z1.b, z1.b, #0x1
309; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
310; CHECK-NEXT:    ret
311  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 8)
312  ret <vscale x 16 x i1> %res
313}
314
315; Verify promote type legalisation works as expected.
316define <vscale x 2 x i8> @splice_nxv2i8_idx(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) #0 {
317; CHECK-LABEL: splice_nxv2i8_idx:
318; CHECK:       // %bb.0:
319; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
320; CHECK-NEXT:    ret
321  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 1)
322  ret <vscale x 2 x i8> %res
323}
324
325; Verify splitvec type legalisation works as expected.
326define <vscale x 8 x i32> @splice_nxv8i32_idx(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b) #0 {
327; CHECK-LABEL: splice_nxv8i32_idx:
328; CHECK:       // %bb.0:
329; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
330; CHECK-NEXT:    addvl sp, sp, #-4
331; CHECK-NEXT:    ptrue p0.s
332; CHECK-NEXT:    mov x8, sp
333; CHECK-NEXT:    orr x8, x8, #0x8
334; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
335; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
336; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
337; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
338; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
339; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8, #1, mul vl]
340; CHECK-NEXT:    addvl sp, sp, #4
341; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
342; CHECK-NEXT:    ret
343  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 2)
344  ret <vscale x 8 x i32> %res
345}
346
347; Verify splitvec type legalisation works as expected.
348define <vscale x 16 x float> @splice_nxv16f32_16(<vscale x 16 x float> %a, <vscale x 16 x float> %b) vscale_range(2,16) #0 {
349; CHECK-LABEL: splice_nxv16f32_16:
350; CHECK:       // %bb.0:
351; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
352; CHECK-NEXT:    addvl sp, sp, #-8
353; CHECK-NEXT:    rdvl x8, #1
354; CHECK-NEXT:    mov w9, #16 // =0x10
355; CHECK-NEXT:    ptrue p0.s
356; CHECK-NEXT:    sub x8, x8, #1
357; CHECK-NEXT:    cmp x8, #16
358; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
359; CHECK-NEXT:    csel x8, x8, x9, lo
360; CHECK-NEXT:    mov x9, sp
361; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
362; CHECK-NEXT:    add x10, x9, x8, lsl #2
363; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
364; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
365; CHECK-NEXT:    st1w { z7.s }, p0, [sp, #7, mul vl]
366; CHECK-NEXT:    st1w { z4.s }, p0, [sp, #4, mul vl]
367; CHECK-NEXT:    st1w { z5.s }, p0, [sp, #5, mul vl]
368; CHECK-NEXT:    st1w { z6.s }, p0, [sp, #6, mul vl]
369; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x9, x8, lsl #2]
370; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x10, #1, mul vl]
371; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x10, #2, mul vl]
372; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x10, #3, mul vl]
373; CHECK-NEXT:    addvl sp, sp, #8
374; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
375; CHECK-NEXT:    ret
376  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 16)
377  ret <vscale x 16 x float> %res
378}
379
380;
381; VECTOR_SPLICE (trailing elements)
382;
383
384define <vscale x 16 x i8> @splice_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
385; CHECK-LABEL: splice_nxv16i8:
386; CHECK:       // %bb.0:
387; CHECK-NEXT:    ptrue p0.b, vl16
388; CHECK-NEXT:    rev p0.b, p0.b
389; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
390; CHECK-NEXT:    ret
391  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -16)
392  ret <vscale x 16 x i8> %res
393}
394
395define <vscale x 16 x i8> @splice_nxv16i8_neg32(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) vscale_range(2,16) #0 {
396; CHECK-LABEL: splice_nxv16i8_neg32:
397; CHECK:       // %bb.0:
398; CHECK-NEXT:    ptrue p0.b, vl32
399; CHECK-NEXT:    rev p0.b, p0.b
400; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
401; CHECK-NEXT:    ret
402  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -32)
403  ret <vscale x 16 x i8> %res
404}
405
406define <vscale x 16 x i8> @splice_nxv16i8_neg64(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) vscale_range(4,16) #0 {
407; CHECK-LABEL: splice_nxv16i8_neg64:
408; CHECK:       // %bb.0:
409; CHECK-NEXT:    ptrue p0.b, vl64
410; CHECK-NEXT:    rev p0.b, p0.b
411; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
412; CHECK-NEXT:    ret
413  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -64)
414  ret <vscale x 16 x i8> %res
415}
416
417define <vscale x 16 x i8> @splice_nxv16i8_neg128(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) vscale_range(8,16) #0 {
418; CHECK-LABEL: splice_nxv16i8_neg128:
419; CHECK:       // %bb.0:
420; CHECK-NEXT:    ptrue p0.b, vl128
421; CHECK-NEXT:    rev p0.b, p0.b
422; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
423; CHECK-NEXT:    ret
424  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -128)
425  ret <vscale x 16 x i8> %res
426}
427
428define <vscale x 16 x i8> @splice_nxv16i8_neg256(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) vscale_range(16,16) #0 {
429; CHECK-LABEL: splice_nxv16i8_neg256:
430; CHECK:       // %bb.0:
431; CHECK-NEXT:    ptrue p0.b, vl256
432; CHECK-NEXT:    rev p0.b, p0.b
433; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
434; CHECK-NEXT:    ret
435  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -256)
436  ret <vscale x 16 x i8> %res
437}
438
439define <vscale x 16 x i8> @splice_nxv16i8_1(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
440; CHECK-LABEL: splice_nxv16i8_1:
441; CHECK:       // %bb.0:
442; CHECK-NEXT:    ptrue p0.b, vl1
443; CHECK-NEXT:    rev p0.b, p0.b
444; CHECK-NEXT:    splice z0.b, p0, z0.b, z1.b
445; CHECK-NEXT:    ret
446  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -1)
447  ret <vscale x 16 x i8> %res
448}
449
450define <vscale x 16 x i8> @splice_nxv16i8_neg17(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) vscale_range(2,16) #0 {
451; CHECK-LABEL: splice_nxv16i8_neg17:
452; CHECK:       // %bb.0:
453; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
454; CHECK-NEXT:    addvl sp, sp, #-2
455; CHECK-NEXT:    rdvl x8, #1
456; CHECK-NEXT:    ptrue p0.b
457; CHECK-NEXT:    mov w9, #17 // =0x11
458; CHECK-NEXT:    cmp x8, #17
459; CHECK-NEXT:    mov x10, sp
460; CHECK-NEXT:    csel x9, x8, x9, lo
461; CHECK-NEXT:    add x8, x10, x8
462; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
463; CHECK-NEXT:    st1b { z1.b }, p0, [sp, #1, mul vl]
464; CHECK-NEXT:    sub x8, x8, x9
465; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x8]
466; CHECK-NEXT:    addvl sp, sp, #2
467; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
468; CHECK-NEXT:    ret
469  %res = call <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i32 -17)
470  ret <vscale x 16 x i8> %res
471}
472
473define <vscale x 8 x i16> @splice_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
474; CHECK-LABEL: splice_nxv8i16:
475; CHECK:       // %bb.0:
476; CHECK-NEXT:    ptrue p0.h, vl8
477; CHECK-NEXT:    rev p0.h, p0.h
478; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
479; CHECK-NEXT:    ret
480  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -8)
481  ret <vscale x 8 x i16> %res
482}
483
484define <vscale x 8 x i16> @splice_nxv8i16_1(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
485; CHECK-LABEL: splice_nxv8i16_1:
486; CHECK:       // %bb.0:
487; CHECK-NEXT:    ptrue p0.h, vl1
488; CHECK-NEXT:    rev p0.h, p0.h
489; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
490; CHECK-NEXT:    ret
491  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -1)
492  ret <vscale x 8 x i16> %res
493}
494
495define <vscale x 8 x i16> @splice_nxv8i16_neg9(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) vscale_range(2,16) #0 {
496; CHECK-LABEL: splice_nxv8i16_neg9:
497; CHECK:       // %bb.0:
498; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
499; CHECK-NEXT:    addvl sp, sp, #-2
500; CHECK-NEXT:    rdvl x8, #1
501; CHECK-NEXT:    ptrue p0.h
502; CHECK-NEXT:    mov w9, #18 // =0x12
503; CHECK-NEXT:    cmp x8, #18
504; CHECK-NEXT:    mov x10, sp
505; CHECK-NEXT:    csel x9, x8, x9, lo
506; CHECK-NEXT:    add x8, x10, x8
507; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
508; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
509; CHECK-NEXT:    sub x8, x8, x9
510; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
511; CHECK-NEXT:    addvl sp, sp, #2
512; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
513; CHECK-NEXT:    ret
514  %res = call <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, i32 -9)
515  ret <vscale x 8 x i16> %res
516}
517
518define <vscale x 4 x i32> @splice_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
519; CHECK-LABEL: splice_nxv4i32:
520; CHECK:       // %bb.0:
521; CHECK-NEXT:    ptrue p0.s, vl4
522; CHECK-NEXT:    rev p0.s, p0.s
523; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
524; CHECK-NEXT:    ret
525  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -4)
526  ret <vscale x 4 x i32> %res
527}
528
529define <vscale x 4 x i32> @splice_nxv4i32_1(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) #0 {
530; CHECK-LABEL: splice_nxv4i32_1:
531; CHECK:       // %bb.0:
532; CHECK-NEXT:    ptrue p0.s, vl1
533; CHECK-NEXT:    rev p0.s, p0.s
534; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
535; CHECK-NEXT:    ret
536  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -1)
537  ret <vscale x 4 x i32> %res
538}
539
540define <vscale x 4 x i32> @splice_nxv4i32_neg5(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) vscale_range(2,16) #0 {
541; CHECK-LABEL: splice_nxv4i32_neg5:
542; CHECK:       // %bb.0:
543; CHECK-NEXT:    ptrue p0.s, vl5
544; CHECK-NEXT:    rev p0.s, p0.s
545; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
546; CHECK-NEXT:    ret
547  %res = call <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 -5)
548  ret <vscale x 4 x i32> %res
549}
550
551define <vscale x 2 x i64> @splice_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
552; CHECK-LABEL: splice_nxv2i64:
553; CHECK:       // %bb.0:
554; CHECK-NEXT:    ptrue p0.d, vl2
555; CHECK-NEXT:    rev p0.d, p0.d
556; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
557; CHECK-NEXT:    ret
558  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -2)
559  ret <vscale x 2 x i64> %res
560}
561
562define <vscale x 2 x i64> @splice_nxv2i64_1(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) #0 {
563; CHECK-LABEL: splice_nxv2i64_1:
564; CHECK:       // %bb.0:
565; CHECK-NEXT:    ptrue p0.d, vl1
566; CHECK-NEXT:    rev p0.d, p0.d
567; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
568; CHECK-NEXT:    ret
569  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -1)
570  ret <vscale x 2 x i64> %res
571}
572
573define <vscale x 2 x i64> @splice_nxv2i64_neg3(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b) vscale_range(2,16) #0 {
574; CHECK-LABEL: splice_nxv2i64_neg3:
575; CHECK:       // %bb.0:
576; CHECK-NEXT:    ptrue p0.d, vl3
577; CHECK-NEXT:    rev p0.d, p0.d
578; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
579; CHECK-NEXT:    ret
580  %res = call <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, i32 -3)
581  ret <vscale x 2 x i64> %res
582}
583
584define <vscale x 8 x half> @splice_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
585; CHECK-LABEL: splice_nxv8f16:
586; CHECK:       // %bb.0:
587; CHECK-NEXT:    ptrue p0.h, vl8
588; CHECK-NEXT:    rev p0.h, p0.h
589; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
590; CHECK-NEXT:    ret
591  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -8)
592  ret <vscale x 8 x half> %res
593}
594
595define <vscale x 8 x half> @splice_nxv8f16_1(<vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
596; CHECK-LABEL: splice_nxv8f16_1:
597; CHECK:       // %bb.0:
598; CHECK-NEXT:    ptrue p0.h, vl1
599; CHECK-NEXT:    rev p0.h, p0.h
600; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
601; CHECK-NEXT:    ret
602  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -1)
603  ret <vscale x 8 x half> %res
604}
605
606define <vscale x 8 x half> @splice_nxv8f16_neg9(<vscale x 8 x half> %a, <vscale x 8 x half> %b) vscale_range(2,16) #0 {
607; CHECK-LABEL: splice_nxv8f16_neg9:
608; CHECK:       // %bb.0:
609; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
610; CHECK-NEXT:    addvl sp, sp, #-2
611; CHECK-NEXT:    rdvl x8, #1
612; CHECK-NEXT:    ptrue p0.h
613; CHECK-NEXT:    mov w9, #18 // =0x12
614; CHECK-NEXT:    cmp x8, #18
615; CHECK-NEXT:    mov x10, sp
616; CHECK-NEXT:    csel x9, x8, x9, lo
617; CHECK-NEXT:    add x8, x10, x8
618; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
619; CHECK-NEXT:    st1h { z1.h }, p0, [sp, #1, mul vl]
620; CHECK-NEXT:    sub x8, x8, x9
621; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x8]
622; CHECK-NEXT:    addvl sp, sp, #2
623; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
624; CHECK-NEXT:    ret
625  %res = call <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b, i32 -9)
626  ret <vscale x 8 x half> %res
627}
628
629define <vscale x 4 x float> @splice_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
630; CHECK-LABEL: splice_nxv4f32:
631; CHECK:       // %bb.0:
632; CHECK-NEXT:    ptrue p0.s, vl4
633; CHECK-NEXT:    rev p0.s, p0.s
634; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
635; CHECK-NEXT:    ret
636  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -4)
637  ret <vscale x 4 x float> %res
638}
639
640define <vscale x 4 x float> @splice_nxv4f32_1(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
641; CHECK-LABEL: splice_nxv4f32_1:
642; CHECK:       // %bb.0:
643; CHECK-NEXT:    ptrue p0.s, vl1
644; CHECK-NEXT:    rev p0.s, p0.s
645; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
646; CHECK-NEXT:    ret
647  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -1)
648  ret <vscale x 4 x float> %res
649}
650
651define <vscale x 4 x float> @splice_nxv4f32_neg5(<vscale x 4 x float> %a, <vscale x 4 x float> %b) vscale_range(2,16) #0 {
652; CHECK-LABEL: splice_nxv4f32_neg5:
653; CHECK:       // %bb.0:
654; CHECK-NEXT:    ptrue p0.s, vl5
655; CHECK-NEXT:    rev p0.s, p0.s
656; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
657; CHECK-NEXT:    ret
658  %res = call <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 -5)
659  ret <vscale x 4 x float> %res
660}
661
662define <vscale x 2 x double> @splice_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
663; CHECK-LABEL: splice_nxv2f64:
664; CHECK:       // %bb.0:
665; CHECK-NEXT:    ptrue p0.d, vl2
666; CHECK-NEXT:    rev p0.d, p0.d
667; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
668; CHECK-NEXT:    ret
669  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -2)
670  ret <vscale x 2 x double> %res
671}
672
673define <vscale x 2 x double> @splice_nxv2f64_1(<vscale x 2 x double> %a, <vscale x 2 x double> %b) #0 {
674; CHECK-LABEL: splice_nxv2f64_1:
675; CHECK:       // %bb.0:
676; CHECK-NEXT:    ptrue p0.d, vl1
677; CHECK-NEXT:    rev p0.d, p0.d
678; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
679; CHECK-NEXT:    ret
680  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -1)
681  ret <vscale x 2 x double> %res
682}
683
684define <vscale x 2 x double> @splice_nxv2f64_neg3(<vscale x 2 x double> %a, <vscale x 2 x double> %b) vscale_range(2,16) #0 {
685; CHECK-LABEL: splice_nxv2f64_neg3:
686; CHECK:       // %bb.0:
687; CHECK-NEXT:    ptrue p0.d, vl3
688; CHECK-NEXT:    rev p0.d, p0.d
689; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
690; CHECK-NEXT:    ret
691  %res = call <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b, i32 -3)
692  ret <vscale x 2 x double> %res
693}
694
695define <vscale x 2 x bfloat> @splice_nxv2bf16_neg_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
696; CHECK-LABEL: splice_nxv2bf16_neg_idx:
697; CHECK:       // %bb.0:
698; CHECK-NEXT:    ptrue p0.d, vl1
699; CHECK-NEXT:    rev p0.d, p0.d
700; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
701; CHECK-NEXT:    ret
702  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 -1)
703  ret <vscale x 2 x bfloat> %res
704}
705
706define <vscale x 2 x bfloat> @splice_nxv2bf16_neg2_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
707; CHECK-LABEL: splice_nxv2bf16_neg2_idx:
708; CHECK:       // %bb.0:
709; CHECK-NEXT:    ptrue p0.d, vl2
710; CHECK-NEXT:    rev p0.d, p0.d
711; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
712; CHECK-NEXT:    ret
713  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 -2)
714  ret <vscale x 2 x bfloat> %res
715}
716
717define <vscale x 2 x bfloat> @splice_nxv2bf16_first_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
718; CHECK-LABEL: splice_nxv2bf16_first_idx:
719; CHECK:       // %bb.0:
720; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #8
721; CHECK-NEXT:    ret
722  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 1)
723  ret <vscale x 2 x bfloat> %res
724}
725
726define <vscale x 2 x bfloat> @splice_nxv2bf16_last_idx(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) vscale_range(16,16) #0 {
727; CHECK-LABEL: splice_nxv2bf16_last_idx:
728; CHECK:       // %bb.0:
729; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #248
730; CHECK-NEXT:    ret
731  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 31)
732  ret <vscale x 2 x bfloat> %res
733}
734
735define <vscale x 4 x bfloat> @splice_nxv4bf16_neg_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
736; CHECK-LABEL: splice_nxv4bf16_neg_idx:
737; CHECK:       // %bb.0:
738; CHECK-NEXT:    ptrue p0.s, vl1
739; CHECK-NEXT:    rev p0.s, p0.s
740; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
741; CHECK-NEXT:    ret
742  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 -1)
743  ret <vscale x 4 x bfloat> %res
744}
745
746define <vscale x 4 x bfloat> @splice_nxv4bf16_neg3_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
747; CHECK-LABEL: splice_nxv4bf16_neg3_idx:
748; CHECK:       // %bb.0:
749; CHECK-NEXT:    ptrue p0.s, vl3
750; CHECK-NEXT:    rev p0.s, p0.s
751; CHECK-NEXT:    splice z0.s, p0, z0.s, z1.s
752; CHECK-NEXT:    ret
753  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 -3)
754  ret <vscale x 4 x bfloat> %res
755}
756
757define <vscale x 4 x bfloat> @splice_nxv4bf16_first_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
758; CHECK-LABEL: splice_nxv4bf16_first_idx:
759; CHECK:       // %bb.0:
760; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #4
761; CHECK-NEXT:    ret
762  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 1)
763  ret <vscale x 4 x bfloat> %res
764}
765
766define <vscale x 4 x bfloat> @splice_nxv4bf16_last_idx(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) vscale_range(16,16) #0 {
767; CHECK-LABEL: splice_nxv4bf16_last_idx:
768; CHECK:       // %bb.0:
769; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #252
770; CHECK-NEXT:    ret
771  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 63)
772  ret <vscale x 4 x bfloat> %res
773}
774
775define <vscale x 8 x bfloat> @splice_nxv8bf16_first_idx(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
776; CHECK-LABEL: splice_nxv8bf16_first_idx:
777; CHECK:       // %bb.0:
778; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #2
779; CHECK-NEXT:    ret
780  %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 1)
781  ret <vscale x 8 x bfloat> %res
782}
783
784define <vscale x 8 x bfloat> @splice_nxv8bf16_last_idx(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) vscale_range(16,16) #0 {
785; CHECK-LABEL: splice_nxv8bf16_last_idx:
786; CHECK:       // %bb.0:
787; CHECK-NEXT:    ext z0.b, z0.b, z1.b, #254
788; CHECK-NEXT:    ret
789  %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 127)
790  ret <vscale x 8 x bfloat> %res
791}
792
793; Ensure predicate based splice is promoted to use ZPRs.
794define <vscale x 2 x i1> @splice_nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b) #0 {
795; CHECK-LABEL: splice_nxv2i1:
796; CHECK:       // %bb.0:
797; CHECK-NEXT:    ptrue p2.d, vl1
798; CHECK-NEXT:    mov z0.d, p1/z, #1 // =0x1
799; CHECK-NEXT:    mov z1.d, p0/z, #1 // =0x1
800; CHECK-NEXT:    rev p0.d, p2.d
801; CHECK-NEXT:    splice z1.d, p0, z1.d, z0.d
802; CHECK-NEXT:    ptrue p0.d
803; CHECK-NEXT:    and z1.d, z1.d, #0x1
804; CHECK-NEXT:    cmpne p0.d, p0/z, z1.d, #0
805; CHECK-NEXT:    ret
806  %res = call <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1> %a, <vscale x 2 x i1> %b, i32 -1)
807  ret <vscale x 2 x i1> %res
808}
809
810; Ensure predicate based splice is promoted to use ZPRs.
811define <vscale x 4 x i1> @splice_nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b) #0 {
812; CHECK-LABEL: splice_nxv4i1:
813; CHECK:       // %bb.0:
814; CHECK-NEXT:    ptrue p2.s, vl1
815; CHECK-NEXT:    mov z0.s, p1/z, #1 // =0x1
816; CHECK-NEXT:    mov z1.s, p0/z, #1 // =0x1
817; CHECK-NEXT:    rev p0.s, p2.s
818; CHECK-NEXT:    splice z1.s, p0, z1.s, z0.s
819; CHECK-NEXT:    ptrue p0.s
820; CHECK-NEXT:    and z1.s, z1.s, #0x1
821; CHECK-NEXT:    cmpne p0.s, p0/z, z1.s, #0
822; CHECK-NEXT:    ret
823  %res = call <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1> %a, <vscale x 4 x i1> %b, i32 -1)
824  ret <vscale x 4 x i1> %res
825}
826
827; Ensure predicate based splice is promoted to use ZPRs.
828define <vscale x 8 x i1> @splice_nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b) #0 {
829; CHECK-LABEL: splice_nxv8i1:
830; CHECK:       // %bb.0:
831; CHECK-NEXT:    ptrue p2.h, vl1
832; CHECK-NEXT:    mov z0.h, p1/z, #1 // =0x1
833; CHECK-NEXT:    mov z1.h, p0/z, #1 // =0x1
834; CHECK-NEXT:    rev p0.h, p2.h
835; CHECK-NEXT:    splice z1.h, p0, z1.h, z0.h
836; CHECK-NEXT:    ptrue p0.h
837; CHECK-NEXT:    and z1.h, z1.h, #0x1
838; CHECK-NEXT:    cmpne p0.h, p0/z, z1.h, #0
839; CHECK-NEXT:    ret
840  %res = call <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1> %a, <vscale x 8 x i1> %b, i32 -1)
841  ret <vscale x 8 x i1> %res
842}
843
844; Ensure predicate based splice is promoted to use ZPRs.
845define <vscale x 16 x i1> @splice_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b) #0 {
846; CHECK-LABEL: splice_nxv16i1:
847; CHECK:       // %bb.0:
848; CHECK-NEXT:    ptrue p2.b, vl1
849; CHECK-NEXT:    mov z0.b, p1/z, #1 // =0x1
850; CHECK-NEXT:    mov z1.b, p0/z, #1 // =0x1
851; CHECK-NEXT:    rev p0.b, p2.b
852; CHECK-NEXT:    splice z1.b, p0, z1.b, z0.b
853; CHECK-NEXT:    ptrue p0.b
854; CHECK-NEXT:    and z1.b, z1.b, #0x1
855; CHECK-NEXT:    cmpne p0.b, p0/z, z1.b, #0
856; CHECK-NEXT:    ret
857  %res = call <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, i32 -1)
858  ret <vscale x 16 x i1> %res
859}
860
861; Verify promote type legalisation works as expected.
862define <vscale x 2 x i8> @splice_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b) #0 {
863; CHECK-LABEL: splice_nxv2i8:
864; CHECK:       // %bb.0:
865; CHECK-NEXT:    ptrue p0.d, vl2
866; CHECK-NEXT:    rev p0.d, p0.d
867; CHECK-NEXT:    splice z0.d, p0, z0.d, z1.d
868; CHECK-NEXT:    ret
869  %res = call <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, i32 -2)
870  ret <vscale x 2 x i8> %res
871}
872
873; Verify splitvec type legalisation works as expected.
874define <vscale x 8 x i32> @splice_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b) #0 {
875; CHECK-LABEL: splice_nxv8i32:
876; CHECK:       // %bb.0:
877; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
878; CHECK-NEXT:    addvl sp, sp, #-4
879; CHECK-NEXT:    ptrue p0.s
880; CHECK-NEXT:    rdvl x8, #2
881; CHECK-NEXT:    mov x9, sp
882; CHECK-NEXT:    add x8, x9, x8
883; CHECK-NEXT:    mov x9, #-8 // =0xfffffffffffffff8
884; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
885; CHECK-NEXT:    sub x10, x8, #32
886; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
887; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
888; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
889; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8, x9, lsl #2]
890; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x10, #1, mul vl]
891; CHECK-NEXT:    addvl sp, sp, #4
892; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
893; CHECK-NEXT:    ret
894  %res = call <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, i32 -8)
895  ret <vscale x 8 x i32> %res
896}
897
898; Verify splitvec type legalisation works as expected.
899define <vscale x 16 x float> @splice_nxv16f32_neg17(<vscale x 16 x float> %a, <vscale x 16 x float> %b) vscale_range(2,16) #0 {
900; CHECK-LABEL: splice_nxv16f32_neg17:
901; CHECK:       // %bb.0:
902; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
903; CHECK-NEXT:    addvl sp, sp, #-8
904; CHECK-NEXT:    rdvl x8, #4
905; CHECK-NEXT:    ptrue p0.s
906; CHECK-NEXT:    mov w9, #68 // =0x44
907; CHECK-NEXT:    cmp x8, #68
908; CHECK-NEXT:    mov x10, sp
909; CHECK-NEXT:    csel x9, x8, x9, lo
910; CHECK-NEXT:    add x8, x10, x8
911; CHECK-NEXT:    st1w { z3.s }, p0, [sp, #3, mul vl]
912; CHECK-NEXT:    st1w { z2.s }, p0, [sp, #2, mul vl]
913; CHECK-NEXT:    sub x8, x8, x9
914; CHECK-NEXT:    st1w { z1.s }, p0, [sp, #1, mul vl]
915; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
916; CHECK-NEXT:    st1w { z7.s }, p0, [sp, #7, mul vl]
917; CHECK-NEXT:    st1w { z4.s }, p0, [sp, #4, mul vl]
918; CHECK-NEXT:    st1w { z5.s }, p0, [sp, #5, mul vl]
919; CHECK-NEXT:    st1w { z6.s }, p0, [sp, #6, mul vl]
920; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
921; CHECK-NEXT:    ld1w { z1.s }, p0/z, [x8, #1, mul vl]
922; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x8, #2, mul vl]
923; CHECK-NEXT:    ld1w { z3.s }, p0/z, [x8, #3, mul vl]
924; CHECK-NEXT:    addvl sp, sp, #8
925; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
926; CHECK-NEXT:    ret
927  %res = call <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b, i32 -17)
928  ret <vscale x 16 x float> %res
929}
930
931declare <vscale x 2 x i1> @llvm.vector.splice.nxv2i1(<vscale x 2 x i1>, <vscale x 2 x i1>, i32)
932declare <vscale x 4 x i1> @llvm.vector.splice.nxv4i1(<vscale x 4 x i1>, <vscale x 4 x i1>, i32)
933declare <vscale x 8 x i1> @llvm.vector.splice.nxv8i1(<vscale x 8 x i1>, <vscale x 8 x i1>, i32)
934declare <vscale x 16 x i1> @llvm.vector.splice.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>, i32)
935
936declare <vscale x 2 x i8> @llvm.vector.splice.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, i32)
937declare <vscale x 16 x i8> @llvm.vector.splice.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, i32)
938declare <vscale x 8 x i16> @llvm.vector.splice.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, i32)
939declare <vscale x 4 x i32> @llvm.vector.splice.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, i32)
940declare <vscale x 8 x i32> @llvm.vector.splice.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, i32)
941declare <vscale x 2 x i64> @llvm.vector.splice.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, i32)
942
943declare <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, i32)
944declare <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, i32)
945declare <vscale x 8 x half> @llvm.vector.splice.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, i32)
946declare <vscale x 2 x float> @llvm.vector.splice.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, i32)
947declare <vscale x 4 x float> @llvm.vector.splice.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, i32)
948declare <vscale x 16 x float> @llvm.vector.splice.nxv16f32(<vscale x 16 x float>, <vscale x 16 x float>, i32)
949declare <vscale x 2 x double> @llvm.vector.splice.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, i32)
950
951declare <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32)
952declare <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32)
953declare <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
954
955attributes #0 = { nounwind "target-features"="+sve" }
956