xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-insert-vector.ll (revision 61510b51c33464a6bc15e4cf5b1ee07e2e0ec1c9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+bf16 < %s | FileCheck %s --check-prefixes=CHECK
3
4define <vscale x 2 x i64> @insert_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind {
5; CHECK-LABEL: insert_v2i64_nxv2i64:
6; CHECK:       // %bb.0:
7; CHECK-NEXT:    ptrue p0.d, vl2
8; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
9; CHECK-NEXT:    mov z0.d, p0/m, z1.d
10; CHECK-NEXT:    ret
11  %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 0)
12  ret <vscale x 2 x i64> %retval
13}
14
15define <vscale x 2 x i64> @insert_v2i64_nxv2i64_idx2(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind {
16; CHECK-LABEL: insert_v2i64_nxv2i64_idx2:
17; CHECK:       // %bb.0:
18; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
19; CHECK-NEXT:    addvl sp, sp, #-1
20; CHECK-NEXT:    cntd x8
21; CHECK-NEXT:    mov w9, #2 // =0x2
22; CHECK-NEXT:    ptrue p0.d
23; CHECK-NEXT:    sub x8, x8, #2
24; CHECK-NEXT:    cmp x8, #2
25; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
26; CHECK-NEXT:    csel x8, x8, x9, lo
27; CHECK-NEXT:    mov x9, sp
28; CHECK-NEXT:    lsl x8, x8, #3
29; CHECK-NEXT:    str q1, [x9, x8]
30; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
31; CHECK-NEXT:    addvl sp, sp, #1
32; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
33; CHECK-NEXT:    ret
34  %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 2)
35  ret <vscale x 2 x i64> %retval
36}
37
38define <vscale x 4 x i32> @insert_v4i32_nxv4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind {
39; CHECK-LABEL: insert_v4i32_nxv4i32:
40; CHECK:       // %bb.0:
41; CHECK-NEXT:    ptrue p0.s, vl4
42; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
43; CHECK-NEXT:    mov z0.s, p0/m, z1.s
44; CHECK-NEXT:    ret
45  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 0)
46  ret <vscale x 4 x i32> %retval
47}
48
49define <vscale x 4 x i32> @insert_v4i32_nxv4i32_idx4(<vscale x 4 x i32> %vec, <4 x i32> %subvec) nounwind {
50; CHECK-LABEL: insert_v4i32_nxv4i32_idx4:
51; CHECK:       // %bb.0:
52; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
53; CHECK-NEXT:    addvl sp, sp, #-1
54; CHECK-NEXT:    cntw x8
55; CHECK-NEXT:    mov w9, #4 // =0x4
56; CHECK-NEXT:    ptrue p0.s
57; CHECK-NEXT:    sub x8, x8, #4
58; CHECK-NEXT:    cmp x8, #4
59; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
60; CHECK-NEXT:    csel x8, x8, x9, lo
61; CHECK-NEXT:    mov x9, sp
62; CHECK-NEXT:    lsl x8, x8, #2
63; CHECK-NEXT:    str q1, [x9, x8]
64; CHECK-NEXT:    ld1w { z0.s }, p0/z, [sp]
65; CHECK-NEXT:    addvl sp, sp, #1
66; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
67; CHECK-NEXT:    ret
68  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 4)
69  ret <vscale x 4 x i32> %retval
70}
71
72define <vscale x 8 x i16> @insert_v8i16_nxv8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind {
73; CHECK-LABEL: insert_v8i16_nxv8i16:
74; CHECK:       // %bb.0:
75; CHECK-NEXT:    ptrue p0.h, vl8
76; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
77; CHECK-NEXT:    mov z0.h, p0/m, z1.h
78; CHECK-NEXT:    ret
79  %retval = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 0)
80  ret <vscale x 8 x i16> %retval
81}
82
83define <vscale x 8 x i16> @insert_v8i16_nxv8i16_idx8(<vscale x 8 x i16> %vec, <8 x i16> %subvec) nounwind {
84; CHECK-LABEL: insert_v8i16_nxv8i16_idx8:
85; CHECK:       // %bb.0:
86; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
87; CHECK-NEXT:    addvl sp, sp, #-1
88; CHECK-NEXT:    cnth x8
89; CHECK-NEXT:    mov w9, #8 // =0x8
90; CHECK-NEXT:    ptrue p0.h
91; CHECK-NEXT:    sub x8, x8, #8
92; CHECK-NEXT:    cmp x8, #8
93; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
94; CHECK-NEXT:    csel x8, x8, x9, lo
95; CHECK-NEXT:    mov x9, sp
96; CHECK-NEXT:    lsl x8, x8, #1
97; CHECK-NEXT:    str q1, [x9, x8]
98; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
99; CHECK-NEXT:    addvl sp, sp, #1
100; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
101; CHECK-NEXT:    ret
102  %retval = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> %vec, <8 x i16> %subvec, i64 8)
103  ret <vscale x 8 x i16> %retval
104}
105
106define <vscale x 16 x i8> @insert_v16i8_nxv16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind {
107; CHECK-LABEL: insert_v16i8_nxv16i8:
108; CHECK:       // %bb.0:
109; CHECK-NEXT:    ptrue p0.b, vl16
110; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
111; CHECK-NEXT:    mov z0.b, p0/m, z1.b
112; CHECK-NEXT:    ret
113  %retval = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 0)
114  ret <vscale x 16 x i8> %retval
115}
116
117define <vscale x 16 x i8> @insert_v16i8_nxv16i8_idx16(<vscale x 16 x i8> %vec, <16 x i8> %subvec) nounwind {
118; CHECK-LABEL: insert_v16i8_nxv16i8_idx16:
119; CHECK:       // %bb.0:
120; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
121; CHECK-NEXT:    addvl sp, sp, #-1
122; CHECK-NEXT:    rdvl x8, #1
123; CHECK-NEXT:    ptrue p0.b
124; CHECK-NEXT:    mov w9, #16 // =0x10
125; CHECK-NEXT:    sub x8, x8, #16
126; CHECK-NEXT:    cmp x8, #16
127; CHECK-NEXT:    st1b { z0.b }, p0, [sp]
128; CHECK-NEXT:    csel x8, x8, x9, lo
129; CHECK-NEXT:    mov x9, sp
130; CHECK-NEXT:    str q1, [x9, x8]
131; CHECK-NEXT:    ld1b { z0.b }, p0/z, [sp]
132; CHECK-NEXT:    addvl sp, sp, #1
133; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
134; CHECK-NEXT:    ret
135  %retval = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> %vec, <16 x i8> %subvec, i64 16)
136  ret <vscale x 16 x i8> %retval
137}
138
139
140; Insert subvectors into illegal vectors
141
142define void @insert_nxv8i64_nxv16i64(<vscale x 8 x i64> %sv0, <vscale x 8 x i64> %sv1, ptr %out) {
143; CHECK-LABEL: insert_nxv8i64_nxv16i64:
144; CHECK:       // %bb.0:
145; CHECK-NEXT:    ptrue p0.d
146; CHECK-NEXT:    st1d { z7.d }, p0, [x0, #7, mul vl]
147; CHECK-NEXT:    st1d { z6.d }, p0, [x0, #6, mul vl]
148; CHECK-NEXT:    st1d { z5.d }, p0, [x0, #5, mul vl]
149; CHECK-NEXT:    st1d { z4.d }, p0, [x0, #4, mul vl]
150; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #3, mul vl]
151; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #2, mul vl]
152; CHECK-NEXT:    st1d { z1.d }, p0, [x0, #1, mul vl]
153; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
154; CHECK-NEXT:    ret
155  %v0 = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0)
156  %v = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> %v0, <vscale x 8 x i64> %sv1, i64 8)
157  store <vscale x 16 x i64> %v, ptr %out
158  ret void
159}
160
161define void @insert_nxv8i64_nxv16i64_lo(<vscale x 8 x i64> %sv0, ptr %out) {
162; CHECK-LABEL: insert_nxv8i64_nxv16i64_lo:
163; CHECK:       // %bb.0:
164; CHECK-NEXT:    ptrue p0.d
165; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #3, mul vl]
166; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #2, mul vl]
167; CHECK-NEXT:    st1d { z1.d }, p0, [x0, #1, mul vl]
168; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
169; CHECK-NEXT:    ret
170  %v = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 0)
171  store <vscale x 16 x i64> %v, ptr %out
172  ret void
173}
174
175define void @insert_nxv8i64_nxv16i64_hi(<vscale x 8 x i64> %sv0, ptr %out) {
176; CHECK-LABEL: insert_nxv8i64_nxv16i64_hi:
177; CHECK:       // %bb.0:
178; CHECK-NEXT:    ptrue p0.d
179; CHECK-NEXT:    st1d { z3.d }, p0, [x0, #7, mul vl]
180; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #6, mul vl]
181; CHECK-NEXT:    st1d { z1.d }, p0, [x0, #5, mul vl]
182; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #4, mul vl]
183; CHECK-NEXT:    ret
184  %v = call <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 8 x i64> %sv0, i64 8)
185  store <vscale x 16 x i64> %v, ptr %out
186  ret void
187}
188
189define void @insert_v2i64_nxv16i64(<2 x i64> %sv0, <2 x i64> %sv1, ptr %out) uwtable {
190; CHECK-LABEL: insert_v2i64_nxv16i64:
191; CHECK:       // %bb.0:
192; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
193; CHECK-NEXT:    .cfi_def_cfa_offset 16
194; CHECK-NEXT:    .cfi_offset w29, -16
195; CHECK-NEXT:    addvl sp, sp, #-4
196; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
197; CHECK-NEXT:    ptrue p0.d
198; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
199; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
200; CHECK-NEXT:    str q1, [sp, #32]
201; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp, #3, mul vl]
202; CHECK-NEXT:    ld1d { z1.d }, p0/z, [sp, #2, mul vl]
203; CHECK-NEXT:    ld1d { z2.d }, p0/z, [sp, #1, mul vl]
204; CHECK-NEXT:    ld1d { z3.d }, p0/z, [sp]
205; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #3, mul vl]
206; CHECK-NEXT:    st1d { z1.d }, p0, [x0, #2, mul vl]
207; CHECK-NEXT:    st1d { z2.d }, p0, [x0, #1, mul vl]
208; CHECK-NEXT:    st1d { z3.d }, p0, [x0]
209; CHECK-NEXT:    addvl sp, sp, #4
210; CHECK-NEXT:    .cfi_def_cfa wsp, 16
211; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
212; CHECK-NEXT:    .cfi_def_cfa_offset 0
213; CHECK-NEXT:    .cfi_restore w29
214; CHECK-NEXT:    ret
215  %v0 = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0)
216  %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> %v0, <2 x i64> %sv1, i64 4)
217  store <vscale x 16 x i64> %v, ptr %out
218  ret void
219}
220
221define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) {
222; CHECK-LABEL: insert_v2i64_nxv16i64_lo0:
223; CHECK:       // %bb.0:
224; CHECK-NEXT:    ptrue p0.d
225; CHECK-NEXT:    ldr q0, [x0]
226; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
227; CHECK-NEXT:    ret
228  %sv = load <2 x i64>, ptr %psv
229  %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0)
230  store <vscale x 16 x i64> %v, ptr %out
231  ret void
232}
233
234define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) uwtable {
235; CHECK-LABEL: insert_v2i64_nxv16i64_lo2:
236; CHECK:       // %bb.0:
237; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
238; CHECK-NEXT:    .cfi_def_cfa_offset 16
239; CHECK-NEXT:    .cfi_offset w29, -16
240; CHECK-NEXT:    addvl sp, sp, #-2
241; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
242; CHECK-NEXT:    ldr q0, [x0]
243; CHECK-NEXT:    ptrue p0.d
244; CHECK-NEXT:    str q0, [sp, #16]
245; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp, #1, mul vl]
246; CHECK-NEXT:    ld1d { z1.d }, p0/z, [sp]
247; CHECK-NEXT:    st1d { z0.d }, p0, [x1, #1, mul vl]
248; CHECK-NEXT:    st1d { z1.d }, p0, [x1]
249; CHECK-NEXT:    addvl sp, sp, #2
250; CHECK-NEXT:    .cfi_def_cfa wsp, 16
251; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
252; CHECK-NEXT:    .cfi_def_cfa_offset 0
253; CHECK-NEXT:    .cfi_restore w29
254; CHECK-NEXT:    ret
255  %sv = load <2 x i64>, ptr %psv
256  %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
257  store <vscale x 16 x i64> %v, ptr %out
258  ret void
259}
260
261
262; Insert subvectors that need widening
263
264define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_undef() nounwind {
265; CHECK-LABEL: insert_nxv1i32_nxv4i32_undef:
266; CHECK:       // %bb.0: // %entry
267; CHECK-NEXT:    mov z0.s, #1 // =0x1
268; CHECK-NEXT:    ret
269entry:
270  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> undef, <vscale x 1 x i32> splat(i32 1), i64 0)
271  ret <vscale x 4 x i32> %retval
272}
273
274define <vscale x 6 x i16> @insert_nxv1i16_nxv6i16_undef() nounwind {
275; CHECK-LABEL: insert_nxv1i16_nxv6i16_undef:
276; CHECK:       // %bb.0: // %entry
277; CHECK-NEXT:    mov z0.h, #1 // =0x1
278; CHECK-NEXT:    ret
279entry:
280  %retval = call <vscale x 6 x i16> @llvm.vector.insert.nxv6i16.nxv1i16(<vscale x 6 x i16> undef, <vscale x 1 x i16> splat(i16 1), i64 0)
281  ret <vscale x 6 x i16> %retval
282}
283
284define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_undef(<vscale x 1 x float> %subvec) nounwind {
285; CHECK-LABEL: insert_nxv1f32_nxv4f32_undef:
286; CHECK:       // %bb.0: // %entry
287; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
288; CHECK-NEXT:    ret
289entry:
290  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> undef, <vscale x 1 x float> %subvec, i64 0)
291  ret <vscale x 4 x float> %retval
292}
293
294; This tests promotion of the input operand to INSERT_SUBVECTOR.
295define <vscale x 8 x i16> @insert_nxv8i16_nxv2i16(<vscale x 8 x i16> %vec, <vscale x 2 x i16> %in) nounwind {
296; CHECK-LABEL: insert_nxv8i16_nxv2i16:
297; CHECK:       // %bb.0:
298; CHECK-NEXT:    uunpklo z2.s, z0.h
299; CHECK-NEXT:    uunpkhi z0.s, z0.h
300; CHECK-NEXT:    uunpklo z2.d, z2.s
301; CHECK-NEXT:    uzp1 z1.s, z2.s, z1.s
302; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
303; CHECK-NEXT:    ret
304  %r = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16> %vec, <vscale x 2 x i16> %in, i64 2)
305  ret <vscale x 8 x i16> %r
306}
307
308define <vscale x 4 x half> @insert_nxv4f16_nxv2f16_0(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1) nounwind {
309; CHECK-LABEL: insert_nxv4f16_nxv2f16_0:
310; CHECK:       // %bb.0:
311; CHECK-NEXT:    uunpkhi z0.d, z0.s
312; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
313; CHECK-NEXT:    ret
314  %v0 = call <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1, i64 0)
315 ret <vscale x 4 x half> %v0
316}
317
318define <vscale x 4 x half> @insert_nxv4f16_nxv2f16_2(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1) nounwind {
319; CHECK-LABEL: insert_nxv4f16_nxv2f16_2:
320; CHECK:       // %bb.0:
321; CHECK-NEXT:    uunpklo z0.d, z0.s
322; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
323; CHECK-NEXT:    ret
324  %v0 = call <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half> %sv0, <vscale x 2 x half> %sv1, i64 2)
325 ret <vscale x 4 x half> %v0
326}
327
328; Test that the index is scaled by vscale if the subvector is scalable.
329define <vscale x 8 x half> @insert_nxv8f16_nxv2f16(<vscale x 8 x half> %vec, <vscale x 2 x half> %in) nounwind {
330; CHECK-LABEL: insert_nxv8f16_nxv2f16:
331; CHECK:       // %bb.0:
332; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
333; CHECK-NEXT:    addvl sp, sp, #-1
334; CHECK-NEXT:    ptrue p0.h
335; CHECK-NEXT:    ptrue p1.d
336; CHECK-NEXT:    st1h { z0.h }, p0, [sp]
337; CHECK-NEXT:    st1h { z1.d }, p1, [sp, #1, mul vl]
338; CHECK-NEXT:    ld1h { z0.h }, p0/z, [sp]
339; CHECK-NEXT:    addvl sp, sp, #1
340; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
341; CHECK-NEXT:    ret
342  %r = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half> %vec, <vscale x 2 x half> %in, i64 2)
343  ret <vscale x 8 x half> %r
344}
345
346define <vscale x 8 x half> @insert_nxv8f16_nxv4f16_0(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1) nounwind {
347; CHECK-LABEL: insert_nxv8f16_nxv4f16_0:
348; CHECK:       // %bb.0:
349; CHECK-NEXT:    uunpkhi z0.s, z0.h
350; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
351; CHECK-NEXT:    ret
352  %v0 = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1, i64 0)
353 ret <vscale x 8 x half> %v0
354}
355
356define <vscale x 8 x half> @insert_nxv8f16_nxv4f16_4(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1) nounwind {
357; CHECK-LABEL: insert_nxv8f16_nxv4f16_4:
358; CHECK:       // %bb.0:
359; CHECK-NEXT:    uunpklo z0.s, z0.h
360; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
361; CHECK-NEXT:    ret
362  %v0 = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half> %sv0, <vscale x 4 x half> %sv1, i64 4)
363 ret <vscale x 8 x half> %v0
364}
365
366; Fixed length clamping
367
368define <vscale x 2 x i64> @insert_fixed_v2i64_nxv2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec) nounwind #0 {
369; CHECK-LABEL: insert_fixed_v2i64_nxv2i64:
370; CHECK:       // %bb.0:
371; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
372; CHECK-NEXT:    addvl sp, sp, #-1
373; CHECK-NEXT:    ptrue p0.d
374; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
375; CHECK-NEXT:    str q1, [sp, #16]
376; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
377; CHECK-NEXT:    addvl sp, sp, #1
378; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
379; CHECK-NEXT:    ret
380  %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> %vec, <2 x i64> %subvec, i64 2)
381  ret <vscale x 2 x i64> %retval
382}
383
384define <vscale x 2 x i64> @insert_fixed_v4i64_nxv2i64(<vscale x 2 x i64> %vec, ptr %ptr) nounwind #0 {
385; CHECK-LABEL: insert_fixed_v4i64_nxv2i64:
386; CHECK:       // %bb.0:
387; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
388; CHECK-NEXT:    addvl sp, sp, #-1
389; CHECK-NEXT:    ptrue p0.d
390; CHECK-NEXT:    ld1d { z1.d }, p0/z, [x0]
391; CHECK-NEXT:    st1d { z0.d }, p0, [sp]
392; CHECK-NEXT:    st1d { z1.d }, p0, [sp]
393; CHECK-NEXT:    ld1d { z0.d }, p0/z, [sp]
394; CHECK-NEXT:    addvl sp, sp, #1
395; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
396; CHECK-NEXT:    ret
397  %subvec = load <4 x i64>, ptr %ptr
398  %retval = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> %vec, <4 x i64> %subvec, i64 4)
399  ret <vscale x 2 x i64> %retval
400}
401
402;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
403;;  Upacked types that need result widening
404;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
405
406define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32(<vscale x 2 x i32> %sv0) {
407; CHECK-LABEL: insert_nxv3i32_nxv2i32:
408; CHECK:       // %bb.0:
409; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
410; CHECK-NEXT:    ret
411  %v0 = call <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32> undef, <vscale x 2 x i32> %sv0, i64 0)
412 ret <vscale x 3 x i32> %v0
413}
414
415;; Check that the Subvector is not widen so it does not crash.
416define <vscale x 3 x i32> @insert_nxv3i32_nxv2i32_2(<vscale x 3 x i32> %sv0, <vscale x 2 x i32> %sv1) {
417; CHECK-LABEL: insert_nxv3i32_nxv2i32_2:
418; CHECK:       // %bb.0:
419; CHECK-NEXT:    uunpkhi z0.d, z0.s
420; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
421; CHECK-NEXT:    ret
422  %v0 = call <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32> %sv0, <vscale x 2 x i32> %sv1, i64 0)
423  ret <vscale x 3 x i32> %v0
424}
425
426define <vscale x 3 x float> @insert_nxv3f32_nxv2f32(<vscale x 2 x float> %sv0) nounwind {
427; CHECK-LABEL: insert_nxv3f32_nxv2f32:
428; CHECK:       // %bb.0:
429; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
430; CHECK-NEXT:    ret
431  %v0 = call <vscale x 3 x float> @llvm.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float> undef, <vscale x 2 x float> %sv0, i64 0)
432 ret <vscale x 3 x float> %v0
433}
434
435define <vscale x 4 x float> @insert_nxv4f32_nxv2f32_0(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1) nounwind {
436; CHECK-LABEL: insert_nxv4f32_nxv2f32_0:
437; CHECK:       // %bb.0:
438; CHECK-NEXT:    uunpkhi z0.d, z0.s
439; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
440; CHECK-NEXT:    ret
441  %v0 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1, i64 0)
442 ret <vscale x 4 x float> %v0
443}
444
445define <vscale x 4 x float> @insert_nxv4f32_nxv2f32_2(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1) nounwind {
446; CHECK-LABEL: insert_nxv4f32_nxv2f32_2:
447; CHECK:       // %bb.0:
448; CHECK-NEXT:    uunpklo z0.d, z0.s
449; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
450; CHECK-NEXT:    ret
451  %v0 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> %sv0, <vscale x 2 x float> %sv1, i64 2)
452 ret <vscale x 4 x float> %v0
453}
454
455define <vscale x 6 x i32>  @insert_nxv6i32_nxv2i32(<vscale x 2 x i32> %sv0, <vscale x 2 x i32> %sv1) nounwind {
456; CHECK-LABEL: insert_nxv6i32_nxv2i32:
457; CHECK:       // %bb.0:
458; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
459; CHECK-NEXT:    addvl sp, sp, #-2
460; CHECK-NEXT:    ptrue p0.s
461; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
462; CHECK-NEXT:    ld1w { z1.s }, p0/z, [sp, #1, mul vl]
463; CHECK-NEXT:    st1w { z0.s }, p0, [sp]
464; CHECK-NEXT:    addvl sp, sp, #2
465; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
466; CHECK-NEXT:    ret
467  %v0 = call <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32> undef, <vscale x 2 x i32> %sv0, i64 0)
468  %v1 = call <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32> %v0, <vscale x 2 x i32> %sv1, i64 2)
469  ret <vscale x 6 x i32> %v1
470}
471
472;; This only works because the input vector is undef and index is zero
473define  <vscale x 6 x i32> @insert_nxv6i32_nxv3i32(<vscale x 3 x i32> %sv0) {
474; CHECK-LABEL: insert_nxv6i32_nxv3i32:
475; CHECK:       // %bb.0:
476; CHECK-NEXT:    ret
477  %v0 = call <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32> undef, <vscale x 3 x i32> %sv0, i64 0)
478  ret <vscale x 6 x i32> %v0
479}
480
481define <vscale x 12 x i32> @insert_nxv12i32_nxv4i32(<vscale x 4 x i32> %sv0, <vscale x 4 x i32> %sv1, <vscale x 4 x i32> %sv2) {
482; CHECK-LABEL: insert_nxv12i32_nxv4i32:
483; CHECK:       // %bb.0:
484; CHECK-NEXT:    ret
485  %v0 = call <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> undef, <vscale x 4 x i32> %sv0, i64 0)
486  %v1 = call <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> %v0, <vscale x 4 x i32> %sv1, i64 4)
487  %v2 = call <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32> %v1, <vscale x 4 x i32> %sv2, i64 8)
488  ret <vscale x 12 x i32> %v2
489}
490
491define <vscale x 2 x bfloat> @insert_nxv2bf16_nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind {
492; CHECK-LABEL: insert_nxv2bf16_nxv2bf16:
493; CHECK:       // %bb.0:
494; CHECK-NEXT:    mov z0.d, z1.d
495; CHECK-NEXT:    ret
496  %v0 = call <vscale x 2 x bfloat> @llvm.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 0)
497  ret <vscale x 2 x bfloat> %v0
498}
499
500define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind {
501; CHECK-LABEL: insert_nxv4bf16_nxv4bf16:
502; CHECK:       // %bb.0:
503; CHECK-NEXT:    mov z0.d, z1.d
504; CHECK-NEXT:    ret
505  %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 0)
506  ret <vscale x 4 x bfloat> %v0
507}
508
509define <vscale x 4 x bfloat> @insert_nxv4bf16_v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1) nounwind {
510; CHECK-LABEL: insert_nxv4bf16_v4bf16:
511; CHECK:       // %bb.0:
512; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
513; CHECK-NEXT:    addvl sp, sp, #-1
514; CHECK-NEXT:    ptrue p0.s
515; CHECK-NEXT:    addpl x8, sp, #4
516; CHECK-NEXT:    st1h { z0.s }, p0, [sp, #1, mul vl]
517; CHECK-NEXT:    str d1, [x8]
518; CHECK-NEXT:    ld1h { z0.s }, p0/z, [sp, #1, mul vl]
519; CHECK-NEXT:    addvl sp, sp, #1
520; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
521; CHECK-NEXT:    ret
522  %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat> %sv0, <4 x bfloat> %v1, i64 0)
523  ret <vscale x 4 x bfloat> %v0
524}
525
526define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1) nounwind {
527; CHECK-LABEL: insert_nxv8bf16_nxv8bf16:
528; CHECK:       // %bb.0:
529; CHECK-NEXT:    mov z0.d, z1.d
530; CHECK-NEXT:    ret
531  %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat> %sv0, <vscale x 8 x bfloat> %sv1, i64 0)
532  ret <vscale x 8 x bfloat> %v0
533}
534
535define <vscale x 8 x bfloat> @insert_nxv8bf16_v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1) nounwind {
536; CHECK-LABEL: insert_nxv8bf16_v8bf16:
537; CHECK:       // %bb.0:
538; CHECK-NEXT:    ptrue p0.h, vl8
539; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
540; CHECK-NEXT:    mov z0.h, p0/m, z1.h
541; CHECK-NEXT:    ret
542  %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> %sv0, <8 x bfloat> %v1, i64 0)
543  ret <vscale x 8 x bfloat> %v0
544}
545
546define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv4bf16_0(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind {
547; CHECK-LABEL: insert_nxv8bf16_nxv4bf16_0:
548; CHECK:       // %bb.0:
549; CHECK-NEXT:    uunpkhi z0.s, z0.h
550; CHECK-NEXT:    uzp1 z0.h, z1.h, z0.h
551; CHECK-NEXT:    ret
552  %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 0)
553 ret <vscale x 8 x bfloat> %v0
554}
555
556define <vscale x 8 x bfloat> @insert_nxv8bf16_nxv4bf16_4(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1) nounwind {
557; CHECK-LABEL: insert_nxv8bf16_nxv4bf16_4:
558; CHECK:       // %bb.0:
559; CHECK-NEXT:    uunpklo z0.s, z0.h
560; CHECK-NEXT:    uzp1 z0.h, z0.h, z1.h
561; CHECK-NEXT:    ret
562  %v0 = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat> %sv0, <vscale x 4 x bfloat> %sv1, i64 4)
563 ret <vscale x 8 x bfloat> %v0
564}
565
566define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv2bf16_0(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind {
567; CHECK-LABEL: insert_nxv4bf16_nxv2bf16_0:
568; CHECK:       // %bb.0:
569; CHECK-NEXT:    uunpkhi z0.d, z0.s
570; CHECK-NEXT:    uzp1 z0.s, z1.s, z0.s
571; CHECK-NEXT:    ret
572  %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 0)
573 ret <vscale x 4 x bfloat> %v0
574}
575
576define <vscale x 4 x bfloat> @insert_nxv4bf16_nxv2bf16_2(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1) nounwind {
577; CHECK-LABEL: insert_nxv4bf16_nxv2bf16_2:
578; CHECK:       // %bb.0:
579; CHECK-NEXT:    uunpklo z0.d, z0.s
580; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
581; CHECK-NEXT:    ret
582  %v0 = call <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat> %sv0, <vscale x 2 x bfloat> %sv1, i64 2)
583 ret <vscale x 4 x bfloat> %v0
584}
585
586; Test predicate inserts of half size.
587define <vscale x 16 x i1> @insert_nxv16i1_nxv8i1_0(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv) {
588; CHECK-LABEL: insert_nxv16i1_nxv8i1_0:
589; CHECK:       // %bb.0:
590; CHECK-NEXT:    punpkhi p0.h, p0.b
591; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
592; CHECK-NEXT:    ret
593  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv, i64 0)
594  ret <vscale x 16 x i1> %v0
595}
596
597define <vscale x 16 x i1> @insert_nxv16i1_nxv8i1_8(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv) {
598; CHECK-LABEL: insert_nxv16i1_nxv8i1_8:
599; CHECK:       // %bb.0:
600; CHECK-NEXT:    punpklo p0.h, p0.b
601; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
602; CHECK-NEXT:    ret
603  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1> %vec, <vscale x 8 x i1> %sv, i64 8)
604  ret <vscale x 16 x i1> %v0
605}
606
607; Test predicate inserts of less than half the size.
608define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_0(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv) {
609; CHECK-LABEL: insert_nxv16i1_nxv4i1_0:
610; CHECK:       // %bb.0:
611; CHECK-NEXT:    punpklo p2.h, p0.b
612; CHECK-NEXT:    punpkhi p0.h, p0.b
613; CHECK-NEXT:    punpkhi p2.h, p2.b
614; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
615; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
616; CHECK-NEXT:    ret
617  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv, i64 0)
618  ret <vscale x 16 x i1> %v0
619}
620
621define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_12(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv) {
622; CHECK-LABEL: insert_nxv16i1_nxv4i1_12:
623; CHECK:       // %bb.0:
624; CHECK-NEXT:    punpkhi p2.h, p0.b
625; CHECK-NEXT:    punpklo p0.h, p0.b
626; CHECK-NEXT:    punpklo p2.h, p2.b
627; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
628; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
629; CHECK-NEXT:    ret
630  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> %vec, <vscale x 4 x i1> %sv, i64 12)
631  ret <vscale x 16 x i1> %v0
632}
633
634; Test predicate insert into undef/zero
635define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_into_zero(<vscale x 4 x i1> %sv) {
636; CHECK-LABEL: insert_nxv16i1_nxv4i1_into_zero:
637; CHECK:       // %bb.0:
638; CHECK-NEXT:    pfalse p1.b
639; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
640; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
641; CHECK-NEXT:    ret
642  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> zeroinitializer, <vscale x 4 x i1> %sv, i64 0)
643  ret <vscale x 16 x i1> %v0
644}
645
646define <vscale x 16 x i1> @insert_nxv16i1_nxv4i1_into_poison(<vscale x 4 x i1> %sv) {
647; CHECK-LABEL: insert_nxv16i1_nxv4i1_into_poison:
648; CHECK:       // %bb.0:
649; CHECK-NEXT:    uzp1 p0.h, p0.h, p0.h
650; CHECK-NEXT:    uzp1 p0.b, p0.b, p0.b
651; CHECK-NEXT:    ret
652  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1> poison, <vscale x 4 x i1> %sv, i64 0)
653  ret <vscale x 16 x i1> %v0
654}
655
656; Test constant predicate insert into undef
657define <vscale x 2 x i1> @insert_nxv2i1_v8i1_const_true_into_undef() vscale_range(4,8) {
658; CHECK-LABEL: insert_nxv2i1_v8i1_const_true_into_undef:
659; CHECK:       // %bb.0:
660; CHECK-NEXT:    ptrue p0.d
661; CHECK-NEXT:    ret
662  %v0 = call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1 (<vscale x 2 x i1> undef, <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0)
663  ret <vscale x 2 x i1> %v0
664}
665
666define <vscale x 4 x i1> @insert_nxv4i1_v16i1_const_true_into_undef() vscale_range(4,8) {
667; CHECK-LABEL: insert_nxv4i1_v16i1_const_true_into_undef:
668; CHECK:       // %bb.0:
669; CHECK-NEXT:    ptrue p0.s
670; CHECK-NEXT:    ret
671  %v0 = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.v16i1 (<vscale x 4 x i1> undef, <16 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0)
672  ret <vscale x 4 x i1> %v0
673}
674
675define <vscale x 8 x i1> @insert_nxv8i1_v32i1_const_true_into_undef() vscale_range(4,8) {
676; CHECK-LABEL: insert_nxv8i1_v32i1_const_true_into_undef:
677; CHECK:       // %bb.0:
678; CHECK-NEXT:    ptrue p0.h
679; CHECK-NEXT:    ret
680  %v0 = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.v32i1 (<vscale x 8 x i1> undef, <32 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0)
681  ret <vscale x 8 x i1> %v0
682}
683
684define <vscale x 16 x i1> @insert_nxv16i1_v64i1_const_true_into_undef() vscale_range(4,8) {
685; CHECK-LABEL: insert_nxv16i1_v64i1_const_true_into_undef:
686; CHECK:       // %bb.0:
687; CHECK-NEXT:    ptrue p0.b
688; CHECK-NEXT:    ret
689  %v0 = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v64i1 (<vscale x 16 x i1> undef, <64 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, i64 0)
690  ret <vscale x 16 x i1> %v0
691}
692
693;
694; Insert nxv1i1 type into: nxv2i1
695;
696
697define <vscale x 2 x i1> @insert_nxv1i1_nxv2i1_0(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv) {
698; CHECK-LABEL: insert_nxv1i1_nxv2i1_0:
699; CHECK:       // %bb.0:
700; CHECK-NEXT:    punpkhi p0.h, p0.b
701; CHECK-NEXT:    uzp1 p0.d, p1.d, p0.d
702; CHECK-NEXT:    ret
703  %res = call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv, i64 0)
704  ret <vscale x 2 x i1> %res
705}
706
707define <vscale x 2 x i1> @insert_nxv1i1_nxv2i1_1(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv) {
708; CHECK-LABEL: insert_nxv1i1_nxv2i1_1:
709; CHECK:       // %bb.0:
710; CHECK-NEXT:    punpklo p0.h, p0.b
711; CHECK-NEXT:    uzp1 p0.d, p0.d, p1.d
712; CHECK-NEXT:    ret
713  %res = call <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1> %vec, <vscale x 1 x i1> %sv, i64 1)
714  ret <vscale x 2 x i1> %res
715}
716
717;
718; Insert nxv1i1 type into: nxv4i1
719;
720
721define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_0(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) {
722; CHECK-LABEL: insert_nxv1i1_nxv4i1_0:
723; CHECK:       // %bb.0:
724; CHECK-NEXT:    punpklo p2.h, p0.b
725; CHECK-NEXT:    punpkhi p0.h, p0.b
726; CHECK-NEXT:    punpkhi p2.h, p2.b
727; CHECK-NEXT:    uzp1 p1.d, p1.d, p2.d
728; CHECK-NEXT:    uzp1 p0.s, p1.s, p0.s
729; CHECK-NEXT:    ret
730  %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 0)
731  ret <vscale x 4 x i1> %res
732}
733
734define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) {
735; CHECK-LABEL: insert_nxv1i1_nxv4i1_1:
736; CHECK:       // %bb.0:
737; CHECK-NEXT:    punpklo p2.h, p0.b
738; CHECK-NEXT:    punpkhi p0.h, p0.b
739; CHECK-NEXT:    punpklo p2.h, p2.b
740; CHECK-NEXT:    uzp1 p1.d, p2.d, p1.d
741; CHECK-NEXT:    uzp1 p0.s, p1.s, p0.s
742; CHECK-NEXT:    ret
743  %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 1)
744  ret <vscale x 4 x i1> %res
745}
746
747define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_2(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) {
748; CHECK-LABEL: insert_nxv1i1_nxv4i1_2:
749; CHECK:       // %bb.0:
750; CHECK-NEXT:    punpkhi p2.h, p0.b
751; CHECK-NEXT:    punpklo p0.h, p0.b
752; CHECK-NEXT:    punpkhi p2.h, p2.b
753; CHECK-NEXT:    uzp1 p1.d, p1.d, p2.d
754; CHECK-NEXT:    uzp1 p0.s, p0.s, p1.s
755; CHECK-NEXT:    ret
756  %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 2)
757  ret <vscale x 4 x i1> %res
758}
759
760define <vscale x 4 x i1> @insert_nxv1i1_nxv4i1_3(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv) {
761; CHECK-LABEL: insert_nxv1i1_nxv4i1_3:
762; CHECK:       // %bb.0:
763; CHECK-NEXT:    punpkhi p2.h, p0.b
764; CHECK-NEXT:    punpklo p0.h, p0.b
765; CHECK-NEXT:    punpklo p2.h, p2.b
766; CHECK-NEXT:    uzp1 p1.d, p2.d, p1.d
767; CHECK-NEXT:    uzp1 p0.s, p0.s, p1.s
768; CHECK-NEXT:    ret
769  %res = call <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1> %vec, <vscale x 1 x i1> %sv, i64 3)
770  ret <vscale x 4 x i1> %res
771}
772
773;
774; Insert nxv1i1 type into: nxv8i1
775;
776
777define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_0(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
778; CHECK-LABEL: insert_nxv1i1_nxv8i1_0:
779; CHECK:       // %bb.0:
780; CHECK-NEXT:    punpklo p2.h, p0.b
781; CHECK-NEXT:    punpkhi p0.h, p0.b
782; CHECK-NEXT:    punpklo p3.h, p2.b
783; CHECK-NEXT:    punpkhi p2.h, p2.b
784; CHECK-NEXT:    punpkhi p3.h, p3.b
785; CHECK-NEXT:    uzp1 p1.d, p1.d, p3.d
786; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
787; CHECK-NEXT:    uzp1 p0.h, p1.h, p0.h
788; CHECK-NEXT:    ret
789  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 0)
790  ret <vscale x 8 x i1> %res
791}
792
793define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
794; CHECK-LABEL: insert_nxv1i1_nxv8i1_1:
795; CHECK:       // %bb.0:
796; CHECK-NEXT:    punpklo p2.h, p0.b
797; CHECK-NEXT:    punpkhi p0.h, p0.b
798; CHECK-NEXT:    punpklo p3.h, p2.b
799; CHECK-NEXT:    punpkhi p2.h, p2.b
800; CHECK-NEXT:    punpklo p3.h, p3.b
801; CHECK-NEXT:    uzp1 p1.d, p3.d, p1.d
802; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
803; CHECK-NEXT:    uzp1 p0.h, p1.h, p0.h
804; CHECK-NEXT:    ret
805  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 1)
806  ret <vscale x 8 x i1> %res
807}
808
809define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_2(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
810; CHECK-LABEL: insert_nxv1i1_nxv8i1_2:
811; CHECK:       // %bb.0:
812; CHECK-NEXT:    punpklo p2.h, p0.b
813; CHECK-NEXT:    punpkhi p0.h, p0.b
814; CHECK-NEXT:    punpkhi p3.h, p2.b
815; CHECK-NEXT:    punpklo p2.h, p2.b
816; CHECK-NEXT:    punpkhi p3.h, p3.b
817; CHECK-NEXT:    uzp1 p1.d, p1.d, p3.d
818; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
819; CHECK-NEXT:    uzp1 p0.h, p1.h, p0.h
820; CHECK-NEXT:    ret
821  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 2)
822  ret <vscale x 8 x i1> %res
823}
824
825define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_3(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
826; CHECK-LABEL: insert_nxv1i1_nxv8i1_3:
827; CHECK:       // %bb.0:
828; CHECK-NEXT:    punpklo p2.h, p0.b
829; CHECK-NEXT:    punpkhi p0.h, p0.b
830; CHECK-NEXT:    punpkhi p3.h, p2.b
831; CHECK-NEXT:    punpklo p2.h, p2.b
832; CHECK-NEXT:    punpklo p3.h, p3.b
833; CHECK-NEXT:    uzp1 p1.d, p3.d, p1.d
834; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
835; CHECK-NEXT:    uzp1 p0.h, p1.h, p0.h
836; CHECK-NEXT:    ret
837  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 3)
838  ret <vscale x 8 x i1> %res
839}
840
841define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_4(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
842; CHECK-LABEL: insert_nxv1i1_nxv8i1_4:
843; CHECK:       // %bb.0:
844; CHECK-NEXT:    punpkhi p2.h, p0.b
845; CHECK-NEXT:    punpklo p0.h, p0.b
846; CHECK-NEXT:    punpklo p3.h, p2.b
847; CHECK-NEXT:    punpkhi p2.h, p2.b
848; CHECK-NEXT:    punpkhi p3.h, p3.b
849; CHECK-NEXT:    uzp1 p1.d, p1.d, p3.d
850; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
851; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
852; CHECK-NEXT:    ret
853  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 4)
854  ret <vscale x 8 x i1> %res
855}
856
857define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_5(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
858; CHECK-LABEL: insert_nxv1i1_nxv8i1_5:
859; CHECK:       // %bb.0:
860; CHECK-NEXT:    punpkhi p2.h, p0.b
861; CHECK-NEXT:    punpklo p0.h, p0.b
862; CHECK-NEXT:    punpklo p3.h, p2.b
863; CHECK-NEXT:    punpkhi p2.h, p2.b
864; CHECK-NEXT:    punpklo p3.h, p3.b
865; CHECK-NEXT:    uzp1 p1.d, p3.d, p1.d
866; CHECK-NEXT:    uzp1 p1.s, p1.s, p2.s
867; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
868; CHECK-NEXT:    ret
869  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 5)
870  ret <vscale x 8 x i1> %res
871}
872
873define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_6(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
874; CHECK-LABEL: insert_nxv1i1_nxv8i1_6:
875; CHECK:       // %bb.0:
876; CHECK-NEXT:    punpkhi p2.h, p0.b
877; CHECK-NEXT:    punpklo p0.h, p0.b
878; CHECK-NEXT:    punpkhi p3.h, p2.b
879; CHECK-NEXT:    punpklo p2.h, p2.b
880; CHECK-NEXT:    punpkhi p3.h, p3.b
881; CHECK-NEXT:    uzp1 p1.d, p1.d, p3.d
882; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
883; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
884; CHECK-NEXT:    ret
885  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 6)
886  ret <vscale x 8 x i1> %res
887}
888
889define <vscale x 8 x i1> @insert_nxv1i1_nxv8i1_7(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv) {
890; CHECK-LABEL: insert_nxv1i1_nxv8i1_7:
891; CHECK:       // %bb.0:
892; CHECK-NEXT:    punpkhi p2.h, p0.b
893; CHECK-NEXT:    punpklo p0.h, p0.b
894; CHECK-NEXT:    punpkhi p3.h, p2.b
895; CHECK-NEXT:    punpklo p2.h, p2.b
896; CHECK-NEXT:    punpklo p3.h, p3.b
897; CHECK-NEXT:    uzp1 p1.d, p3.d, p1.d
898; CHECK-NEXT:    uzp1 p1.s, p2.s, p1.s
899; CHECK-NEXT:    uzp1 p0.h, p0.h, p1.h
900; CHECK-NEXT:    ret
901  %res = call <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1> %vec, <vscale x 1 x i1> %sv, i64 7)
902  ret <vscale x 8 x i1> %res
903}
904
905;
906; Insert nxv1i1 type into: nxv16i1
907;
908
909define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_0(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
910; CHECK-LABEL: insert_nxv1i1_nxv16i1_0:
911; CHECK:       // %bb.0:
912; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
913; CHECK-NEXT:    addvl sp, sp, #-1
914; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
915; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
916; CHECK-NEXT:    .cfi_offset w29, -16
917; CHECK-NEXT:    punpklo p2.h, p0.b
918; CHECK-NEXT:    punpkhi p0.h, p0.b
919; CHECK-NEXT:    punpklo p3.h, p2.b
920; CHECK-NEXT:    punpkhi p2.h, p2.b
921; CHECK-NEXT:    punpklo p4.h, p3.b
922; CHECK-NEXT:    punpkhi p3.h, p3.b
923; CHECK-NEXT:    punpkhi p4.h, p4.b
924; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
925; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
926; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
927; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
928; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
929; CHECK-NEXT:    addvl sp, sp, #1
930; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
931; CHECK-NEXT:    ret
932  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 0)
933  ret <vscale x 16 x i1> %res
934}
935
936define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
937; CHECK-LABEL: insert_nxv1i1_nxv16i1_1:
938; CHECK:       // %bb.0:
939; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
940; CHECK-NEXT:    addvl sp, sp, #-1
941; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
942; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
943; CHECK-NEXT:    .cfi_offset w29, -16
944; CHECK-NEXT:    punpklo p2.h, p0.b
945; CHECK-NEXT:    punpkhi p0.h, p0.b
946; CHECK-NEXT:    punpklo p3.h, p2.b
947; CHECK-NEXT:    punpkhi p2.h, p2.b
948; CHECK-NEXT:    punpklo p4.h, p3.b
949; CHECK-NEXT:    punpkhi p3.h, p3.b
950; CHECK-NEXT:    punpklo p4.h, p4.b
951; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
952; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
953; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
954; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
955; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
956; CHECK-NEXT:    addvl sp, sp, #1
957; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
958; CHECK-NEXT:    ret
959  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 1)
960  ret <vscale x 16 x i1> %res
961}
962
963define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_2(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
964; CHECK-LABEL: insert_nxv1i1_nxv16i1_2:
965; CHECK:       // %bb.0:
966; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
967; CHECK-NEXT:    addvl sp, sp, #-1
968; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
969; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
970; CHECK-NEXT:    .cfi_offset w29, -16
971; CHECK-NEXT:    punpklo p2.h, p0.b
972; CHECK-NEXT:    punpkhi p0.h, p0.b
973; CHECK-NEXT:    punpklo p3.h, p2.b
974; CHECK-NEXT:    punpkhi p2.h, p2.b
975; CHECK-NEXT:    punpkhi p4.h, p3.b
976; CHECK-NEXT:    punpklo p3.h, p3.b
977; CHECK-NEXT:    punpkhi p4.h, p4.b
978; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
979; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
980; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
981; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
982; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
983; CHECK-NEXT:    addvl sp, sp, #1
984; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
985; CHECK-NEXT:    ret
986  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 2)
987  ret <vscale x 16 x i1> %res
988}
989
990define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_3(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
991; CHECK-LABEL: insert_nxv1i1_nxv16i1_3:
992; CHECK:       // %bb.0:
993; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
994; CHECK-NEXT:    addvl sp, sp, #-1
995; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
996; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
997; CHECK-NEXT:    .cfi_offset w29, -16
998; CHECK-NEXT:    punpklo p2.h, p0.b
999; CHECK-NEXT:    punpkhi p0.h, p0.b
1000; CHECK-NEXT:    punpklo p3.h, p2.b
1001; CHECK-NEXT:    punpkhi p2.h, p2.b
1002; CHECK-NEXT:    punpkhi p4.h, p3.b
1003; CHECK-NEXT:    punpklo p3.h, p3.b
1004; CHECK-NEXT:    punpklo p4.h, p4.b
1005; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1006; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1007; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1008; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
1009; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
1010; CHECK-NEXT:    addvl sp, sp, #1
1011; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1012; CHECK-NEXT:    ret
1013  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 3)
1014  ret <vscale x 16 x i1> %res
1015}
1016
1017define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_4(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1018; CHECK-LABEL: insert_nxv1i1_nxv16i1_4:
1019; CHECK:       // %bb.0:
1020; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1021; CHECK-NEXT:    addvl sp, sp, #-1
1022; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1023; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1024; CHECK-NEXT:    .cfi_offset w29, -16
1025; CHECK-NEXT:    punpklo p2.h, p0.b
1026; CHECK-NEXT:    punpkhi p0.h, p0.b
1027; CHECK-NEXT:    punpkhi p3.h, p2.b
1028; CHECK-NEXT:    punpklo p2.h, p2.b
1029; CHECK-NEXT:    punpklo p4.h, p3.b
1030; CHECK-NEXT:    punpkhi p3.h, p3.b
1031; CHECK-NEXT:    punpkhi p4.h, p4.b
1032; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1033; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1034; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1035; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1036; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
1037; CHECK-NEXT:    addvl sp, sp, #1
1038; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1039; CHECK-NEXT:    ret
1040  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 4)
1041  ret <vscale x 16 x i1> %res
1042}
1043
1044define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_5(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1045; CHECK-LABEL: insert_nxv1i1_nxv16i1_5:
1046; CHECK:       // %bb.0:
1047; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1048; CHECK-NEXT:    addvl sp, sp, #-1
1049; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1050; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1051; CHECK-NEXT:    .cfi_offset w29, -16
1052; CHECK-NEXT:    punpklo p2.h, p0.b
1053; CHECK-NEXT:    punpkhi p0.h, p0.b
1054; CHECK-NEXT:    punpkhi p3.h, p2.b
1055; CHECK-NEXT:    punpklo p2.h, p2.b
1056; CHECK-NEXT:    punpklo p4.h, p3.b
1057; CHECK-NEXT:    punpkhi p3.h, p3.b
1058; CHECK-NEXT:    punpklo p4.h, p4.b
1059; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1060; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1061; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1062; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1063; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
1064; CHECK-NEXT:    addvl sp, sp, #1
1065; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1066; CHECK-NEXT:    ret
1067  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 5)
1068  ret <vscale x 16 x i1> %res
1069}
1070
1071define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_6(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1072; CHECK-LABEL: insert_nxv1i1_nxv16i1_6:
1073; CHECK:       // %bb.0:
1074; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1075; CHECK-NEXT:    addvl sp, sp, #-1
1076; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1077; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1078; CHECK-NEXT:    .cfi_offset w29, -16
1079; CHECK-NEXT:    punpklo p2.h, p0.b
1080; CHECK-NEXT:    punpkhi p0.h, p0.b
1081; CHECK-NEXT:    punpkhi p3.h, p2.b
1082; CHECK-NEXT:    punpklo p2.h, p2.b
1083; CHECK-NEXT:    punpkhi p4.h, p3.b
1084; CHECK-NEXT:    punpklo p3.h, p3.b
1085; CHECK-NEXT:    punpkhi p4.h, p4.b
1086; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1087; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1088; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1089; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1090; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
1091; CHECK-NEXT:    addvl sp, sp, #1
1092; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1093; CHECK-NEXT:    ret
1094  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 6)
1095  ret <vscale x 16 x i1> %res
1096}
1097
1098define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_7(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1099; CHECK-LABEL: insert_nxv1i1_nxv16i1_7:
1100; CHECK:       // %bb.0:
1101; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1102; CHECK-NEXT:    addvl sp, sp, #-1
1103; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1104; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1105; CHECK-NEXT:    .cfi_offset w29, -16
1106; CHECK-NEXT:    punpklo p2.h, p0.b
1107; CHECK-NEXT:    punpkhi p0.h, p0.b
1108; CHECK-NEXT:    punpkhi p3.h, p2.b
1109; CHECK-NEXT:    punpklo p2.h, p2.b
1110; CHECK-NEXT:    punpkhi p4.h, p3.b
1111; CHECK-NEXT:    punpklo p3.h, p3.b
1112; CHECK-NEXT:    punpklo p4.h, p4.b
1113; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1114; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1115; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1116; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1117; CHECK-NEXT:    uzp1 p0.b, p1.b, p0.b
1118; CHECK-NEXT:    addvl sp, sp, #1
1119; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1120; CHECK-NEXT:    ret
1121  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 7)
1122  ret <vscale x 16 x i1> %res
1123}
1124
1125define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_8(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1126; CHECK-LABEL: insert_nxv1i1_nxv16i1_8:
1127; CHECK:       // %bb.0:
1128; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1129; CHECK-NEXT:    addvl sp, sp, #-1
1130; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1131; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1132; CHECK-NEXT:    .cfi_offset w29, -16
1133; CHECK-NEXT:    punpkhi p2.h, p0.b
1134; CHECK-NEXT:    punpklo p0.h, p0.b
1135; CHECK-NEXT:    punpklo p3.h, p2.b
1136; CHECK-NEXT:    punpkhi p2.h, p2.b
1137; CHECK-NEXT:    punpklo p4.h, p3.b
1138; CHECK-NEXT:    punpkhi p3.h, p3.b
1139; CHECK-NEXT:    punpkhi p4.h, p4.b
1140; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1141; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1142; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1143; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
1144; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1145; CHECK-NEXT:    addvl sp, sp, #1
1146; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1147; CHECK-NEXT:    ret
1148  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 8)
1149  ret <vscale x 16 x i1> %res
1150}
1151
1152define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_9(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1153; CHECK-LABEL: insert_nxv1i1_nxv16i1_9:
1154; CHECK:       // %bb.0:
1155; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1156; CHECK-NEXT:    addvl sp, sp, #-1
1157; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1158; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1159; CHECK-NEXT:    .cfi_offset w29, -16
1160; CHECK-NEXT:    punpkhi p2.h, p0.b
1161; CHECK-NEXT:    punpklo p0.h, p0.b
1162; CHECK-NEXT:    punpklo p3.h, p2.b
1163; CHECK-NEXT:    punpkhi p2.h, p2.b
1164; CHECK-NEXT:    punpklo p4.h, p3.b
1165; CHECK-NEXT:    punpkhi p3.h, p3.b
1166; CHECK-NEXT:    punpklo p4.h, p4.b
1167; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1168; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1169; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1170; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
1171; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1172; CHECK-NEXT:    addvl sp, sp, #1
1173; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1174; CHECK-NEXT:    ret
1175  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 9)
1176  ret <vscale x 16 x i1> %res
1177}
1178
1179define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_10(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1180; CHECK-LABEL: insert_nxv1i1_nxv16i1_10:
1181; CHECK:       // %bb.0:
1182; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1183; CHECK-NEXT:    addvl sp, sp, #-1
1184; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1185; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1186; CHECK-NEXT:    .cfi_offset w29, -16
1187; CHECK-NEXT:    punpkhi p2.h, p0.b
1188; CHECK-NEXT:    punpklo p0.h, p0.b
1189; CHECK-NEXT:    punpklo p3.h, p2.b
1190; CHECK-NEXT:    punpkhi p2.h, p2.b
1191; CHECK-NEXT:    punpkhi p4.h, p3.b
1192; CHECK-NEXT:    punpklo p3.h, p3.b
1193; CHECK-NEXT:    punpkhi p4.h, p4.b
1194; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1195; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1196; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1197; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
1198; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1199; CHECK-NEXT:    addvl sp, sp, #1
1200; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1201; CHECK-NEXT:    ret
1202  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 10)
1203  ret <vscale x 16 x i1> %res
1204}
1205
1206define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_11(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1207; CHECK-LABEL: insert_nxv1i1_nxv16i1_11:
1208; CHECK:       // %bb.0:
1209; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1210; CHECK-NEXT:    addvl sp, sp, #-1
1211; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1212; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1213; CHECK-NEXT:    .cfi_offset w29, -16
1214; CHECK-NEXT:    punpkhi p2.h, p0.b
1215; CHECK-NEXT:    punpklo p0.h, p0.b
1216; CHECK-NEXT:    punpklo p3.h, p2.b
1217; CHECK-NEXT:    punpkhi p2.h, p2.b
1218; CHECK-NEXT:    punpkhi p4.h, p3.b
1219; CHECK-NEXT:    punpklo p3.h, p3.b
1220; CHECK-NEXT:    punpklo p4.h, p4.b
1221; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1222; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1223; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1224; CHECK-NEXT:    uzp1 p1.h, p1.h, p2.h
1225; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1226; CHECK-NEXT:    addvl sp, sp, #1
1227; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1228; CHECK-NEXT:    ret
1229  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 11)
1230  ret <vscale x 16 x i1> %res
1231}
1232
1233define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_12(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1234; CHECK-LABEL: insert_nxv1i1_nxv16i1_12:
1235; CHECK:       // %bb.0:
1236; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1237; CHECK-NEXT:    addvl sp, sp, #-1
1238; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1239; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1240; CHECK-NEXT:    .cfi_offset w29, -16
1241; CHECK-NEXT:    punpkhi p2.h, p0.b
1242; CHECK-NEXT:    punpklo p0.h, p0.b
1243; CHECK-NEXT:    punpkhi p3.h, p2.b
1244; CHECK-NEXT:    punpklo p2.h, p2.b
1245; CHECK-NEXT:    punpklo p4.h, p3.b
1246; CHECK-NEXT:    punpkhi p3.h, p3.b
1247; CHECK-NEXT:    punpkhi p4.h, p4.b
1248; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1249; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1250; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1251; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1252; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1253; CHECK-NEXT:    addvl sp, sp, #1
1254; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1255; CHECK-NEXT:    ret
1256  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 12)
1257  ret <vscale x 16 x i1> %res
1258}
1259
1260define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_13(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1261; CHECK-LABEL: insert_nxv1i1_nxv16i1_13:
1262; CHECK:       // %bb.0:
1263; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1264; CHECK-NEXT:    addvl sp, sp, #-1
1265; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1266; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1267; CHECK-NEXT:    .cfi_offset w29, -16
1268; CHECK-NEXT:    punpkhi p2.h, p0.b
1269; CHECK-NEXT:    punpklo p0.h, p0.b
1270; CHECK-NEXT:    punpkhi p3.h, p2.b
1271; CHECK-NEXT:    punpklo p2.h, p2.b
1272; CHECK-NEXT:    punpklo p4.h, p3.b
1273; CHECK-NEXT:    punpkhi p3.h, p3.b
1274; CHECK-NEXT:    punpklo p4.h, p4.b
1275; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1276; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1277; CHECK-NEXT:    uzp1 p1.s, p1.s, p3.s
1278; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1279; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1280; CHECK-NEXT:    addvl sp, sp, #1
1281; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1282; CHECK-NEXT:    ret
1283  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 13)
1284  ret <vscale x 16 x i1> %res
1285}
1286
1287define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_14(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1288; CHECK-LABEL: insert_nxv1i1_nxv16i1_14:
1289; CHECK:       // %bb.0:
1290; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1291; CHECK-NEXT:    addvl sp, sp, #-1
1292; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1293; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1294; CHECK-NEXT:    .cfi_offset w29, -16
1295; CHECK-NEXT:    punpkhi p2.h, p0.b
1296; CHECK-NEXT:    punpklo p0.h, p0.b
1297; CHECK-NEXT:    punpkhi p3.h, p2.b
1298; CHECK-NEXT:    punpklo p2.h, p2.b
1299; CHECK-NEXT:    punpkhi p4.h, p3.b
1300; CHECK-NEXT:    punpklo p3.h, p3.b
1301; CHECK-NEXT:    punpkhi p4.h, p4.b
1302; CHECK-NEXT:    uzp1 p1.d, p1.d, p4.d
1303; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1304; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1305; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1306; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1307; CHECK-NEXT:    addvl sp, sp, #1
1308; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1309; CHECK-NEXT:    ret
1310  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 14)
1311  ret <vscale x 16 x i1> %res
1312}
1313
1314define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_15(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv) {
1315; CHECK-LABEL: insert_nxv1i1_nxv16i1_15:
1316; CHECK:       // %bb.0:
1317; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1318; CHECK-NEXT:    addvl sp, sp, #-1
1319; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
1320; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
1321; CHECK-NEXT:    .cfi_offset w29, -16
1322; CHECK-NEXT:    punpkhi p2.h, p0.b
1323; CHECK-NEXT:    punpklo p0.h, p0.b
1324; CHECK-NEXT:    punpkhi p3.h, p2.b
1325; CHECK-NEXT:    punpklo p2.h, p2.b
1326; CHECK-NEXT:    punpkhi p4.h, p3.b
1327; CHECK-NEXT:    punpklo p3.h, p3.b
1328; CHECK-NEXT:    punpklo p4.h, p4.b
1329; CHECK-NEXT:    uzp1 p1.d, p4.d, p1.d
1330; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
1331; CHECK-NEXT:    uzp1 p1.s, p3.s, p1.s
1332; CHECK-NEXT:    uzp1 p1.h, p2.h, p1.h
1333; CHECK-NEXT:    uzp1 p0.b, p0.b, p1.b
1334; CHECK-NEXT:    addvl sp, sp, #1
1335; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1336; CHECK-NEXT:    ret
1337  %res = call <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1> %vec, <vscale x 1 x i1> %sv, i64 15)
1338  ret <vscale x 16 x i1> %res
1339}
1340
1341attributes #0 = { vscale_range(2,2) }
1342
1343declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
1344
1345declare <vscale x 6 x i16> @llvm.vector.insert.nxv6i16.nxv1i16(<vscale x 6 x i16>, <vscale x 1 x i16>, i64)
1346declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16>, <vscale x 2 x i16>, i64)
1347declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
1348
1349declare <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32>, <vscale x 2 x i32>, i64)
1350declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32>, <vscale x 1 x i32>, i64)
1351declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
1352declare <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32>, <vscale x 4 x i32>, i64)
1353declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32>, <vscale x 2 x i32>, i64)
1354declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32>, <vscale x 3 x i32>, i64)
1355
1356declare <vscale x 2 x bfloat> @llvm.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i64)
1357declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat>, <vscale x 2 x bfloat>, i64)
1358declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i64)
1359declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat>, <4 x bfloat>, i64)
1360declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
1361declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat>, <vscale x 4 x bfloat>, i64)
1362declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
1363
1364declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
1365declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64)
1366declare <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64)
1367declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
1368
1369declare <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half>, <vscale x 2 x half>, i64)
1370declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half>, <vscale x 2 x half>, i64)
1371declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half>, <vscale x 4 x half>, i64)
1372
1373declare <vscale x 3 x float> @llvm.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float>, <vscale x 2 x float>, i64)
1374declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float>, <vscale x 1 x float>, i64)
1375declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float>, <vscale x 2 x float>, i64)
1376
1377declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1(<vscale x 2 x i1>, <8 x i1>, i64)
1378declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.v16i1(<vscale x 4 x i1>, <16 x i1>, i64)
1379declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.v32i1(<vscale x 8 x i1>, <32 x i1>, i64)
1380declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1>, <vscale x 1 x i1>, i64)
1381declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1>, <vscale x 1 x i1>, i64)
1382declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1>, <vscale x 1 x i1>, i64)
1383declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1>, <vscale x 1 x i1>, i64)
1384declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1>, <vscale x 4 x i1>, i64)
1385declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1>, <vscale x 8 x i1>, i64)
1386declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v64i1(<vscale x 16 x i1>, <64 x i1>, i64)
1387