xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll (revision b6c0f1bfa79a3a32d841ac5ab1f94c3aee3b5d90)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
3; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
4; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
5; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
6
7; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
8; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
9; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV32VLA
10; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -early-live-intervals -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,VLA,RV64VLA
11
12; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV32VLS %s
13; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
14; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV32VLS %s
15; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin -riscv-v-vector-bits-max=128 -verify-machineinstrs | FileCheck -check-prefixes=CHECK,VLS,RV64VLS %s
16
17define <vscale x 8 x i32> @insert_nxv8i32_v2i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
18; VLA-LABEL: insert_nxv8i32_v2i32_0:
19; VLA:       # %bb.0:
20; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
21; VLA-NEXT:    vle32.v v12, (a0)
22; VLA-NEXT:    vsetivli zero, 2, e32, m4, tu, ma
23; VLA-NEXT:    vmv.v.v v8, v12
24; VLA-NEXT:    ret
25;
26; VLS-LABEL: insert_nxv8i32_v2i32_0:
27; VLS:       # %bb.0:
28; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
29; VLS-NEXT:    vle32.v v12, (a0)
30; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
31; VLS-NEXT:    vmv.v.v v8, v12
32; VLS-NEXT:    ret
33  %sv = load <2 x i32>, ptr %svp
34  %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 0)
35  ret <vscale x 8 x i32> %v
36}
37
38define <vscale x 8 x i32> @insert_nxv8i32_v2i32_2(<vscale x 8 x i32> %vec, ptr %svp) {
39; VLA-LABEL: insert_nxv8i32_v2i32_2:
40; VLA:       # %bb.0:
41; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
42; VLA-NEXT:    vle32.v v12, (a0)
43; VLA-NEXT:    vsetivli zero, 4, e32, m4, tu, ma
44; VLA-NEXT:    vslideup.vi v8, v12, 2
45; VLA-NEXT:    ret
46;
47; VLS-LABEL: insert_nxv8i32_v2i32_2:
48; VLS:       # %bb.0:
49; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
50; VLS-NEXT:    vle32.v v12, (a0)
51; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
52; VLS-NEXT:    vslideup.vi v8, v12, 2
53; VLS-NEXT:    ret
54  %sv = load <2 x i32>, ptr %svp
55  %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 2)
56  ret <vscale x 8 x i32> %v
57}
58
59define <vscale x 8 x i32> @insert_nxv8i32_v2i32_6(<vscale x 8 x i32> %vec, ptr %svp) {
60; VLA-LABEL: insert_nxv8i32_v2i32_6:
61; VLA:       # %bb.0:
62; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
63; VLA-NEXT:    vle32.v v12, (a0)
64; VLA-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
65; VLA-NEXT:    vslideup.vi v8, v12, 6
66; VLA-NEXT:    ret
67;
68; VLS-LABEL: insert_nxv8i32_v2i32_6:
69; VLS:       # %bb.0:
70; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
71; VLS-NEXT:    vle32.v v12, (a0)
72; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
73; VLS-NEXT:    vslideup.vi v9, v12, 2
74; VLS-NEXT:    ret
75  %sv = load <2 x i32>, ptr %svp
76  %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> %vec, <2 x i32> %sv, i64 6)
77  ret <vscale x 8 x i32> %v
78}
79
80define <vscale x 8 x i32> @insert_nxv8i32_v8i32_0(<vscale x 8 x i32> %vec, ptr %svp) {
81; VLA-LABEL: insert_nxv8i32_v8i32_0:
82; VLA:       # %bb.0:
83; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
84; VLA-NEXT:    vle32.v v12, (a0)
85; VLA-NEXT:    vsetivli zero, 8, e32, m4, tu, ma
86; VLA-NEXT:    vmv.v.v v8, v12
87; VLA-NEXT:    ret
88;
89; VLS-LABEL: insert_nxv8i32_v8i32_0:
90; VLS:       # %bb.0:
91; VLS-NEXT:    vl2re32.v v8, (a0)
92; VLS-NEXT:    ret
93  %sv = load <8 x i32>, ptr %svp
94  %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 0)
95  ret <vscale x 8 x i32> %v
96}
97
98define <vscale x 8 x i32> @insert_nxv8i32_v8i32_8(<vscale x 8 x i32> %vec, ptr %svp) {
99; VLA-LABEL: insert_nxv8i32_v8i32_8:
100; VLA:       # %bb.0:
101; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
102; VLA-NEXT:    vle32.v v12, (a0)
103; VLA-NEXT:    vsetivli zero, 16, e32, m4, tu, ma
104; VLA-NEXT:    vslideup.vi v8, v12, 8
105; VLA-NEXT:    ret
106;
107; VLS-LABEL: insert_nxv8i32_v8i32_8:
108; VLS:       # %bb.0:
109; VLS-NEXT:    vl2re32.v v10, (a0)
110; VLS-NEXT:    ret
111  %sv = load <8 x i32>, ptr %svp
112  %v = call <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32> %vec, <8 x i32> %sv, i64 8)
113  ret <vscale x 8 x i32> %v
114}
115
116define <vscale x 8 x i32> @insert_nxv8i32_undef_v2i32_0(ptr %svp) {
117; CHECK-LABEL: insert_nxv8i32_undef_v2i32_0:
118; CHECK:       # %bb.0:
119; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
120; CHECK-NEXT:    vle32.v v8, (a0)
121; CHECK-NEXT:    ret
122  %sv = load <2 x i32>, ptr %svp
123  %v = call <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32> undef, <2 x i32> %sv, i64 0)
124  ret <vscale x 8 x i32> %v
125}
126
127define <vscale x 2 x i32> @insert_nxv8i32_v4i32_0(<vscale x 2 x i32> %vec, <4 x i32> %subvec) {
128; VLA-LABEL: insert_nxv8i32_v4i32_0:
129; VLA:       # %bb.0:
130; VLA-NEXT:    vsetivli zero, 4, e32, m1, tu, ma
131; VLA-NEXT:    vmv.v.v v8, v9
132; VLA-NEXT:    ret
133;
134; VLS-LABEL: insert_nxv8i32_v4i32_0:
135; VLS:       # %bb.0:
136; VLS-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
137; VLS-NEXT:    vmv1r.v v8, v9
138; VLS-NEXT:    ret
139  %v = call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v4i32(<vscale x 2 x i32> %vec, <4 x i32> %subvec, i64 0)
140  ret <vscale x 2 x i32> %v
141}
142
143
144define <4 x i32> @insert_v4i32_v4i32_0(<4 x i32> %vec, <4 x i32> %subvec) {
145; CHECK-LABEL: insert_v4i32_v4i32_0:
146; CHECK:       # %bb.0:
147; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
148; CHECK-NEXT:    vmv1r.v v8, v9
149; CHECK-NEXT:    ret
150  %v = call <4 x i32> @llvm.vector.insert.v4i32.v4i32(<4 x i32> %vec, <4 x i32> %subvec, i64 0)
151  ret <4 x i32> %v
152}
153
154define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) {
155; VLA-LABEL: insert_v4i32_v2i32_0:
156; VLA:       # %bb.0:
157; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
158; VLA-NEXT:    vle32.v v8, (a1)
159; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
160; VLA-NEXT:    vle32.v v9, (a0)
161; VLA-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
162; VLA-NEXT:    vmv.v.v v9, v8
163; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
164; VLA-NEXT:    vse32.v v9, (a0)
165; VLA-NEXT:    ret
166;
167; VLS-LABEL: insert_v4i32_v2i32_0:
168; VLS:       # %bb.0:
169; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
170; VLS-NEXT:    vle32.v v8, (a1)
171; VLS-NEXT:    vl1re32.v v9, (a0)
172; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
173; VLS-NEXT:    vmv.v.v v9, v8
174; VLS-NEXT:    vs1r.v v9, (a0)
175; VLS-NEXT:    ret
176  %sv = load <2 x i32>, ptr %svp
177  %vec = load <4 x i32>, ptr %vp
178  %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 0)
179  store <4 x i32> %v, ptr %vp
180  ret void
181}
182
183define void @insert_v4i32_v2i32_2(ptr %vp, ptr %svp) {
184; VLA-LABEL: insert_v4i32_v2i32_2:
185; VLA:       # %bb.0:
186; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
187; VLA-NEXT:    vle32.v v8, (a1)
188; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
189; VLA-NEXT:    vle32.v v9, (a0)
190; VLA-NEXT:    vslideup.vi v9, v8, 2
191; VLA-NEXT:    vse32.v v9, (a0)
192; VLA-NEXT:    ret
193;
194; VLS-LABEL: insert_v4i32_v2i32_2:
195; VLS:       # %bb.0:
196; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
197; VLS-NEXT:    vle32.v v8, (a1)
198; VLS-NEXT:    vl1re32.v v9, (a0)
199; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
200; VLS-NEXT:    vslideup.vi v9, v8, 2
201; VLS-NEXT:    vs1r.v v9, (a0)
202; VLS-NEXT:    ret
203  %sv = load <2 x i32>, ptr %svp
204  %vec = load <4 x i32>, ptr %vp
205  %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> %vec, <2 x i32> %sv, i64 2)
206  store <4 x i32> %v, ptr %vp
207  ret void
208}
209
210define void @insert_v4i32_undef_v2i32_0(ptr %vp, ptr %svp) {
211; VLA-LABEL: insert_v4i32_undef_v2i32_0:
212; VLA:       # %bb.0:
213; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
214; VLA-NEXT:    vle32.v v8, (a1)
215; VLA-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
216; VLA-NEXT:    vse32.v v8, (a0)
217; VLA-NEXT:    ret
218;
219; VLS-LABEL: insert_v4i32_undef_v2i32_0:
220; VLS:       # %bb.0:
221; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
222; VLS-NEXT:    vle32.v v8, (a1)
223; VLS-NEXT:    vs1r.v v8, (a0)
224; VLS-NEXT:    ret
225  %sv = load <2 x i32>, ptr %svp
226  %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %sv, i64 0)
227  store <4 x i32> %v, ptr %vp
228  ret void
229}
230
231; This tests the code path in RISCVISelDAGToDAG::Select where we select an
232; insert_subvector with a fixed vector and fixed subvector type. The phi here is
233; used to prevent the fixed insert_subvector from being combined away into a
234; scalable insert_subvector.
235define <4 x i32> @insert_v4i32_undef_v2i32_0_phi(<2 x i32> %subvec, i1 %cond) {
236; CHECK-LABEL: insert_v4i32_undef_v2i32_0_phi:
237; CHECK:       # %bb.0: # %entry
238; CHECK-NEXT:    andi a0, a0, 1
239; CHECK-NEXT:    bnez a0, .LBB11_2
240; CHECK-NEXT:  # %bb.1:
241; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
242; CHECK-NEXT:    vmv.v.i v8, 0
243; CHECK-NEXT:  .LBB11_2: # %bar
244; CHECK-NEXT:    ret
245entry:
246  br i1 %cond, label %foo, label %bar
247foo:
248  %v = call <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32> undef, <2 x i32> %subvec, i64 0)
249  br label %bar
250bar:
251  %w = phi <4 x i32> [%v, %foo], [zeroinitializer, %entry]
252  ret <4 x i32> %w
253}
254
255
256define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) {
257; VLA-LABEL: insert_v8i32_v2i32_0:
258; VLA:       # %bb.0:
259; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
260; VLA-NEXT:    vle32.v v8, (a1)
261; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
262; VLA-NEXT:    vle32.v v10, (a0)
263; VLA-NEXT:    vsetivli zero, 2, e32, m2, tu, ma
264; VLA-NEXT:    vmv.v.v v10, v8
265; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
266; VLA-NEXT:    vse32.v v10, (a0)
267; VLA-NEXT:    ret
268;
269; VLS-LABEL: insert_v8i32_v2i32_0:
270; VLS:       # %bb.0:
271; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
272; VLS-NEXT:    vle32.v v8, (a1)
273; VLS-NEXT:    vl2re32.v v10, (a0)
274; VLS-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
275; VLS-NEXT:    vmv.v.v v10, v8
276; VLS-NEXT:    vs2r.v v10, (a0)
277; VLS-NEXT:    ret
278  %sv = load <2 x i32>, ptr %svp
279  %vec = load <8 x i32>, ptr %vp
280  %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 0)
281  store <8 x i32> %v, ptr %vp
282  ret void
283}
284
285define void @insert_v8i32_v2i32_2(ptr %vp, ptr %svp) {
286; VLA-LABEL: insert_v8i32_v2i32_2:
287; VLA:       # %bb.0:
288; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
289; VLA-NEXT:    vle32.v v8, (a0)
290; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
291; VLA-NEXT:    vle32.v v10, (a1)
292; VLA-NEXT:    vsetivli zero, 4, e32, m2, tu, ma
293; VLA-NEXT:    vslideup.vi v8, v10, 2
294; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
295; VLA-NEXT:    vse32.v v8, (a0)
296; VLA-NEXT:    ret
297;
298; VLS-LABEL: insert_v8i32_v2i32_2:
299; VLS:       # %bb.0:
300; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
301; VLS-NEXT:    vle32.v v8, (a1)
302; VLS-NEXT:    vl2re32.v v10, (a0)
303; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
304; VLS-NEXT:    vslideup.vi v10, v8, 2
305; VLS-NEXT:    vs2r.v v10, (a0)
306; VLS-NEXT:    ret
307  %sv = load <2 x i32>, ptr %svp
308  %vec = load <8 x i32>, ptr %vp
309  %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 2)
310  store <8 x i32> %v, ptr %vp
311  ret void
312}
313
314define void @insert_v8i32_v2i32_6(ptr %vp, ptr %svp) {
315; VLA-LABEL: insert_v8i32_v2i32_6:
316; VLA:       # %bb.0:
317; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
318; VLA-NEXT:    vle32.v v8, (a0)
319; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
320; VLA-NEXT:    vle32.v v10, (a1)
321; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
322; VLA-NEXT:    vslideup.vi v8, v10, 6
323; VLA-NEXT:    vse32.v v8, (a0)
324; VLA-NEXT:    ret
325;
326; VLS-LABEL: insert_v8i32_v2i32_6:
327; VLS:       # %bb.0:
328; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
329; VLS-NEXT:    vle32.v v8, (a1)
330; VLS-NEXT:    vl2re32.v v10, (a0)
331; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
332; VLS-NEXT:    vslideup.vi v11, v8, 2
333; VLS-NEXT:    vs2r.v v10, (a0)
334; VLS-NEXT:    ret
335  %sv = load <2 x i32>, ptr %svp
336  %vec = load <8 x i32>, ptr %vp
337  %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> %vec, <2 x i32> %sv, i64 6)
338  store <8 x i32> %v, ptr %vp
339  ret void
340}
341
342define void @insert_v8i32_undef_v2i32_6(ptr %vp, ptr %svp) {
343; VLA-LABEL: insert_v8i32_undef_v2i32_6:
344; VLA:       # %bb.0:
345; VLA-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
346; VLA-NEXT:    vle32.v v8, (a1)
347; VLA-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
348; VLA-NEXT:    vslideup.vi v10, v8, 6
349; VLA-NEXT:    vse32.v v10, (a0)
350; VLA-NEXT:    ret
351;
352; VLS-LABEL: insert_v8i32_undef_v2i32_6:
353; VLS:       # %bb.0:
354; VLS-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
355; VLS-NEXT:    vle32.v v8, (a1)
356; VLS-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
357; VLS-NEXT:    vslideup.vi v9, v8, 2
358; VLS-NEXT:    vs2r.v v8, (a0)
359; VLS-NEXT:    ret
360  %sv = load <2 x i32>, ptr %svp
361  %v = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> undef, <2 x i32> %sv, i64 6)
362  store <8 x i32> %v, ptr %vp
363  ret void
364}
365
366define void @insert_v4i16_v2i16_0(ptr %vp, ptr %svp) {
367; CHECK-LABEL: insert_v4i16_v2i16_0:
368; CHECK:       # %bb.0:
369; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
370; CHECK-NEXT:    vle16.v v8, (a0)
371; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
372; CHECK-NEXT:    vle16.v v9, (a1)
373; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
374; CHECK-NEXT:    vmv.v.v v8, v9
375; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
376; CHECK-NEXT:    vse16.v v8, (a0)
377; CHECK-NEXT:    ret
378  %v = load <4 x i16>, ptr %vp
379  %sv = load <2 x i16>, ptr %svp
380  %c = call <4 x i16> @llvm.vector.insert.v2i16.v4i16(<4 x i16> %v, <2 x i16> %sv, i64 0)
381  store <4 x i16> %c, ptr %vp
382  ret void
383}
384
385define void @insert_v4i16_v2i16_2(ptr %vp, ptr %svp) {
386; CHECK-LABEL: insert_v4i16_v2i16_2:
387; CHECK:       # %bb.0:
388; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
389; CHECK-NEXT:    vle16.v v8, (a0)
390; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
391; CHECK-NEXT:    vle16.v v9, (a1)
392; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
393; CHECK-NEXT:    vslideup.vi v8, v9, 2
394; CHECK-NEXT:    vse16.v v8, (a0)
395; CHECK-NEXT:    ret
396  %v = load <4 x i16>, ptr %vp
397  %sv = load <2 x i16>, ptr %svp
398  %c = call <4 x i16> @llvm.vector.insert.v2i16.v4i16(<4 x i16> %v, <2 x i16> %sv, i64 2)
399  store <4 x i16> %c, ptr %vp
400  ret void
401}
402
403define void @insert_v32i1_v8i1_0(ptr %vp, ptr %svp) {
404; VLA-LABEL: insert_v32i1_v8i1_0:
405; VLA:       # %bb.0:
406; VLA-NEXT:    li a2, 32
407; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
408; VLA-NEXT:    vlm.v v8, (a0)
409; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
410; VLA-NEXT:    vlm.v v9, (a1)
411; VLA-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
412; VLA-NEXT:    vmv.v.v v8, v9
413; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
414; VLA-NEXT:    vsm.v v8, (a0)
415; VLA-NEXT:    ret
416;
417; VLS-LABEL: insert_v32i1_v8i1_0:
418; VLS:       # %bb.0:
419; VLS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
420; VLS-NEXT:    vlm.v v8, (a0)
421; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
422; VLS-NEXT:    vlm.v v9, (a1)
423; VLS-NEXT:    vsetivli zero, 1, e8, mf4, tu, ma
424; VLS-NEXT:    vmv.v.v v8, v9
425; VLS-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
426; VLS-NEXT:    vsm.v v8, (a0)
427; VLS-NEXT:    ret
428  %v = load <32 x i1>, ptr %vp
429  %sv = load <8 x i1>, ptr %svp
430  %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 0)
431  store <32 x i1> %c, ptr %vp
432  ret void
433}
434
435define void @insert_v32i1_v8i1_16(ptr %vp, ptr %svp) {
436; VLA-LABEL: insert_v32i1_v8i1_16:
437; VLA:       # %bb.0:
438; VLA-NEXT:    li a2, 32
439; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
440; VLA-NEXT:    vlm.v v8, (a0)
441; VLA-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
442; VLA-NEXT:    vlm.v v9, (a1)
443; VLA-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
444; VLA-NEXT:    vslideup.vi v8, v9, 2
445; VLA-NEXT:    vsetvli zero, a2, e8, m2, ta, ma
446; VLA-NEXT:    vsm.v v8, (a0)
447; VLA-NEXT:    ret
448;
449; VLS-LABEL: insert_v32i1_v8i1_16:
450; VLS:       # %bb.0:
451; VLS-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
452; VLS-NEXT:    vlm.v v8, (a0)
453; VLS-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
454; VLS-NEXT:    vlm.v v9, (a1)
455; VLS-NEXT:    vsetivli zero, 3, e8, mf4, tu, ma
456; VLS-NEXT:    vslideup.vi v8, v9, 2
457; VLS-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
458; VLS-NEXT:    vsm.v v8, (a0)
459; VLS-NEXT:    ret
460  %v = load <32 x i1>, ptr %vp
461  %sv = load <8 x i1>, ptr %svp
462  %c = call <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1> %v, <8 x i1> %sv, i64 16)
463  store <32 x i1> %c, ptr %vp
464  ret void
465}
466
467define void @insert_v8i1_v4i1_0(ptr %vp, ptr %svp) {
468; CHECK-LABEL: insert_v8i1_v4i1_0:
469; CHECK:       # %bb.0:
470; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
471; CHECK-NEXT:    vlm.v v0, (a0)
472; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
473; CHECK-NEXT:    vlm.v v8, (a1)
474; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
475; CHECK-NEXT:    vmv.v.i v9, 0
476; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
477; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
478; CHECK-NEXT:    vmv.v.i v10, 0
479; CHECK-NEXT:    vmv1r.v v0, v8
480; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
481; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
482; CHECK-NEXT:    vmv.v.v v9, v8
483; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
484; CHECK-NEXT:    vmsne.vi v8, v9, 0
485; CHECK-NEXT:    vsm.v v8, (a0)
486; CHECK-NEXT:    ret
487  %v = load <8 x i1>, ptr %vp
488  %sv = load <4 x i1>, ptr %svp
489  %c = call <8 x i1> @llvm.vector.insert.v4i1.v8i1(<8 x i1> %v, <4 x i1> %sv, i64 0)
490  store <8 x i1> %c, ptr %vp
491  ret void
492}
493
494define void @insert_v8i1_v4i1_4(ptr %vp, ptr %svp) {
495; CHECK-LABEL: insert_v8i1_v4i1_4:
496; CHECK:       # %bb.0:
497; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
498; CHECK-NEXT:    vlm.v v0, (a0)
499; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
500; CHECK-NEXT:    vlm.v v8, (a1)
501; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
502; CHECK-NEXT:    vmv.v.i v9, 0
503; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
504; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
505; CHECK-NEXT:    vmv.v.i v10, 0
506; CHECK-NEXT:    vmv1r.v v0, v8
507; CHECK-NEXT:    vmerge.vim v8, v10, 1, v0
508; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
509; CHECK-NEXT:    vslideup.vi v9, v8, 4
510; CHECK-NEXT:    vmsne.vi v8, v9, 0
511; CHECK-NEXT:    vsm.v v8, (a0)
512; CHECK-NEXT:    ret
513  %v = load <8 x i1>, ptr %vp
514  %sv = load <4 x i1>, ptr %svp
515  %c = call <8 x i1> @llvm.vector.insert.v4i1.v8i1(<8 x i1> %v, <4 x i1> %sv, i64 4)
516  store <8 x i1> %c, ptr %vp
517  ret void
518}
519
520define <vscale x 2 x i16> @insert_nxv2i16_v2i16_0(<vscale x 2 x i16> %v, ptr %svp) {
521; CHECK-LABEL: insert_nxv2i16_v2i16_0:
522; CHECK:       # %bb.0:
523; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
524; CHECK-NEXT:    vle16.v v9, (a0)
525; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
526; CHECK-NEXT:    vmv.v.v v8, v9
527; CHECK-NEXT:    ret
528  %sv = load <2 x i16>, ptr %svp
529  %c = call <vscale x 2 x i16> @llvm.vector.insert.v2i16.nxv2i16(<vscale x 2 x i16> %v, <2 x i16> %sv, i64 0)
530  ret <vscale x 2 x i16> %c
531}
532
533define <vscale x 2 x i16> @insert_nxv2i16_v2i16_2(<vscale x 2 x i16> %v, ptr %svp) {
534; CHECK-LABEL: insert_nxv2i16_v2i16_2:
535; CHECK:       # %bb.0:
536; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
537; CHECK-NEXT:    vle16.v v9, (a0)
538; CHECK-NEXT:    vsetivli zero, 6, e16, mf2, tu, ma
539; CHECK-NEXT:    vslideup.vi v8, v9, 4
540; CHECK-NEXT:    ret
541  %sv = load <2 x i16>, ptr %svp
542  %c = call <vscale x 2 x i16> @llvm.vector.insert.v2i16.nxv2i16(<vscale x 2 x i16> %v, <2 x i16> %sv, i64 4)
543  ret <vscale x 2 x i16> %c
544}
545
546define <vscale x 2 x i1> @insert_nxv2i1_v4i1_0(<vscale x 2 x i1> %v, ptr %svp) {
547; VLA-LABEL: insert_nxv2i1_v4i1_0:
548; VLA:       # %bb.0:
549; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
550; VLA-NEXT:    vlm.v v8, (a0)
551; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
552; VLA-NEXT:    vmv.v.i v9, 0
553; VLA-NEXT:    vmerge.vim v9, v9, 1, v0
554; VLA-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
555; VLA-NEXT:    vmv.v.i v10, 0
556; VLA-NEXT:    vmv1r.v v0, v8
557; VLA-NEXT:    vmerge.vim v8, v10, 1, v0
558; VLA-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
559; VLA-NEXT:    vmv.v.v v9, v8
560; VLA-NEXT:    vsetvli a0, zero, e8, mf4, ta, ma
561; VLA-NEXT:    vmsne.vi v0, v9, 0
562; VLA-NEXT:    ret
563;
564; VLS-LABEL: insert_nxv2i1_v4i1_0:
565; VLS:       # %bb.0:
566; VLS-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
567; VLS-NEXT:    vlm.v v8, (a0)
568; VLS-NEXT:    vmv.v.i v9, 0
569; VLS-NEXT:    vmerge.vim v10, v9, 1, v0
570; VLS-NEXT:    vmv1r.v v0, v8
571; VLS-NEXT:    vmerge.vim v8, v9, 1, v0
572; VLS-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
573; VLS-NEXT:    vmv.v.v v10, v8
574; VLS-NEXT:    vsetvli zero, zero, e8, mf4, ta, ma
575; VLS-NEXT:    vmsne.vi v0, v10, 0
576; VLS-NEXT:    ret
577  %sv = load <4 x i1>, ptr %svp
578  %c = call <vscale x 2 x i1> @llvm.vector.insert.v4i1.nxv2i1(<vscale x 2 x i1> %v, <4 x i1> %sv, i64 0)
579  ret <vscale x 2 x i1> %c
580}
581
582define <vscale x 8 x i1> @insert_nxv8i1_v4i1_0(<vscale x 8 x i1> %v, ptr %svp) {
583; CHECK-LABEL: insert_nxv8i1_v4i1_0:
584; CHECK:       # %bb.0:
585; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
586; CHECK-NEXT:    vlm.v v8, (a0)
587; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, tu, ma
588; CHECK-NEXT:    vmv.v.v v0, v8
589; CHECK-NEXT:    ret
590  %sv = load <8 x i1>, ptr %svp
591  %c = call <vscale x 8 x i1> @llvm.vector.insert.v8i1.nxv8i1(<vscale x 8 x i1> %v, <8 x i1> %sv, i64 0)
592  ret <vscale x 8 x i1> %c
593}
594
595define <vscale x 8 x i1> @insert_nxv8i1_v8i1_16(<vscale x 8 x i1> %v, ptr %svp) {
596; CHECK-LABEL: insert_nxv8i1_v8i1_16:
597; CHECK:       # %bb.0:
598; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
599; CHECK-NEXT:    vlm.v v8, (a0)
600; CHECK-NEXT:    vsetivli zero, 3, e8, mf8, tu, ma
601; CHECK-NEXT:    vslideup.vi v0, v8, 2
602; CHECK-NEXT:    ret
603  %sv = load <8 x i1>, ptr %svp
604  %c = call <vscale x 8 x i1> @llvm.vector.insert.v8i1.nxv8i1(<vscale x 8 x i1> %v, <8 x i1> %sv, i64 16)
605  ret <vscale x 8 x i1> %c
606}
607
608declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
609
610define void @insert_v2i64_nxv16i64(ptr %psv0, ptr %psv1, ptr %out) {
611; VLA-LABEL: insert_v2i64_nxv16i64:
612; VLA:       # %bb.0:
613; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
614; VLA-NEXT:    vle64.v v8, (a0)
615; VLA-NEXT:    vle64.v v16, (a1)
616; VLA-NEXT:    vsetivli zero, 6, e64, m8, tu, ma
617; VLA-NEXT:    vslideup.vi v8, v16, 4
618; VLA-NEXT:    vs8r.v v8, (a2)
619; VLA-NEXT:    ret
620;
621; VLS-LABEL: insert_v2i64_nxv16i64:
622; VLS:       # %bb.0:
623; VLS-NEXT:    vl1re64.v v8, (a0)
624; VLS-NEXT:    vl1re64.v v10, (a1)
625; VLS-NEXT:    vs8r.v v8, (a2)
626; VLS-NEXT:    ret
627  %sv0 = load <2 x i64>, ptr %psv0
628  %sv1 = load <2 x i64>, ptr %psv1
629  %v0 = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv0, i64 0)
630  %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> %v0, <2 x i64> %sv1, i64 4)
631  store <vscale x 16 x i64> %v, ptr %out
632  ret void
633}
634
635define void @insert_v2i64_nxv16i64_lo0(ptr %psv, ptr %out) {
636; VLA-LABEL: insert_v2i64_nxv16i64_lo0:
637; VLA:       # %bb.0:
638; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
639; VLA-NEXT:    vle64.v v8, (a0)
640; VLA-NEXT:    vs8r.v v8, (a1)
641; VLA-NEXT:    ret
642;
643; VLS-LABEL: insert_v2i64_nxv16i64_lo0:
644; VLS:       # %bb.0:
645; VLS-NEXT:    vl1re64.v v8, (a0)
646; VLS-NEXT:    vs8r.v v8, (a1)
647; VLS-NEXT:    ret
648  %sv = load <2 x i64>, ptr %psv
649  %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 0)
650  store <vscale x 16 x i64> %v, ptr %out
651  ret void
652}
653
654define void @insert_v2i64_nxv16i64_lo2(ptr %psv, ptr %out) {
655; VLA-LABEL: insert_v2i64_nxv16i64_lo2:
656; VLA:       # %bb.0:
657; VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
658; VLA-NEXT:    vle64.v v8, (a0)
659; VLA-NEXT:    vsetivli zero, 4, e64, m8, ta, ma
660; VLA-NEXT:    vslideup.vi v16, v8, 2
661; VLA-NEXT:    vs8r.v v16, (a1)
662; VLA-NEXT:    ret
663;
664; VLS-LABEL: insert_v2i64_nxv16i64_lo2:
665; VLS:       # %bb.0:
666; VLS-NEXT:    vl1re64.v v9, (a0)
667; VLS-NEXT:    vs8r.v v8, (a1)
668; VLS-NEXT:    ret
669  %sv = load <2 x i64>, ptr %psv
670  %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 2)
671  store <vscale x 16 x i64> %v, ptr %out
672  ret void
673}
674
675; Check we don't mistakenly optimize this: we don't know whether this is
676; inserted into the low or high split vector.
677define void @insert_v2i64_nxv16i64_hi(ptr %psv, ptr %out) {
678; RV32-LABEL: insert_v2i64_nxv16i64_hi:
679; RV32:       # %bb.0:
680; RV32-NEXT:    addi sp, sp, -80
681; RV32-NEXT:    .cfi_def_cfa_offset 80
682; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
683; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
684; RV32-NEXT:    .cfi_offset ra, -4
685; RV32-NEXT:    .cfi_offset s0, -8
686; RV32-NEXT:    addi s0, sp, 80
687; RV32-NEXT:    .cfi_def_cfa s0, 0
688; RV32-NEXT:    csrr a2, vlenb
689; RV32-NEXT:    slli a2, a2, 4
690; RV32-NEXT:    sub sp, sp, a2
691; RV32-NEXT:    andi sp, sp, -64
692; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
693; RV32-NEXT:    vle64.v v8, (a0)
694; RV32-NEXT:    addi a0, sp, 128
695; RV32-NEXT:    vse64.v v8, (a0)
696; RV32-NEXT:    csrr a0, vlenb
697; RV32-NEXT:    slli a0, a0, 3
698; RV32-NEXT:    addi a2, sp, 64
699; RV32-NEXT:    add a3, a2, a0
700; RV32-NEXT:    vl8re64.v v8, (a3)
701; RV32-NEXT:    vl8re64.v v16, (a2)
702; RV32-NEXT:    add a0, a1, a0
703; RV32-NEXT:    vs8r.v v8, (a0)
704; RV32-NEXT:    vs8r.v v16, (a1)
705; RV32-NEXT:    addi sp, s0, -80
706; RV32-NEXT:    .cfi_def_cfa sp, 80
707; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
708; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
709; RV32-NEXT:    addi sp, sp, 80
710; RV32-NEXT:    ret
711; RV64-LABEL: insert_v2i64_nxv16i64_hi:
712; RV64:       # %bb.0:
713; RV64-NEXT:    addi sp, sp, -80
714; RV64-NEXT:    .cfi_def_cfa_offset 80
715; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
716; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
717; RV64-NEXT:    .cfi_offset ra, -8
718; RV64-NEXT:    .cfi_offset s0, -16
719; RV64-NEXT:    addi s0, sp, 80
720; RV64-NEXT:    .cfi_def_cfa s0, 0
721; RV64-NEXT:    csrr a2, vlenb
722; RV64-NEXT:    slli a2, a2, 4
723; RV64-NEXT:    sub sp, sp, a2
724; RV64-NEXT:    andi sp, sp, -64
725; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
726; RV64-NEXT:    vle64.v v8, (a0)
727; RV64-NEXT:    addi a0, sp, 128
728; RV64-NEXT:    vse64.v v8, (a0)
729; RV64-NEXT:    csrr a0, vlenb
730; RV64-NEXT:    slli a0, a0, 3
731; RV64-NEXT:    addi a2, sp, 64
732; RV64-NEXT:    add a3, a2, a0
733; RV64-NEXT:    vl8re64.v v8, (a3)
734; RV64-NEXT:    vl8re64.v v16, (a2)
735; RV64-NEXT:    add a0, a1, a0
736; RV64-NEXT:    vs8r.v v8, (a0)
737; RV64-NEXT:    vs8r.v v16, (a1)
738; RV64-NEXT:    addi sp, s0, -80
739; RV64-NEXT:    .cfi_def_cfa sp, 80
740; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
741; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
742; RV64-NEXT:    addi sp, sp, 80
743; RV64-NEXT:    ret
744; RV32VLA-LABEL: insert_v2i64_nxv16i64_hi:
745; RV32VLA:       # %bb.0:
746; RV32VLA-NEXT:    addi sp, sp, -80
747; RV32VLA-NEXT:    .cfi_def_cfa_offset 80
748; RV32VLA-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
749; RV32VLA-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
750; RV32VLA-NEXT:    .cfi_offset ra, -4
751; RV32VLA-NEXT:    .cfi_offset s0, -8
752; RV32VLA-NEXT:    addi s0, sp, 80
753; RV32VLA-NEXT:    .cfi_def_cfa s0, 0
754; RV32VLA-NEXT:    csrr a2, vlenb
755; RV32VLA-NEXT:    slli a2, a2, 4
756; RV32VLA-NEXT:    sub sp, sp, a2
757; RV32VLA-NEXT:    andi sp, sp, -64
758; RV32VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
759; RV32VLA-NEXT:    vle64.v v8, (a0)
760; RV32VLA-NEXT:    addi a0, sp, 128
761; RV32VLA-NEXT:    csrr a2, vlenb
762; RV32VLA-NEXT:    addi a3, sp, 64
763; RV32VLA-NEXT:    slli a2, a2, 3
764; RV32VLA-NEXT:    vse64.v v8, (a0)
765; RV32VLA-NEXT:    add a0, a3, a2
766; RV32VLA-NEXT:    vl8re64.v v8, (a0)
767; RV32VLA-NEXT:    vl8re64.v v16, (a3)
768; RV32VLA-NEXT:    add a2, a1, a2
769; RV32VLA-NEXT:    vs8r.v v8, (a2)
770; RV32VLA-NEXT:    vs8r.v v16, (a1)
771; RV32VLA-NEXT:    addi sp, s0, -80
772; RV32VLA-NEXT:    .cfi_def_cfa sp, 80
773; RV32VLA-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
774; RV32VLA-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
775; RV32VLA-NEXT:    .cfi_restore ra
776; RV32VLA-NEXT:    .cfi_restore s0
777; RV32VLA-NEXT:    addi sp, sp, 80
778; RV32VLA-NEXT:    .cfi_def_cfa_offset 0
779; RV32VLA-NEXT:    ret
780;
781; RV64VLA-LABEL: insert_v2i64_nxv16i64_hi:
782; RV64VLA:       # %bb.0:
783; RV64VLA-NEXT:    addi sp, sp, -80
784; RV64VLA-NEXT:    .cfi_def_cfa_offset 80
785; RV64VLA-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
786; RV64VLA-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
787; RV64VLA-NEXT:    .cfi_offset ra, -8
788; RV64VLA-NEXT:    .cfi_offset s0, -16
789; RV64VLA-NEXT:    addi s0, sp, 80
790; RV64VLA-NEXT:    .cfi_def_cfa s0, 0
791; RV64VLA-NEXT:    csrr a2, vlenb
792; RV64VLA-NEXT:    slli a2, a2, 4
793; RV64VLA-NEXT:    sub sp, sp, a2
794; RV64VLA-NEXT:    andi sp, sp, -64
795; RV64VLA-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
796; RV64VLA-NEXT:    vle64.v v8, (a0)
797; RV64VLA-NEXT:    addi a0, sp, 128
798; RV64VLA-NEXT:    csrr a2, vlenb
799; RV64VLA-NEXT:    addi a3, sp, 64
800; RV64VLA-NEXT:    slli a2, a2, 3
801; RV64VLA-NEXT:    vse64.v v8, (a0)
802; RV64VLA-NEXT:    add a0, a3, a2
803; RV64VLA-NEXT:    vl8re64.v v8, (a0)
804; RV64VLA-NEXT:    vl8re64.v v16, (a3)
805; RV64VLA-NEXT:    add a2, a1, a2
806; RV64VLA-NEXT:    vs8r.v v8, (a2)
807; RV64VLA-NEXT:    vs8r.v v16, (a1)
808; RV64VLA-NEXT:    addi sp, s0, -80
809; RV64VLA-NEXT:    .cfi_def_cfa sp, 80
810; RV64VLA-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
811; RV64VLA-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
812; RV64VLA-NEXT:    .cfi_restore ra
813; RV64VLA-NEXT:    .cfi_restore s0
814; RV64VLA-NEXT:    addi sp, sp, 80
815; RV64VLA-NEXT:    .cfi_def_cfa_offset 0
816; RV64VLA-NEXT:    ret
817;
818; RV32VLS-LABEL: insert_v2i64_nxv16i64_hi:
819; RV32VLS:       # %bb.0:
820; RV32VLS-NEXT:    addi sp, sp, -80
821; RV32VLS-NEXT:    .cfi_def_cfa_offset 80
822; RV32VLS-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
823; RV32VLS-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
824; RV32VLS-NEXT:    .cfi_offset ra, -4
825; RV32VLS-NEXT:    .cfi_offset s0, -8
826; RV32VLS-NEXT:    addi s0, sp, 80
827; RV32VLS-NEXT:    .cfi_def_cfa s0, 0
828; RV32VLS-NEXT:    addi sp, sp, -256
829; RV32VLS-NEXT:    andi sp, sp, -64
830; RV32VLS-NEXT:    vl1re64.v v8, (a0)
831; RV32VLS-NEXT:    addi a0, sp, 128
832; RV32VLS-NEXT:    vs1r.v v8, (a0)
833; RV32VLS-NEXT:    addi a0, sp, 192
834; RV32VLS-NEXT:    vl8re64.v v8, (a0)
835; RV32VLS-NEXT:    addi a0, sp, 64
836; RV32VLS-NEXT:    vl8re64.v v16, (a0)
837; RV32VLS-NEXT:    addi a0, a1, 128
838; RV32VLS-NEXT:    vs8r.v v8, (a0)
839; RV32VLS-NEXT:    vs8r.v v16, (a1)
840; RV32VLS-NEXT:    addi sp, s0, -80
841; RV32VLS-NEXT:    .cfi_def_cfa sp, 80
842; RV32VLS-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
843; RV32VLS-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
844; RV32VLS-NEXT:    .cfi_restore ra
845; RV32VLS-NEXT:    .cfi_restore s0
846; RV32VLS-NEXT:    addi sp, sp, 80
847; RV32VLS-NEXT:    .cfi_def_cfa_offset 0
848; RV32VLS-NEXT:    ret
849;
850; RV64VLS-LABEL: insert_v2i64_nxv16i64_hi:
851; RV64VLS:       # %bb.0:
852; RV64VLS-NEXT:    addi sp, sp, -80
853; RV64VLS-NEXT:    .cfi_def_cfa_offset 80
854; RV64VLS-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
855; RV64VLS-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
856; RV64VLS-NEXT:    .cfi_offset ra, -8
857; RV64VLS-NEXT:    .cfi_offset s0, -16
858; RV64VLS-NEXT:    addi s0, sp, 80
859; RV64VLS-NEXT:    .cfi_def_cfa s0, 0
860; RV64VLS-NEXT:    addi sp, sp, -256
861; RV64VLS-NEXT:    andi sp, sp, -64
862; RV64VLS-NEXT:    vl1re64.v v8, (a0)
863; RV64VLS-NEXT:    addi a0, sp, 128
864; RV64VLS-NEXT:    vs1r.v v8, (a0)
865; RV64VLS-NEXT:    addi a0, sp, 192
866; RV64VLS-NEXT:    vl8re64.v v8, (a0)
867; RV64VLS-NEXT:    addi a0, sp, 64
868; RV64VLS-NEXT:    vl8re64.v v16, (a0)
869; RV64VLS-NEXT:    addi a0, a1, 128
870; RV64VLS-NEXT:    vs8r.v v8, (a0)
871; RV64VLS-NEXT:    vs8r.v v16, (a1)
872; RV64VLS-NEXT:    addi sp, s0, -80
873; RV64VLS-NEXT:    .cfi_def_cfa sp, 80
874; RV64VLS-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
875; RV64VLS-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
876; RV64VLS-NEXT:    .cfi_restore ra
877; RV64VLS-NEXT:    .cfi_restore s0
878; RV64VLS-NEXT:    addi sp, sp, 80
879; RV64VLS-NEXT:    .cfi_def_cfa_offset 0
880; RV64VLS-NEXT:    ret
881  %sv = load <2 x i64>, ptr %psv
882  %v = call <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64> undef, <2 x i64> %sv, i64 8)
883  store <vscale x 16 x i64> %v, ptr %out
884  ret void
885}
886
887define <vscale x 8 x bfloat> @insert_nxv8bf16_v2bf16_0(<vscale x 8 x bfloat> %vec, ptr %svp) {
888; VLA-LABEL: insert_nxv8bf16_v2bf16_0:
889; VLA:       # %bb.0:
890; VLA-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
891; VLA-NEXT:    vle16.v v10, (a0)
892; VLA-NEXT:    vsetivli zero, 2, e16, m2, tu, ma
893; VLA-NEXT:    vmv.v.v v8, v10
894; VLA-NEXT:    ret
895;
896; VLS-LABEL: insert_nxv8bf16_v2bf16_0:
897; VLS:       # %bb.0:
898; VLS-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
899; VLS-NEXT:    vle16.v v10, (a0)
900; VLS-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
901; VLS-NEXT:    vmv.v.v v8, v10
902; VLS-NEXT:    ret
903  %sv = load <2 x bfloat>, ptr %svp
904  %v = call <vscale x 8 x bfloat> @llvm.vector.insert.v2bf16.nxv8bf16(<vscale x 8 x bfloat> %vec, <2 x bfloat> %sv, i64 0)
905  ret <vscale x 8 x bfloat> %v
906}
907
908define <vscale x 8 x bfloat> @insert_nxv8bf16_v2bf16_2(<vscale x 8 x bfloat> %vec, ptr %svp) {
909; VLA-LABEL: insert_nxv8bf16_v2bf16_2:
910; VLA:       # %bb.0:
911; VLA-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
912; VLA-NEXT:    vle16.v v10, (a0)
913; VLA-NEXT:    vsetivli zero, 4, e16, m2, tu, ma
914; VLA-NEXT:    vslideup.vi v8, v10, 2
915; VLA-NEXT:    ret
916;
917; VLS-LABEL: insert_nxv8bf16_v2bf16_2:
918; VLS:       # %bb.0:
919; VLS-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
920; VLS-NEXT:    vle16.v v10, (a0)
921; VLS-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
922; VLS-NEXT:    vslideup.vi v8, v10, 2
923; VLS-NEXT:    ret
924  %sv = load <2 x bfloat>, ptr %svp
925  %v = call <vscale x 8 x bfloat> @llvm.vector.insert.v2bf16.nxv8bf16(<vscale x 8 x bfloat> %vec, <2 x bfloat> %sv, i64 2)
926  ret <vscale x 8 x bfloat> %v
927}
928
929define <vscale x 8 x half> @insert_nxv8f16_v2f16_0(<vscale x 8 x half> %vec, ptr %svp) {
930; VLA-LABEL: insert_nxv8f16_v2f16_0:
931; VLA:       # %bb.0:
932; VLA-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
933; VLA-NEXT:    vle16.v v10, (a0)
934; VLA-NEXT:    vsetivli zero, 2, e16, m2, tu, ma
935; VLA-NEXT:    vmv.v.v v8, v10
936; VLA-NEXT:    ret
937;
938; VLS-LABEL: insert_nxv8f16_v2f16_0:
939; VLS:       # %bb.0:
940; VLS-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
941; VLS-NEXT:    vle16.v v10, (a0)
942; VLS-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
943; VLS-NEXT:    vmv.v.v v8, v10
944; VLS-NEXT:    ret
945  %sv = load <2 x half>, ptr %svp
946  %v = call <vscale x 8 x half> @llvm.vector.insert.v2f16.nxv8f16(<vscale x 8 x half> %vec, <2 x half> %sv, i64 0)
947  ret <vscale x 8 x half> %v
948}
949
950define <vscale x 8 x half> @insert_nxv8f16_v2f16_2(<vscale x 8 x half> %vec, ptr %svp) {
951; VLA-LABEL: insert_nxv8f16_v2f16_2:
952; VLA:       # %bb.0:
953; VLA-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
954; VLA-NEXT:    vle16.v v10, (a0)
955; VLA-NEXT:    vsetivli zero, 4, e16, m2, tu, ma
956; VLA-NEXT:    vslideup.vi v8, v10, 2
957; VLA-NEXT:    ret
958;
959; VLS-LABEL: insert_nxv8f16_v2f16_2:
960; VLS:       # %bb.0:
961; VLS-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
962; VLS-NEXT:    vle16.v v10, (a0)
963; VLS-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
964; VLS-NEXT:    vslideup.vi v8, v10, 2
965; VLS-NEXT:    ret
966  %sv = load <2 x half>, ptr %svp
967  %v = call <vscale x 8 x half> @llvm.vector.insert.v2f16.nxv8f16(<vscale x 8 x half> %vec, <2 x half> %sv, i64 2)
968  ret <vscale x 8 x half> %v
969}
970
971declare <8 x i1> @llvm.vector.insert.v4i1.v8i1(<8 x i1>, <4 x i1>, i64)
972declare <32 x i1> @llvm.vector.insert.v8i1.v32i1(<32 x i1>, <8 x i1>, i64)
973
974declare <4 x i16> @llvm.vector.insert.v2i16.v4i16(<4 x i16>, <2 x i16>, i64)
975
976declare <4 x i32> @llvm.vector.insert.v2i32.v4i32(<4 x i32>, <2 x i32>, i64)
977declare <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32>, <2 x i32>, i64)
978
979declare <vscale x 2 x i1> @llvm.vector.insert.v4i1.nxv2i1(<vscale x 2 x i1>, <4 x i1>, i64)
980declare <vscale x 8 x i1> @llvm.vector.insert.v8i1.nxv8i1(<vscale x 8 x i1>, <8 x i1>, i64)
981
982declare <vscale x 2 x i16> @llvm.vector.insert.v2i16.nxv2i16(<vscale x 2 x i16>, <2 x i16>, i64)
983
984declare <vscale x 8 x i32> @llvm.vector.insert.v2i32.nxv8i32(<vscale x 8 x i32>, <2 x i32>, i64)
985declare <vscale x 8 x i32> @llvm.vector.insert.v4i32.nxv8i32(<vscale x 8 x i32>, <4 x i32>, i64)
986declare <vscale x 8 x i32> @llvm.vector.insert.v8i32.nxv8i32(<vscale x 8 x i32>, <8 x i32>, i64)
987
988; We emit insert_subvectors of fixed vectors at index 0 into undefs as a
989; copy_to_regclass or insert_subreg, depending on the register classes of the
990; vector types. Make sure that we use the correct type and not the shrunken
991; LMUL=1 type, otherwise we will end up with an invalid extract_subvector when
992; converting it from scalable->fixed, e.g. we get this for VLEN=128:
993;
994;   t14: nxv2i32 = insert_subvector undef:nxv2i32, t4, Constant:i64<0>
995; t15: v8i32 = extract_subvector t14, Constant:i64<0>
996declare <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32>, i64)
997define <4 x i32> @insert_extract_v8i32_v2i32_0(<2 x i32> %v) {
998; CHECK-LABEL: insert_extract_v8i32_v2i32_0:
999; CHECK:       # %bb.0:
1000; CHECK-NEXT:    ret
1001  %1 = call <8 x i32> @llvm.vector.insert.v2i32.v8i32(<8 x i32> poison, <2 x i32> %v, i64 0)
1002  %2 = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %1, i64 0)
1003  ret <4 x i32> %2
1004}
1005