xref: /llvm-project/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll (revision 62baf21daa377c4ec1a641b26931063c1117d262)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s
3; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 -force-streaming < %s | FileCheck %s
4
5; == Normal Multi-Vector Consecutive Stores ==
6
7define void @st1_x2_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
8; CHECK-LABEL: st1_x2_i8:
9; CHECK:       // %bb.0:
10; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
11; CHECK-NEXT:    addvl sp, sp, #-1
12; CHECK-NEXT:    mov z3.d, z2.d
13; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
14; CHECK-NEXT:    mov p8.b, p0.b
15; CHECK-NEXT:    mov z2.d, z1.d
16; CHECK-NEXT:    st1b { z2.b, z3.b }, pn8, [x0]
17; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
18; CHECK-NEXT:    addvl sp, sp, #1
19; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
20; CHECK-NEXT:    ret
21  call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
22  ret void
23}
24
25define void @st1_x2_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
26; CHECK-LABEL: st1_x2_i16:
27; CHECK:       // %bb.0:
28; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
29; CHECK-NEXT:    addvl sp, sp, #-1
30; CHECK-NEXT:    mov z3.d, z2.d
31; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
32; CHECK-NEXT:    mov p8.b, p0.b
33; CHECK-NEXT:    mov z2.d, z1.d
34; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
35; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
36; CHECK-NEXT:    addvl sp, sp, #1
37; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
38; CHECK-NEXT:    ret
39  call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
40  ret void
41}
42
43define void @st1_x2_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
44; CHECK-LABEL: st1_x2_i32:
45; CHECK:       // %bb.0:
46; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
47; CHECK-NEXT:    addvl sp, sp, #-1
48; CHECK-NEXT:    mov z3.d, z2.d
49; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
50; CHECK-NEXT:    mov p8.b, p0.b
51; CHECK-NEXT:    mov z2.d, z1.d
52; CHECK-NEXT:    st1w { z2.s, z3.s }, pn8, [x0]
53; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
54; CHECK-NEXT:    addvl sp, sp, #1
55; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
56; CHECK-NEXT:    ret
57  call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
58  ret void
59}
60
61define void @st1_x2_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
62; CHECK-LABEL: st1_x2_i64:
63; CHECK:       // %bb.0:
64; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
65; CHECK-NEXT:    addvl sp, sp, #-1
66; CHECK-NEXT:    mov z3.d, z2.d
67; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
68; CHECK-NEXT:    mov p8.b, p0.b
69; CHECK-NEXT:    mov z2.d, z1.d
70; CHECK-NEXT:    st1d { z2.d, z3.d }, pn8, [x0]
71; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
72; CHECK-NEXT:    addvl sp, sp, #1
73; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
74; CHECK-NEXT:    ret
75  call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
76  ret void
77}
78
79define void @st1_x2_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
80; CHECK-LABEL: st1_x2_f16:
81; CHECK:       // %bb.0:
82; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
83; CHECK-NEXT:    addvl sp, sp, #-1
84; CHECK-NEXT:    mov z3.d, z2.d
85; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
86; CHECK-NEXT:    mov p8.b, p0.b
87; CHECK-NEXT:    mov z2.d, z1.d
88; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
89; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
90; CHECK-NEXT:    addvl sp, sp, #1
91; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
92; CHECK-NEXT:    ret
93  call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
94  ret void
95}
96
97define void @st1_x2_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
98; CHECK-LABEL: st1_x2_bf16:
99; CHECK:       // %bb.0:
100; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
101; CHECK-NEXT:    addvl sp, sp, #-1
102; CHECK-NEXT:    mov z3.d, z2.d
103; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
104; CHECK-NEXT:    mov p8.b, p0.b
105; CHECK-NEXT:    mov z2.d, z1.d
106; CHECK-NEXT:    st1h { z2.h, z3.h }, pn8, [x0]
107; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
108; CHECK-NEXT:    addvl sp, sp, #1
109; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
110; CHECK-NEXT:    ret
111  call void @llvm.aarch64.sve.st1.pn.x2.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
112  ret void
113}
114
115define void @st1_x2_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
116; CHECK-LABEL: st1_x2_f32:
117; CHECK:       // %bb.0:
118; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
119; CHECK-NEXT:    addvl sp, sp, #-1
120; CHECK-NEXT:    mov z3.d, z2.d
121; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
122; CHECK-NEXT:    mov p8.b, p0.b
123; CHECK-NEXT:    mov z2.d, z1.d
124; CHECK-NEXT:    st1w { z2.s, z3.s }, pn8, [x0]
125; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
126; CHECK-NEXT:    addvl sp, sp, #1
127; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
128; CHECK-NEXT:    ret
129  call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
130  ret void
131}
132
133define void @st1_x2_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
134; CHECK-LABEL: st1_x2_f64:
135; CHECK:       // %bb.0:
136; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
137; CHECK-NEXT:    addvl sp, sp, #-1
138; CHECK-NEXT:    mov z3.d, z2.d
139; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
140; CHECK-NEXT:    mov p8.b, p0.b
141; CHECK-NEXT:    mov z2.d, z1.d
142; CHECK-NEXT:    st1d { z2.d, z3.d }, pn8, [x0]
143; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
144; CHECK-NEXT:    addvl sp, sp, #1
145; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
146; CHECK-NEXT:    ret
147  call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
148  ret void
149}
150
151define void @st1_x4_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
152; CHECK-LABEL: st1_x4_i8:
153; CHECK:       // %bb.0:
154; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
155; CHECK-NEXT:    addvl sp, sp, #-1
156; CHECK-NEXT:    mov z7.d, z4.d
157; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
158; CHECK-NEXT:    mov p8.b, p0.b
159; CHECK-NEXT:    mov z6.d, z3.d
160; CHECK-NEXT:    mov z5.d, z2.d
161; CHECK-NEXT:    mov z4.d, z1.d
162; CHECK-NEXT:    st1b { z4.b - z7.b }, pn8, [x0]
163; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
164; CHECK-NEXT:    addvl sp, sp, #1
165; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
166; CHECK-NEXT:    ret
167  call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
168  ret void
169}
170
171define void @st1_x4_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
172; CHECK-LABEL: st1_x4_i16:
173; CHECK:       // %bb.0:
174; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
175; CHECK-NEXT:    addvl sp, sp, #-1
176; CHECK-NEXT:    mov z7.d, z4.d
177; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
178; CHECK-NEXT:    mov p8.b, p0.b
179; CHECK-NEXT:    mov z6.d, z3.d
180; CHECK-NEXT:    mov z5.d, z2.d
181; CHECK-NEXT:    mov z4.d, z1.d
182; CHECK-NEXT:    st1h { z4.h - z7.h }, pn8, [x0]
183; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
184; CHECK-NEXT:    addvl sp, sp, #1
185; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
186; CHECK-NEXT:    ret
187  call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
188  ret void
189}
190
191define void @st1_x4_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
192; CHECK-LABEL: st1_x4_i32:
193; CHECK:       // %bb.0:
194; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
195; CHECK-NEXT:    addvl sp, sp, #-1
196; CHECK-NEXT:    mov z7.d, z4.d
197; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
198; CHECK-NEXT:    mov p8.b, p0.b
199; CHECK-NEXT:    mov z6.d, z3.d
200; CHECK-NEXT:    mov z5.d, z2.d
201; CHECK-NEXT:    mov z4.d, z1.d
202; CHECK-NEXT:    st1w { z4.s - z7.s }, pn8, [x0]
203; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
204; CHECK-NEXT:    addvl sp, sp, #1
205; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
206; CHECK-NEXT:    ret
207  call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
208  ret void
209}
210
211define void @st1_x4_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
212; CHECK-LABEL: st1_x4_i64:
213; CHECK:       // %bb.0:
214; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
215; CHECK-NEXT:    addvl sp, sp, #-1
216; CHECK-NEXT:    mov z7.d, z4.d
217; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
218; CHECK-NEXT:    mov p8.b, p0.b
219; CHECK-NEXT:    mov z6.d, z3.d
220; CHECK-NEXT:    mov z5.d, z2.d
221; CHECK-NEXT:    mov z4.d, z1.d
222; CHECK-NEXT:    st1d { z4.d - z7.d }, pn8, [x0]
223; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
224; CHECK-NEXT:    addvl sp, sp, #1
225; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
226; CHECK-NEXT:    ret
227  call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
228  ret void
229}
230
231define void @st1_x4_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
232; CHECK-LABEL: st1_x4_f16:
233; CHECK:       // %bb.0:
234; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
235; CHECK-NEXT:    addvl sp, sp, #-1
236; CHECK-NEXT:    mov z7.d, z4.d
237; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
238; CHECK-NEXT:    mov p8.b, p0.b
239; CHECK-NEXT:    mov z6.d, z3.d
240; CHECK-NEXT:    mov z5.d, z2.d
241; CHECK-NEXT:    mov z4.d, z1.d
242; CHECK-NEXT:    st1h { z4.h - z7.h }, pn8, [x0]
243; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
244; CHECK-NEXT:    addvl sp, sp, #1
245; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
246; CHECK-NEXT:    ret
247  call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
248  ret void
249}
250
251define void @st1_x4_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
252; CHECK-LABEL: st1_x4_bf16:
253; CHECK:       // %bb.0:
254; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
255; CHECK-NEXT:    addvl sp, sp, #-1
256; CHECK-NEXT:    mov z7.d, z4.d
257; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
258; CHECK-NEXT:    mov p8.b, p0.b
259; CHECK-NEXT:    mov z6.d, z3.d
260; CHECK-NEXT:    mov z5.d, z2.d
261; CHECK-NEXT:    mov z4.d, z1.d
262; CHECK-NEXT:    st1h { z4.h - z7.h }, pn8, [x0]
263; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
264; CHECK-NEXT:    addvl sp, sp, #1
265; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
266; CHECK-NEXT:    ret
267  call void @llvm.aarch64.sve.st1.pn.x4.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
268  ret void
269}
270
271define void @st1_x4_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
272; CHECK-LABEL: st1_x4_f32:
273; CHECK:       // %bb.0:
274; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
275; CHECK-NEXT:    addvl sp, sp, #-1
276; CHECK-NEXT:    mov z7.d, z4.d
277; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
278; CHECK-NEXT:    mov p8.b, p0.b
279; CHECK-NEXT:    mov z6.d, z3.d
280; CHECK-NEXT:    mov z5.d, z2.d
281; CHECK-NEXT:    mov z4.d, z1.d
282; CHECK-NEXT:    st1w { z4.s - z7.s }, pn8, [x0]
283; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
284; CHECK-NEXT:    addvl sp, sp, #1
285; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
286; CHECK-NEXT:    ret
287  call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
288  ret void
289}
290
291define void @st1_x4_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
292; CHECK-LABEL: st1_x4_f64:
293; CHECK:       // %bb.0:
294; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
295; CHECK-NEXT:    addvl sp, sp, #-1
296; CHECK-NEXT:    mov z7.d, z4.d
297; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
298; CHECK-NEXT:    mov p8.b, p0.b
299; CHECK-NEXT:    mov z6.d, z3.d
300; CHECK-NEXT:    mov z5.d, z2.d
301; CHECK-NEXT:    mov z4.d, z1.d
302; CHECK-NEXT:    st1d { z4.d - z7.d }, pn8, [x0]
303; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
304; CHECK-NEXT:    addvl sp, sp, #1
305; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
306; CHECK-NEXT:    ret
307  call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
308  ret void
309}
310
311; == Non-temporal Multi-Vector Consecutive Stores ==
312
313define void @stnt1_x2_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
314; CHECK-LABEL: stnt1_x2_i8:
315; CHECK:       // %bb.0:
316; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
317; CHECK-NEXT:    addvl sp, sp, #-1
318; CHECK-NEXT:    mov z3.d, z2.d
319; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
320; CHECK-NEXT:    mov p8.b, p0.b
321; CHECK-NEXT:    mov z2.d, z1.d
322; CHECK-NEXT:    stnt1b { z2.b, z3.b }, pn8, [x0]
323; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
324; CHECK-NEXT:    addvl sp, sp, #1
325; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
326; CHECK-NEXT:    ret
327  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
328  ret void
329}
330
331define void @stnt1_x2_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
332; CHECK-LABEL: stnt1_x2_i16:
333; CHECK:       // %bb.0:
334; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
335; CHECK-NEXT:    addvl sp, sp, #-1
336; CHECK-NEXT:    mov z3.d, z2.d
337; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
338; CHECK-NEXT:    mov p8.b, p0.b
339; CHECK-NEXT:    mov z2.d, z1.d
340; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
341; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
342; CHECK-NEXT:    addvl sp, sp, #1
343; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
344; CHECK-NEXT:    ret
345  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
346  ret void
347}
348
349define void @stnt1_x2_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
350; CHECK-LABEL: stnt1_x2_i32:
351; CHECK:       // %bb.0:
352; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
353; CHECK-NEXT:    addvl sp, sp, #-1
354; CHECK-NEXT:    mov z3.d, z2.d
355; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
356; CHECK-NEXT:    mov p8.b, p0.b
357; CHECK-NEXT:    mov z2.d, z1.d
358; CHECK-NEXT:    stnt1w { z2.s, z3.s }, pn8, [x0]
359; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
360; CHECK-NEXT:    addvl sp, sp, #1
361; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
362; CHECK-NEXT:    ret
363  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
364  ret void
365}
366
367define void @stnt1_x2_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
368; CHECK-LABEL: stnt1_x2_i64:
369; CHECK:       // %bb.0:
370; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
371; CHECK-NEXT:    addvl sp, sp, #-1
372; CHECK-NEXT:    mov z3.d, z2.d
373; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
374; CHECK-NEXT:    mov p8.b, p0.b
375; CHECK-NEXT:    mov z2.d, z1.d
376; CHECK-NEXT:    stnt1d { z2.d, z3.d }, pn8, [x0]
377; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
378; CHECK-NEXT:    addvl sp, sp, #1
379; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
380; CHECK-NEXT:    ret
381  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
382  ret void
383}
384
385define void @stnt1_x2_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
386; CHECK-LABEL: stnt1_x2_f16:
387; CHECK:       // %bb.0:
388; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
389; CHECK-NEXT:    addvl sp, sp, #-1
390; CHECK-NEXT:    mov z3.d, z2.d
391; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
392; CHECK-NEXT:    mov p8.b, p0.b
393; CHECK-NEXT:    mov z2.d, z1.d
394; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
395; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
396; CHECK-NEXT:    addvl sp, sp, #1
397; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
398; CHECK-NEXT:    ret
399  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
400  ret void
401}
402
403define void @stnt1_x2_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
404; CHECK-LABEL: stnt1_x2_bf16:
405; CHECK:       // %bb.0:
406; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
407; CHECK-NEXT:    addvl sp, sp, #-1
408; CHECK-NEXT:    mov z3.d, z2.d
409; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
410; CHECK-NEXT:    mov p8.b, p0.b
411; CHECK-NEXT:    mov z2.d, z1.d
412; CHECK-NEXT:    stnt1h { z2.h, z3.h }, pn8, [x0]
413; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
414; CHECK-NEXT:    addvl sp, sp, #1
415; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
416; CHECK-NEXT:    ret
417  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
418  ret void
419}
420
421define void @stnt1_x2_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
422; CHECK-LABEL: stnt1_x2_f32:
423; CHECK:       // %bb.0:
424; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
425; CHECK-NEXT:    addvl sp, sp, #-1
426; CHECK-NEXT:    mov z3.d, z2.d
427; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
428; CHECK-NEXT:    mov p8.b, p0.b
429; CHECK-NEXT:    mov z2.d, z1.d
430; CHECK-NEXT:    stnt1w { z2.s, z3.s }, pn8, [x0]
431; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
432; CHECK-NEXT:    addvl sp, sp, #1
433; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
434; CHECK-NEXT:    ret
435  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
436  ret void
437}
438
439define void @stnt1_x2_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
440; CHECK-LABEL: stnt1_x2_f64:
441; CHECK:       // %bb.0:
442; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
443; CHECK-NEXT:    addvl sp, sp, #-1
444; CHECK-NEXT:    mov z3.d, z2.d
445; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
446; CHECK-NEXT:    mov p8.b, p0.b
447; CHECK-NEXT:    mov z2.d, z1.d
448; CHECK-NEXT:    stnt1d { z2.d, z3.d }, pn8, [x0]
449; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
450; CHECK-NEXT:    addvl sp, sp, #1
451; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
452; CHECK-NEXT:    ret
453  call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, target("aarch64.svcount") %pn, ptr %ptr);
454  ret void
455}
456
457define void @stnt1_x4_i8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
458; CHECK-LABEL: stnt1_x4_i8:
459; CHECK:       // %bb.0:
460; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
461; CHECK-NEXT:    addvl sp, sp, #-1
462; CHECK-NEXT:    mov z7.d, z4.d
463; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
464; CHECK-NEXT:    mov p8.b, p0.b
465; CHECK-NEXT:    mov z6.d, z3.d
466; CHECK-NEXT:    mov z5.d, z2.d
467; CHECK-NEXT:    mov z4.d, z1.d
468; CHECK-NEXT:    stnt1b { z4.b - z7.b }, pn8, [x0]
469; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
470; CHECK-NEXT:    addvl sp, sp, #1
471; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
472; CHECK-NEXT:    ret
473  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
474  ret void
475}
476
477define void @stnt1_x4_i16(<vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
478; CHECK-LABEL: stnt1_x4_i16:
479; CHECK:       // %bb.0:
480; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
481; CHECK-NEXT:    addvl sp, sp, #-1
482; CHECK-NEXT:    mov z7.d, z4.d
483; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
484; CHECK-NEXT:    mov p8.b, p0.b
485; CHECK-NEXT:    mov z6.d, z3.d
486; CHECK-NEXT:    mov z5.d, z2.d
487; CHECK-NEXT:    mov z4.d, z1.d
488; CHECK-NEXT:    stnt1h { z4.h - z7.h }, pn8, [x0]
489; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
490; CHECK-NEXT:    addvl sp, sp, #1
491; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
492; CHECK-NEXT:    ret
493  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
494  ret void
495}
496
497define void @stnt1_x4_i32(<vscale x 16 x i8> %unused, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
498; CHECK-LABEL: stnt1_x4_i32:
499; CHECK:       // %bb.0:
500; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
501; CHECK-NEXT:    addvl sp, sp, #-1
502; CHECK-NEXT:    mov z7.d, z4.d
503; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
504; CHECK-NEXT:    mov p8.b, p0.b
505; CHECK-NEXT:    mov z6.d, z3.d
506; CHECK-NEXT:    mov z5.d, z2.d
507; CHECK-NEXT:    mov z4.d, z1.d
508; CHECK-NEXT:    stnt1w { z4.s - z7.s }, pn8, [x0]
509; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
510; CHECK-NEXT:    addvl sp, sp, #1
511; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
512; CHECK-NEXT:    ret
513  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
514  ret void
515}
516
517define void @stnt1_x4_i64(<vscale x 16 x i8> %unused, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
518; CHECK-LABEL: stnt1_x4_i64:
519; CHECK:       // %bb.0:
520; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
521; CHECK-NEXT:    addvl sp, sp, #-1
522; CHECK-NEXT:    mov z7.d, z4.d
523; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
524; CHECK-NEXT:    mov p8.b, p0.b
525; CHECK-NEXT:    mov z6.d, z3.d
526; CHECK-NEXT:    mov z5.d, z2.d
527; CHECK-NEXT:    mov z4.d, z1.d
528; CHECK-NEXT:    stnt1d { z4.d - z7.d }, pn8, [x0]
529; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
530; CHECK-NEXT:    addvl sp, sp, #1
531; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
532; CHECK-NEXT:    ret
533  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
534  ret void
535}
536
537define void @stnt1_x4_f16(<vscale x 16 x i8> %unused, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
538; CHECK-LABEL: stnt1_x4_f16:
539; CHECK:       // %bb.0:
540; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
541; CHECK-NEXT:    addvl sp, sp, #-1
542; CHECK-NEXT:    mov z7.d, z4.d
543; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
544; CHECK-NEXT:    mov p8.b, p0.b
545; CHECK-NEXT:    mov z6.d, z3.d
546; CHECK-NEXT:    mov z5.d, z2.d
547; CHECK-NEXT:    mov z4.d, z1.d
548; CHECK-NEXT:    stnt1h { z4.h - z7.h }, pn8, [x0]
549; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
550; CHECK-NEXT:    addvl sp, sp, #1
551; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
552; CHECK-NEXT:    ret
553  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
554  ret void
555}
556
557define void @stnt1_x4_bf16(<vscale x 16 x i8> %unused, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
558; CHECK-LABEL: stnt1_x4_bf16:
559; CHECK:       // %bb.0:
560; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
561; CHECK-NEXT:    addvl sp, sp, #-1
562; CHECK-NEXT:    mov z7.d, z4.d
563; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
564; CHECK-NEXT:    mov p8.b, p0.b
565; CHECK-NEXT:    mov z6.d, z3.d
566; CHECK-NEXT:    mov z5.d, z2.d
567; CHECK-NEXT:    mov z4.d, z1.d
568; CHECK-NEXT:    stnt1h { z4.h - z7.h }, pn8, [x0]
569; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
570; CHECK-NEXT:    addvl sp, sp, #1
571; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
572; CHECK-NEXT:    ret
573  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8bf16(<vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
574  ret void
575}
576
577define void @stnt1_x4_f32(<vscale x 16 x i8> %unused, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
578; CHECK-LABEL: stnt1_x4_f32:
579; CHECK:       // %bb.0:
580; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
581; CHECK-NEXT:    addvl sp, sp, #-1
582; CHECK-NEXT:    mov z7.d, z4.d
583; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
584; CHECK-NEXT:    mov p8.b, p0.b
585; CHECK-NEXT:    mov z6.d, z3.d
586; CHECK-NEXT:    mov z5.d, z2.d
587; CHECK-NEXT:    mov z4.d, z1.d
588; CHECK-NEXT:    stnt1w { z4.s - z7.s }, pn8, [x0]
589; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
590; CHECK-NEXT:    addvl sp, sp, #1
591; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
592; CHECK-NEXT:    ret
593  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
594  ret void
595}
596
597define void @stnt1_x4_f64(<vscale x 16 x i8> %unused, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
598; CHECK-LABEL: stnt1_x4_f64:
599; CHECK:       // %bb.0:
600; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
601; CHECK-NEXT:    addvl sp, sp, #-1
602; CHECK-NEXT:    mov z7.d, z4.d
603; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
604; CHECK-NEXT:    mov p8.b, p0.b
605; CHECK-NEXT:    mov z6.d, z3.d
606; CHECK-NEXT:    mov z5.d, z2.d
607; CHECK-NEXT:    mov z4.d, z1.d
608; CHECK-NEXT:    stnt1d { z4.d - z7.d }, pn8, [x0]
609; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
610; CHECK-NEXT:    addvl sp, sp, #1
611; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
612; CHECK-NEXT:    ret
613  call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3, target("aarch64.svcount") %pn, ptr %ptr);
614  ret void
615}
616
617declare void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
618declare void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
619declare void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
620declare void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
621declare void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
622declare void @llvm.aarch64.sve.st1.pn.x2.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
623declare void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
624declare void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
625declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
626declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
627declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
628declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
629declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
630declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
631declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
632declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
633
634
635declare void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
636declare void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
637declare void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
638declare void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
639declare void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
640declare void @llvm.aarch64.sve.st1.pn.x4.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
641declare void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
642declare void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
643declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, target("aarch64.svcount"), ptr)
644declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, target("aarch64.svcount"), ptr)
645declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, target("aarch64.svcount"), ptr)
646declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, target("aarch64.svcount"), ptr)
647declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, target("aarch64.svcount"), ptr)
648declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, target("aarch64.svcount"), ptr)
649declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, target("aarch64.svcount"), ptr)
650declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, target("aarch64.svcount"), ptr)
651