xref: /llvm-project/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll (revision 7f0c5b0502b462d2afad32d3681b37cfc15ba844)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -force-streaming -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED
3; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS
4
5define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
6; STRIDED-LABEL: ldnt1_x2_i8_z0_z8:
7; STRIDED:       // %bb.0:
8; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
9; STRIDED-NEXT:    addvl sp, sp, #-17
10; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
11; STRIDED-NEXT:    mov p8.b, p0.b
12; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
13; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
14; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
15; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
16; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
17; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
18; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
19; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
20; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
21; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
22; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
23; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
24; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
25; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
26; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
27; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
28; STRIDED-NEXT:    ldnt1b { z0.b, z8.b }, pn8/z, [x0]
29; STRIDED-NEXT:    //APP
30; STRIDED-NEXT:    nop
31; STRIDED-NEXT:    //NO_APP
32; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
33; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
34; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
35; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
36; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
37; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
38; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
39; STRIDED-NEXT:    mov z1.d, z8.d
40; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
41; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
42; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
43; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
44; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
45; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
46; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
47; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
48; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
49; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
50; STRIDED-NEXT:    addvl sp, sp, #17
51; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
52; STRIDED-NEXT:    ret
53;
54; CONTIGUOUS-LABEL: ldnt1_x2_i8_z0_z8:
55; CONTIGUOUS:       // %bb.0:
56; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
57; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
58; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
59; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
60; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
61; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
62; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
63; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
64; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
65; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
66; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
67; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
68; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
69; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
70; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
71; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
72; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
73; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
74; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
75; CONTIGUOUS-NEXT:    mov p8.b, p0.b
76; CONTIGUOUS-NEXT:    ldnt1b { z0.b, z1.b }, pn8/z, [x0]
77; CONTIGUOUS-NEXT:    str z0, [sp]
78; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
79; CONTIGUOUS-NEXT:    //APP
80; CONTIGUOUS-NEXT:    nop
81; CONTIGUOUS-NEXT:    //NO_APP
82; CONTIGUOUS-NEXT:    ldr z0, [sp]
83; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
84; CONTIGUOUS-NEXT:    addvl sp, sp, #2
85; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
86; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
87; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
88; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
89; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
90; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
91; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
92; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
93; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
94; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
95; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
96; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
97; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
98; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
99; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
100; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
101; CONTIGUOUS-NEXT:    addvl sp, sp, #16
102; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
103; CONTIGUOUS-NEXT:    ret
104  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr)
105  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
106  %res.v0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
107  %v0 = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> %res.v0, i64 0)
108  %res.v1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 1
109  %v1 = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> %v0, <vscale x 16 x i8> %res.v1, i64 16)
110  ret <vscale x 32 x i8> %v1
111}
112
113define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
114; STRIDED-LABEL: ldnt1_x2_i8_z0_z8_scalar:
115; STRIDED:       // %bb.0:
116; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
117; STRIDED-NEXT:    addvl sp, sp, #-17
118; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
119; STRIDED-NEXT:    mov p8.b, p0.b
120; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
121; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
122; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
123; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
124; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
125; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
126; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
127; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
128; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
129; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
130; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
131; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
132; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
133; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
134; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
135; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
136; STRIDED-NEXT:    ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1]
137; STRIDED-NEXT:    //APP
138; STRIDED-NEXT:    nop
139; STRIDED-NEXT:    //NO_APP
140; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
141; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
142; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
143; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
144; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
145; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
146; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
147; STRIDED-NEXT:    mov z1.d, z8.d
148; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
149; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
150; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
151; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
152; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
153; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
154; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
155; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
156; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
157; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
158; STRIDED-NEXT:    addvl sp, sp, #17
159; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
160; STRIDED-NEXT:    ret
161;
162; CONTIGUOUS-LABEL: ldnt1_x2_i8_z0_z8_scalar:
163; CONTIGUOUS:       // %bb.0:
164; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
165; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
166; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
167; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
168; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
169; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
170; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
171; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
172; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
173; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
174; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
175; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
176; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
177; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
178; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
179; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
180; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
181; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
182; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
183; CONTIGUOUS-NEXT:    mov p8.b, p0.b
184; CONTIGUOUS-NEXT:    ldnt1b { z0.b, z1.b }, pn8/z, [x0, x1]
185; CONTIGUOUS-NEXT:    str z0, [sp]
186; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
187; CONTIGUOUS-NEXT:    //APP
188; CONTIGUOUS-NEXT:    nop
189; CONTIGUOUS-NEXT:    //NO_APP
190; CONTIGUOUS-NEXT:    ldr z0, [sp]
191; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
192; CONTIGUOUS-NEXT:    addvl sp, sp, #2
193; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
194; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
195; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
196; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
197; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
198; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
199; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
200; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
201; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
202; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
203; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
204; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
205; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
206; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
207; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
208; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
209; CONTIGUOUS-NEXT:    addvl sp, sp, #16
210; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
211; CONTIGUOUS-NEXT:    ret
212  %base = getelementptr i8, ptr %ptr, i64 %index
213  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %base)
214  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
215  %res.v0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
216  %v0 = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> %res.v0, i64 0)
217  %res.v1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 1
218  %v1 = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> %v0, <vscale x 16 x i8> %res.v1, i64 16)
219  ret <vscale x 32 x i8> %v1
220}
221
222define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
223; STRIDED-LABEL: ldnt1_x2_i16_z0_z8:
224; STRIDED:       // %bb.0:
225; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
226; STRIDED-NEXT:    addvl sp, sp, #-17
227; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
228; STRIDED-NEXT:    mov p8.b, p0.b
229; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
230; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
231; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
232; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
233; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
234; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
235; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
236; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
237; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
238; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
239; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
240; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
241; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
242; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
243; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
244; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
245; STRIDED-NEXT:    ldnt1h { z0.h, z8.h }, pn8/z, [x0]
246; STRIDED-NEXT:    //APP
247; STRIDED-NEXT:    nop
248; STRIDED-NEXT:    //NO_APP
249; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
250; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
251; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
252; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
253; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
254; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
255; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
256; STRIDED-NEXT:    mov z1.d, z8.d
257; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
258; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
259; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
260; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
261; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
262; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
263; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
264; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
265; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
266; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
267; STRIDED-NEXT:    addvl sp, sp, #17
268; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
269; STRIDED-NEXT:    ret
270;
271; CONTIGUOUS-LABEL: ldnt1_x2_i16_z0_z8:
272; CONTIGUOUS:       // %bb.0:
273; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
274; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
275; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
276; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
277; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
278; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
279; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
280; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
281; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
282; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
283; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
284; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
285; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
286; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
287; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
288; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
289; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
290; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
291; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
292; CONTIGUOUS-NEXT:    mov p8.b, p0.b
293; CONTIGUOUS-NEXT:    ldnt1h { z0.h, z1.h }, pn8/z, [x0]
294; CONTIGUOUS-NEXT:    str z0, [sp]
295; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
296; CONTIGUOUS-NEXT:    //APP
297; CONTIGUOUS-NEXT:    nop
298; CONTIGUOUS-NEXT:    //NO_APP
299; CONTIGUOUS-NEXT:    ldr z0, [sp]
300; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
301; CONTIGUOUS-NEXT:    addvl sp, sp, #2
302; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
303; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
304; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
305; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
306; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
307; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
308; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
309; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
310; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
311; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
312; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
313; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
314; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
315; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
316; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
317; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
318; CONTIGUOUS-NEXT:    addvl sp, sp, #16
319; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
320; CONTIGUOUS-NEXT:    ret
321  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr)
322  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
323  %res.v0 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
324  %v0 = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> %res.v0, i64 0)
325  %res.v1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 1
326  %v1 = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> %v0, <vscale x 8 x i16> %res.v1, i64 8)
327  ret <vscale x 16 x i16> %v1
328}
329
330define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
331; STRIDED-LABEL: ldnt1_x2_i16_z0_z8_scalar:
332; STRIDED:       // %bb.0:
333; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
334; STRIDED-NEXT:    addvl sp, sp, #-17
335; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
336; STRIDED-NEXT:    mov p8.b, p0.b
337; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
338; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
339; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
340; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
341; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
342; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
343; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
344; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
345; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
346; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
347; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
348; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
349; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
350; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
351; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
352; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
353; STRIDED-NEXT:    ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1]
354; STRIDED-NEXT:    //APP
355; STRIDED-NEXT:    nop
356; STRIDED-NEXT:    //NO_APP
357; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
358; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
359; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
360; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
361; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
362; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
363; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
364; STRIDED-NEXT:    mov z1.d, z8.d
365; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
366; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
367; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
368; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
369; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
370; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
371; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
372; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
373; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
374; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
375; STRIDED-NEXT:    addvl sp, sp, #17
376; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
377; STRIDED-NEXT:    ret
378;
379; CONTIGUOUS-LABEL: ldnt1_x2_i16_z0_z8_scalar:
380; CONTIGUOUS:       // %bb.0:
381; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
382; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
383; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
384; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
385; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
386; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
387; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
388; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
389; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
390; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
391; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
392; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
393; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
394; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
395; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
396; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
397; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
398; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
399; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
400; CONTIGUOUS-NEXT:    mov p8.b, p0.b
401; CONTIGUOUS-NEXT:    ldnt1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1]
402; CONTIGUOUS-NEXT:    str z0, [sp]
403; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
404; CONTIGUOUS-NEXT:    //APP
405; CONTIGUOUS-NEXT:    nop
406; CONTIGUOUS-NEXT:    //NO_APP
407; CONTIGUOUS-NEXT:    ldr z0, [sp]
408; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
409; CONTIGUOUS-NEXT:    addvl sp, sp, #2
410; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
411; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
412; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
413; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
414; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
415; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
416; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
417; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
418; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
419; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
420; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
421; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
422; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
423; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
424; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
425; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
426; CONTIGUOUS-NEXT:    addvl sp, sp, #16
427; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
428; CONTIGUOUS-NEXT:    ret
429  %base = getelementptr i16, ptr %ptr, i64 %index
430  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %base)
431  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
432  %res.v0 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
433  %v0 = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> %res.v0, i64 0)
434  %res.v1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 1
435  %v1 = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> %v0, <vscale x 8 x i16> %res.v1, i64 8)
436  ret <vscale x 16 x i16> %v1
437}
438
439define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
440; STRIDED-LABEL: ldnt1_x2_i32_z0_z8:
441; STRIDED:       // %bb.0:
442; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
443; STRIDED-NEXT:    addvl sp, sp, #-17
444; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
445; STRIDED-NEXT:    mov p8.b, p0.b
446; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
447; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
448; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
449; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
450; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
451; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
452; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
453; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
454; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
455; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
456; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
457; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
458; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
459; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
460; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
461; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
462; STRIDED-NEXT:    ldnt1w { z0.s, z8.s }, pn8/z, [x0]
463; STRIDED-NEXT:    //APP
464; STRIDED-NEXT:    nop
465; STRIDED-NEXT:    //NO_APP
466; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
467; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
468; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
469; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
470; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
471; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
472; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
473; STRIDED-NEXT:    mov z1.d, z8.d
474; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
475; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
476; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
477; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
478; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
479; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
480; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
481; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
482; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
483; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
484; STRIDED-NEXT:    addvl sp, sp, #17
485; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
486; STRIDED-NEXT:    ret
487;
488; CONTIGUOUS-LABEL: ldnt1_x2_i32_z0_z8:
489; CONTIGUOUS:       // %bb.0:
490; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
491; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
492; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
493; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
494; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
495; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
496; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
497; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
498; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
499; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
500; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
501; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
502; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
503; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
504; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
505; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
506; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
507; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
508; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
509; CONTIGUOUS-NEXT:    mov p8.b, p0.b
510; CONTIGUOUS-NEXT:    ldnt1w { z0.s, z1.s }, pn8/z, [x0]
511; CONTIGUOUS-NEXT:    str z0, [sp]
512; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
513; CONTIGUOUS-NEXT:    //APP
514; CONTIGUOUS-NEXT:    nop
515; CONTIGUOUS-NEXT:    //NO_APP
516; CONTIGUOUS-NEXT:    ldr z0, [sp]
517; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
518; CONTIGUOUS-NEXT:    addvl sp, sp, #2
519; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
520; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
521; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
522; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
523; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
524; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
525; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
526; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
527; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
528; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
529; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
530; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
531; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
532; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
533; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
534; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
535; CONTIGUOUS-NEXT:    addvl sp, sp, #16
536; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
537; CONTIGUOUS-NEXT:    ret
538  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr)
539  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
540  %res.v0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
541  %v0 = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> %res.v0, i64 0)
542  %res.v1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 1
543  %v1 = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> %v0, <vscale x 4 x i32> %res.v1, i64 4)
544  ret <vscale x 8 x i32> %v1
545}
546
547define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
548; STRIDED-LABEL: ldnt1_x2_i32_z0_z8_scalar:
549; STRIDED:       // %bb.0:
550; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
551; STRIDED-NEXT:    addvl sp, sp, #-17
552; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
553; STRIDED-NEXT:    mov p8.b, p0.b
554; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
555; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
556; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
557; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
558; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
559; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
560; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
561; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
562; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
563; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
564; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
565; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
566; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
567; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
568; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
569; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
570; STRIDED-NEXT:    ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2]
571; STRIDED-NEXT:    //APP
572; STRIDED-NEXT:    nop
573; STRIDED-NEXT:    //NO_APP
574; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
575; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
576; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
577; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
578; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
579; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
580; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
581; STRIDED-NEXT:    mov z1.d, z8.d
582; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
583; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
584; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
585; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
586; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
587; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
588; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
589; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
590; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
591; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
592; STRIDED-NEXT:    addvl sp, sp, #17
593; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
594; STRIDED-NEXT:    ret
595;
596; CONTIGUOUS-LABEL: ldnt1_x2_i32_z0_z8_scalar:
597; CONTIGUOUS:       // %bb.0:
598; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
599; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
600; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
601; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
602; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
603; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
604; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
605; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
606; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
607; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
608; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
609; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
610; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
611; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
612; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
613; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
614; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
615; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
616; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
617; CONTIGUOUS-NEXT:    mov p8.b, p0.b
618; CONTIGUOUS-NEXT:    ldnt1w { z0.s, z1.s }, pn8/z, [x0, x1, lsl #2]
619; CONTIGUOUS-NEXT:    str z0, [sp]
620; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
621; CONTIGUOUS-NEXT:    //APP
622; CONTIGUOUS-NEXT:    nop
623; CONTIGUOUS-NEXT:    //NO_APP
624; CONTIGUOUS-NEXT:    ldr z0, [sp]
625; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
626; CONTIGUOUS-NEXT:    addvl sp, sp, #2
627; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
628; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
629; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
630; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
631; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
632; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
633; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
634; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
635; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
636; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
637; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
638; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
639; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
640; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
641; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
642; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
643; CONTIGUOUS-NEXT:    addvl sp, sp, #16
644; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
645; CONTIGUOUS-NEXT:    ret
646  %base = getelementptr i32, ptr %ptr, i64 %index
647  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %base)
648  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
649  %res.v0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
650  %v0 = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> %res.v0, i64 0)
651  %res.v1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 1
652  %v1 = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> %v0, <vscale x 4 x i32> %res.v1, i64 4)
653  ret <vscale x 8 x i32> %v1
654}
655
656define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
657; STRIDED-LABEL: ldnt1_x2_i64_z0_z8:
658; STRIDED:       // %bb.0:
659; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
660; STRIDED-NEXT:    addvl sp, sp, #-17
661; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
662; STRIDED-NEXT:    mov p8.b, p0.b
663; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
664; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
665; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
666; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
667; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
668; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
669; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
670; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
671; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
672; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
673; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
674; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
675; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
676; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
677; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
678; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
679; STRIDED-NEXT:    ldnt1d { z0.d, z8.d }, pn8/z, [x0]
680; STRIDED-NEXT:    //APP
681; STRIDED-NEXT:    nop
682; STRIDED-NEXT:    //NO_APP
683; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
684; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
685; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
686; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
687; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
688; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
689; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
690; STRIDED-NEXT:    mov z1.d, z8.d
691; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
692; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
693; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
694; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
695; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
696; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
697; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
698; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
699; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
700; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
701; STRIDED-NEXT:    addvl sp, sp, #17
702; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
703; STRIDED-NEXT:    ret
704;
705; CONTIGUOUS-LABEL: ldnt1_x2_i64_z0_z8:
706; CONTIGUOUS:       // %bb.0:
707; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
708; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
709; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
710; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
711; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
712; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
713; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
714; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
715; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
716; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
717; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
718; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
719; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
720; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
721; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
722; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
723; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
724; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
725; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
726; CONTIGUOUS-NEXT:    mov p8.b, p0.b
727; CONTIGUOUS-NEXT:    ldnt1d { z0.d, z1.d }, pn8/z, [x0]
728; CONTIGUOUS-NEXT:    str z0, [sp]
729; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
730; CONTIGUOUS-NEXT:    //APP
731; CONTIGUOUS-NEXT:    nop
732; CONTIGUOUS-NEXT:    //NO_APP
733; CONTIGUOUS-NEXT:    ldr z0, [sp]
734; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
735; CONTIGUOUS-NEXT:    addvl sp, sp, #2
736; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
737; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
738; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
739; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
740; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
741; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
742; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
743; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
744; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
745; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
746; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
747; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
748; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
749; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
750; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
751; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
752; CONTIGUOUS-NEXT:    addvl sp, sp, #16
753; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
754; CONTIGUOUS-NEXT:    ret
755  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr)
756  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
757  %res.v0 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
758  %v0 = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> %res.v0, i64 0)
759  %res.v1 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 1
760  %v1 = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> %v0, <vscale x 2 x i64> %res.v1, i64 2)
761  ret <vscale x 4 x i64> %v1
762}
763
764define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
765; STRIDED-LABEL: ldnt1_x2_i64_z0_z8_scalar:
766; STRIDED:       // %bb.0:
767; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
768; STRIDED-NEXT:    addvl sp, sp, #-17
769; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
770; STRIDED-NEXT:    mov p8.b, p0.b
771; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
772; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
773; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
774; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
775; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
776; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
777; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
778; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
779; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
780; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
781; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
782; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
783; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
784; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
785; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
786; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
787; STRIDED-NEXT:    ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3]
788; STRIDED-NEXT:    //APP
789; STRIDED-NEXT:    nop
790; STRIDED-NEXT:    //NO_APP
791; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
792; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
793; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
794; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
795; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
796; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
797; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
798; STRIDED-NEXT:    mov z1.d, z8.d
799; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
800; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
801; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
802; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
803; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
804; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
805; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
806; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
807; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
808; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
809; STRIDED-NEXT:    addvl sp, sp, #17
810; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
811; STRIDED-NEXT:    ret
812;
813; CONTIGUOUS-LABEL: ldnt1_x2_i64_z0_z8_scalar:
814; CONTIGUOUS:       // %bb.0:
815; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
816; CONTIGUOUS-NEXT:    addvl sp, sp, #-16
817; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
818; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
819; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
820; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
821; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
822; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
823; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
824; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
825; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
826; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
827; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
828; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
829; CONTIGUOUS-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
830; CONTIGUOUS-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
831; CONTIGUOUS-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
832; CONTIGUOUS-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
833; CONTIGUOUS-NEXT:    addvl sp, sp, #-2
834; CONTIGUOUS-NEXT:    mov p8.b, p0.b
835; CONTIGUOUS-NEXT:    ldnt1d { z0.d, z1.d }, pn8/z, [x0, x1, lsl #3]
836; CONTIGUOUS-NEXT:    str z0, [sp]
837; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
838; CONTIGUOUS-NEXT:    //APP
839; CONTIGUOUS-NEXT:    nop
840; CONTIGUOUS-NEXT:    //NO_APP
841; CONTIGUOUS-NEXT:    ldr z0, [sp]
842; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
843; CONTIGUOUS-NEXT:    addvl sp, sp, #2
844; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
845; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
846; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
847; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
848; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
849; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
850; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
851; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
852; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
853; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
854; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
855; CONTIGUOUS-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
856; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
857; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
858; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
859; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
860; CONTIGUOUS-NEXT:    addvl sp, sp, #16
861; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
862; CONTIGUOUS-NEXT:    ret
863  %base = getelementptr i64, ptr %ptr, i64 %index
864  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %base)
865  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
866  %res.v0 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
867  %v0 = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> %res.v0, i64 0)
868  %res.v1 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 1
869  %v1 = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> %v0, <vscale x 2 x i64> %res.v1, i64 2)
870  ret <vscale x 4 x i64> %v1
871}
872
873define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
874; STRIDED-LABEL: ldnt1_x4_i8_z0_z4_z8_z12:
875; STRIDED:       // %bb.0:
876; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
877; STRIDED-NEXT:    addvl sp, sp, #-17
878; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
879; STRIDED-NEXT:    mov p8.b, p0.b
880; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
881; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
882; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
883; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
884; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
885; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
886; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
887; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
888; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
889; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
890; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
891; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
892; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
893; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
894; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
895; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
896; STRIDED-NEXT:    ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
897; STRIDED-NEXT:    //APP
898; STRIDED-NEXT:    nop
899; STRIDED-NEXT:    //NO_APP
900; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
901; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
902; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
903; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
904; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
905; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
906; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
907; STRIDED-NEXT:    mov z2.d, z8.d
908; STRIDED-NEXT:    mov z3.d, z12.d
909; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
910; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
911; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
912; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
913; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
914; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
915; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
916; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
917; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
918; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
919; STRIDED-NEXT:    mov z1.d, z4.d
920; STRIDED-NEXT:    addvl sp, sp, #17
921; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
922; STRIDED-NEXT:    ret
923;
924; CONTIGUOUS-LABEL: ldnt1_x4_i8_z0_z4_z8_z12:
925; CONTIGUOUS:       // %bb.0:
926; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
927; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
928; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
929; CONTIGUOUS-NEXT:    ptrue pn8.b
930; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
931; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
932; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
933; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
934; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
935; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
936; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
937; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
938; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
939; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
940; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
941; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
942; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
943; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
944; CONTIGUOUS-NEXT:    mov p8.b, p0.b
945; CONTIGUOUS-NEXT:    ldnt1b { z0.b - z3.b }, pn8/z, [x0]
946; CONTIGUOUS-NEXT:    str z0, [sp]
947; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
948; CONTIGUOUS-NEXT:    str z2, [sp, #2, mul vl]
949; CONTIGUOUS-NEXT:    str z3, [sp, #3, mul vl]
950; CONTIGUOUS-NEXT:    //APP
951; CONTIGUOUS-NEXT:    nop
952; CONTIGUOUS-NEXT:    //NO_APP
953; CONTIGUOUS-NEXT:    ldr z0, [sp]
954; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
955; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
956; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
957; CONTIGUOUS-NEXT:    addvl sp, sp, #4
958; CONTIGUOUS-NEXT:    ptrue pn8.b
959; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
960; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
961; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
962; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
963; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
964; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
965; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
966; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
967; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
968; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
969; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
970; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
971; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
972; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
973; CONTIGUOUS-NEXT:    addvl sp, sp, #15
974; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
975; CONTIGUOUS-NEXT:    ret
976  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr)
977  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
978  %res.v0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
979  %v0 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> %res.v0, i64 0)
980  %res.v1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 1
981  %v1 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %v0, <vscale x 16 x i8> %res.v1, i64 16)
982  %res.v2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 2
983  %v2 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %v1, <vscale x 16 x i8> %res.v2, i64 32)
984  %res.v3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 3
985  %v3 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %v2, <vscale x 16 x i8> %res.v3, i64 48)
986  ret <vscale x 64 x i8> %v3
987}
988
989define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
990; STRIDED-LABEL: ldnt1_x4_i8_z0_z4_z8_z12_scalar:
991; STRIDED:       // %bb.0:
992; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
993; STRIDED-NEXT:    addvl sp, sp, #-17
994; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
995; STRIDED-NEXT:    mov p8.b, p0.b
996; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
997; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
998; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
999; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1000; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1001; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1002; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1003; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1004; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1005; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1006; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1007; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
1008; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
1009; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
1010; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
1011; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
1012; STRIDED-NEXT:    ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1]
1013; STRIDED-NEXT:    //APP
1014; STRIDED-NEXT:    nop
1015; STRIDED-NEXT:    //NO_APP
1016; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1017; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1018; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1019; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1020; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1021; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1022; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1023; STRIDED-NEXT:    mov z2.d, z8.d
1024; STRIDED-NEXT:    mov z3.d, z12.d
1025; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1026; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1027; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1028; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1029; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1030; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
1031; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
1032; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
1033; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
1034; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
1035; STRIDED-NEXT:    mov z1.d, z4.d
1036; STRIDED-NEXT:    addvl sp, sp, #17
1037; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1038; STRIDED-NEXT:    ret
1039;
1040; CONTIGUOUS-LABEL: ldnt1_x4_i8_z0_z4_z8_z12_scalar:
1041; CONTIGUOUS:       // %bb.0:
1042; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1043; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
1044; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1045; CONTIGUOUS-NEXT:    ptrue pn8.b
1046; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1047; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
1048; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1049; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1050; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1051; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1052; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1053; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1054; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1055; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1056; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1057; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1058; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
1059; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
1060; CONTIGUOUS-NEXT:    mov p8.b, p0.b
1061; CONTIGUOUS-NEXT:    ldnt1b { z0.b - z3.b }, pn8/z, [x0, x1]
1062; CONTIGUOUS-NEXT:    str z0, [sp]
1063; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
1064; CONTIGUOUS-NEXT:    str z2, [sp, #2, mul vl]
1065; CONTIGUOUS-NEXT:    str z3, [sp, #3, mul vl]
1066; CONTIGUOUS-NEXT:    //APP
1067; CONTIGUOUS-NEXT:    nop
1068; CONTIGUOUS-NEXT:    //NO_APP
1069; CONTIGUOUS-NEXT:    ldr z0, [sp]
1070; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
1071; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
1072; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
1073; CONTIGUOUS-NEXT:    addvl sp, sp, #4
1074; CONTIGUOUS-NEXT:    ptrue pn8.b
1075; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1076; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1077; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1078; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1079; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
1080; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1081; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1082; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1083; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1084; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1085; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1086; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1087; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
1088; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1089; CONTIGUOUS-NEXT:    addvl sp, sp, #15
1090; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1091; CONTIGUOUS-NEXT:    ret
1092  %base = getelementptr i8, ptr %ptr, i64 %index
1093  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %base)
1094  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
1095  %res.v0 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 0
1096  %v0 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> %res.v0, i64 0)
1097  %res.v1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 1
1098  %v1 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %v0, <vscale x 16 x i8> %res.v1, i64 16)
1099  %res.v2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 2
1100  %v2 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %v1, <vscale x 16 x i8> %res.v2, i64 32)
1101  %res.v3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res, 3
1102  %v3 = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> %v2, <vscale x 16 x i8> %res.v3, i64 48)
1103  ret <vscale x 64 x i8> %v3
1104}
1105
1106define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
1107; STRIDED-LABEL: ldnt1_x4_i16_z0_z4_z8_z12:
1108; STRIDED:       // %bb.0:
1109; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1110; STRIDED-NEXT:    addvl sp, sp, #-17
1111; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1112; STRIDED-NEXT:    mov p8.b, p0.b
1113; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1114; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1115; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1116; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1117; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1118; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1119; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1120; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1121; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1122; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1123; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1124; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
1125; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
1126; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
1127; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
1128; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
1129; STRIDED-NEXT:    ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0]
1130; STRIDED-NEXT:    //APP
1131; STRIDED-NEXT:    nop
1132; STRIDED-NEXT:    //NO_APP
1133; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1134; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1135; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1136; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1137; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1138; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1139; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1140; STRIDED-NEXT:    mov z2.d, z8.d
1141; STRIDED-NEXT:    mov z3.d, z12.d
1142; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1143; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1144; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1145; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1146; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1147; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
1148; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
1149; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
1150; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
1151; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
1152; STRIDED-NEXT:    mov z1.d, z4.d
1153; STRIDED-NEXT:    addvl sp, sp, #17
1154; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1155; STRIDED-NEXT:    ret
1156;
1157; CONTIGUOUS-LABEL: ldnt1_x4_i16_z0_z4_z8_z12:
1158; CONTIGUOUS:       // %bb.0:
1159; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1160; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
1161; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1162; CONTIGUOUS-NEXT:    ptrue pn8.b
1163; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1164; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
1165; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1166; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1167; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1168; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1169; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1170; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1171; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1172; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1173; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1174; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1175; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
1176; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
1177; CONTIGUOUS-NEXT:    mov p8.b, p0.b
1178; CONTIGUOUS-NEXT:    ldnt1h { z0.h - z3.h }, pn8/z, [x0]
1179; CONTIGUOUS-NEXT:    str z0, [sp]
1180; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
1181; CONTIGUOUS-NEXT:    str z2, [sp, #2, mul vl]
1182; CONTIGUOUS-NEXT:    str z3, [sp, #3, mul vl]
1183; CONTIGUOUS-NEXT:    //APP
1184; CONTIGUOUS-NEXT:    nop
1185; CONTIGUOUS-NEXT:    //NO_APP
1186; CONTIGUOUS-NEXT:    ldr z0, [sp]
1187; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
1188; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
1189; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
1190; CONTIGUOUS-NEXT:    addvl sp, sp, #4
1191; CONTIGUOUS-NEXT:    ptrue pn8.b
1192; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1193; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1194; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1195; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1196; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
1197; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1198; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1199; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1200; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1201; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1202; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1203; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1204; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
1205; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1206; CONTIGUOUS-NEXT:    addvl sp, sp, #15
1207; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1208; CONTIGUOUS-NEXT:    ret
1209  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr)
1210  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
1211  %res.v0 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
1212  %v0 = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> %res.v0, i64 0)
1213  %res.v1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 1
1214  %v1 = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> %v0, <vscale x 8 x i16> %res.v1, i64 8)
1215  %res.v2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 2
1216  %v2 = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> %v1, <vscale x 8 x i16> %res.v2, i64 16)
1217  %res.v3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 3
1218  %v3 = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> %v2, <vscale x 8 x i16> %res.v3, i64 24)
1219  ret <vscale x 32 x i16> %v3
1220}
1221
1222define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
1223; STRIDED-LABEL: ldnt1_x4_i16_z0_z4_z8_z12_scalar:
1224; STRIDED:       // %bb.0:
1225; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1226; STRIDED-NEXT:    addvl sp, sp, #-17
1227; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1228; STRIDED-NEXT:    mov p8.b, p0.b
1229; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1230; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1231; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1232; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1233; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1234; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1235; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1236; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1237; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1238; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1239; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1240; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
1241; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
1242; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
1243; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
1244; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
1245; STRIDED-NEXT:    ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1]
1246; STRIDED-NEXT:    //APP
1247; STRIDED-NEXT:    nop
1248; STRIDED-NEXT:    //NO_APP
1249; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1250; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1251; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1252; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1253; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1254; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1255; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1256; STRIDED-NEXT:    mov z2.d, z8.d
1257; STRIDED-NEXT:    mov z3.d, z12.d
1258; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1259; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1260; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1261; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1262; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1263; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
1264; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
1265; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
1266; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
1267; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
1268; STRIDED-NEXT:    mov z1.d, z4.d
1269; STRIDED-NEXT:    addvl sp, sp, #17
1270; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1271; STRIDED-NEXT:    ret
1272;
1273; CONTIGUOUS-LABEL: ldnt1_x4_i16_z0_z4_z8_z12_scalar:
1274; CONTIGUOUS:       // %bb.0:
1275; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1276; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
1277; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1278; CONTIGUOUS-NEXT:    ptrue pn8.b
1279; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1280; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
1281; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1282; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1283; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1284; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1285; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1286; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1287; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1288; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1289; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1290; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1291; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
1292; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
1293; CONTIGUOUS-NEXT:    mov p8.b, p0.b
1294; CONTIGUOUS-NEXT:    ldnt1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1]
1295; CONTIGUOUS-NEXT:    str z0, [sp]
1296; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
1297; CONTIGUOUS-NEXT:    str z2, [sp, #2, mul vl]
1298; CONTIGUOUS-NEXT:    str z3, [sp, #3, mul vl]
1299; CONTIGUOUS-NEXT:    //APP
1300; CONTIGUOUS-NEXT:    nop
1301; CONTIGUOUS-NEXT:    //NO_APP
1302; CONTIGUOUS-NEXT:    ldr z0, [sp]
1303; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
1304; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
1305; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
1306; CONTIGUOUS-NEXT:    addvl sp, sp, #4
1307; CONTIGUOUS-NEXT:    ptrue pn8.b
1308; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1309; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1310; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1311; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1312; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
1313; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1314; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1315; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1316; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1317; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1318; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1319; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1320; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
1321; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1322; CONTIGUOUS-NEXT:    addvl sp, sp, #15
1323; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1324; CONTIGUOUS-NEXT:    ret
1325  %base = getelementptr i16, ptr %ptr, i64 %index
1326  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %base)
1327  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
1328  %res.v0 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 0
1329  %v0 = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> %res.v0, i64 0)
1330  %res.v1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 1
1331  %v1 = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> %v0, <vscale x 8 x i16> %res.v1, i64 8)
1332  %res.v2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 2
1333  %v2 = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> %v1, <vscale x 8 x i16> %res.v2, i64 16)
1334  %res.v3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res, 3
1335  %v3 = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> %v2, <vscale x 8 x i16> %res.v3, i64 24)
1336  ret <vscale x 32 x i16> %v3
1337}
1338
1339define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
1340; STRIDED-LABEL: ldnt1_x4_i32_z0_z4_z8_z12:
1341; STRIDED:       // %bb.0:
1342; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1343; STRIDED-NEXT:    addvl sp, sp, #-17
1344; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1345; STRIDED-NEXT:    mov p8.b, p0.b
1346; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1347; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1348; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1349; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1350; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1351; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1352; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1353; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1354; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1355; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1356; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1357; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
1358; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
1359; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
1360; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
1361; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
1362; STRIDED-NEXT:    ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0]
1363; STRIDED-NEXT:    //APP
1364; STRIDED-NEXT:    nop
1365; STRIDED-NEXT:    //NO_APP
1366; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1367; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1368; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1369; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1370; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1371; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1372; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1373; STRIDED-NEXT:    mov z2.d, z8.d
1374; STRIDED-NEXT:    mov z3.d, z12.d
1375; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1376; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1377; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1378; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1379; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1380; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
1381; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
1382; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
1383; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
1384; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
1385; STRIDED-NEXT:    mov z1.d, z4.d
1386; STRIDED-NEXT:    addvl sp, sp, #17
1387; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1388; STRIDED-NEXT:    ret
1389;
1390; CONTIGUOUS-LABEL: ldnt1_x4_i32_z0_z4_z8_z12:
1391; CONTIGUOUS:       // %bb.0:
1392; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1393; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
1394; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1395; CONTIGUOUS-NEXT:    ptrue pn8.b
1396; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1397; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
1398; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1399; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1400; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1401; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1402; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1403; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1404; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1405; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1406; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1407; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1408; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
1409; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
1410; CONTIGUOUS-NEXT:    mov p8.b, p0.b
1411; CONTIGUOUS-NEXT:    ldnt1w { z0.s - z3.s }, pn8/z, [x0]
1412; CONTIGUOUS-NEXT:    str z0, [sp]
1413; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
1414; CONTIGUOUS-NEXT:    str z2, [sp, #2, mul vl]
1415; CONTIGUOUS-NEXT:    str z3, [sp, #3, mul vl]
1416; CONTIGUOUS-NEXT:    //APP
1417; CONTIGUOUS-NEXT:    nop
1418; CONTIGUOUS-NEXT:    //NO_APP
1419; CONTIGUOUS-NEXT:    ldr z0, [sp]
1420; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
1421; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
1422; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
1423; CONTIGUOUS-NEXT:    addvl sp, sp, #4
1424; CONTIGUOUS-NEXT:    ptrue pn8.b
1425; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1426; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1427; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1428; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1429; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
1430; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1431; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1432; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1433; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1434; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1435; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1436; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1437; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
1438; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1439; CONTIGUOUS-NEXT:    addvl sp, sp, #15
1440; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1441; CONTIGUOUS-NEXT:    ret
1442  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr)
1443  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
1444  %res.v0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
1445  %v0 = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> %res.v0, i64 0)
1446  %res.v1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 1
1447  %v1 = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v0, <vscale x 4 x i32> %res.v1, i64 4)
1448  %res.v2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 2
1449  %v2 = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %res.v2, i64 8)
1450  %res.v3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 3
1451  %v3 = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v2, <vscale x 4 x i32> %res.v3, i64 12)
1452  ret <vscale x 16 x i32> %v3
1453}
1454
1455define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
1456; STRIDED-LABEL: ldnt1_x4_i32_z0_z4_z8_z12_scalar:
1457; STRIDED:       // %bb.0:
1458; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1459; STRIDED-NEXT:    addvl sp, sp, #-17
1460; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1461; STRIDED-NEXT:    mov p8.b, p0.b
1462; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1463; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1464; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1465; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1466; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1467; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1468; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1469; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1470; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1471; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1472; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1473; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
1474; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
1475; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
1476; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
1477; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
1478; STRIDED-NEXT:    ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2]
1479; STRIDED-NEXT:    //APP
1480; STRIDED-NEXT:    nop
1481; STRIDED-NEXT:    //NO_APP
1482; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1483; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1484; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1485; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1486; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1487; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1488; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1489; STRIDED-NEXT:    mov z2.d, z8.d
1490; STRIDED-NEXT:    mov z3.d, z12.d
1491; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1492; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1493; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1494; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1495; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1496; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
1497; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
1498; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
1499; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
1500; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
1501; STRIDED-NEXT:    mov z1.d, z4.d
1502; STRIDED-NEXT:    addvl sp, sp, #17
1503; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1504; STRIDED-NEXT:    ret
1505;
1506; CONTIGUOUS-LABEL: ldnt1_x4_i32_z0_z4_z8_z12_scalar:
1507; CONTIGUOUS:       // %bb.0:
1508; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1509; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
1510; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1511; CONTIGUOUS-NEXT:    ptrue pn8.b
1512; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1513; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
1514; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1515; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1516; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1517; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1518; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1519; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1520; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1521; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1522; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1523; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1524; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
1525; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
1526; CONTIGUOUS-NEXT:    mov p8.b, p0.b
1527; CONTIGUOUS-NEXT:    ldnt1w { z0.s - z3.s }, pn8/z, [x0, x1, lsl #2]
1528; CONTIGUOUS-NEXT:    str z0, [sp]
1529; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
1530; CONTIGUOUS-NEXT:    str z2, [sp, #2, mul vl]
1531; CONTIGUOUS-NEXT:    str z3, [sp, #3, mul vl]
1532; CONTIGUOUS-NEXT:    //APP
1533; CONTIGUOUS-NEXT:    nop
1534; CONTIGUOUS-NEXT:    //NO_APP
1535; CONTIGUOUS-NEXT:    ldr z0, [sp]
1536; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
1537; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
1538; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
1539; CONTIGUOUS-NEXT:    addvl sp, sp, #4
1540; CONTIGUOUS-NEXT:    ptrue pn8.b
1541; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1542; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1543; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1544; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1545; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
1546; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1547; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1548; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1549; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1550; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1551; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1552; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1553; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
1554; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1555; CONTIGUOUS-NEXT:    addvl sp, sp, #15
1556; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1557; CONTIGUOUS-NEXT:    ret
1558  %base = getelementptr i32, ptr %ptr, i64 %index
1559  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %base)
1560  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
1561  %res.v0 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 0
1562  %v0 = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> %res.v0, i64 0)
1563  %res.v1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 1
1564  %v1 = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v0, <vscale x 4 x i32> %res.v1, i64 4)
1565  %res.v2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 2
1566  %v2 = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v1, <vscale x 4 x i32> %res.v2, i64 8)
1567  %res.v3 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res, 3
1568  %v3 = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> %v2, <vscale x 4 x i32> %res.v3, i64 12)
1569  ret <vscale x 16 x i32> %v3
1570}
1571
1572define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind {
1573; STRIDED-LABEL: ldnt1_x4_i64_z0_z4_z8_z12:
1574; STRIDED:       // %bb.0:
1575; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1576; STRIDED-NEXT:    addvl sp, sp, #-17
1577; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1578; STRIDED-NEXT:    mov p8.b, p0.b
1579; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1580; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1581; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1582; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1583; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1584; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1585; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1586; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1587; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1588; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1589; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1590; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
1591; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
1592; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
1593; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
1594; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
1595; STRIDED-NEXT:    ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0]
1596; STRIDED-NEXT:    //APP
1597; STRIDED-NEXT:    nop
1598; STRIDED-NEXT:    //NO_APP
1599; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1600; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1601; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1602; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1603; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1604; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1605; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1606; STRIDED-NEXT:    mov z2.d, z8.d
1607; STRIDED-NEXT:    mov z3.d, z12.d
1608; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1609; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1610; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1611; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1612; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1613; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
1614; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
1615; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
1616; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
1617; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
1618; STRIDED-NEXT:    mov z1.d, z4.d
1619; STRIDED-NEXT:    addvl sp, sp, #17
1620; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1621; STRIDED-NEXT:    ret
1622;
1623; CONTIGUOUS-LABEL: ldnt1_x4_i64_z0_z4_z8_z12:
1624; CONTIGUOUS:       // %bb.0:
1625; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1626; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
1627; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1628; CONTIGUOUS-NEXT:    ptrue pn8.b
1629; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1630; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
1631; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1632; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1633; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1634; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1635; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1636; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1637; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1638; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1639; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1640; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1641; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
1642; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
1643; CONTIGUOUS-NEXT:    mov p8.b, p0.b
1644; CONTIGUOUS-NEXT:    ldnt1d { z0.d - z3.d }, pn8/z, [x0]
1645; CONTIGUOUS-NEXT:    str z0, [sp]
1646; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
1647; CONTIGUOUS-NEXT:    str z2, [sp, #2, mul vl]
1648; CONTIGUOUS-NEXT:    str z3, [sp, #3, mul vl]
1649; CONTIGUOUS-NEXT:    //APP
1650; CONTIGUOUS-NEXT:    nop
1651; CONTIGUOUS-NEXT:    //NO_APP
1652; CONTIGUOUS-NEXT:    ldr z0, [sp]
1653; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
1654; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
1655; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
1656; CONTIGUOUS-NEXT:    addvl sp, sp, #4
1657; CONTIGUOUS-NEXT:    ptrue pn8.b
1658; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1659; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1660; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1661; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1662; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
1663; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1664; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1665; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1666; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1667; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1668; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1669; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1670; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
1671; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1672; CONTIGUOUS-NEXT:    addvl sp, sp, #15
1673; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1674; CONTIGUOUS-NEXT:    ret
1675  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr)
1676  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
1677  %res.v0 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
1678  %v0 = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> %res.v0, i64 0)
1679  %res.v1 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 1
1680  %v1 = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> %v0, <vscale x 2 x i64> %res.v1, i64 2)
1681  %res.v2 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 2
1682  %v2 = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> %v1, <vscale x 2 x i64> %res.v2, i64 4)
1683  %res.v3 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 3
1684  %v3 = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> %v2, <vscale x 2 x i64> %res.v3, i64 6)
1685  ret <vscale x 8 x i64> %v3
1686}
1687
1688define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind {
1689; STRIDED-LABEL: ldnt1_x4_i64_z0_z4_z8_z12_scalar:
1690; STRIDED:       // %bb.0:
1691; STRIDED-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1692; STRIDED-NEXT:    addvl sp, sp, #-17
1693; STRIDED-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1694; STRIDED-NEXT:    mov p8.b, p0.b
1695; STRIDED-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1696; STRIDED-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1697; STRIDED-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1698; STRIDED-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1699; STRIDED-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1700; STRIDED-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1701; STRIDED-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1702; STRIDED-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1703; STRIDED-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1704; STRIDED-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1705; STRIDED-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1706; STRIDED-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
1707; STRIDED-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
1708; STRIDED-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
1709; STRIDED-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
1710; STRIDED-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
1711; STRIDED-NEXT:    ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3]
1712; STRIDED-NEXT:    //APP
1713; STRIDED-NEXT:    nop
1714; STRIDED-NEXT:    //NO_APP
1715; STRIDED-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1716; STRIDED-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1717; STRIDED-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1718; STRIDED-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1719; STRIDED-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1720; STRIDED-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1721; STRIDED-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1722; STRIDED-NEXT:    mov z2.d, z8.d
1723; STRIDED-NEXT:    mov z3.d, z12.d
1724; STRIDED-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1725; STRIDED-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1726; STRIDED-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1727; STRIDED-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1728; STRIDED-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1729; STRIDED-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
1730; STRIDED-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
1731; STRIDED-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
1732; STRIDED-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
1733; STRIDED-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
1734; STRIDED-NEXT:    mov z1.d, z4.d
1735; STRIDED-NEXT:    addvl sp, sp, #17
1736; STRIDED-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1737; STRIDED-NEXT:    ret
1738;
1739; CONTIGUOUS-LABEL: ldnt1_x4_i64_z0_z4_z8_z12_scalar:
1740; CONTIGUOUS:       // %bb.0:
1741; CONTIGUOUS-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1742; CONTIGUOUS-NEXT:    addvl sp, sp, #-15
1743; CONTIGUOUS-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1744; CONTIGUOUS-NEXT:    ptrue pn8.b
1745; CONTIGUOUS-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
1746; CONTIGUOUS-NEXT:    st1b { z10.b, z11.b }, pn8, [sp, #12, mul vl] // 32-byte Folded Spill
1747; CONTIGUOUS-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
1748; CONTIGUOUS-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
1749; CONTIGUOUS-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
1750; CONTIGUOUS-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
1751; CONTIGUOUS-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
1752; CONTIGUOUS-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
1753; CONTIGUOUS-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
1754; CONTIGUOUS-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
1755; CONTIGUOUS-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
1756; CONTIGUOUS-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
1757; CONTIGUOUS-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
1758; CONTIGUOUS-NEXT:    addvl sp, sp, #-4
1759; CONTIGUOUS-NEXT:    mov p8.b, p0.b
1760; CONTIGUOUS-NEXT:    ldnt1d { z0.d - z3.d }, pn8/z, [x0, x1, lsl #3]
1761; CONTIGUOUS-NEXT:    str z0, [sp]
1762; CONTIGUOUS-NEXT:    str z1, [sp, #1, mul vl]
1763; CONTIGUOUS-NEXT:    str z2, [sp, #2, mul vl]
1764; CONTIGUOUS-NEXT:    str z3, [sp, #3, mul vl]
1765; CONTIGUOUS-NEXT:    //APP
1766; CONTIGUOUS-NEXT:    nop
1767; CONTIGUOUS-NEXT:    //NO_APP
1768; CONTIGUOUS-NEXT:    ldr z0, [sp]
1769; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
1770; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
1771; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
1772; CONTIGUOUS-NEXT:    addvl sp, sp, #4
1773; CONTIGUOUS-NEXT:    ptrue pn8.b
1774; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
1775; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
1776; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
1777; CONTIGUOUS-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
1778; CONTIGUOUS-NEXT:    ld1b { z10.b, z11.b }, pn8/z, [sp, #12, mul vl] // 32-byte Folded Reload
1779; CONTIGUOUS-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
1780; CONTIGUOUS-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
1781; CONTIGUOUS-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
1782; CONTIGUOUS-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
1783; CONTIGUOUS-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
1784; CONTIGUOUS-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
1785; CONTIGUOUS-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
1786; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
1787; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1788; CONTIGUOUS-NEXT:    addvl sp, sp, #15
1789; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1790; CONTIGUOUS-NEXT:    ret
1791  %base = getelementptr i64, ptr %ptr, i64 %index
1792  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %base)
1793  call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind
1794  %res.v0 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 0
1795  %v0 = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> %res.v0, i64 0)
1796  %res.v1 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 1
1797  %v1 = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> %v0, <vscale x 2 x i64> %res.v1, i64 2)
1798  %res.v2 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 2
1799  %v2 = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> %v1, <vscale x 2 x i64> %res.v2, i64 4)
1800  %res.v3 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res, 3
1801  %v3 = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> %v2, <vscale x 2 x i64> %res.v3, i64 6)
1802  ret <vscale x 8 x i64> %v3
1803}
1804
1805declare <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8>, <vscale x 16 x i8>, i64)
1806declare <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16>, <vscale x 8 x i16>, i64)
1807declare <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32>, <vscale x 4 x i32>, i64)
1808declare <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64>, <vscale x 2 x i64>, i64)
1809declare <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8>, <vscale x 16 x i8>, i64)
1810declare <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16>, <vscale x 8 x i16>, i64)
1811declare <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32>, <vscale x 4 x i32>, i64)
1812declare <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64>, <vscale x 2 x i64>, i64)
1813declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount"), ptr)
1814declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount"), ptr)
1815declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount"), ptr)
1816declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount"), ptr)
1817declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr)
1818declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr)
1819declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr)
1820declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr)
1821