xref: /llvm-project/llvm/test/CodeGen/AArch64/sme2-intrinsics-fmlas.ll (revision 62baf21daa377c4ec1a641b26931063c1117d262)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-f64f64 -force-streaming -verify-machineinstrs | FileCheck %s
3
4; FMLA (SINGLE)
5
6define void @multi_vector_add_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
7; CHECK-LABEL: multi_vector_add_single_vg1x2_s:
8; CHECK:       // %bb.0:
9; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
10; CHECK-NEXT:    mov w8, w0
11; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
12; CHECK-NEXT:    fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
13; CHECK-NEXT:    fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
14; CHECK-NEXT:    ret
15  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 %slice,
16                                                        <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
17                                                        <vscale x 4 x float> %zm)
18  %slice.7 = add i32 %slice, 7
19  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32 %slice.7,
20                                                        <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
21                                                        <vscale x 4 x float> %zm)
22  ret void
23}
24
25define void @multi_vector_add_single_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
26; CHECK-LABEL: multi_vector_add_single_vg1x2_d:
27; CHECK:       // %bb.0:
28; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
29; CHECK-NEXT:    mov w8, w0
30; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
31; CHECK-NEXT:    fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
32; CHECK-NEXT:    fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
33; CHECK-NEXT:    ret
34  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 %slice,
35                                                        <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
36                                                        <vscale x 2 x double> %zm)
37  %slice.7 = add i32 %slice, 7
38  call void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32 %slice.7,
39                                                        <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
40                                                        <vscale x 2 x double> %zm)
41  ret void
42}
43
44define void @multi_vector_add_single_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
45; CHECK-LABEL: multi_vector_add_single_vg1x4_s:
46; CHECK:       // %bb.0:
47; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
48; CHECK-NEXT:    mov w8, w0
49; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
50; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
51; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
52; CHECK-NEXT:    fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
53; CHECK-NEXT:    fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
54; CHECK-NEXT:    ret
55                                             <vscale x 4 x float> %zm) {
56  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 %slice,
57                                                        <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
58                                                        <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
59                                                        <vscale x 4 x float> %zm)
60  %slice.7 = add i32 %slice, 7
61  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32 %slice.7,
62                                                        <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
63                                                        <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
64                                                        <vscale x 4 x float> %zm)
65  ret void
66}
67
68define void @multi_vector_add_single_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
69; CHECK-LABEL: multi_vector_add_single_vg1x4_d:
70; CHECK:       // %bb.0:
71; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
72; CHECK-NEXT:    mov w8, w0
73; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
74; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
75; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
76; CHECK-NEXT:    fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
77; CHECK-NEXT:    fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
78; CHECK-NEXT:    ret
79                                             <vscale x 2 x double> %zm) {
80  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 %slice,
81                                                        <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
82                                                        <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
83                                                        <vscale x 2 x double> %zm)
84  %slice.7 = add i32 %slice, 7
85  call void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32 %slice.7,
86                                                        <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
87                                                        <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
88                                                        <vscale x 2 x double> %zm)
89  ret void
90}
91
92; FMLS (SINGLE)
93
94define void @multi_vector_sub_single_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
95; CHECK-LABEL: multi_vector_sub_single_vg1x2_s:
96; CHECK:       // %bb.0:
97; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
98; CHECK-NEXT:    mov w8, w0
99; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
100; CHECK-NEXT:    fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
101; CHECK-NEXT:    fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
102; CHECK-NEXT:    ret
103  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 %slice,
104                                                        <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
105                                                        <vscale x 4 x float> %zm)
106  %slice.7 = add i32 %slice, 7
107  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32 %slice.7,
108                                                        <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
109                                                        <vscale x 4 x float> %zm)
110  ret void
111}
112
113define void @multi_vector_sub_single_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
114; CHECK-LABEL: multi_vector_sub_single_vg1x2_d:
115; CHECK:       // %bb.0:
116; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
117; CHECK-NEXT:    mov w8, w0
118; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
119; CHECK-NEXT:    fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
120; CHECK-NEXT:    fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
121; CHECK-NEXT:    ret
122  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 %slice,
123                                                        <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
124                                                        <vscale x 2 x double> %zm)
125  %slice.7 = add i32 %slice, 7
126  call void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32 %slice.7,
127                                                        <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
128                                                        <vscale x 2 x double> %zm)
129  ret void
130}
131
132define void @multi_vector_sub_single_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
133; CHECK-LABEL: multi_vector_sub_single_vg1x4_s:
134; CHECK:       // %bb.0:
135; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
136; CHECK-NEXT:    mov w8, w0
137; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
138; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
139; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
140; CHECK-NEXT:    fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
141; CHECK-NEXT:    fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
142; CHECK-NEXT:    ret
143                                             <vscale x 4 x float> %zm) {
144  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 %slice,
145                                                        <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
146                                                        <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
147                                                        <vscale x 4 x float> %zm)
148  %slice.7 = add i32 %slice, 7
149  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32 %slice.7,
150                                                        <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
151                                                        <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
152                                                        <vscale x 4 x float> %zm)
153  ret void
154}
155
156define void @multi_vector_sub_single_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
157; CHECK-LABEL: multi_vector_sub_single_vg1x4_d:
158; CHECK:       // %bb.0:
159; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
160; CHECK-NEXT:    mov w8, w0
161; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
162; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
163; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
164; CHECK-NEXT:    fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
165; CHECK-NEXT:    fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
166; CHECK-NEXT:    ret
167                                             <vscale x 2 x double> %zm) {
168  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 %slice,
169                                                        <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
170                                                        <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
171                                                        <vscale x 2 x double> %zm)
172  %slice.7 = add i32 %slice, 7
173  call void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32 %slice.7,
174                                                        <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
175                                                        <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
176                                                        <vscale x 2 x double> %zm)
177  ret void
178}
179
180; FMLA (MULTI)
181
182define void @multi_vector_add_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
183; CHECK-LABEL: multi_vector_add_vg1x2_s:
184; CHECK:       // %bb.0:
185; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
186; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
187; CHECK-NEXT:    mov w8, w0
188; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
189; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
190; CHECK-NEXT:    fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
191; CHECK-NEXT:    fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
192; CHECK-NEXT:    ret
193                                      <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) {
194  call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice,
195                                                 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
196                                                 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
197  %slice.7 = add i32 %slice, 7
198  call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice.7,
199                                                 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
200                                                 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
201  ret void
202}
203
204define void @multi_vector_add_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
205; CHECK-LABEL: multi_vector_add_vg1x2_d:
206; CHECK:       // %bb.0:
207; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
208; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
209; CHECK-NEXT:    mov w8, w0
210; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
211; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
212; CHECK-NEXT:    fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
213; CHECK-NEXT:    fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
214; CHECK-NEXT:    ret
215                                      <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) {
216  call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 %slice,
217                                                 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
218                                                 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
219  %slice.7 = add i32 %slice, 7
220  call void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32 %slice.7,
221                                                 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
222                                                 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
223  ret void
224}
225
226; Test to ensure the correct register class is used (first register in the list should be a multiple of 2)
227define void @multi_vector_add_vg1x2_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
228; CHECK-LABEL: multi_vector_add_vg1x2_s_regclass:
229; CHECK:       // %bb.0:
230; CHECK-NEXT:    mov z4.d, z3.d
231; CHECK-NEXT:    mov z6.d, z1.d
232; CHECK-NEXT:    mov w8, w0
233; CHECK-NEXT:    mov z5.d, z2.d
234; CHECK-NEXT:    mov z7.d, z0.d
235; CHECK-NEXT:    fmla za.s[w8, 0, vgx2], { z6.s, z7.s }, { z4.s, z5.s }
236; CHECK-NEXT:    ret
237                                               <vscale x 4 x float> %zm0, <vscale x 4 x float> %zm1) {
238  call void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32 %slice,
239                                                 <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn0,
240                                                 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm0)
241  ret void
242}
243
244define void @multi_vector_add_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
245; CHECK-LABEL: multi_vector_add_vg1x4_s:
246; CHECK:       // %bb.0:
247; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
248; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
249; CHECK-NEXT:    mov w8, w0
250; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
251; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
252; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
253; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
254; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
255; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
256; CHECK-NEXT:    fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
257; CHECK-NEXT:    fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
258; CHECK-NEXT:    ret
259                                      <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
260  call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice,
261                                                 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
262                                                 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
263  %slice.7 = add i32 %slice, 7
264  call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice.7,
265                                                 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
266                                                 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
267  ret void
268}
269
270define void @multi_vector_add_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
271; CHECK-LABEL: multi_vector_add_vg1x4_d:
272; CHECK:       // %bb.0:
273; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
274; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
275; CHECK-NEXT:    mov w8, w0
276; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
277; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
278; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
279; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
280; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
281; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
282; CHECK-NEXT:    fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
283; CHECK-NEXT:    fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
284; CHECK-NEXT:    ret
285                                      <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
286  call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 %slice,
287                                                 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
288                                                 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
289  %slice.7 = add i32 %slice, 7
290  call void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32 %slice.7,
291                                                 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
292                                                 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
293  ret void
294}
295
296; Test to ensure the correct register class is used (first register in the list should be a multiple of 4)
297define void @multi_vector_add_vg1x4_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
298; CHECK-LABEL: multi_vector_add_vg1x4_s_regclass:
299; CHECK:       // %bb.0:
300; CHECK-NEXT:    mov z26.d, z7.d
301; CHECK-NEXT:    mov z30.d, z3.d
302; CHECK-NEXT:    mov w8, w0
303; CHECK-NEXT:    mov z25.d, z6.d
304; CHECK-NEXT:    mov z29.d, z2.d
305; CHECK-NEXT:    mov z24.d, z5.d
306; CHECK-NEXT:    mov z28.d, z1.d
307; CHECK-NEXT:    mov z27.d, z4.d
308; CHECK-NEXT:    mov z31.d, z0.d
309; CHECK-NEXT:    fmla za.s[w8, 0, vgx4], { z28.s - z31.s }, { z24.s - z27.s }
310; CHECK-NEXT:    ret
311                                               <vscale x 4 x float> %zm0, <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3) {
312  call void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32 %slice,
313                                                 <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn0,
314                                                 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm0)
315  ret void
316}
317
318; FMLS (MULTI)
319
320define void @multi_vector_sub_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
321; CHECK-LABEL: multi_vector_sub_vg1x2_s:
322; CHECK:       // %bb.0:
323; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
324; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
325; CHECK-NEXT:    mov w8, w0
326; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
327; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
328; CHECK-NEXT:    fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
329; CHECK-NEXT:    fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
330; CHECK-NEXT:    ret
331                                      <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2) {
332  call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 %slice,
333                                                 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
334                                                 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
335  %slice.7 = add i32 %slice, 7
336  call void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32 %slice.7,
337                                                 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
338                                                 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2)
339  ret void
340}
341
342define void @multi_vector_sub_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
343; CHECK-LABEL: multi_vector_sub_vg1x2_d:
344; CHECK:       // %bb.0:
345; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
346; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
347; CHECK-NEXT:    mov w8, w0
348; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
349; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
350; CHECK-NEXT:    fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
351; CHECK-NEXT:    fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
352; CHECK-NEXT:    ret
353                                      <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2) {
354  call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 %slice,
355                                                 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
356                                                 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
357  %slice.7 = add i32 %slice, 7
358  call void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32 %slice.7,
359                                                 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
360                                                 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2)
361  ret void
362}
363
364define void @multi_vector_sub_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
365; CHECK-LABEL: multi_vector_sub_vg1x4_s:
366; CHECK:       // %bb.0:
367; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
368; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
369; CHECK-NEXT:    mov w8, w0
370; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
371; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
372; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
373; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
374; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
375; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
376; CHECK-NEXT:    fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
377; CHECK-NEXT:    fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
378; CHECK-NEXT:    ret
379                                      <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4) {
380  call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 %slice,
381                                                 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
382                                                 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
383  %slice.7 = add i32 %slice, 7
384  call void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32 %slice.7,
385                                                 <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
386                                                 <vscale x 4 x float> %zm1, <vscale x 4 x float> %zm2, <vscale x 4 x float> %zm3, <vscale x 4 x float> %zm4)
387  ret void
388}
389
390define void @multi_vector_sub_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
391; CHECK-LABEL: multi_vector_sub_vg1x4_d:
392; CHECK:       // %bb.0:
393; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
394; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
395; CHECK-NEXT:    mov w8, w0
396; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
397; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
398; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
399; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
400; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
401; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
402; CHECK-NEXT:    fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
403; CHECK-NEXT:    fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
404; CHECK-NEXT:    ret
405                                      <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4) {
406  call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 %slice,
407                                                 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
408                                                 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
409  %slice.7 = add i32 %slice, 7
410  call void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32 %slice.7,
411                                                 <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
412                                                 <vscale x 2 x double> %zm1, <vscale x 2 x double> %zm2, <vscale x 2 x double> %zm3, <vscale x 2 x double> %zm4)
413  ret void
414}
415
416; FMLA (INDEXED)
417
418define void @multi_vector_add_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
419; CHECK-LABEL: multi_vector_add_lane_vg1x2_s:
420; CHECK:       // %bb.0:
421; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
422; CHECK-NEXT:    mov w8, w0
423; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
424; CHECK-NEXT:    fmla za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3]
425; CHECK-NEXT:    fmla za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3]
426; CHECK-NEXT:    ret
427  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice,
428                                                      <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
429                                                      <vscale x 4 x float> %zm, i32 3)
430  %slice.7 = add i32 %slice, 7
431  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice.7,
432                                                      <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
433                                                      <vscale x 4 x float> %zm, i32 3)
434  ret void
435}
436
437define void @multi_vector_add_lane_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
438; CHECK-LABEL: multi_vector_add_lane_vg1x2_d:
439; CHECK:       // %bb.0:
440; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
441; CHECK-NEXT:    mov w8, w0
442; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
443; CHECK-NEXT:    fmla za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1]
444; CHECK-NEXT:    fmla za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1]
445; CHECK-NEXT:    ret
446  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 %slice,
447                                                      <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
448                                                      <vscale x 2 x double> %zm, i32 1)
449  %slice.7 = add i32 %slice, 7
450  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32 %slice.7,
451                                                      <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
452                                                      <vscale x 2 x double> %zm, i32 1)
453  ret void
454}
455
456; Test to ensure the correct register class is used (first register in the list should be a multiple of 2)
457define void @multi_vector_add_lane_vg1x2_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
458; CHECK-LABEL: multi_vector_add_lane_vg1x2_s_regclass:
459; CHECK:       // %bb.0:
460; CHECK-NEXT:    mov z4.d, z1.d
461; CHECK-NEXT:    mov w8, w0
462; CHECK-NEXT:    mov z5.d, z0.d
463; CHECK-NEXT:    fmla za.s[w8, 0, vgx2], { z4.s, z5.s }, z2.s[3]
464; CHECK-NEXT:    ret
465  call void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32 %slice,
466                                                      <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn0,
467                                                      <vscale x 4 x float> %zm, i32 3)
468  ret void
469}
470
471define void @multi_vector_add_lane_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
472; CHECK-LABEL: multi_vector_add_lane_vg1x4_s:
473; CHECK:       // %bb.0:
474; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
475; CHECK-NEXT:    mov w8, w0
476; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
477; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
478; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
479; CHECK-NEXT:    fmla za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s[3]
480; CHECK-NEXT:    fmla za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s[3]
481; CHECK-NEXT:    ret
482                                           <vscale x 4 x float> %zm) {
483  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice,
484                                                      <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
485                                                      <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
486                                                      <vscale x 4 x float> %zm, i32 3)
487  %slice.7 = add i32 %slice, 7
488  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice.7,
489                                                      <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
490                                                      <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
491                                                      <vscale x 4 x float> %zm, i32 3)
492  ret void
493}
494
495define void @multi_vector_add_lane_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
496; CHECK-LABEL: multi_vector_add_lane_vg1x4_d:
497; CHECK:       // %bb.0:
498; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
499; CHECK-NEXT:    mov w8, w0
500; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
501; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
502; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
503; CHECK-NEXT:    fmla za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d[1]
504; CHECK-NEXT:    fmla za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d[1]
505; CHECK-NEXT:    ret
506                                           <vscale x 2 x double> %zm) {
507  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 %slice,
508                                                      <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
509                                                      <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
510                                                      <vscale x 2 x double> %zm, i32 1)
511  %slice.7 = add i32 %slice, 7
512  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32 %slice.7,
513                                                      <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
514                                                      <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
515                                                      <vscale x 2 x double> %zm, i32 1)
516  ret void
517}
518
519; Test to ensure the correct register class is used (first register in the list should be a multiple of 4)
520define void @multi_vector_add_lane_vg1x4_s_regclass(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
521; CHECK-LABEL: multi_vector_add_lane_vg1x4_s_regclass:
522; CHECK:       // %bb.0:
523; CHECK-NEXT:    mov z26.d, z3.d
524; CHECK-NEXT:    mov w8, w0
525; CHECK-NEXT:    mov z25.d, z2.d
526; CHECK-NEXT:    mov z24.d, z1.d
527; CHECK-NEXT:    mov z27.d, z0.d
528; CHECK-NEXT:    fmla za.s[w8, 0, vgx4], { z24.s - z27.s }, z4.s[3]
529; CHECK-NEXT:    ret
530                                                    <vscale x 4 x float> %zm) {
531  call void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32 %slice,
532                                                      <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2,
533                                                      <vscale x 4 x float> %zn3, <vscale x 4 x float> %zn0,
534                                                      <vscale x 4 x float> %zm, i32 3)
535  ret void
536}
537
538; FMLS (INDEXED)
539
540define void @multi_vector_sub_lane_vg1x2_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zm) {
541; CHECK-LABEL: multi_vector_sub_lane_vg1x2_s:
542; CHECK:       // %bb.0:
543; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
544; CHECK-NEXT:    mov w8, w0
545; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
546; CHECK-NEXT:    fmls za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s[3]
547; CHECK-NEXT:    fmls za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s[3]
548; CHECK-NEXT:    ret
549  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 %slice,
550                                                      <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
551                                                      <vscale x 4 x float> %zm, i32 3)
552  %slice.7 = add i32 %slice, 7
553  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32 %slice.7,
554                                                      <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
555                                                      <vscale x 4 x float> %zm, i32 3)
556  ret void
557}
558
559define void @multi_vector_sub_lane_vg1x2_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zm) {
560; CHECK-LABEL: multi_vector_sub_lane_vg1x2_d:
561; CHECK:       // %bb.0:
562; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
563; CHECK-NEXT:    mov w8, w0
564; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
565; CHECK-NEXT:    fmls za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d[1]
566; CHECK-NEXT:    fmls za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d[1]
567; CHECK-NEXT:    ret
568  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 %slice,
569                                                      <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
570                                                      <vscale x 2 x double> %zm, i32 1)
571  %slice.7 = add i32 %slice, 7
572  call void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32 %slice.7,
573                                                      <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
574                                                      <vscale x 2 x double> %zm, i32 1)
575  ret void
576}
577
578define void @multi_vector_sub_lane_vg1x4_s(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
579; CHECK-LABEL: multi_vector_sub_lane_vg1x4_s:
580; CHECK:       // %bb.0:
581; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
582; CHECK-NEXT:    mov w8, w0
583; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
584; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
585; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
586; CHECK-NEXT:    fmls za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s[3]
587; CHECK-NEXT:    fmls za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s[3]
588; CHECK-NEXT:    ret
589                                           <vscale x 4 x float> %zm) {
590  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 %slice,
591                                                      <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
592                                                      <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
593                                                      <vscale x 4 x float> %zm, i32 3)
594  %slice.7 = add i32 %slice, 7
595  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32 %slice.7,
596                                                      <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
597                                                      <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3,
598                                                      <vscale x 4 x float> %zm, i32 3)
599  ret void
600}
601
602define void @multi_vector_sub_lane_vg1x4_d(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
603; CHECK-LABEL: multi_vector_sub_lane_vg1x4_d:
604; CHECK:       // %bb.0:
605; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
606; CHECK-NEXT:    mov w8, w0
607; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
608; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
609; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
610; CHECK-NEXT:    fmls za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d[1]
611; CHECK-NEXT:    fmls za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d[1]
612; CHECK-NEXT:    ret
613                                           <vscale x 2 x double> %zm) {
614  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 %slice,
615                                                      <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
616                                                      <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
617                                                      <vscale x 2 x double> %zm, i32 1)
618  %slice.7 = add i32 %slice, 7
619  call void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32 %slice.7,
620                                                      <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
621                                                      <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3,
622                                                      <vscale x 2 x double> %zm, i32 1)
623  ret void
624}
625
626declare void @llvm.aarch64.sme.fmla.single.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
627declare void @llvm.aarch64.sme.fmla.single.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
628declare void @llvm.aarch64.sme.fmla.single.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
629declare void @llvm.aarch64.sme.fmla.single.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
630
631declare void @llvm.aarch64.sme.fmls.single.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
632declare void @llvm.aarch64.sme.fmls.single.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
633declare void @llvm.aarch64.sme.fmls.single.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
634declare void @llvm.aarch64.sme.fmls.single.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
635
636declare void @llvm.aarch64.sme.fmla.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
637declare void @llvm.aarch64.sme.fmla.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
638declare void @llvm.aarch64.sme.fmla.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>,
639                                                       <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
640declare void @llvm.aarch64.sme.fmla.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>,
641                                                       <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
642
643declare void @llvm.aarch64.sme.fmls.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
644declare void @llvm.aarch64.sme.fmls.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
645declare void @llvm.aarch64.sme.fmls.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>,
646                                                       <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>)
647declare void @llvm.aarch64.sme.fmls.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>,
648                                                       <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>)
649
650declare void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
651declare void @llvm.aarch64.sme.fmla.lane.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)
652declare void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
653declare void @llvm.aarch64.sme.fmla.lane.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)
654
655declare void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
656declare void @llvm.aarch64.sme.fmls.lane.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)
657declare void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, i32)
658declare void @llvm.aarch64.sme.fmls.lane.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, i32)
659