xref: /llvm-project/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlals.ll (revision 62baf21daa377c4ec1a641b26931063c1117d262)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+bf16 -force-streaming -verify-machineinstrs < %s | FileCheck %s
3
4;
5; BF/F/S/UMLAL x1 (SINGLE)
6;
7
8define void @multi_vector_add_single_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
9; CHECK-LABEL: multi_vector_add_single_vg2x1_bf16:
10; CHECK:       // %bb.0:
11; CHECK-NEXT:    mov w8, w0
12; CHECK-NEXT:    bfmlal za.s[w8, 0:1], z0.h, z1.h
13; CHECK-NEXT:    bfmlal za.s[w8, 14:15], z0.h, z1.h
14; CHECK-NEXT:    ret
15  call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
16  %slice.14 = add i32 %slice, 14
17  call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
18  ret void
19}
20
21define void @multi_vector_add_single_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
22; CHECK-LABEL: multi_vector_add_single_vg2x1_f16:
23; CHECK:       // %bb.0:
24; CHECK-NEXT:    mov w8, w0
25; CHECK-NEXT:    fmlal za.s[w8, 0:1], z0.h, z1.h
26; CHECK-NEXT:    fmlal za.s[w8, 14:15], z0.h, z1.h
27; CHECK-NEXT:    ret
28  call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
29  %slice.14 = add i32 %slice, 14
30  call void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
31  ret void
32}
33
34define void @multi_vector_add_single_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
35; CHECK-LABEL: multi_vector_add_single_vg2x1_s16:
36; CHECK:       // %bb.0:
37; CHECK-NEXT:    mov w8, w0
38; CHECK-NEXT:    smlal za.s[w8, 0:1], z0.h, z1.h
39; CHECK-NEXT:    smlal za.s[w8, 14:15], z0.h, z1.h
40; CHECK-NEXT:    ret
41  call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
42  %slice.14 = add i32 %slice, 14
43  call void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
44  ret void
45}
46
47define void @multi_vector_add_single_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
48; CHECK-LABEL: multi_vector_add_single_vg2x1_u16:
49; CHECK:       // %bb.0:
50; CHECK-NEXT:    mov w8, w0
51; CHECK-NEXT:    umlal za.s[w8, 0:1], z0.h, z1.h
52; CHECK-NEXT:    umlal za.s[w8, 14:15], z0.h, z1.h
53; CHECK-NEXT:    ret
54  call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
55  %slice.14 = add i32 %slice, 14
56  call void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
57  ret void
58}
59
60;
61; BF/F/S/UMLSL x1 (SINGLE)
62;
63
64define void @multi_vector_sub_single_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
65; CHECK-LABEL: multi_vector_sub_single_vg2x1_bf16:
66; CHECK:       // %bb.0:
67; CHECK-NEXT:    mov w8, w0
68; CHECK-NEXT:    bfmlsl za.s[w8, 0:1], z0.h, z1.h
69; CHECK-NEXT:    bfmlsl za.s[w8, 14:15], z0.h, z1.h
70; CHECK-NEXT:    ret
71  call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
72  %slice.14 = add i32 %slice, 14
73  call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm)
74  ret void
75}
76
77define void @multi_vector_sub_single_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
78; CHECK-LABEL: multi_vector_sub_single_vg2x1_f16:
79; CHECK:       // %bb.0:
80; CHECK-NEXT:    mov w8, w0
81; CHECK-NEXT:    fmlsl za.s[w8, 0:1], z0.h, z1.h
82; CHECK-NEXT:    fmlsl za.s[w8, 14:15], z0.h, z1.h
83; CHECK-NEXT:    ret
84  call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
85  %slice.14 = add i32 %slice, 14
86  call void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm)
87  ret void
88}
89
90define void @multi_vector_sub_single_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
91; CHECK-LABEL: multi_vector_sub_single_vg2x1_s16:
92; CHECK:       // %bb.0:
93; CHECK-NEXT:    mov w8, w0
94; CHECK-NEXT:    smlsl za.s[w8, 0:1], z0.h, z1.h
95; CHECK-NEXT:    smlsl za.s[w8, 14:15], z0.h, z1.h
96; CHECK-NEXT:    ret
97  call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
98  %slice.14 = add i32 %slice, 14
99  call void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
100  ret void
101}
102
103define void @multi_vector_sub_single_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
104; CHECK-LABEL: multi_vector_sub_single_vg2x1_u16:
105; CHECK:       // %bb.0:
106; CHECK-NEXT:    mov w8, w0
107; CHECK-NEXT:    umlsl za.s[w8, 0:1], z0.h, z1.h
108; CHECK-NEXT:    umlsl za.s[w8, 14:15], z0.h, z1.h
109; CHECK-NEXT:    ret
110  call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
111  %slice.14 = add i32 %slice, 14
112  call void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
113  ret void
114}
115
116;
117; BF/F/S/UMLAL x2 (SINGLE)
118;
119
120define void @multi_vector_add_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
121; CHECK-LABEL: multi_vector_add_single_vg2x2_bf16:
122; CHECK:       // %bb.0:
123; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
124; CHECK-NEXT:    mov w8, w0
125; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
126; CHECK-NEXT:    bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
127; CHECK-NEXT:    bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
128; CHECK-NEXT:    ret
129  call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
130  %slice.6 = add i32 %slice, 6
131  call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
132  ret void
133}
134
135define void @multi_vector_add_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
136; CHECK-LABEL: multi_vector_add_single_vg2x2_f16:
137; CHECK:       // %bb.0:
138; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
139; CHECK-NEXT:    mov w8, w0
140; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
141; CHECK-NEXT:    fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
142; CHECK-NEXT:    fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
143; CHECK-NEXT:    ret
144  call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
145  %slice.6 = add i32 %slice, 6
146  call void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
147  ret void
148}
149
150define void @multi_vector_add_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
151; CHECK-LABEL: multi_vector_add_single_vg2x2_s16:
152; CHECK:       // %bb.0:
153; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
154; CHECK-NEXT:    mov w8, w0
155; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
156; CHECK-NEXT:    smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
157; CHECK-NEXT:    smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
158; CHECK-NEXT:    ret
159  call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
160  %slice.6 = add i32 %slice, 6
161  call void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
162  ret void
163}
164
165define void @multi_vector_add_single_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
166; CHECK-LABEL: multi_vector_add_single_vg2x2_u16:
167; CHECK:       // %bb.0:
168; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
169; CHECK-NEXT:    mov w8, w0
170; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
171; CHECK-NEXT:    umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
172; CHECK-NEXT:    umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
173; CHECK-NEXT:    ret
174  call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
175  %slice.6 = add i32 %slice, 6
176  call void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
177  ret void
178}
179
180;
181; BF/F/S/UMLSL x2 (SINGLE)
182;
183
184define void @multi_vector_sub_single_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
185; CHECK-LABEL: multi_vector_sub_single_vg2x2_bf16:
186; CHECK:       // %bb.0:
187; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
188; CHECK-NEXT:    mov w8, w0
189; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
190; CHECK-NEXT:    bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
191; CHECK-NEXT:    bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
192; CHECK-NEXT:    ret
193  call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
194  %slice.6 = add i32 %slice, 6
195  call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm)
196  ret void
197}
198
199define void @multi_vector_sub_single_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
200; CHECK-LABEL: multi_vector_sub_single_vg2x2_f16:
201; CHECK:       // %bb.0:
202; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
203; CHECK-NEXT:    mov w8, w0
204; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
205; CHECK-NEXT:    fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
206; CHECK-NEXT:    fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
207; CHECK-NEXT:    ret
208  call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
209  %slice.6 = add i32 %slice, 6
210  call void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm)
211  ret void
212}
213
214define void @multi_vector_sub_single_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
215; CHECK-LABEL: multi_vector_sub_single_vg2x2_s16:
216; CHECK:       // %bb.0:
217; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
218; CHECK-NEXT:    mov w8, w0
219; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
220; CHECK-NEXT:    smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
221; CHECK-NEXT:    smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
222; CHECK-NEXT:    ret
223  call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
224  %slice.6 = add i32 %slice, 6
225  call void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
226  ret void
227}
228
229define void @multi_vector_sub_single_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
230; CHECK-LABEL: multi_vector_sub_single_vg2x2_u16:
231; CHECK:       // %bb.0:
232; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
233; CHECK-NEXT:    mov w8, w0
234; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
235; CHECK-NEXT:    umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h
236; CHECK-NEXT:    umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h
237; CHECK-NEXT:    ret
238  call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
239  %slice.6 = add i32 %slice, 6
240  call void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
241  ret void
242}
243
244;
245; BF/F/S/UMLAL x4 (SINGLE)
246;
247
248define void @multi_vector_add_single_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
249; CHECK-LABEL: multi_vector_add_single_vg2x4_bf16:
250; CHECK:       // %bb.0:
251; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
252; CHECK-NEXT:    mov w8, w0
253; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
254; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
255; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
256; CHECK-NEXT:    bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
257; CHECK-NEXT:    bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
258; CHECK-NEXT:    ret
259  call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 %slice,
260                                                            <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
261                                                            <vscale x 8 x bfloat> %zm)
262  %slice.6 = add i32 %slice, 6
263  call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32 %slice.6,
264                                                            <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
265                                                            <vscale x 8 x bfloat> %zm)
266  ret void
267}
268
269define void @multi_vector_add_single_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
270; CHECK-LABEL: multi_vector_add_single_vg2x4_f16:
271; CHECK:       // %bb.0:
272; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
273; CHECK-NEXT:    mov w8, w0
274; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
275; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
276; CHECK-NEXT:    mov z3.d, z2.d
277; CHECK-NEXT:    fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
278; CHECK-NEXT:    fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
279; CHECK-NEXT:    ret
280  call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 %slice,
281                                                         <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn2,
282                                                         <vscale x 8 x half> %zm)
283  %slice.6 = add i32 %slice, 6
284  call void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32 %slice.6,
285                                                         <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn2,
286                                                         <vscale x 8 x half> %zm)
287  ret void
288}
289
290define void @multi_vector_add_single_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
291; CHECK-LABEL: multi_vector_add_single_vg2x4_s16:
292; CHECK:       // %bb.0:
293; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
294; CHECK-NEXT:    mov w8, w0
295; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
296; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
297; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
298; CHECK-NEXT:    smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
299; CHECK-NEXT:    smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
300; CHECK-NEXT:    ret
301  call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 %slice,
302                                                         <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
303                                                         <vscale x 8 x i16> %zm)
304  %slice.6 = add i32 %slice, 6
305  call void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32 %slice.6,
306                                                         <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
307                                                         <vscale x 8 x i16> %zm)
308  ret void
309}
310
311define void @multi_vector_add_single_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
312; CHECK-LABEL: multi_vector_add_single_vg2x4_u16:
313; CHECK:       // %bb.0:
314; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
315; CHECK-NEXT:    mov w8, w0
316; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
317; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
318; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
319; CHECK-NEXT:    umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
320; CHECK-NEXT:    umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
321; CHECK-NEXT:    ret
322  call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 %slice,
323                                                         <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
324                                                         <vscale x 8 x i16> %zm)
325  %slice.6 = add i32 %slice, 6
326  call void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32 %slice.6,
327                                                         <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
328                                                         <vscale x 8 x i16> %zm)
329  ret void
330}
331
332;
333; BF/F/S/UMLSL x4 (SINGLE)
334;
335
336define void @multi_vector_sub_single_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
337; CHECK-LABEL: multi_vector_sub_single_vg2x4_bf16:
338; CHECK:       // %bb.0:
339; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
340; CHECK-NEXT:    mov w8, w0
341; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
342; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
343; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
344; CHECK-NEXT:    bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
345; CHECK-NEXT:    bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
346; CHECK-NEXT:    ret
347  call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 %slice,
348                                                           <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
349                                                           <vscale x 8 x bfloat> %zm)
350  %slice.6 = add i32 %slice, 6
351  call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32 %slice.6,
352                                                           <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
353                                                           <vscale x 8 x bfloat> %zm)
354  ret void
355}
356
357define void @multi_vector_sub_single_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
358; CHECK-LABEL: multi_vector_sub_single_vg2x4_f16:
359; CHECK:       // %bb.0:
360; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
361; CHECK-NEXT:    mov w8, w0
362; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
363; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
364; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
365; CHECK-NEXT:    fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
366; CHECK-NEXT:    fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
367; CHECK-NEXT:    ret
368  call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 %slice,
369                                                         <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
370                                                         <vscale x 8 x half> %zm)
371  %slice.6 = add i32 %slice, 6
372  call void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32 %slice.6,
373                                                         <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
374                                                         <vscale x 8 x half> %zm)
375  ret void
376}
377
378define void @multi_vector_sub_single_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
379; CHECK-LABEL: multi_vector_sub_single_vg2x4_s16:
380; CHECK:       // %bb.0:
381; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
382; CHECK-NEXT:    mov w8, w0
383; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
384; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
385; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
386; CHECK-NEXT:    smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
387; CHECK-NEXT:    smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
388; CHECK-NEXT:    ret
389  call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 %slice,
390                                                         <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
391                                                         <vscale x 8 x i16> %zm)
392  %slice.6 = add i32 %slice, 6
393  call void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32 %slice.6,
394                                                         <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
395                                                         <vscale x 8 x i16> %zm)
396  ret void
397}
398
399define void @multi_vector_sub_single_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
400; CHECK-LABEL: multi_vector_sub_single_vg2x4_u16:
401; CHECK:       // %bb.0:
402; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
403; CHECK-NEXT:    mov w8, w0
404; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
405; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
406; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
407; CHECK-NEXT:    umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h
408; CHECK-NEXT:    umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h
409; CHECK-NEXT:    ret
410  call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 %slice,
411                                                         <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
412                                                         <vscale x 8 x i16> %zm)
413  %slice.6 = add i32 %slice, 6
414  call void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32 %slice.6,
415                                                         <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
416                                                         <vscale x 8 x i16> %zm)
417  ret void
418}
419
420;
421; BF/F/S/UMLAL x2 (MULTI)
422;
423
424define void @multi_vector_add_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm0,  <vscale x 8 x bfloat> %zm1) {
425; CHECK-LABEL: multi_vector_add_multi_vg2x2_bf16:
426; CHECK:       // %bb.0:
427; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
428; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
429; CHECK-NEXT:    mov w8, w0
430; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
431; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
432; CHECK-NEXT:    bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
433; CHECK-NEXT:    bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
434; CHECK-NEXT:    ret
435  call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
436                                                               <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
437  %slice.6 = add i32 %slice, 6
438  call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
439                                                                 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
440  ret void
441}
442
443define void @multi_vector_add_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) {
444; CHECK-LABEL: multi_vector_add_multi_vg2x2_f16:
445; CHECK:       // %bb.0:
446; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
447; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
448; CHECK-NEXT:    mov w8, w0
449; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
450; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
451; CHECK-NEXT:    fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
452; CHECK-NEXT:    fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
453; CHECK-NEXT:    ret
454  call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
455                                                              <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
456  %slice.6 = add i32 %slice, 6
457  call void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
458                                                                <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
459  ret void
460}
461
462define void @multi_vector_add_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
463; CHECK-LABEL: multi_vector_add_multi_vg2x2_s16:
464; CHECK:       // %bb.0:
465; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
466; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
467; CHECK-NEXT:    mov w8, w0
468; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
469; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
470; CHECK-NEXT:    smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
471; CHECK-NEXT:    smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
472; CHECK-NEXT:    ret
473  call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
474                                                              <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
475  %slice.6 = add i32 %slice, 6
476  call void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
477                                                                <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
478  ret void
479}
480
481define void @multi_vector_add_multi_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
482; CHECK-LABEL: multi_vector_add_multi_vg2x2_u16:
483; CHECK:       // %bb.0:
484; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
485; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
486; CHECK-NEXT:    mov w8, w0
487; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
488; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
489; CHECK-NEXT:    umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
490; CHECK-NEXT:    umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
491; CHECK-NEXT:    ret
492  call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
493                                                              <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
494  %slice.6 = add i32 %slice, 6
495  call void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
496                                                                <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
497  ret void
498}
499
500;
501; BF/F/S/UMLSL x2 (MULTI)
502;
503
504define void @multi_vector_sub_multi_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1) {
505; CHECK-LABEL: multi_vector_sub_multi_vg2x2_bf16:
506; CHECK:       // %bb.0:
507; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
508; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
509; CHECK-NEXT:    mov w8, w0
510; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
511; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
512; CHECK-NEXT:    bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
513; CHECK-NEXT:    bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
514; CHECK-NEXT:    ret
515  call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
516                                                               <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
517  %slice.6 = add i32 %slice, 6
518  call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32 %slice.6, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1,
519                                                                 <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1)
520  ret void
521}
522
523define void @multi_vector_sub_multi_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1) {
524; CHECK-LABEL: multi_vector_sub_multi_vg2x2_f16:
525; CHECK:       // %bb.0:
526; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
527; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
528; CHECK-NEXT:    mov w8, w0
529; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
530; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
531; CHECK-NEXT:    fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
532; CHECK-NEXT:    fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
533; CHECK-NEXT:    ret
534  call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
535                                                              <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
536  %slice.6 = add i32 %slice, 6
537  call void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32 %slice.6, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1,
538                                                                <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1)
539  ret void
540}
541
542define void @multi_vector_sub_multi_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
543; CHECK-LABEL: multi_vector_sub_multi_vg2x2_s16:
544; CHECK:       // %bb.0:
545; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
546; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
547; CHECK-NEXT:    mov w8, w0
548; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
549; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
550; CHECK-NEXT:    smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
551; CHECK-NEXT:    smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
552; CHECK-NEXT:    ret
553  call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
554                                                              <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
555  %slice.6 = add i32 %slice, 6
556  call void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
557                                                                <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
558  ret void
559}
560
561define void @multi_vector_sub_multi_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
562; CHECK-LABEL: multi_vector_sub_multi_vg2x2_u16:
563; CHECK:       // %bb.0:
564; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
565; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
566; CHECK-NEXT:    mov w8, w0
567; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
568; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
569; CHECK-NEXT:    umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
570; CHECK-NEXT:    umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, { z2.h, z3.h }
571; CHECK-NEXT:    ret
572  call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
573                                                              <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
574  %slice.6 = add i32 %slice, 6
575  call void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32 %slice.6, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1,
576                                                                <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
577  ret void
578}
579
580;
581; BF/F/S/UMLAL x4 (MULTI)
582;
583
584define void @multi_vector_add_multi_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
585; CHECK-LABEL: multi_vector_add_multi_vg2x4_bf16:
586; CHECK:       // %bb.0:
587; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
588; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
589; CHECK-NEXT:    mov w8, w0
590; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
591; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
592; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
593; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
594; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
595; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
596; CHECK-NEXT:    bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
597; CHECK-NEXT:    bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
598; CHECK-NEXT:    ret
599                                               <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) {
600  call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 %slice,
601                                                   <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
602                                                   <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
603  %slice.6 = add i32 %slice, 6
604  call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32 %slice.6,
605                                                   <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
606                                                   <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
607  ret void
608}
609
610define void @multi_vector_add_multi_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
611; CHECK-LABEL: multi_vector_add_multi_vg2x4_f16:
612; CHECK:       // %bb.0:
613; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
614; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
615; CHECK-NEXT:    mov w8, w0
616; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
617; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
618; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
619; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
620; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
621; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
622; CHECK-NEXT:    fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
623; CHECK-NEXT:    fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
624; CHECK-NEXT:    ret
625                                              <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) {
626  call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 %slice,
627                                                  <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
628                                                  <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
629  %slice.6 = add i32 %slice, 6
630  call void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32 %slice.6,
631                                                  <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
632                                                  <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
633  ret void
634}
635
636define void @multi_vector_add_multi_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
637; CHECK-LABEL: multi_vector_add_multi_vg2x4_s16:
638; CHECK:       // %bb.0:
639; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
640; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
641; CHECK-NEXT:    mov w8, w0
642; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
643; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
644; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
645; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
646; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
647; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
648; CHECK-NEXT:    smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
649; CHECK-NEXT:    smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
650; CHECK-NEXT:    ret
651                                              <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
652  call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 %slice,
653                                                  <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
654                                                  <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
655  %slice.6 = add i32 %slice, 6
656  call void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32 %slice.6,
657                                                  <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
658                                                  <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
659  ret void
660}
661
662define void @multi_vector_add_multi_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
663; CHECK-LABEL: multi_vector_add_multi_vg2x4_u16:
664; CHECK:       // %bb.0:
665; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
666; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
667; CHECK-NEXT:    mov w8, w0
668; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
669; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
670; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
671; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
672; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
673; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
674; CHECK-NEXT:    umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
675; CHECK-NEXT:    umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
676; CHECK-NEXT:    ret
677                                              <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
678  call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 %slice,
679                                                  <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
680                                                  <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
681  %slice.6 = add i32 %slice, 6
682  call void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32 %slice.6,
683                                                  <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
684                                                  <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
685  ret void
686}
687
688;
689; BF/F/S/UMLSL x4 (MULTI)
690;
691
692define void @multi_vector_sub_multi_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
693; CHECK-LABEL: multi_vector_sub_multi_vg2x4_bf16:
694; CHECK:       // %bb.0:
695; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
696; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
697; CHECK-NEXT:    mov w8, w0
698; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
699; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
700; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
701; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
702; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
703; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
704; CHECK-NEXT:    bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
705; CHECK-NEXT:    bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
706; CHECK-NEXT:    ret
707                                               <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3) {
708  call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 %slice,
709                                                   <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
710                                                   <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
711  %slice.6 = add i32 %slice, 6
712  call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32 %slice.6,
713                                                   <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
714                                                   <vscale x 8 x bfloat> %zm0, <vscale x 8 x bfloat> %zm1, <vscale x 8 x bfloat> %zm2, <vscale x 8 x bfloat> %zm3)
715  ret void
716}
717
718define void @multi_vector_sub_multi_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
719; CHECK-LABEL: multi_vector_sub_multi_vg2x4_f16:
720; CHECK:       // %bb.0:
721; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
722; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
723; CHECK-NEXT:    mov w8, w0
724; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
725; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
726; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
727; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
728; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
729; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
730; CHECK-NEXT:    fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
731; CHECK-NEXT:    fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
732; CHECK-NEXT:    ret
733                                              <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3) {
734  call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 %slice,
735                                                  <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
736                                                  <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
737  %slice.6 = add i32 %slice, 6
738  call void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32 %slice.6,
739                                                  <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
740                                                  <vscale x 8 x half> %zm0, <vscale x 8 x half> %zm1, <vscale x 8 x half> %zm2, <vscale x 8 x half> %zm3)
741  ret void
742}
743
744define void @multi_vector_sub_multi_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
745; CHECK-LABEL: multi_vector_sub_multi_vg2x4_s16:
746; CHECK:       // %bb.0:
747; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
748; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
749; CHECK-NEXT:    mov w8, w0
750; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
751; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
752; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
753; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
754; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
755; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
756; CHECK-NEXT:    smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
757; CHECK-NEXT:    smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
758; CHECK-NEXT:    ret
759                                              <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
760  call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 %slice,
761                                                  <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
762                                                  <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
763  %slice.6 = add i32 %slice, 6
764  call void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32 %slice.6,
765                                                  <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
766                                                  <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
767  ret void
768}
769
770define void @multi_vector_sub_multi_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
771; CHECK-LABEL: multi_vector_sub_multi_vg2x4_u16:
772; CHECK:       // %bb.0:
773; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
774; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
775; CHECK-NEXT:    mov w8, w0
776; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
777; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
778; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
779; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
780; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
781; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
782; CHECK-NEXT:    umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
783; CHECK-NEXT:    umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
784; CHECK-NEXT:    ret
785                                              <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
786  call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 %slice,
787                                                  <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
788                                                  <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
789  %slice.6 = add i32 %slice, 6
790  call void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32 %slice.6,
791                                                  <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
792                                                  <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
793  ret void
794}
795
796;
797; BF/F/S/UMLAL x1 (INDEXED)
798;
799
800define void @multi_vector_add_lane_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
801; CHECK-LABEL: multi_vector_add_lane_vg2x1_f16:
802; CHECK:       // %bb.0:
803; CHECK-NEXT:    mov w8, w0
804; CHECK-NEXT:    fmlal za.s[w8, 0:1], z0.h, z1.h[0]
805; CHECK-NEXT:    fmlal za.s[w8, 14:15], z0.h, z1.h[7]
806; CHECK-NEXT:    ret
807  call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 0)
808  %slice.14 = add i32 %slice, 14
809  call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 7)
810  ret void
811}
812
813define void @multi_vector_add_lane_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
814; CHECK-LABEL: multi_vector_add_lane_vg2x1_bf16:
815; CHECK:       // %bb.0:
816; CHECK-NEXT:    mov w8, w0
817; CHECK-NEXT:    bfmlal za.s[w8, 0:1], z0.h, z1.h[0]
818; CHECK-NEXT:    bfmlal za.s[w8, 14:15], z0.h, z1.h[7]
819; CHECK-NEXT:    ret
820  call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 0)
821  %slice.14 = add i32 %slice, 14
822  call void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
823  ret void
824}
825
826define void @multi_vector_add_lane_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
827; CHECK-LABEL: multi_vector_add_lane_vg2x1_s16:
828; CHECK:       // %bb.0:
829; CHECK-NEXT:    mov w8, w0
830; CHECK-NEXT:    smlal za.s[w8, 0:1], z0.h, z1.h[0]
831; CHECK-NEXT:    smlal za.s[w8, 14:15], z0.h, z1.h[7]
832; CHECK-NEXT:    ret
833  call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
834  %slice.14 = add i32 %slice, 14
835  call void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
836  ret void
837}
838
839define void @multi_vector_add_lane_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
840; CHECK-LABEL: multi_vector_add_lane_vg2x1_u16:
841; CHECK:       // %bb.0:
842; CHECK-NEXT:    mov w8, w0
843; CHECK-NEXT:    umlal za.s[w8, 0:1], z0.h, z1.h[0]
844; CHECK-NEXT:    umlal za.s[w8, 14:15], z0.h, z1.h[7]
845; CHECK-NEXT:    ret
846  call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
847  %slice.14 = add i32 %slice, 14
848  call void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
849  ret void
850}
851
852;
853; BF/F/S/UMLSL x1 (INDEXED)
854;
855
856define void @multi_vector_sub_lane_vg2x1_f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
857; CHECK-LABEL: multi_vector_sub_lane_vg2x1_f16:
858; CHECK:       // %bb.0:
859; CHECK-NEXT:    mov w8, w0
860; CHECK-NEXT:    fmlsl za.s[w8, 0:1], z0.h, z1.h[0]
861; CHECK-NEXT:    fmlsl za.s[w8, 14:15], z0.h, z1.h[7]
862; CHECK-NEXT:    ret
863  call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 %slice, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 0)
864  %slice.14 = add i32 %slice, 14
865  call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32 %slice.14, <vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 7)
866  ret void
867}
868
869define void @multi_vector_sub_lane_vg2x1_bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
870; CHECK-LABEL: multi_vector_sub_lane_vg2x1_bf16:
871; CHECK:       // %bb.0:
872; CHECK-NEXT:    mov w8, w0
873; CHECK-NEXT:    bfmlsl za.s[w8, 0:1], z0.h, z1.h[0]
874; CHECK-NEXT:    bfmlsl za.s[w8, 14:15], z0.h, z1.h[7]
875; CHECK-NEXT:    ret
876  call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 %slice, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 0)
877  %slice.14 = add i32 %slice, 14
878  call void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32 %slice.14, <vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 7)
879  ret void
880}
881
882define void @multi_vector_sub_lane_vg2x1_s16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
883; CHECK-LABEL: multi_vector_sub_lane_vg2x1_s16:
884; CHECK:       // %bb.0:
885; CHECK-NEXT:    mov w8, w0
886; CHECK-NEXT:    smlsl za.s[w8, 0:1], z0.h, z1.h[0]
887; CHECK-NEXT:    smlsl za.s[w8, 14:15], z0.h, z1.h[7]
888; CHECK-NEXT:    ret
889  call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
890  %slice.14 = add i32 %slice, 14
891  call void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
892  ret void
893}
894
895define void @multi_vector_sub_lane_vg2x1_u16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
896; CHECK-LABEL: multi_vector_sub_lane_vg2x1_u16:
897; CHECK:       // %bb.0:
898; CHECK-NEXT:    mov w8, w0
899; CHECK-NEXT:    umlsl za.s[w8, 0:1], z0.h, z1.h[0]
900; CHECK-NEXT:    umlsl za.s[w8, 14:15], z0.h, z1.h[7]
901; CHECK-NEXT:    ret
902  call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
903  %slice.14 = add i32 %slice, 14
904  call void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32 %slice.14, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
905  ret void
906}
907
908;
909; BF/F/S/UMLAL x2 (INDEXED)
910;
911
912define void @multi_vector_add_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
913; CHECK-LABEL: multi_vector_add_lane_vg2x2_f16:
914; CHECK:       // %bb.0:
915; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
916; CHECK-NEXT:    mov w8, w0
917; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
918; CHECK-NEXT:    fmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
919; CHECK-NEXT:    fmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
920; CHECK-NEXT:    ret
921  call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 %slice,
922                                                       <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 0)
923  %slice.6 = add i32 %slice, 6
924  call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32 %slice.6,
925                                                       <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 7)
926  ret void
927}
928
929define void @multi_vector_add_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
930; CHECK-LABEL: multi_vector_add_lane_vg2x2_bf16:
931; CHECK:       // %bb.0:
932; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
933; CHECK-NEXT:    mov w8, w0
934; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
935; CHECK-NEXT:    bfmlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
936; CHECK-NEXT:    bfmlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
937; CHECK-NEXT:    ret
938  call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 %slice,
939                                                         <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 0)
940  %slice.6 = add i32 %slice, 6
941  call void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32 %slice.6,
942                                                         <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 7)
943  ret void
944}
945
946define void @multi_vector_add_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
947; CHECK-LABEL: multi_vector_add_lane_vg2x2_s16:
948; CHECK:       // %bb.0:
949; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
950; CHECK-NEXT:    mov w8, w0
951; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
952; CHECK-NEXT:    smlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
953; CHECK-NEXT:    smlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
954; CHECK-NEXT:    ret
955  call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 %slice,
956                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
957  %slice.6 = add i32 %slice, 6
958  call void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32 %slice.6,
959                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
960  ret void
961}
962
963define void @multi_vector_add_lane_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
964; CHECK-LABEL: multi_vector_add_lane_vg2x2_u16:
965; CHECK:       // %bb.0:
966; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
967; CHECK-NEXT:    mov w8, w0
968; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
969; CHECK-NEXT:    umlal za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
970; CHECK-NEXT:    umlal za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
971; CHECK-NEXT:    ret
972  call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 %slice,
973                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
974  %slice.6 = add i32 %slice, 6
975  call void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32 %slice.6,
976                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
977  ret void
978}
979
980;
981; BF/F/S/UMLSL x2 (INDEXED)
982;
983
984define void @multi_vector_sub_lane_vg2x2_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm) {
985; CHECK-LABEL: multi_vector_sub_lane_vg2x2_f16:
986; CHECK:       // %bb.0:
987; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
988; CHECK-NEXT:    mov w8, w0
989; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
990; CHECK-NEXT:    fmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
991; CHECK-NEXT:    fmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
992; CHECK-NEXT:    ret
993  call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 %slice,
994                                                       <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 0)
995  %slice.6 = add i32 %slice, 6
996  call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32 %slice.6,
997                                                       <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zm, i32 7)
998  ret void
999}
1000
1001define void @multi_vector_sub_lane_vg2x2_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm) {
1002; CHECK-LABEL: multi_vector_sub_lane_vg2x2_bf16:
1003; CHECK:       // %bb.0:
1004; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
1005; CHECK-NEXT:    mov w8, w0
1006; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
1007; CHECK-NEXT:    bfmlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
1008; CHECK-NEXT:    bfmlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
1009; CHECK-NEXT:    ret
1010  call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 %slice,
1011                                                         <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 0)
1012  %slice.6 = add i32 %slice, 6
1013  call void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32 %slice.6,
1014                                                         <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zm, i32 7)
1015  ret void
1016}
1017
1018define void @multi_vector_sub_lane_vg2x2_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
1019; CHECK-LABEL: multi_vector_sub_lane_vg2x2_s16:
1020; CHECK:       // %bb.0:
1021; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
1022; CHECK-NEXT:    mov w8, w0
1023; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
1024; CHECK-NEXT:    smlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
1025; CHECK-NEXT:    smlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
1026; CHECK-NEXT:    ret
1027  call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 %slice,
1028                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
1029  %slice.6 = add i32 %slice, 6
1030  call void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32 %slice.6,
1031                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
1032  ret void
1033}
1034
1035define void @multi_vector_sub_lane_vg2x2_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
1036; CHECK-LABEL: multi_vector_sub_lane_vg2x2_u16:
1037; CHECK:       // %bb.0:
1038; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
1039; CHECK-NEXT:    mov w8, w0
1040; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
1041; CHECK-NEXT:    umlsl za.s[w8, 0:1, vgx2], { z0.h, z1.h }, z2.h[0]
1042; CHECK-NEXT:    umlsl za.s[w8, 6:7, vgx2], { z0.h, z1.h }, z2.h[7]
1043; CHECK-NEXT:    ret
1044  call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 %slice,
1045                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
1046  %slice.6 = add i32 %slice, 6
1047  call void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32 %slice.6,
1048                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
1049  ret void
1050}
1051
1052;
1053; BF/F/S/UMLAL x4 (INDEXED)
1054;
1055
1056define void @multi_vector_add_lane_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
1057; CHECK-LABEL: multi_vector_add_lane_vg2x4_f16:
1058; CHECK:       // %bb.0:
1059; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1060; CHECK-NEXT:    mov w8, w0
1061; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1062; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1063; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1064; CHECK-NEXT:    fmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1065; CHECK-NEXT:    fmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1066; CHECK-NEXT:    ret
1067  call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 %slice,
1068                                                       <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
1069                                                       <vscale x 8 x half> %zm, i32 0)
1070  %slice.6 = add i32 %slice, 6
1071  call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32 %slice.6,
1072                                                       <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
1073                                                       <vscale x 8 x half> %zm, i32 7)
1074  ret void
1075}
1076
1077define void @multi_vector_add_lane_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
1078; CHECK-LABEL: multi_vector_add_lane_vg2x4_bf16:
1079; CHECK:       // %bb.0:
1080; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1081; CHECK-NEXT:    mov w8, w0
1082; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1083; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1084; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1085; CHECK-NEXT:    bfmlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1086; CHECK-NEXT:    bfmlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1087; CHECK-NEXT:    ret
1088  call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 %slice,
1089                                                         <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
1090                                                         <vscale x 8 x bfloat> %zm, i32 0)
1091  %slice.6 = add i32 %slice, 6
1092  call void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32 %slice.6,
1093                                                         <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
1094                                                         <vscale x 8 x bfloat> %zm, i32 7)
1095  ret void
1096}
1097
1098define void @multi_vector_add_lane_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
1099; CHECK-LABEL: multi_vector_add_lane_vg2x4_s16:
1100; CHECK:       // %bb.0:
1101; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1102; CHECK-NEXT:    mov w8, w0
1103; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1104; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1105; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1106; CHECK-NEXT:    smlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1107; CHECK-NEXT:    smlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1108; CHECK-NEXT:    ret
1109  call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 %slice,
1110                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1111                                                       <vscale x 8 x i16> %zm, i32 0)
1112  %slice.6 = add i32 %slice, 6
1113  call void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32 %slice.6,
1114                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1115                                                       <vscale x 8 x i16> %zm, i32 7)
1116  ret void
1117}
1118
1119define void @multi_vector_add_lane_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
1120; CHECK-LABEL: multi_vector_add_lane_vg2x4_u16:
1121; CHECK:       // %bb.0:
1122; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1123; CHECK-NEXT:    mov w8, w0
1124; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1125; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1126; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1127; CHECK-NEXT:    umlal za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1128; CHECK-NEXT:    umlal za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1129; CHECK-NEXT:    ret
1130  call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 %slice,
1131                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1132                                                       <vscale x 8 x i16> %zm, i32 0)
1133  %slice.6 = add i32 %slice, 6
1134  call void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32 %slice.6,
1135                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1136                                                       <vscale x 8 x i16> %zm, i32 7)
1137  ret void
1138}
1139
1140;
1141; BF/F/S/UMLSL x4 (INDEXED)
1142;
1143
1144define void @multi_vector_sub_lane_vg2x4_f16(i32 %slice, <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3, <vscale x 8 x half> %zm) {
1145; CHECK-LABEL: multi_vector_sub_lane_vg2x4_f16:
1146; CHECK:       // %bb.0:
1147; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1148; CHECK-NEXT:    mov w8, w0
1149; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1150; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1151; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1152; CHECK-NEXT:    fmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1153; CHECK-NEXT:    fmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1154; CHECK-NEXT:    ret
1155  call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 %slice,
1156                                                       <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
1157                                                       <vscale x 8 x half> %zm, i32 0)
1158  %slice.6 = add i32 %slice, 6
1159  call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32 %slice.6,
1160                                                       <vscale x 8 x half> %zn0, <vscale x 8 x half> %zn1, <vscale x 8 x half> %zn2, <vscale x 8 x half> %zn3,
1161                                                       <vscale x 8 x half> %zm, i32 7)
1162  ret void
1163}
1164
1165define void @multi_vector_sub_lane_vg2x4_bf16(i32 %slice, <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3, <vscale x 8 x bfloat> %zm) {
1166; CHECK-LABEL: multi_vector_sub_lane_vg2x4_bf16:
1167; CHECK:       // %bb.0:
1168; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1169; CHECK-NEXT:    mov w8, w0
1170; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1171; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1172; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1173; CHECK-NEXT:    bfmlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1174; CHECK-NEXT:    bfmlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1175; CHECK-NEXT:    ret
1176  call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 %slice,
1177                                                         <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
1178                                                         <vscale x 8 x bfloat> %zm, i32 0)
1179  %slice.6 = add i32 %slice, 6
1180  call void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32 %slice.6,
1181                                                         <vscale x 8 x bfloat> %zn0, <vscale x 8 x bfloat> %zn1, <vscale x 8 x bfloat> %zn2, <vscale x 8 x bfloat> %zn3,
1182                                                         <vscale x 8 x bfloat> %zm, i32 7)
1183  ret void
1184}
1185
1186define void @multi_vector_sub_lane_vg2x4_s16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
1187; CHECK-LABEL: multi_vector_sub_lane_vg2x4_s16:
1188; CHECK:       // %bb.0:
1189; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1190; CHECK-NEXT:    mov w8, w0
1191; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1192; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1193; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1194; CHECK-NEXT:    smlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1195; CHECK-NEXT:    smlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1196; CHECK-NEXT:    ret
1197  call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 %slice,
1198                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1199                                                       <vscale x 8 x i16> %zm, i32 0)
1200  %slice.6 = add i32 %slice, 6
1201  call void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32 %slice.6,
1202                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1203                                                       <vscale x 8 x i16> %zm, i32 7)
1204  ret void
1205}
1206
1207define void @multi_vector_sub_lane_vg2x4_u16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
1208; CHECK-LABEL: multi_vector_sub_lane_vg2x4_u16:
1209; CHECK:       // %bb.0:
1210; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1211; CHECK-NEXT:    mov w8, w0
1212; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1213; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1214; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
1215; CHECK-NEXT:    umlsl za.s[w8, 0:1, vgx4], { z0.h - z3.h }, z4.h[0]
1216; CHECK-NEXT:    umlsl za.s[w8, 6:7, vgx4], { z0.h - z3.h }, z4.h[7]
1217; CHECK-NEXT:    ret
1218  call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 %slice,
1219                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1220                                                       <vscale x 8 x i16> %zm, i32 0)
1221  %slice.6 = add i32 %slice, 6
1222  call void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32 %slice.6,
1223                                                       <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1224                                                       <vscale x 8 x i16> %zm, i32 7)
1225  ret void
1226}
1227
1228declare void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1229declare void @llvm.aarch64.sme.fmlal.single.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>)
1230declare void @llvm.aarch64.sme.smlal.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1231declare void @llvm.aarch64.sme.umlal.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1232
1233declare void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1234declare void @llvm.aarch64.sme.fmlsl.single.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>)
1235declare void @llvm.aarch64.sme.smlsl.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1236declare void @llvm.aarch64.sme.umlsl.single.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1237
1238declare void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1239declare void @llvm.aarch64.sme.fmlal.single.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1240declare void @llvm.aarch64.sme.smlal.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1241declare void @llvm.aarch64.sme.umlal.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1242
1243declare void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1244declare void @llvm.aarch64.sme.fmlsl.single.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1245declare void @llvm.aarch64.sme.smlsl.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1246declare void @llvm.aarch64.sme.umlsl.single.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1247
1248declare void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
1249                                                                 <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1250declare void @llvm.aarch64.sme.fmlal.single.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>,
1251                                                               <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1252declare void @llvm.aarch64.sme.smlal.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
1253                                                               <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1254declare void @llvm.aarch64.sme.umlal.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
1255                                                               <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1256
1257declare void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
1258                                                                 <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1259declare void @llvm.aarch64.sme.fmlsl.single.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>,
1260                                                               <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1261declare void @llvm.aarch64.sme.smlsl.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
1262                                                               <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1263declare void @llvm.aarch64.sme.umlsl.single.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>,
1264                                                               <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1265
1266declare void @llvm.aarch64.sme.fmlal.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1267declare void @llvm.aarch64.sme.fmlal.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1268declare void @llvm.aarch64.sme.smlal.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1269declare void @llvm.aarch64.sme.umlal.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1270
1271declare void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1272declare void @llvm.aarch64.sme.fmlsl.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1273declare void @llvm.aarch64.sme.smlsl.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1274declare void @llvm.aarch64.sme.umlsl.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1275
1276declare void @llvm.aarch64.sme.fmlal.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
1277                                                         <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1278declare void @llvm.aarch64.sme.fmlal.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
1279                                                        <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1280declare void @llvm.aarch64.sme.smlal.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
1281                                                        <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1282declare void @llvm.aarch64.sme.umlal.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
1283                                                        <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1284
1285declare void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>,
1286                                                         <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>)
1287declare void @llvm.aarch64.sme.fmlsl.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>,
1288                                                        <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>)
1289declare void @llvm.aarch64.sme.smlsl.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
1290                                                        <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1291declare void @llvm.aarch64.sme.umlsl.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
1292                                                        <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1293
1294declare void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1295declare void @llvm.aarch64.sme.fmlal.lane.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1296declare void @llvm.aarch64.sme.smlal.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1297declare void @llvm.aarch64.sme.umlal.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1298
1299declare void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1300declare void @llvm.aarch64.sme.fmlsl.lane.vg2x1.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1301declare void @llvm.aarch64.sme.smlsl.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1302declare void @llvm.aarch64.sme.umlsl.lane.vg2x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1303
1304declare void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1305declare void @llvm.aarch64.sme.fmlal.lane.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1306declare void @llvm.aarch64.sme.smlal.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1307declare void @llvm.aarch64.sme.umlal.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1308
1309declare void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1310declare void @llvm.aarch64.sme.fmlsl.lane.vg2x2.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1311declare void @llvm.aarch64.sme.smlsl.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1312declare void @llvm.aarch64.sme.umlsl.lane.vg2x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1313
1314declare void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1315declare void @llvm.aarch64.sme.fmlal.lane.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1316declare void @llvm.aarch64.sme.smlal.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1317declare void @llvm.aarch64.sme.umlal.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1318
1319declare void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8bf16(i32, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
1320declare void @llvm.aarch64.sme.fmlsl.lane.vg2x4.nxv8f16(i32, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, i32)
1321declare void @llvm.aarch64.sme.smlsl.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1322declare void @llvm.aarch64.sme.umlsl.lane.vg2x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1323