xref: /llvm-project/llvm/test/CodeGen/AArch64/sme2-intrinsics-mlall.ll (revision 62baf21daa377c4ec1a641b26931063c1117d262)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
3
4;
5; SMLALL
6;
7
8; Single x1
9
10define void @multi_vector_mul_add_single_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
11; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_s8:
12; CHECK:       // %bb.0:
13; CHECK-NEXT:    mov w8, w0
14; CHECK-NEXT:    smlall za.s[w8, 0:3], z1.b, z2.b
15; CHECK-NEXT:    smlall za.s[w8, 12:15], z1.b, z2.b
16; CHECK-NEXT:    ret
17  call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
18  %slice.12 = add i32 %slice, 12
19  call void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
20  ret void
21}
22
23define void @multi_vector_mul_add_single_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
24; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_s16:
25; CHECK:       // %bb.0:
26; CHECK-NEXT:    mov w8, w0
27; CHECK-NEXT:    smlall za.d[w8, 0:3], z1.h, z2.h
28; CHECK-NEXT:    smlall za.d[w8, 12:15], z1.h, z2.h
29; CHECK-NEXT:    ret
30  call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
31  %slice.12 = add i32 %slice, 12
32  call void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
33  ret void
34}
35
36; Single x2
37
38define void @multi_vector_mul_add_single_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
39; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_s8:
40; CHECK:       // %bb.0:
41; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
42; CHECK-NEXT:    mov w8, w0
43; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
44; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
45; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
46; CHECK-NEXT:    ret
47  call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
48  %slice.4 = add i32 %slice, 4
49  call void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
50  ret void
51}
52
53define void @multi_vector_mul_add_single_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
54; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_s16:
55; CHECK:       // %bb.0:
56; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
57; CHECK-NEXT:    mov w8, w0
58; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
59; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
60; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
61; CHECK-NEXT:    ret
62  call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
63  %slice.4 = add i32 %slice, 4
64  call void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
65  ret void
66}
67
68; Single x4
69
70define void @multi_vector_mul_add_single_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
71; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_s8:
72; CHECK:       // %bb.0:
73; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
74; CHECK-NEXT:    mov w8, w0
75; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
76; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
77; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
78; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
79; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
80; CHECK-NEXT:    ret
81  call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
82  %slice.4 = add i32 %slice, 4
83  call void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
84  ret void
85}
86
87define void @multi_vector_mul_add_single_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
88; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_s16:
89; CHECK:       // %bb.0:
90; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
91; CHECK-NEXT:    mov w8, w0
92; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
93; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
94; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
95; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h
96; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h
97; CHECK-NEXT:    ret
98  call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
99  %slice.4 = add i32 %slice, 4
100  call void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
101  ret void
102}
103
104; Multi x2
105
106define void @multi_vector_mul_add_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
107; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s8:
108; CHECK:       // %bb.0:
109; CHECK-NEXT:    mov z5.d, z4.d
110; CHECK-NEXT:    mov z7.d, z2.d
111; CHECK-NEXT:    mov w8, w0
112; CHECK-NEXT:    mov z4.d, z3.d
113; CHECK-NEXT:    mov z6.d, z1.d
114; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
115; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
116; CHECK-NEXT:    ret
117  call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
118  %slice.4 = add i32 %slice, 4
119  call void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
120  ret void
121}
122
123define void @multi_vector_mul_add_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
124; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_s16:
125; CHECK:       // %bb.0:
126; CHECK-NEXT:    mov z5.d, z4.d
127; CHECK-NEXT:    mov z7.d, z2.d
128; CHECK-NEXT:    mov w8, w0
129; CHECK-NEXT:    mov z4.d, z3.d
130; CHECK-NEXT:    mov z6.d, z1.d
131; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
132; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
133; CHECK-NEXT:    ret
134  call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
135  %slice.4 = add i32 %slice, 4
136  call void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
137  ret void
138}
139
140; Multi x4
141
142define void @multi_vector_mul_add_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
143; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s8:
144; CHECK:       // %bb.0:
145; CHECK-NEXT:    mov z26.d, z7.d
146; CHECK-NEXT:    mov z31.d, z4.d
147; CHECK-NEXT:    mov w8, w0
148; CHECK-NEXT:    ptrue p0.b
149; CHECK-NEXT:    mov z25.d, z6.d
150; CHECK-NEXT:    mov z30.d, z3.d
151; CHECK-NEXT:    mov z24.d, z5.d
152; CHECK-NEXT:    mov z29.d, z2.d
153; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
154; CHECK-NEXT:    mov z28.d, z1.d
155; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
156; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
157; CHECK-NEXT:    ret
158  call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
159  %slice.4 = add i32 %slice, 4
160  call void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
161  ret void
162}
163
164define void @multi_vector_mul_add_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
165; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_s16:
166; CHECK:       // %bb.0:
167; CHECK-NEXT:    mov z26.d, z7.d
168; CHECK-NEXT:    mov z31.d, z4.d
169; CHECK-NEXT:    mov w8, w0
170; CHECK-NEXT:    ptrue p0.h
171; CHECK-NEXT:    mov z25.d, z6.d
172; CHECK-NEXT:    mov z30.d, z3.d
173; CHECK-NEXT:    mov z24.d, z5.d
174; CHECK-NEXT:    mov z29.d, z2.d
175; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
176; CHECK-NEXT:    mov z28.d, z1.d
177; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
178; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
179; CHECK-NEXT:    ret
180  call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
181  %slice.4 = add i32 %slice, 4
182  call void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
183  ret void
184}
185
186; Indexed x1
187
188define void @multi_vector_mul_add_lane_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
189; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_s8:
190; CHECK:       // %bb.0:
191; CHECK-NEXT:    mov w8, w0
192; CHECK-NEXT:    smlall za.s[w8, 0:3], z1.b, z2.b[0]
193; CHECK-NEXT:    smlall za.s[w8, 12:15], z1.b, z2.b[15]
194; CHECK-NEXT:    ret
195  call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
196  %slice.12 = add i32 %slice, 12
197  call void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
198  ret void
199}
200
201define void @multi_vector_mul_add_lane_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
202; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_s16:
203; CHECK:       // %bb.0:
204; CHECK-NEXT:    mov w8, w0
205; CHECK-NEXT:    smlall za.d[w8, 0:3], z1.h, z2.h[0]
206; CHECK-NEXT:    smlall za.d[w8, 12:15], z1.h, z2.h[7]
207; CHECK-NEXT:    ret
208  call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
209  %slice.12 = add i32 %slice, 12
210  call void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
211  ret void
212}
213
214; Indexed x2
215
216define void @multi_vector_mul_add_lane_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
217; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_s8:
218; CHECK:       // %bb.0:
219; CHECK-NEXT:    mov z5.d, z2.d
220; CHECK-NEXT:    mov w8, w0
221; CHECK-NEXT:    mov z4.d, z1.d
222; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
223; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
224; CHECK-NEXT:    ret
225  call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
226  %slice.4 = add i32 %slice, 4
227  call void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
228  ret void
229}
230
231define void @multi_vector_mul_add_lane_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
232; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_s16:
233; CHECK:       // %bb.0:
234; CHECK-NEXT:    mov z5.d, z2.d
235; CHECK-NEXT:    mov w8, w0
236; CHECK-NEXT:    mov z4.d, z1.d
237; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
238; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
239; CHECK-NEXT:    ret
240  call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
241  %slice.4 = add i32 %slice, 4
242  call void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
243  ret void
244}
245
246; Indexed x4
247
248define void @multi_vector_mul_add_lane_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
249; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_s8:
250; CHECK:       // %bb.0:
251; CHECK-NEXT:    mov z27.d, z4.d
252; CHECK-NEXT:    mov w8, w0
253; CHECK-NEXT:    mov z26.d, z3.d
254; CHECK-NEXT:    mov z25.d, z2.d
255; CHECK-NEXT:    mov z24.d, z1.d
256; CHECK-NEXT:    smlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
257; CHECK-NEXT:    smlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
258; CHECK-NEXT:    ret
259  call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
260  %slice.4 = add i32 %slice, 4
261  call void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
262  ret void
263}
264
265define void @multi_vector_mul_add_lane_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
266; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_s16:
267; CHECK:       // %bb.0:
268; CHECK-NEXT:    mov z27.d, z4.d
269; CHECK-NEXT:    mov w8, w0
270; CHECK-NEXT:    mov z26.d, z3.d
271; CHECK-NEXT:    mov z25.d, z2.d
272; CHECK-NEXT:    mov z24.d, z1.d
273; CHECK-NEXT:    smlall za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0]
274; CHECK-NEXT:    smlall za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7]
275; CHECK-NEXT:    ret
276  call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0)
277  %slice.4 = add i32 %slice, 4
278  call void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7)
279  ret void
280}
281
282; UMLALL
283
284; Single x1
285
286define void @multi_vector_mul_add_single_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
287; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_u8:
288; CHECK:       // %bb.0:
289; CHECK-NEXT:    mov w8, w0
290; CHECK-NEXT:    umlall za.s[w8, 0:3], z1.b, z2.b
291; CHECK-NEXT:    umlall za.s[w8, 12:15], z1.b, z2.b
292; CHECK-NEXT:    ret
293  call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
294  %slice.12 = add i32 %slice, 12
295  call void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
296  ret void
297}
298
299define void @multi_vector_mul_add_single_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
300; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x1_u16:
301; CHECK:       // %bb.0:
302; CHECK-NEXT:    mov w8, w0
303; CHECK-NEXT:    umlall za.d[w8, 0:3], z1.h, z2.h
304; CHECK-NEXT:    umlall za.d[w8, 12:15], z1.h, z2.h
305; CHECK-NEXT:    ret
306  call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
307  %slice.12 = add i32 %slice, 12
308  call void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
309  ret void
310}
311
312; Single x2
313
314define void @multi_vector_mul_add_single_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
315; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_u8:
316; CHECK:       // %bb.0:
317; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
318; CHECK-NEXT:    mov w8, w0
319; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
320; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
321; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
322; CHECK-NEXT:    ret
323  call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
324  %slice.4 = add i32 %slice, 4
325  call void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
326  ret void
327}
328
329define void @multi_vector_mul_add_single_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
330; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x2_u16:
331; CHECK:       // %bb.0:
332; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
333; CHECK-NEXT:    mov w8, w0
334; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
335; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
336; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
337; CHECK-NEXT:    ret
338  call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
339  %slice.4 = add i32 %slice, 4
340  call void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
341  ret void
342}
343
344; Single x4
345
346define void @multi_vector_mul_add_single_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
347; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_u8:
348; CHECK:       // %bb.0:
349; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
350; CHECK-NEXT:    mov w8, w0
351; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
352; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
353; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
354; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
355; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
356; CHECK-NEXT:    ret
357  call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
358  %slice.4 = add i32 %slice, 4
359  call void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
360  ret void
361}
362
363define void @multi_vector_mul_add_single_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
364; CHECK-LABEL: multi_vector_mul_add_single_long_vg4x4_u16:
365; CHECK:       // %bb.0:
366; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
367; CHECK-NEXT:    mov w8, w0
368; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
369; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
370; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
371; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h
372; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h
373; CHECK-NEXT:    ret
374  call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
375  %slice.4 = add i32 %slice, 4
376  call void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
377  ret void
378}
379
380; Multi x2
381
382define void @multi_vector_mul_add_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
383; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u8:
384; CHECK:       // %bb.0:
385; CHECK-NEXT:    mov z5.d, z4.d
386; CHECK-NEXT:    mov z7.d, z2.d
387; CHECK-NEXT:    mov w8, w0
388; CHECK-NEXT:    mov z4.d, z3.d
389; CHECK-NEXT:    mov z6.d, z1.d
390; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
391; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
392; CHECK-NEXT:    ret
393  call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
394  %slice.4 = add i32 %slice, 4
395  call void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
396  ret void
397}
398
399define void @multi_vector_mul_add_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
400; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x2_u16:
401; CHECK:       // %bb.0:
402; CHECK-NEXT:    mov z5.d, z4.d
403; CHECK-NEXT:    mov z7.d, z2.d
404; CHECK-NEXT:    mov w8, w0
405; CHECK-NEXT:    mov z4.d, z3.d
406; CHECK-NEXT:    mov z6.d, z1.d
407; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
408; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
409; CHECK-NEXT:    ret
410  call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
411  %slice.4 = add i32 %slice, 4
412  call void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
413  ret void
414}
415
416; Multi x4
417
418define void @multi_vector_mul_add_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
419; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u8:
420; CHECK:       // %bb.0:
421; CHECK-NEXT:    mov z26.d, z7.d
422; CHECK-NEXT:    mov z31.d, z4.d
423; CHECK-NEXT:    mov w8, w0
424; CHECK-NEXT:    ptrue p0.b
425; CHECK-NEXT:    mov z25.d, z6.d
426; CHECK-NEXT:    mov z30.d, z3.d
427; CHECK-NEXT:    mov z24.d, z5.d
428; CHECK-NEXT:    mov z29.d, z2.d
429; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
430; CHECK-NEXT:    mov z28.d, z1.d
431; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
432; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
433; CHECK-NEXT:    ret
434  call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
435  %slice.4 = add i32 %slice, 4
436  call void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
437  ret void
438}
439
440define void @multi_vector_mul_add_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
441; CHECK-LABEL: multi_vector_mul_add_multi_long_vg4x4_u16:
442; CHECK:       // %bb.0:
443; CHECK-NEXT:    mov z26.d, z7.d
444; CHECK-NEXT:    mov z31.d, z4.d
445; CHECK-NEXT:    mov w8, w0
446; CHECK-NEXT:    ptrue p0.h
447; CHECK-NEXT:    mov z25.d, z6.d
448; CHECK-NEXT:    mov z30.d, z3.d
449; CHECK-NEXT:    mov z24.d, z5.d
450; CHECK-NEXT:    mov z29.d, z2.d
451; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
452; CHECK-NEXT:    mov z28.d, z1.d
453; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
454; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
455; CHECK-NEXT:    ret
456  call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
457  %slice.4 = add i32 %slice, 4
458  call void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
459  ret void
460}
461
462; Indexed x1
463
464define void @multi_vector_mul_add_lane_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
465; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_u8:
466; CHECK:       // %bb.0:
467; CHECK-NEXT:    mov w8, w0
468; CHECK-NEXT:    umlall za.s[w8, 0:3], z1.b, z2.b[0]
469; CHECK-NEXT:    umlall za.s[w8, 12:15], z1.b, z2.b[15]
470; CHECK-NEXT:    ret
471  call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
472  %slice.12 = add i32 %slice, 12
473  call void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
474  ret void
475}
476
477define void @multi_vector_mul_add_lane_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
478; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x1_u16:
479; CHECK:       // %bb.0:
480; CHECK-NEXT:    mov w8, w0
481; CHECK-NEXT:    umlall za.d[w8, 0:3], z1.h, z2.h[0]
482; CHECK-NEXT:    umlall za.d[w8, 12:15], z1.h, z2.h[7]
483; CHECK-NEXT:    ret
484  call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
485  %slice.12 = add i32 %slice, 12
486  call void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
487  ret void
488}
489
490; Indexed x2
491
492define void @multi_vector_mul_add_lane_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
493; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_u8:
494; CHECK:       // %bb.0:
495; CHECK-NEXT:    mov z5.d, z2.d
496; CHECK-NEXT:    mov w8, w0
497; CHECK-NEXT:    mov z4.d, z1.d
498; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
499; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
500; CHECK-NEXT:    ret
501  call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
502  %slice.4 = add i32 %slice, 4
503  call void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
504  ret void
505}
506
507define void @multi_vector_mul_add_lane_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
508; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x2_u16:
509; CHECK:       // %bb.0:
510; CHECK-NEXT:    mov z5.d, z2.d
511; CHECK-NEXT:    mov w8, w0
512; CHECK-NEXT:    mov z4.d, z1.d
513; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
514; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
515; CHECK-NEXT:    ret
516  call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
517  %slice.4 = add i32 %slice, 4
518  call void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
519  ret void
520}
521
522; Indexed x4
523
524define void @multi_vector_mul_add_lane_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
525; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_u8:
526; CHECK:       // %bb.0:
527; CHECK-NEXT:    mov z27.d, z4.d
528; CHECK-NEXT:    mov w8, w0
529; CHECK-NEXT:    mov z26.d, z3.d
530; CHECK-NEXT:    mov z25.d, z2.d
531; CHECK-NEXT:    mov z24.d, z1.d
532; CHECK-NEXT:    umlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
533; CHECK-NEXT:    umlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
534; CHECK-NEXT:    ret
535  call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
536  %slice.4 = add i32 %slice, 4
537  call void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
538  ret void
539}
540
541define void @multi_vector_mul_add_lane_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
542; CHECK-LABEL: multi_vector_mul_add_lane_long_vg4x4_u16:
543; CHECK:       // %bb.0:
544; CHECK-NEXT:    mov z27.d, z4.d
545; CHECK-NEXT:    mov w8, w0
546; CHECK-NEXT:    mov z26.d, z3.d
547; CHECK-NEXT:    mov z25.d, z2.d
548; CHECK-NEXT:    mov z24.d, z1.d
549; CHECK-NEXT:    umlall za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0]
550; CHECK-NEXT:    umlall za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7]
551; CHECK-NEXT:    ret
552  call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0)
553  %slice.4 = add i32 %slice, 4
554  call void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7)
555  ret void
556}
557
558; SMLSLL
559
560; Single x1
561
562define void @multi_vector_mul_sub_single_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
563; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_s8:
564; CHECK:       // %bb.0:
565; CHECK-NEXT:    mov w8, w0
566; CHECK-NEXT:    smlsll za.s[w8, 0:3], z1.b, z2.b
567; CHECK-NEXT:    smlsll za.s[w8, 12:15], z1.b, z2.b
568; CHECK-NEXT:    ret
569  call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
570  %slice.12 = add i32 %slice, 12
571  call void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
572  ret void
573}
574
575define void @multi_vector_mul_sub_single_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
576; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_s16:
577; CHECK:       // %bb.0:
578; CHECK-NEXT:    mov w8, w0
579; CHECK-NEXT:    smlsll za.d[w8, 0:3], z1.h, z2.h
580; CHECK-NEXT:    smlsll za.d[w8, 12:15], z1.h, z2.h
581; CHECK-NEXT:    ret
582  call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
583  %slice.12 = add i32 %slice, 12
584  call void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
585  ret void
586}
587
588; Single x2
589
590define void @multi_vector_mul_sub_single_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
591; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_s8:
592; CHECK:       // %bb.0:
593; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
594; CHECK-NEXT:    mov w8, w0
595; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
596; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
597; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
598; CHECK-NEXT:    ret
599  call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
600  %slice.4 = add i32 %slice, 4
601  call void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
602  ret void
603}
604
605define void @multi_vector_mul_sub_single_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
606; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_s16:
607; CHECK:       // %bb.0:
608; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
609; CHECK-NEXT:    mov w8, w0
610; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
611; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
612; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
613; CHECK-NEXT:    ret
614  call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
615  %slice.4 = add i32 %slice, 4
616  call void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
617  ret void
618}
619
620; Single x4
621
622define void @multi_vector_mul_sub_single_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
623; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_s8:
624; CHECK:       // %bb.0:
625; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
626; CHECK-NEXT:    mov w8, w0
627; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
628; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
629; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
630; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
631; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
632; CHECK-NEXT:    ret
633  call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
634  %slice.4 = add i32 %slice, 4
635  call void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
636  ret void
637}
638
639define void @multi_vector_mul_sub_single_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
640; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_s16:
641; CHECK:       // %bb.0:
642; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
643; CHECK-NEXT:    mov w8, w0
644; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
645; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
646; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
647; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h
648; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h
649; CHECK-NEXT:    ret
650  call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
651  %slice.4 = add i32 %slice, 4
652  call void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
653  ret void
654}
655
656; Multi x2
657
658define void @multi_vector_mul_sub_multi_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
659; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s8:
660; CHECK:       // %bb.0:
661; CHECK-NEXT:    mov z5.d, z4.d
662; CHECK-NEXT:    mov z7.d, z2.d
663; CHECK-NEXT:    mov w8, w0
664; CHECK-NEXT:    mov z4.d, z3.d
665; CHECK-NEXT:    mov z6.d, z1.d
666; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
667; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
668; CHECK-NEXT:    ret
669  call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
670  %slice.4 = add i32 %slice, 4
671  call void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
672  ret void
673}
674
675define void @multi_vector_mul_sub_multi_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
676; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_s16:
677; CHECK:       // %bb.0:
678; CHECK-NEXT:    mov z5.d, z4.d
679; CHECK-NEXT:    mov z7.d, z2.d
680; CHECK-NEXT:    mov w8, w0
681; CHECK-NEXT:    mov z4.d, z3.d
682; CHECK-NEXT:    mov z6.d, z1.d
683; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
684; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
685; CHECK-NEXT:    ret
686  call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
687  %slice.4 = add i32 %slice, 4
688  call void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
689  ret void
690}
691
692; Multi x4
693
694define void @multi_vector_mul_sub_multi_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
695; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s8:
696; CHECK:       // %bb.0:
697; CHECK-NEXT:    mov z26.d, z7.d
698; CHECK-NEXT:    mov z31.d, z4.d
699; CHECK-NEXT:    mov w8, w0
700; CHECK-NEXT:    ptrue p0.b
701; CHECK-NEXT:    mov z25.d, z6.d
702; CHECK-NEXT:    mov z30.d, z3.d
703; CHECK-NEXT:    mov z24.d, z5.d
704; CHECK-NEXT:    mov z29.d, z2.d
705; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
706; CHECK-NEXT:    mov z28.d, z1.d
707; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
708; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
709; CHECK-NEXT:    ret
710  call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
711  %slice.4 = add i32 %slice, 4
712  call void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
713  ret void
714}
715
716define void @multi_vector_mul_sub_multi_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
717; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_s16:
718; CHECK:       // %bb.0:
719; CHECK-NEXT:    mov z26.d, z7.d
720; CHECK-NEXT:    mov z31.d, z4.d
721; CHECK-NEXT:    mov w8, w0
722; CHECK-NEXT:    ptrue p0.h
723; CHECK-NEXT:    mov z25.d, z6.d
724; CHECK-NEXT:    mov z30.d, z3.d
725; CHECK-NEXT:    mov z24.d, z5.d
726; CHECK-NEXT:    mov z29.d, z2.d
727; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
728; CHECK-NEXT:    mov z28.d, z1.d
729; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
730; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
731; CHECK-NEXT:    ret
732  call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
733  %slice.4 = add i32 %slice, 4
734  call void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
735  ret void
736}
737
738; Indexed x1
739
740define void @multi_vector_mul_sub_lane_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
741; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_s8:
742; CHECK:       // %bb.0:
743; CHECK-NEXT:    mov w8, w0
744; CHECK-NEXT:    smlsll za.s[w8, 0:3], z1.b, z2.b[0]
745; CHECK-NEXT:    smlsll za.s[w8, 12:15], z1.b, z2.b[15]
746; CHECK-NEXT:    ret
747  call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
748  %slice.12 = add i32 %slice, 12
749  call void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
750  ret void
751}
752
753define void @multi_vector_mul_sub_lane_long_vg4x1_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
754; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_s16:
755; CHECK:       // %bb.0:
756; CHECK-NEXT:    mov w8, w0
757; CHECK-NEXT:    smlsll za.d[w8, 0:3], z1.h, z2.h[0]
758; CHECK-NEXT:    smlsll za.d[w8, 12:15], z1.h, z2.h[7]
759; CHECK-NEXT:    ret
760  call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
761  %slice.12 = add i32 %slice, 12
762  call void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
763  ret void
764}
765
766; Indexed x2
767
768define void @multi_vector_mul_sub_lane_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
769; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_s8:
770; CHECK:       // %bb.0:
771; CHECK-NEXT:    mov z5.d, z2.d
772; CHECK-NEXT:    mov w8, w0
773; CHECK-NEXT:    mov z4.d, z1.d
774; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
775; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
776; CHECK-NEXT:    ret
777  call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
778  %slice.4 = add i32 %slice, 4
779  call void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
780  ret void
781}
782
783define void @multi_vector_mul_sub_lane_long_vg4x2_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
784; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_s16:
785; CHECK:       // %bb.0:
786; CHECK-NEXT:    mov z5.d, z2.d
787; CHECK-NEXT:    mov w8, w0
788; CHECK-NEXT:    mov z4.d, z1.d
789; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
790; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
791; CHECK-NEXT:    ret
792  call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
793  %slice.4 = add i32 %slice, 4
794  call void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
795  ret void
796}
797
798; Indexed x4
799
800define void @multi_vector_mul_sub_lane_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
801; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_s8:
802; CHECK:       // %bb.0:
803; CHECK-NEXT:    mov z27.d, z4.d
804; CHECK-NEXT:    mov w8, w0
805; CHECK-NEXT:    mov z26.d, z3.d
806; CHECK-NEXT:    mov z25.d, z2.d
807; CHECK-NEXT:    mov z24.d, z1.d
808; CHECK-NEXT:    smlsll za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
809; CHECK-NEXT:    smlsll za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
810; CHECK-NEXT:    ret
811  call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
812  %slice.4 = add i32 %slice, 4
813  call void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
814  ret void
815}
816
817define void @multi_vector_mul_sub_lane_long_vg4x4_s16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
818; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_s16:
819; CHECK:       // %bb.0:
820; CHECK-NEXT:    mov z27.d, z4.d
821; CHECK-NEXT:    mov w8, w0
822; CHECK-NEXT:    mov z26.d, z3.d
823; CHECK-NEXT:    mov z25.d, z2.d
824; CHECK-NEXT:    mov z24.d, z1.d
825; CHECK-NEXT:    smlsll za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0]
826; CHECK-NEXT:    smlsll za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7]
827; CHECK-NEXT:    ret
828  call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0)
829  %slice.4 = add i32 %slice, 4
830  call void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7)
831  ret void
832}
833
834; UMLSLL
835
836; Single x1
837
838define void @multi_vector_mul_sub_single_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
839; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_u8:
840; CHECK:       // %bb.0:
841; CHECK-NEXT:    mov w8, w0
842; CHECK-NEXT:    umlsll za.s[w8, 0:3], z1.b, z2.b
843; CHECK-NEXT:    umlsll za.s[w8, 12:15], z1.b, z2.b
844; CHECK-NEXT:    ret
845  call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
846  %slice.12 = add i32 %slice, 12
847  call void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
848  ret void
849}
850
851define void @multi_vector_mul_sub_single_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
852; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x1_u16:
853; CHECK:       // %bb.0:
854; CHECK-NEXT:    mov w8, w0
855; CHECK-NEXT:    umlsll za.d[w8, 0:3], z1.h, z2.h
856; CHECK-NEXT:    umlsll za.d[w8, 12:15], z1.h, z2.h
857; CHECK-NEXT:    ret
858  call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
859  %slice.12 = add i32 %slice, 12
860  call void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm)
861  ret void
862}
863
864; Single x2
865
866define void @multi_vector_mul_sub_single_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
867; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_u8:
868; CHECK:       // %bb.0:
869; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
870; CHECK-NEXT:    mov w8, w0
871; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
872; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
873; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
874; CHECK-NEXT:    ret
875  call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
876  %slice.4 = add i32 %slice, 4
877  call void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
878  ret void
879}
880
881define void @multi_vector_mul_sub_single_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
882; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x2_u16:
883; CHECK:       // %bb.0:
884; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
885; CHECK-NEXT:    mov w8, w0
886; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
887; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx2], { z1.h, z2.h }, z3.h
888; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx2], { z1.h, z2.h }, z3.h
889; CHECK-NEXT:    ret
890  call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
891  %slice.4 = add i32 %slice, 4
892  call void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm)
893  ret void
894}
895
896; Single x4
897
898define void @multi_vector_mul_sub_single_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
899; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_u8:
900; CHECK:       // %bb.0:
901; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
902; CHECK-NEXT:    mov w8, w0
903; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
904; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
905; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
906; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
907; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
908; CHECK-NEXT:    ret
909  call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
910  %slice.4 = add i32 %slice, 4
911  call void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
912  ret void
913}
914
915define void @multi_vector_mul_sub_single_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
916; CHECK-LABEL: multi_vector_mul_sub_single_long_vg4x4_u16:
917; CHECK:       // %bb.0:
918; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
919; CHECK-NEXT:    mov w8, w0
920; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
921; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
922; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
923; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx4], { z1.h - z4.h }, z5.h
924; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx4], { z1.h - z4.h }, z5.h
925; CHECK-NEXT:    ret
926  call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
927  %slice.4 = add i32 %slice, 4
928  call void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm)
929  ret void
930}
931
932; Multi x2
933
934define void @multi_vector_mul_sub_multi_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
935; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u8:
936; CHECK:       // %bb.0:
937; CHECK-NEXT:    mov z5.d, z4.d
938; CHECK-NEXT:    mov z7.d, z2.d
939; CHECK-NEXT:    mov w8, w0
940; CHECK-NEXT:    mov z4.d, z3.d
941; CHECK-NEXT:    mov z6.d, z1.d
942; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
943; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
944; CHECK-NEXT:    ret
945  call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
946  %slice.4 = add i32 %slice, 4
947  call void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
948  ret void
949}
950
951define void @multi_vector_mul_sub_multi_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1) {
952; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x2_u16:
953; CHECK:       // %bb.0:
954; CHECK-NEXT:    mov z5.d, z4.d
955; CHECK-NEXT:    mov z7.d, z2.d
956; CHECK-NEXT:    mov w8, w0
957; CHECK-NEXT:    mov z4.d, z3.d
958; CHECK-NEXT:    mov z6.d, z1.d
959; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
960; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
961; CHECK-NEXT:    ret
962  call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
963  %slice.4 = add i32 %slice, 4
964  call void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1)
965  ret void
966}
967
968; Multi x4
969
970define void @multi_vector_mul_sub_multi_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
971; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u8:
972; CHECK:       // %bb.0:
973; CHECK-NEXT:    mov z26.d, z7.d
974; CHECK-NEXT:    mov z31.d, z4.d
975; CHECK-NEXT:    mov w8, w0
976; CHECK-NEXT:    ptrue p0.b
977; CHECK-NEXT:    mov z25.d, z6.d
978; CHECK-NEXT:    mov z30.d, z3.d
979; CHECK-NEXT:    mov z24.d, z5.d
980; CHECK-NEXT:    mov z29.d, z2.d
981; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
982; CHECK-NEXT:    mov z28.d, z1.d
983; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
984; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
985; CHECK-NEXT:    ret
986  call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
987  %slice.4 = add i32 %slice, 4
988  call void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
989  ret void
990}
991
992define void @multi_vector_mul_sub_multi_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3) {
993; CHECK-LABEL: multi_vector_mul_sub_multi_long_vg4x4_u16:
994; CHECK:       // %bb.0:
995; CHECK-NEXT:    mov z26.d, z7.d
996; CHECK-NEXT:    mov z31.d, z4.d
997; CHECK-NEXT:    mov w8, w0
998; CHECK-NEXT:    ptrue p0.h
999; CHECK-NEXT:    mov z25.d, z6.d
1000; CHECK-NEXT:    mov z30.d, z3.d
1001; CHECK-NEXT:    mov z24.d, z5.d
1002; CHECK-NEXT:    mov z29.d, z2.d
1003; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
1004; CHECK-NEXT:    mov z28.d, z1.d
1005; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
1006; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx4], { z28.h - z31.h }, { z24.h - z27.h }
1007; CHECK-NEXT:    ret
1008  call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
1009  %slice.4 = add i32 %slice, 4
1010  call void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm0, <vscale x 8 x i16> %zm1, <vscale x 8 x i16> %zm2, <vscale x 8 x i16> %zm3)
1011  ret void
1012}
1013
1014; Indexed x1
1015
1016define void @multi_vector_mul_sub_lane_long_vg4x1_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
1017; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_u8:
1018; CHECK:       // %bb.0:
1019; CHECK-NEXT:    mov w8, w0
1020; CHECK-NEXT:    umlsll za.s[w8, 0:3], z1.b, z2.b[0]
1021; CHECK-NEXT:    umlsll za.s[w8, 12:15], z1.b, z2.b[15]
1022; CHECK-NEXT:    ret
1023  call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
1024  %slice.12 = add i32 %slice, 12
1025  call void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
1026  ret void
1027}
1028
1029define void @multi_vector_mul_sub_lane_long_vg4x1_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
1030; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x1_u16:
1031; CHECK:       // %bb.0:
1032; CHECK-NEXT:    mov w8, w0
1033; CHECK-NEXT:    umlsll za.d[w8, 0:3], z1.h, z2.h[0]
1034; CHECK-NEXT:    umlsll za.d[w8, 12:15], z1.h, z2.h[7]
1035; CHECK-NEXT:    ret
1036  call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 0)
1037  %slice.12 = add i32 %slice, 12
1038  call void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32 %slice.12, <vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 7)
1039  ret void
1040}
1041
1042; Indexed x2
1043
1044define void @multi_vector_mul_sub_lane_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
1045; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_u8:
1046; CHECK:       // %bb.0:
1047; CHECK-NEXT:    mov z5.d, z2.d
1048; CHECK-NEXT:    mov w8, w0
1049; CHECK-NEXT:    mov z4.d, z1.d
1050; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
1051; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
1052; CHECK-NEXT:    ret
1053  call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
1054  %slice.4 = add i32 %slice, 4
1055  call void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
1056  ret void
1057}
1058
1059define void @multi_vector_mul_sub_lane_long_vg4x2_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm) {
1060; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x2_u16:
1061; CHECK:       // %bb.0:
1062; CHECK-NEXT:    mov z5.d, z2.d
1063; CHECK-NEXT:    mov w8, w0
1064; CHECK-NEXT:    mov z4.d, z1.d
1065; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx2], { z4.h, z5.h }, z3.h[0]
1066; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx2], { z4.h, z5.h }, z3.h[7]
1067; CHECK-NEXT:    ret
1068  call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 0)
1069  %slice.4 = add i32 %slice, 4
1070  call void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zm, i32 7)
1071  ret void
1072}
1073
1074; Indexed x4
1075
1076define void @multi_vector_mul_sub_lane_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
1077; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_u8:
1078; CHECK:       // %bb.0:
1079; CHECK-NEXT:    mov z27.d, z4.d
1080; CHECK-NEXT:    mov w8, w0
1081; CHECK-NEXT:    mov z26.d, z3.d
1082; CHECK-NEXT:    mov z25.d, z2.d
1083; CHECK-NEXT:    mov z24.d, z1.d
1084; CHECK-NEXT:    umlsll za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
1085; CHECK-NEXT:    umlsll za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
1086; CHECK-NEXT:    ret
1087  call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
1088  %slice.4 = add i32 %slice, 4
1089  call void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
1090  ret void
1091}
1092
1093define void @multi_vector_mul_sub_lane_long_vg4x4_u16(i32 %slice, <vscale x 8 x i16> %dummy, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm) {
1094; CHECK-LABEL: multi_vector_mul_sub_lane_long_vg4x4_u16:
1095; CHECK:       // %bb.0:
1096; CHECK-NEXT:    mov z27.d, z4.d
1097; CHECK-NEXT:    mov w8, w0
1098; CHECK-NEXT:    mov z26.d, z3.d
1099; CHECK-NEXT:    mov z25.d, z2.d
1100; CHECK-NEXT:    mov z24.d, z1.d
1101; CHECK-NEXT:    umlsll za.d[w8, 0:3, vgx4], { z24.h - z27.h }, z5.h[0]
1102; CHECK-NEXT:    umlsll za.d[w8, 4:7, vgx4], { z24.h - z27.h }, z5.h[7]
1103; CHECK-NEXT:    ret
1104  call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 0)
1105  %slice.4 = add i32 %slice, 4
1106  call void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32 %slice.4, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zm, i32 7)
1107  ret void
1108}
1109
1110;
1111; SUMLALL
1112;
1113
1114; Single x 2
1115
1116define void @multi_vector_mul_add_single_signed_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
1117; CHECK-LABEL: multi_vector_mul_add_single_signed_long_vg4x2_s8:
1118; CHECK:       // %bb.0:
1119; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
1120; CHECK-NEXT:    mov w8, w0
1121; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
1122; CHECK-NEXT:    sumlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
1123; CHECK-NEXT:    sumlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
1124; CHECK-NEXT:    ret
1125  call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
1126  %slice.4 = add i32 %slice, 4
1127  call void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
1128  ret void
1129}
1130
1131; Single x 4
1132
1133define void @multi_vector_mul_add_single_signed_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
1134; CHECK-LABEL: multi_vector_mul_add_single_signed_long_vg4x4_s8:
1135; CHECK:       // %bb.0:
1136; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1137; CHECK-NEXT:    mov w8, w0
1138; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1139; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1140; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1141; CHECK-NEXT:    sumlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
1142; CHECK-NEXT:    sumlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
1143; CHECK-NEXT:    ret
1144  call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
1145  %slice.4 = add i32 %slice, 4
1146  call void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
1147  ret void
1148}
1149
1150; Indexed x1
1151
1152define void @multi_vector_mul_add_lane_signed_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
1153; CHECK-LABEL: multi_vector_mul_add_lane_signed_long_vg4x1_s8:
1154; CHECK:       // %bb.0:
1155; CHECK-NEXT:    mov w8, w0
1156; CHECK-NEXT:    sumlall za.s[w8, 0:3], z1.b, z2.b[0]
1157; CHECK-NEXT:    sumlall za.s[w8, 12:15], z1.b, z2.b[15]
1158; CHECK-NEXT:    ret
1159  call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
1160  %slice.12 = add i32 %slice, 12
1161  call void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
1162  ret void
1163}
1164
1165; Indexed x2
1166
1167define void @multi_vector_mul_add_lane_signed_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
1168; CHECK-LABEL: multi_vector_mul_add_lane_signed_long_vg4x2_s8:
1169; CHECK:       // %bb.0:
1170; CHECK-NEXT:    mov z5.d, z2.d
1171; CHECK-NEXT:    mov w8, w0
1172; CHECK-NEXT:    mov z4.d, z1.d
1173; CHECK-NEXT:    sumlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
1174; CHECK-NEXT:    sumlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
1175; CHECK-NEXT:    ret
1176  call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
1177  %slice.4 = add i32 %slice, 4
1178  call void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
1179  ret void
1180}
1181
1182; Indexed x4
1183
1184define void @multi_vector_mul_add_lane_signed_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
1185; CHECK-LABEL: multi_vector_mul_add_lane_signed_long_vg4x4_s8:
1186; CHECK:       // %bb.0:
1187; CHECK-NEXT:    mov z27.d, z4.d
1188; CHECK-NEXT:    mov w8, w0
1189; CHECK-NEXT:    mov z26.d, z3.d
1190; CHECK-NEXT:    mov z25.d, z2.d
1191; CHECK-NEXT:    mov z24.d, z1.d
1192; CHECK-NEXT:    sumlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
1193; CHECK-NEXT:    sumlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
1194; CHECK-NEXT:    ret
1195  call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
1196  %slice.4 = add i32 %slice, 4
1197  call void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
1198  ret void
1199}
1200
1201; USMLALL
1202
1203; Single x1
1204
1205define void @multi_vector_mul_add_single_unsigned_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
1206; CHECK-LABEL: multi_vector_mul_add_single_unsigned_long_vg4x1_s8:
1207; CHECK:       // %bb.0:
1208; CHECK-NEXT:    mov w8, w0
1209; CHECK-NEXT:    usmlall za.s[w8, 0:3], z1.b, z2.b
1210; CHECK-NEXT:    usmlall za.s[w8, 12:15], z1.b, z2.b
1211; CHECK-NEXT:    ret
1212  call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
1213  %slice.12 = add i32 %slice, 12
1214  call void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
1215  ret void
1216}
1217
1218; Single x 2
1219
1220define void @multi_vector_mul_add_single_unsigned_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
1221; CHECK-LABEL: multi_vector_mul_add_single_unsigned_long_vg4x2_s8:
1222; CHECK:       // %bb.0:
1223; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2 def $z1_z2
1224; CHECK-NEXT:    mov w8, w0
1225; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2 def $z1_z2
1226; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx2], { z1.b, z2.b }, z3.b
1227; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx2], { z1.b, z2.b }, z3.b
1228; CHECK-NEXT:    ret
1229  call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
1230  %slice.4 = add i32 %slice, 4
1231  call void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm)
1232  ret void
1233}
1234
1235; Single x4
1236
1237define void @multi_vector_mul_add_single_unsigned_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
1238; CHECK-LABEL: multi_vector_mul_add_single_unsigned_long_vg4x4_s8:
1239; CHECK:       // %bb.0:
1240; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1241; CHECK-NEXT:    mov w8, w0
1242; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1243; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1244; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z1_z2_z3_z4 def $z1_z2_z3_z4
1245; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx4], { z1.b - z4.b }, z5.b
1246; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx4], { z1.b - z4.b }, z5.b
1247; CHECK-NEXT:    ret
1248  call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
1249  %slice.4 = add i32 %slice, 4
1250  call void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm)
1251  ret void
1252}
1253
1254; Multi x2
1255
1256define void @multi_vector_mul_add_multi_unsigned_long_vg4x2_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1) {
1257; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x2_u8:
1258; CHECK:       // %bb.0:
1259; CHECK-NEXT:    mov z5.d, z4.d
1260; CHECK-NEXT:    mov z7.d, z2.d
1261; CHECK-NEXT:    mov w8, w0
1262; CHECK-NEXT:    mov z4.d, z3.d
1263; CHECK-NEXT:    mov z6.d, z1.d
1264; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
1265; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
1266; CHECK-NEXT:    ret
1267  call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
1268  %slice.4 = add i32 %slice, 4
1269  call void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1)
1270  ret void
1271}
1272
1273; Multi x4
1274
1275define void @multi_vector_mul_add_multi_unsigned_long_vg4x4_u8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3) {
1276; CHECK-LABEL: multi_vector_mul_add_multi_unsigned_long_vg4x4_u8:
1277; CHECK:       // %bb.0:
1278; CHECK-NEXT:    mov z26.d, z7.d
1279; CHECK-NEXT:    mov z31.d, z4.d
1280; CHECK-NEXT:    mov w8, w0
1281; CHECK-NEXT:    ptrue p0.b
1282; CHECK-NEXT:    mov z25.d, z6.d
1283; CHECK-NEXT:    mov z30.d, z3.d
1284; CHECK-NEXT:    mov z24.d, z5.d
1285; CHECK-NEXT:    mov z29.d, z2.d
1286; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
1287; CHECK-NEXT:    mov z28.d, z1.d
1288; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
1289; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx4], { z28.b - z31.b }, { z24.b - z27.b }
1290; CHECK-NEXT:    ret
1291  call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
1292  %slice.4 = add i32 %slice, 4
1293  call void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm0, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2, <vscale x 16 x i8> %zm3)
1294  ret void
1295}
1296
1297; Indexed x1
1298
1299define void @multi_vector_mul_add_lane_unsigned_long_vg4x1_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
1300; CHECK-LABEL: multi_vector_mul_add_lane_unsigned_long_vg4x1_s8:
1301; CHECK:       // %bb.0:
1302; CHECK-NEXT:    mov w8, w0
1303; CHECK-NEXT:    usmlall za.s[w8, 0:3], z1.b, z2.b[0]
1304; CHECK-NEXT:    usmlall za.s[w8, 12:15], z1.b, z2.b[15]
1305; CHECK-NEXT:    ret
1306  call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
1307  %slice.12 = add i32 %slice, 12
1308  call void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32 %slice.12, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
1309  ret void
1310}
1311
1312; Indexed x2
1313
1314define void @multi_vector_mul_add_lane_unsigned_long_vg4x2_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm) {
1315; CHECK-LABEL: multi_vector_mul_add_lane_unsigned_long_vg4x2_s8:
1316; CHECK:       // %bb.0:
1317; CHECK-NEXT:    mov z5.d, z2.d
1318; CHECK-NEXT:    mov w8, w0
1319; CHECK-NEXT:    mov z4.d, z1.d
1320; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx2], { z4.b, z5.b }, z3.b[0]
1321; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx2], { z4.b, z5.b }, z3.b[15]
1322; CHECK-NEXT:    ret
1323  call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 0)
1324  %slice.4 = add i32 %slice, 4
1325  call void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zm, i32 15)
1326  ret void
1327}
1328
1329; Indexed x4
1330
1331define void @multi_vector_mul_add_lane_unsigned_long_vg4x4_s8(i32 %slice, <vscale x 16 x i8> %dummy, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm) {
1332; CHECK-LABEL: multi_vector_mul_add_lane_unsigned_long_vg4x4_s8:
1333; CHECK:       // %bb.0:
1334; CHECK-NEXT:    mov z27.d, z4.d
1335; CHECK-NEXT:    mov w8, w0
1336; CHECK-NEXT:    mov z26.d, z3.d
1337; CHECK-NEXT:    mov z25.d, z2.d
1338; CHECK-NEXT:    mov z24.d, z1.d
1339; CHECK-NEXT:    usmlall za.s[w8, 0:3, vgx4], { z24.b - z27.b }, z5.b[0]
1340; CHECK-NEXT:    usmlall za.s[w8, 4:7, vgx4], { z24.b - z27.b }, z5.b[15]
1341; CHECK-NEXT:    ret
1342  call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 0)
1343  %slice.4 = add i32 %slice, 4
1344  call void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32 %slice.4, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zm, i32 15)
1345  ret void
1346}
1347
1348declare void @llvm.aarch64.sme.smla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
1349declare void @llvm.aarch64.sme.smla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1350declare void @llvm.aarch64.sme.smla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1351
1352declare void @llvm.aarch64.sme.smla.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1353declare void @llvm.aarch64.sme.smla.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1354declare void @llvm.aarch64.sme.smla.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1355
1356declare void @llvm.aarch64.sme.smla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1357declare void @llvm.aarch64.sme.smla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1358
1359declare void @llvm.aarch64.sme.smla.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1360declare void @llvm.aarch64.sme.smla.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1361
1362declare void @llvm.aarch64.sme.smla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1363declare void @llvm.aarch64.sme.smla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1364declare void @llvm.aarch64.sme.smla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1365
1366declare void @llvm.aarch64.sme.smla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1367declare void @llvm.aarch64.sme.smla.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1368declare void @llvm.aarch64.sme.smla.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1369
1370declare void @llvm.aarch64.sme.umla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
1371declare void @llvm.aarch64.sme.umla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1372declare void @llvm.aarch64.sme.umla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1373
1374declare void @llvm.aarch64.sme.umla.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1375declare void @llvm.aarch64.sme.umla.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1376declare void @llvm.aarch64.sme.umla.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1377
1378declare void @llvm.aarch64.sme.umla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1379declare void @llvm.aarch64.sme.umla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1380
1381declare void @llvm.aarch64.sme.umla.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1382declare void @llvm.aarch64.sme.umla.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1383
1384declare void @llvm.aarch64.sme.umla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1385declare void @llvm.aarch64.sme.umla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1386declare void @llvm.aarch64.sme.umla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1387
1388declare void @llvm.aarch64.sme.umla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1389declare void @llvm.aarch64.sme.umla.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1390declare void @llvm.aarch64.sme.umla.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1391
1392declare void @llvm.aarch64.sme.smls.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
1393declare void @llvm.aarch64.sme.smls.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1394declare void @llvm.aarch64.sme.smls.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1395
1396declare void @llvm.aarch64.sme.smls.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1397declare void @llvm.aarch64.sme.smls.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1398declare void @llvm.aarch64.sme.smls.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1399
1400declare void @llvm.aarch64.sme.smls.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1401declare void @llvm.aarch64.sme.smls.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1402
1403declare void @llvm.aarch64.sme.smls.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1404declare void @llvm.aarch64.sme.smls.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1405
1406declare void @llvm.aarch64.sme.smls.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1407declare void @llvm.aarch64.sme.smls.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1408declare void @llvm.aarch64.sme.smls.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1409
1410declare void @llvm.aarch64.sme.smls.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1411declare void @llvm.aarch64.sme.smls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1412declare void @llvm.aarch64.sme.smls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1413
1414declare void @llvm.aarch64.sme.umls.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
1415declare void @llvm.aarch64.sme.umls.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1416declare void @llvm.aarch64.sme.umls.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1417
1418declare void @llvm.aarch64.sme.umls.za64.single.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>)
1419declare void @llvm.aarch64.sme.umls.za64.single.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1420declare void @llvm.aarch64.sme.umls.za64.single.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1421
1422declare void @llvm.aarch64.sme.umls.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1423declare void @llvm.aarch64.sme.umls.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1424
1425declare void @llvm.aarch64.sme.umls.za64.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1426declare void @llvm.aarch64.sme.umls.za64.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1427
1428declare void @llvm.aarch64.sme.umls.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1429declare void @llvm.aarch64.sme.umls.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1430declare void @llvm.aarch64.sme.umls.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1431
1432declare void @llvm.aarch64.sme.umls.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1433declare void @llvm.aarch64.sme.umls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1434declare void @llvm.aarch64.sme.umls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1435
1436declare void @llvm.aarch64.sme.sumla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1437declare void @llvm.aarch64.sme.sumla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1438
1439declare void @llvm.aarch64.sme.sumla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1440declare void @llvm.aarch64.sme.sumla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1441declare void @llvm.aarch64.sme.sumla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1442
1443declare void @llvm.aarch64.sme.sumla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1444declare void @llvm.aarch64.sme.sumls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1445declare void @llvm.aarch64.sme.sumls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1446
1447declare void @llvm.aarch64.sme.usmla.za32.single.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>)
1448declare void @llvm.aarch64.sme.usmla.za32.single.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1449declare void @llvm.aarch64.sme.usmla.za32.single.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1450
1451declare void @llvm.aarch64.sme.usmla.za32.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1452declare void @llvm.aarch64.sme.usmla.za32.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1453
1454declare void @llvm.aarch64.sme.usmla.za32.lane.vg4x1.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1455declare void @llvm.aarch64.sme.usmla.za32.lane.vg4x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1456declare void @llvm.aarch64.sme.usmla.za32.lane.vg4x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
1457
1458declare void @llvm.aarch64.sme.usmla.za64.lane.vg4x1.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1459declare void @llvm.aarch64.sme.usmls.za64.lane.vg4x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1460declare void @llvm.aarch64.sme.usmls.za64.lane.vg4x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
1461