xref: /llvm-project/llvm/test/CodeGen/AArch64/sme2-intrinsics-int-dots.ll (revision 865104a1042e824254b130c00c7f8ee0e0e0f6c5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -force-streaming -enable-subreg-liveness -verify-machineinstrs < %s | FileCheck %s
3
4target triple="aarch64-linux-gnu"
5
6
7; == Multi, multi (unsigned) ==
8
9define void @udot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #0 {
10; CHECK-LABEL: udot_multi_za32_u16_vg1x2:
11; CHECK:       // %bb.0:
12; CHECK-NEXT:    mov z5.d, z4.d
13; CHECK-NEXT:    mov z7.d, z2.d
14; CHECK-NEXT:    mov w8, w0
15; CHECK-NEXT:    mov z4.d, z3.d
16; CHECK-NEXT:    mov z6.d, z1.d
17; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
18; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
19; CHECK-NEXT:    ret
20  call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
21  %slice2 = add i32 %slice, 7
22  call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
23  ret void
24}
25
26define void @udot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
27; CHECK-LABEL: udot_multi_za32_u16_vg1x4:
28; CHECK:       // %bb.0:
29; CHECK-NEXT:    ptrue p0.h
30; CHECK-NEXT:    mov z26.d, z7.d
31; CHECK-NEXT:    mov z25.d, z6.d
32; CHECK-NEXT:    mov z7.d, z4.d
33; CHECK-NEXT:    mov w8, w0
34; CHECK-NEXT:    mov z24.d, z5.d
35; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
36; CHECK-NEXT:    mov z6.d, z3.d
37; CHECK-NEXT:    mov z5.d, z2.d
38; CHECK-NEXT:    mov z4.d, z1.d
39; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
40; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
41; CHECK-NEXT:    ret
42                                        <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
43  call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
44                                                      <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
45  %slice2 = add i32 %slice, 7
46  call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
47                                                      <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
48  ret void
49}
50
51define void @udot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
52; CHECK-LABEL: udot_multi_za32_u8_vg1x2:
53; CHECK:       // %bb.0:
54; CHECK-NEXT:    mov z5.d, z4.d
55; CHECK-NEXT:    mov z7.d, z2.d
56; CHECK-NEXT:    mov w8, w0
57; CHECK-NEXT:    mov z4.d, z3.d
58; CHECK-NEXT:    mov z6.d, z1.d
59; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
60; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
61; CHECK-NEXT:    ret
62  call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
63  %slice2 = add i32 %slice, 7
64  call void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
65  ret void
66}
67
68define void @udot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
69; CHECK-LABEL: udot_multi_za32_u8_vg1x4:
70; CHECK:       // %bb.0:
71; CHECK-NEXT:    ptrue p0.b
72; CHECK-NEXT:    mov z26.d, z7.d
73; CHECK-NEXT:    mov z25.d, z6.d
74; CHECK-NEXT:    mov z7.d, z4.d
75; CHECK-NEXT:    mov w8, w0
76; CHECK-NEXT:    mov z24.d, z5.d
77; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
78; CHECK-NEXT:    mov z6.d, z3.d
79; CHECK-NEXT:    mov z5.d, z2.d
80; CHECK-NEXT:    mov z4.d, z1.d
81; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
82; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
83; CHECK-NEXT:    ret
84                                      <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
85  call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
86                                                      <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
87  %slice2 = add i32 %slice, 7
88  call void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
89                                                      <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
90  ret void
91}
92
93define void @udot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #1 {
94; CHECK-LABEL: udot_multi_za64_u16_vg1x2:
95; CHECK:       // %bb.0:
96; CHECK-NEXT:    mov z5.d, z4.d
97; CHECK-NEXT:    mov z7.d, z2.d
98; CHECK-NEXT:    mov w8, w0
99; CHECK-NEXT:    mov z4.d, z3.d
100; CHECK-NEXT:    mov z6.d, z1.d
101; CHECK-NEXT:    udot za.d[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
102; CHECK-NEXT:    udot za.d[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
103; CHECK-NEXT:    ret
104  call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
105  %slice2 = add i32 %slice, 7
106  call void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
107  ret void
108}
109
110define void @udot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
111; CHECK-LABEL: udot_multi_za64_u16_vg1x4:
112; CHECK:       // %bb.0:
113; CHECK-NEXT:    ptrue p0.h
114; CHECK-NEXT:    mov z26.d, z7.d
115; CHECK-NEXT:    mov z25.d, z6.d
116; CHECK-NEXT:    mov z7.d, z4.d
117; CHECK-NEXT:    mov w8, w0
118; CHECK-NEXT:    mov z24.d, z5.d
119; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
120; CHECK-NEXT:    mov z6.d, z3.d
121; CHECK-NEXT:    mov z5.d, z2.d
122; CHECK-NEXT:    mov z4.d, z1.d
123; CHECK-NEXT:    udot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
124; CHECK-NEXT:    udot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
125; CHECK-NEXT:    ret
126                                       <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
127  call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
128                                                      <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
129  %slice2 = add i32 %slice, 7
130  call void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
131                                                      <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
132  ret void
133}
134
135define void @usdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
136; CHECK-LABEL: usdot_multi_za32_u8_vg1x2:
137; CHECK:       // %bb.0:
138; CHECK-NEXT:    mov z5.d, z4.d
139; CHECK-NEXT:    mov z7.d, z2.d
140; CHECK-NEXT:    mov w8, w0
141; CHECK-NEXT:    mov z4.d, z3.d
142; CHECK-NEXT:    mov z6.d, z1.d
143; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
144; CHECK-NEXT:    usdot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
145; CHECK-NEXT:    ret
146  call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
147  %slice2 = add i32 %slice, 7
148  call void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
149  ret void
150}
151
152define void @usdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
153; CHECK-LABEL: usdot_multi_za32_u8_vg1x4:
154; CHECK:       // %bb.0:
155; CHECK-NEXT:    ptrue p0.b
156; CHECK-NEXT:    mov z26.d, z7.d
157; CHECK-NEXT:    mov z25.d, z6.d
158; CHECK-NEXT:    mov z7.d, z4.d
159; CHECK-NEXT:    mov w8, w0
160; CHECK-NEXT:    mov z24.d, z5.d
161; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
162; CHECK-NEXT:    mov z6.d, z3.d
163; CHECK-NEXT:    mov z5.d, z2.d
164; CHECK-NEXT:    mov z4.d, z1.d
165; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
166; CHECK-NEXT:    usdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
167; CHECK-NEXT:    ret
168                                      <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
169  call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
170                                                      <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
171  %slice2 = add i32 %slice, 7
172  call void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
173                                                      <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
174  ret void
175}
176
177
178; == Multi, multi (signed) ==
179
180define void @sdot_multi_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #0 {
181; CHECK-LABEL: sdot_multi_za32_u16_vg1x2:
182; CHECK:       // %bb.0:
183; CHECK-NEXT:    mov z5.d, z4.d
184; CHECK-NEXT:    mov z7.d, z2.d
185; CHECK-NEXT:    mov w8, w0
186; CHECK-NEXT:    mov z4.d, z3.d
187; CHECK-NEXT:    mov z6.d, z1.d
188; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
189; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
190; CHECK-NEXT:    ret
191  call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
192  %slice2 = add i32 %slice, 7
193  call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
194  ret void
195}
196
197define void @sdot_multi_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
198; CHECK-LABEL: sdot_multi_za32_u16_vg1x4:
199; CHECK:       // %bb.0:
200; CHECK-NEXT:    ptrue p0.h
201; CHECK-NEXT:    mov z26.d, z7.d
202; CHECK-NEXT:    mov z25.d, z6.d
203; CHECK-NEXT:    mov z7.d, z4.d
204; CHECK-NEXT:    mov w8, w0
205; CHECK-NEXT:    mov z24.d, z5.d
206; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
207; CHECK-NEXT:    mov z6.d, z3.d
208; CHECK-NEXT:    mov z5.d, z2.d
209; CHECK-NEXT:    mov z4.d, z1.d
210; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
211; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
212; CHECK-NEXT:    ret
213                                        <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #0 {
214  call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
215                                                      <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
216  %slice2 = add i32 %slice, 7
217  call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
218                                                      <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
219  ret void
220}
221
222define void @sdot_multi_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3) #0 {
223; CHECK-LABEL: sdot_multi_za32_u8_vg1x2:
224; CHECK:       // %bb.0:
225; CHECK-NEXT:    mov z5.d, z4.d
226; CHECK-NEXT:    mov z7.d, z2.d
227; CHECK-NEXT:    mov w8, w0
228; CHECK-NEXT:    mov z4.d, z3.d
229; CHECK-NEXT:    mov z6.d, z1.d
230; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
231; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z6.b, z7.b }, { z4.b, z5.b }
232; CHECK-NEXT:    ret
233  call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
234  %slice2 = add i32 %slice, 7
235  call void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3)
236  ret void
237}
238
239define void @sdot_multi_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
240; CHECK-LABEL: sdot_multi_za32_u8_vg1x4:
241; CHECK:       // %bb.0:
242; CHECK-NEXT:    ptrue p0.b
243; CHECK-NEXT:    mov z26.d, z7.d
244; CHECK-NEXT:    mov z25.d, z6.d
245; CHECK-NEXT:    mov z7.d, z4.d
246; CHECK-NEXT:    mov w8, w0
247; CHECK-NEXT:    mov z24.d, z5.d
248; CHECK-NEXT:    ld1b { z27.b }, p0/z, [x1]
249; CHECK-NEXT:    mov z6.d, z3.d
250; CHECK-NEXT:    mov z5.d, z2.d
251; CHECK-NEXT:    mov z4.d, z1.d
252; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
253; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z4.b - z7.b }, { z24.b - z27.b }
254; CHECK-NEXT:    ret
255                                      <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7) #0 {
256  call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
257                                                      <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
258  %slice2 = add i32 %slice, 7
259  call void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
260                                                      <vscale x 16 x i8> %zn4, <vscale x 16 x i8> %zn5, <vscale x 16 x i8> %zn6, <vscale x 16 x i8> %zn7)
261  ret void
262}
263
264define void @sdot_multi_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3) #1 {
265; CHECK-LABEL: sdot_multi_za64_u16_vg1x2:
266; CHECK:       // %bb.0:
267; CHECK-NEXT:    mov z5.d, z4.d
268; CHECK-NEXT:    mov z7.d, z2.d
269; CHECK-NEXT:    mov w8, w0
270; CHECK-NEXT:    mov z4.d, z3.d
271; CHECK-NEXT:    mov z6.d, z1.d
272; CHECK-NEXT:    sdot za.d[w8, 0, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
273; CHECK-NEXT:    sdot za.d[w8, 7, vgx2], { z6.h, z7.h }, { z4.h, z5.h }
274; CHECK-NEXT:    ret
275  call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
276  %slice2 = add i32 %slice, 7
277  call void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3)
278  ret void
279}
280
281define void @sdot_multi_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
282; CHECK-LABEL: sdot_multi_za64_u16_vg1x4:
283; CHECK:       // %bb.0:
284; CHECK-NEXT:    ptrue p0.h
285; CHECK-NEXT:    mov z26.d, z7.d
286; CHECK-NEXT:    mov z25.d, z6.d
287; CHECK-NEXT:    mov z7.d, z4.d
288; CHECK-NEXT:    mov w8, w0
289; CHECK-NEXT:    mov z24.d, z5.d
290; CHECK-NEXT:    ld1h { z27.h }, p0/z, [x1]
291; CHECK-NEXT:    mov z6.d, z3.d
292; CHECK-NEXT:    mov z5.d, z2.d
293; CHECK-NEXT:    mov z4.d, z1.d
294; CHECK-NEXT:    sdot za.d[w8, 0, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
295; CHECK-NEXT:    sdot za.d[w8, 7, vgx4], { z4.h - z7.h }, { z24.h - z27.h }
296; CHECK-NEXT:    ret
297                                       <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7) #1 {
298  call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
299                                                      <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
300  %slice2 = add i32 %slice, 7
301  call void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
302                                                      <vscale x 8 x i16> %zn4, <vscale x 8 x i16> %zn5, <vscale x 8 x i16> %zn6, <vscale x 8 x i16> %zn7)
303  ret void
304}
305
306
307; == Multi, single (unsigned) ==
308
309define void @udot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
310; CHECK-LABEL: udot_single_za32_u16_vg1x2:
311; CHECK:       // %bb.0:
312; CHECK-NEXT:    mov w8, w0
313; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
314; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
315; CHECK-NEXT:    ret
316  call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
317  %slice2 = add i32 %slice, 7
318  call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
319  ret void
320}
321
322define void @udot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 {
323; CHECK-LABEL: udot_single_za32_u16_vg1x2_tuple:
324; CHECK:       // %bb.0: // %entry
325; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
326; CHECK-NEXT:    addvl sp, sp, #-3
327; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
328; CHECK-NEXT:    ptrue pn8.b
329; CHECK-NEXT:    add x9, x0, x1
330; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
331; CHECK-NEXT:    mov w8, wzr
332; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
333; CHECK-NEXT:    ld1h { z1.h, z9.h }, pn8/z, [x0]
334; CHECK-NEXT:    ld1h { z2.h, z10.h }, pn8/z, [x9]
335; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z1.h, z2.h }, z0.h
336; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z9.h, z10.h }, z0.h
337; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
338; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
339; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
340; CHECK-NEXT:    addvl sp, sp, #3
341; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
342; CHECK-NEXT:    ret
343entry:
344  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
345  %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
346  %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
347  %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
348  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
349  %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
350  %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
351  %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
352  call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> %zn)
353  call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> %zn)
354  ret void
355}
356
357define void @udot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
358; CHECK-LABEL: udot_single_za32_u16_vg1x4:
359; CHECK:       // %bb.0:
360; CHECK-NEXT:    mov w8, w0
361; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
362; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
363; CHECK-NEXT:    ret
364  call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
365  %slice2 = add i32 %slice, 7
366  call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
367  ret void
368}
369
370define void @udot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 {
371; CHECK-LABEL: udot_single_za32_u16_vg1x4_tuple:
372; CHECK:       // %bb.0: // %entry
373; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
374; CHECK-NEXT:    addvl sp, sp, #-11
375; CHECK-NEXT:    add x9, x1, x1, lsl #1
376; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
377; CHECK-NEXT:    ptrue pn8.b
378; CHECK-NEXT:    str z20, [sp, #1, mul vl] // 16-byte Folded Spill
379; CHECK-NEXT:    add x10, x0, x1
380; CHECK-NEXT:    mov w8, wzr
381; CHECK-NEXT:    str z16, [sp, #2, mul vl] // 16-byte Folded Spill
382; CHECK-NEXT:    add x9, x0, x9
383; CHECK-NEXT:    str z15, [sp, #3, mul vl] // 16-byte Folded Spill
384; CHECK-NEXT:    str z14, [sp, #4, mul vl] // 16-byte Folded Spill
385; CHECK-NEXT:    str z13, [sp, #5, mul vl] // 16-byte Folded Spill
386; CHECK-NEXT:    str z12, [sp, #6, mul vl] // 16-byte Folded Spill
387; CHECK-NEXT:    str z11, [sp, #7, mul vl] // 16-byte Folded Spill
388; CHECK-NEXT:    str z10, [sp, #8, mul vl] // 16-byte Folded Spill
389; CHECK-NEXT:    str z9, [sp, #9, mul vl] // 16-byte Folded Spill
390; CHECK-NEXT:    str z8, [sp, #10, mul vl] // 16-byte Folded Spill
391; CHECK-NEXT:    ld1h { z1.h, z5.h, z9.h, z13.h }, pn8/z, [x0]
392; CHECK-NEXT:    ld1h { z2.h, z6.h, z10.h, z14.h }, pn8/z, [x10]
393; CHECK-NEXT:    ld1h { z3.h, z7.h, z11.h, z15.h }, pn8/z, [x0, x1, lsl #1]
394; CHECK-NEXT:    ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x9]
395; CHECK-NEXT:    mov z4.d, z16.d
396; CHECK-NEXT:    mov z8.d, z20.d
397; CHECK-NEXT:    mov z12.d, z24.d
398; CHECK-NEXT:    mov z16.d, z28.d
399; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z1.h - z4.h }, z0.h
400; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z5.h - z8.h }, z0.h
401; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z9.h - z12.h }, z0.h
402; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z13.h - z16.h }, z0.h
403; CHECK-NEXT:    ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload
404; CHECK-NEXT:    ldr z16, [sp, #2, mul vl] // 16-byte Folded Reload
405; CHECK-NEXT:    ldr z15, [sp, #3, mul vl] // 16-byte Folded Reload
406; CHECK-NEXT:    ldr z14, [sp, #4, mul vl] // 16-byte Folded Reload
407; CHECK-NEXT:    ldr z13, [sp, #5, mul vl] // 16-byte Folded Reload
408; CHECK-NEXT:    ldr z12, [sp, #6, mul vl] // 16-byte Folded Reload
409; CHECK-NEXT:    ldr z11, [sp, #7, mul vl] // 16-byte Folded Reload
410; CHECK-NEXT:    ldr z10, [sp, #8, mul vl] // 16-byte Folded Reload
411; CHECK-NEXT:    ldr z9, [sp, #9, mul vl] // 16-byte Folded Reload
412; CHECK-NEXT:    ldr z8, [sp, #10, mul vl] // 16-byte Folded Reload
413; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
414; CHECK-NEXT:    addvl sp, sp, #11
415; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
416; CHECK-NEXT:    ret
417entry:
418  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
419  %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
420  %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
421  %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
422  %4 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 2
423  %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 3
424  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
425  %6 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
426  %7 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 0
427  %8 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 1
428  %9 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 2
429  %10 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 3
430  %mul3 = shl i64 %stride, 1
431  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
432  %11 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx4)
433  %12 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 0
434  %13 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 1
435  %14 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 2
436  %15 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 3
437  %mul5 = mul i64 %stride, 3
438  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
439  %16 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx6)
440  %17 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 0
441  %18 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 1
442  %19 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 2
443  %20 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 3
444  call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %7, <vscale x 8 x i16> %12, <vscale x 8 x i16> %17, <vscale x 8 x i16> %zn)
445  call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %8, <vscale x 8 x i16> %13, <vscale x 8 x i16> %18, <vscale x 8 x i16> %zn)
446  call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %4, <vscale x 8 x i16> %9, <vscale x 8 x i16> %14, <vscale x 8 x i16> %19, <vscale x 8 x i16> %zn)
447  call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %5, <vscale x 8 x i16> %10, <vscale x 8 x i16> %15, <vscale x 8 x i16> %20, <vscale x 8 x i16> %zn)
448  ret void
449}
450
451define void @udot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
452; CHECK-LABEL: udot_single_za32_u8_vg1x2:
453; CHECK:       // %bb.0:
454; CHECK-NEXT:    mov w8, w0
455; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
456; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
457; CHECK-NEXT:    ret
458  call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
459  %slice2 = add i32 %slice, 7
460  call void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
461  ret void
462}
463
464define void @udot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
465; CHECK-LABEL: udot_single_za32_u8_vg1x4:
466; CHECK:       // %bb.0:
467; CHECK-NEXT:    mov w8, w0
468; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
469; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
470; CHECK-NEXT:    ret
471  call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
472  %slice2 = add i32 %slice, 7
473  call void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
474  ret void
475}
476
477define void @udot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
478; CHECK-LABEL: udot_single_za64_u16_vg1x2:
479; CHECK:       // %bb.0:
480; CHECK-NEXT:    mov w8, w0
481; CHECK-NEXT:    udot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
482; CHECK-NEXT:    udot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
483; CHECK-NEXT:    ret
484  call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
485  %slice2 = add i32 %slice, 7
486  call void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
487  ret void
488}
489
490define void @udot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
491; CHECK-LABEL: udot_single_za64_u16_vg1x4:
492; CHECK:       // %bb.0:
493; CHECK-NEXT:    mov w8, w0
494; CHECK-NEXT:    udot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
495; CHECK-NEXT:    udot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
496; CHECK-NEXT:    ret
497  call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
498  %slice2 = add i32 %slice, 7
499  call void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
500  ret void
501}
502
503define void @usdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
504; CHECK-LABEL: usdot_single_za32_u8_vg1x2:
505; CHECK:       // %bb.0:
506; CHECK-NEXT:    mov w8, w0
507; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
508; CHECK-NEXT:    usdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
509; CHECK-NEXT:    ret
510  call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
511  %slice2 = add i32 %slice, 7
512  call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
513  ret void
514}
515
516define void @usdot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 {
517; CHECK-LABEL: usdot_single_za32_u16_vg1x2_tuple:
518; CHECK:       // %bb.0: // %entry
519; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
520; CHECK-NEXT:    addvl sp, sp, #-3
521; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
522; CHECK-NEXT:    ptrue pn8.b
523; CHECK-NEXT:    mov w8, wzr
524; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
525; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
526; CHECK-NEXT:    ld1b { z1.b, z9.b }, pn8/z, [x0]
527; CHECK-NEXT:    ld1b { z2.b, z10.b }, pn8/z, [x0, x1]
528; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z0.b
529; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z9.b, z10.b }, z0.b
530; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
531; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
532; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
533; CHECK-NEXT:    addvl sp, sp, #3
534; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
535; CHECK-NEXT:    ret
536entry:
537  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
538  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
539  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
540  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
541  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
542  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
543  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
544  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
545  call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> %zn)
546  call void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> %zn)
547  ret void
548}
549
550define void @usdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
551; CHECK-LABEL: usdot_single_za32_u8_vg1x4:
552; CHECK:       // %bb.0:
553; CHECK-NEXT:    mov w8, w0
554; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
555; CHECK-NEXT:    usdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
556; CHECK-NEXT:    ret
557  call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
558  %slice2 = add i32 %slice, 7
559  call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
560  ret void
561}
562
563define void @usdot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 {
564; CHECK-LABEL: usdot_single_za32_u16_vg1x4_tuple:
565; CHECK:       // %bb.0: // %entry
566; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
567; CHECK-NEXT:    addvl sp, sp, #-11
568; CHECK-NEXT:    lsl x9, x1, #1
569; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
570; CHECK-NEXT:    ptrue pn8.b
571; CHECK-NEXT:    str z20, [sp, #1, mul vl] // 16-byte Folded Spill
572; CHECK-NEXT:    mov w8, wzr
573; CHECK-NEXT:    str z16, [sp, #2, mul vl] // 16-byte Folded Spill
574; CHECK-NEXT:    add x10, x9, x1
575; CHECK-NEXT:    str z15, [sp, #3, mul vl] // 16-byte Folded Spill
576; CHECK-NEXT:    str z14, [sp, #4, mul vl] // 16-byte Folded Spill
577; CHECK-NEXT:    str z13, [sp, #5, mul vl] // 16-byte Folded Spill
578; CHECK-NEXT:    str z12, [sp, #6, mul vl] // 16-byte Folded Spill
579; CHECK-NEXT:    str z11, [sp, #7, mul vl] // 16-byte Folded Spill
580; CHECK-NEXT:    str z10, [sp, #8, mul vl] // 16-byte Folded Spill
581; CHECK-NEXT:    str z9, [sp, #9, mul vl] // 16-byte Folded Spill
582; CHECK-NEXT:    str z8, [sp, #10, mul vl] // 16-byte Folded Spill
583; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0]
584; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1]
585; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9]
586; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10]
587; CHECK-NEXT:    mov z4.d, z16.d
588; CHECK-NEXT:    mov z8.d, z20.d
589; CHECK-NEXT:    mov z12.d, z24.d
590; CHECK-NEXT:    mov z16.d, z28.d
591; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z0.b
592; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z5.b - z8.b }, z0.b
593; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z9.b - z12.b }, z0.b
594; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z13.b - z16.b }, z0.b
595; CHECK-NEXT:    ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload
596; CHECK-NEXT:    ldr z16, [sp, #2, mul vl] // 16-byte Folded Reload
597; CHECK-NEXT:    ldr z15, [sp, #3, mul vl] // 16-byte Folded Reload
598; CHECK-NEXT:    ldr z14, [sp, #4, mul vl] // 16-byte Folded Reload
599; CHECK-NEXT:    ldr z13, [sp, #5, mul vl] // 16-byte Folded Reload
600; CHECK-NEXT:    ldr z12, [sp, #6, mul vl] // 16-byte Folded Reload
601; CHECK-NEXT:    ldr z11, [sp, #7, mul vl] // 16-byte Folded Reload
602; CHECK-NEXT:    ldr z10, [sp, #8, mul vl] // 16-byte Folded Reload
603; CHECK-NEXT:    ldr z9, [sp, #9, mul vl] // 16-byte Folded Reload
604; CHECK-NEXT:    ldr z8, [sp, #10, mul vl] // 16-byte Folded Reload
605; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
606; CHECK-NEXT:    addvl sp, sp, #11
607; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
608; CHECK-NEXT:    ret
609entry:
610  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
611  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
612  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
613  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
614  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
615  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
616  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
617  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
618  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
619  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
620  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
621  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
622  %mul3 = shl i64 %stride, 1
623  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
624  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
625  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
626  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
627  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
628  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
629  %mul5 = mul i64 %stride, 3
630  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
631  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
632  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
633  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
634  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
635  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
636  call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> %zn)
637  call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> %zn)
638  call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> %zn)
639  call void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> %zn)
640  ret void
641}
642
643; == Multi, single (signed) ==
644
645define void @sdot_single_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
646; CHECK-LABEL: sdot_single_za32_u16_vg1x2:
647; CHECK:       // %bb.0:
648; CHECK-NEXT:    mov w8, w0
649; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z3.h
650; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z1.h, z2.h }, z3.h
651; CHECK-NEXT:    ret
652  call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
653  %slice2 = add i32 %slice, 7
654  call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
655  ret void
656}
657
658define void @sdot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 {
659; CHECK-LABEL: sdot_single_za32_u16_vg1x2_tuple:
660; CHECK:       // %bb.0: // %entry
661; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
662; CHECK-NEXT:    addvl sp, sp, #-3
663; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
664; CHECK-NEXT:    ptrue pn8.b
665; CHECK-NEXT:    add x9, x0, x1
666; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
667; CHECK-NEXT:    mov w8, wzr
668; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
669; CHECK-NEXT:    ld1h { z1.h, z9.h }, pn8/z, [x0]
670; CHECK-NEXT:    ld1h { z2.h, z10.h }, pn8/z, [x9]
671; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z1.h, z2.h }, z0.h
672; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z9.h, z10.h }, z0.h
673; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
674; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
675; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
676; CHECK-NEXT:    addvl sp, sp, #3
677; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
678; CHECK-NEXT:    ret
679entry:
680  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
681  %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
682  %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
683  %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
684  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
685  %4 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
686  %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 0
687  %6 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %4, 1
688  call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %5, <vscale x 8 x i16> %zn)
689  call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %6, <vscale x 8 x i16> %zn)
690  ret void
691}
692
693define void @sdot_single_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
694; CHECK-LABEL: sdot_single_za32_u16_vg1x4:
695; CHECK:       // %bb.0:
696; CHECK-NEXT:    mov w8, w0
697; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z5.h
698; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z1.h - z4.h }, z5.h
699; CHECK-NEXT:    ret
700  call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
701  %slice2 = add i32 %slice, 7
702  call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
703  ret void
704}
705
706define void @sdot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 8 x i16> %zn) #0 {
707; CHECK-LABEL: sdot_single_za32_u16_vg1x4_tuple:
708; CHECK:       // %bb.0: // %entry
709; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
710; CHECK-NEXT:    addvl sp, sp, #-11
711; CHECK-NEXT:    add x9, x1, x1, lsl #1
712; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
713; CHECK-NEXT:    ptrue pn8.b
714; CHECK-NEXT:    str z20, [sp, #1, mul vl] // 16-byte Folded Spill
715; CHECK-NEXT:    add x10, x0, x1
716; CHECK-NEXT:    mov w8, wzr
717; CHECK-NEXT:    str z16, [sp, #2, mul vl] // 16-byte Folded Spill
718; CHECK-NEXT:    add x9, x0, x9
719; CHECK-NEXT:    str z15, [sp, #3, mul vl] // 16-byte Folded Spill
720; CHECK-NEXT:    str z14, [sp, #4, mul vl] // 16-byte Folded Spill
721; CHECK-NEXT:    str z13, [sp, #5, mul vl] // 16-byte Folded Spill
722; CHECK-NEXT:    str z12, [sp, #6, mul vl] // 16-byte Folded Spill
723; CHECK-NEXT:    str z11, [sp, #7, mul vl] // 16-byte Folded Spill
724; CHECK-NEXT:    str z10, [sp, #8, mul vl] // 16-byte Folded Spill
725; CHECK-NEXT:    str z9, [sp, #9, mul vl] // 16-byte Folded Spill
726; CHECK-NEXT:    str z8, [sp, #10, mul vl] // 16-byte Folded Spill
727; CHECK-NEXT:    ld1h { z1.h, z5.h, z9.h, z13.h }, pn8/z, [x0]
728; CHECK-NEXT:    ld1h { z2.h, z6.h, z10.h, z14.h }, pn8/z, [x10]
729; CHECK-NEXT:    ld1h { z3.h, z7.h, z11.h, z15.h }, pn8/z, [x0, x1, lsl #1]
730; CHECK-NEXT:    ld1h { z16.h, z20.h, z24.h, z28.h }, pn8/z, [x9]
731; CHECK-NEXT:    mov z4.d, z16.d
732; CHECK-NEXT:    mov z8.d, z20.d
733; CHECK-NEXT:    mov z12.d, z24.d
734; CHECK-NEXT:    mov z16.d, z28.d
735; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z1.h - z4.h }, z0.h
736; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z5.h - z8.h }, z0.h
737; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z9.h - z12.h }, z0.h
738; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z13.h - z16.h }, z0.h
739; CHECK-NEXT:    ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload
740; CHECK-NEXT:    ldr z16, [sp, #2, mul vl] // 16-byte Folded Reload
741; CHECK-NEXT:    ldr z15, [sp, #3, mul vl] // 16-byte Folded Reload
742; CHECK-NEXT:    ldr z14, [sp, #4, mul vl] // 16-byte Folded Reload
743; CHECK-NEXT:    ldr z13, [sp, #5, mul vl] // 16-byte Folded Reload
744; CHECK-NEXT:    ldr z12, [sp, #6, mul vl] // 16-byte Folded Reload
745; CHECK-NEXT:    ldr z11, [sp, #7, mul vl] // 16-byte Folded Reload
746; CHECK-NEXT:    ldr z10, [sp, #8, mul vl] // 16-byte Folded Reload
747; CHECK-NEXT:    ldr z9, [sp, #9, mul vl] // 16-byte Folded Reload
748; CHECK-NEXT:    ldr z8, [sp, #10, mul vl] // 16-byte Folded Reload
749; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
750; CHECK-NEXT:    addvl sp, sp, #11
751; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
752; CHECK-NEXT:    ret
753entry:
754  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
755  %1 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %ptr)
756  %2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 0
757  %3 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 1
758  %4 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 2
759  %5 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %1, 3
760  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
761  %6 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx2)
762  %7 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 0
763  %8 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 1
764  %9 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 2
765  %10 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %6, 3
766  %mul3 = shl i64 %stride, 1
767  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
768  %11 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx4)
769  %12 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 0
770  %13 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 1
771  %14 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 2
772  %15 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %11, 3
773  %mul5 = mul i64 %stride, 3
774  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
775  %16 = tail call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %0, ptr %arrayidx6)
776  %17 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 0
777  %18 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 1
778  %19 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 2
779  %20 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %16, 3
780  call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %2, <vscale x 8 x i16> %7, <vscale x 8 x i16> %12, <vscale x 8 x i16> %17, <vscale x 8 x i16> %zn)
781  call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %3, <vscale x 8 x i16> %8, <vscale x 8 x i16> %13, <vscale x 8 x i16> %18, <vscale x 8 x i16> %zn)
782  call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %4, <vscale x 8 x i16> %9, <vscale x 8 x i16> %14, <vscale x 8 x i16> %19, <vscale x 8 x i16> %zn)
783  call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32 0, <vscale x 8 x i16> %5, <vscale x 8 x i16> %10, <vscale x 8 x i16> %15, <vscale x 8 x i16> %20, <vscale x 8 x i16> %zn)
784  ret void
785}
786
787define void @sdot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
788; CHECK-LABEL: sdot_single_za32_u8_vg1x2:
789; CHECK:       // %bb.0:
790; CHECK-NEXT:    mov w8, w0
791; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
792; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
793; CHECK-NEXT:    ret
794  call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
795  %slice2 = add i32 %slice, 7
796  call void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
797  ret void
798}
799
800define void @sdot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
801; CHECK-LABEL: sdot_single_za32_u8_vg1x4:
802; CHECK:       // %bb.0:
803; CHECK-NEXT:    mov w8, w0
804; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
805; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
806; CHECK-NEXT:    ret
807  call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
808  %slice2 = add i32 %slice, 7
809  call void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
810  ret void
811}
812
813define void @sdot_single_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
814; CHECK-LABEL: sdot_single_za64_u16_vg1x2:
815; CHECK:       // %bb.0:
816; CHECK-NEXT:    mov w8, w0
817; CHECK-NEXT:    sdot za.d[w8, 0, vgx2], { z1.h, z2.h }, z3.h
818; CHECK-NEXT:    sdot za.d[w8, 7, vgx2], { z1.h, z2.h }, z3.h
819; CHECK-NEXT:    ret
820  call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
821  %slice2 = add i32 %slice, 7
822  call void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2)
823  ret void
824}
825
826define void @sdot_single_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
827; CHECK-LABEL: sdot_single_za64_u16_vg1x4:
828; CHECK:       // %bb.0:
829; CHECK-NEXT:    mov w8, w0
830; CHECK-NEXT:    sdot za.d[w8, 0, vgx4], { z1.h - z4.h }, z5.h
831; CHECK-NEXT:    sdot za.d[w8, 7, vgx4], { z1.h - z4.h }, z5.h
832; CHECK-NEXT:    ret
833  call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
834  %slice2 = add i32 %slice, 7
835  call void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4)
836  ret void
837}
838
839define void @sudot_single_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
840; CHECK-LABEL: sudot_single_za32_u8_vg1x2:
841; CHECK:       // %bb.0:
842; CHECK-NEXT:    mov w8, w0
843; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z3.b
844; CHECK-NEXT:    sudot za.s[w8, 7, vgx2], { z1.b, z2.b }, z3.b
845; CHECK-NEXT:    ret
846  call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
847  %slice2 = add i32 %slice, 7
848  call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2)
849  ret void
850}
851
852define void @sudot_single_za32_u16_vg1x2_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 {
853; CHECK-LABEL: sudot_single_za32_u16_vg1x2_tuple:
854; CHECK:       // %bb.0: // %entry
855; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
856; CHECK-NEXT:    addvl sp, sp, #-3
857; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
858; CHECK-NEXT:    ptrue pn8.b
859; CHECK-NEXT:    mov w8, wzr
860; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
861; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
862; CHECK-NEXT:    ld1b { z1.b, z9.b }, pn8/z, [x0]
863; CHECK-NEXT:    ld1b { z2.b, z10.b }, pn8/z, [x0, x1]
864; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z1.b, z2.b }, z0.b
865; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z9.b, z10.b }, z0.b
866; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
867; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
868; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
869; CHECK-NEXT:    addvl sp, sp, #3
870; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
871; CHECK-NEXT:    ret
872entry:
873  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
874  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
875  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
876  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
877  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
878  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
879  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
880  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
881  call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> %zn)
882  call void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> %zn)
883  ret void
884}
885
886define void @sudot_single_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
887; CHECK-LABEL: sudot_single_za32_u8_vg1x4:
888; CHECK:       // %bb.0:
889; CHECK-NEXT:    mov w8, w0
890; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z5.b
891; CHECK-NEXT:    sudot za.s[w8, 7, vgx4], { z1.b - z4.b }, z5.b
892; CHECK-NEXT:    ret
893  call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
894  %slice2 = add i32 %slice, 7
895  call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4)
896  ret void
897}
898
899define void @sudot_single_za32_u16_vg1x4_tuple(ptr %ptr, i64 %stride, <vscale x 16 x i8> %zn) #0 {
900; CHECK-LABEL: sudot_single_za32_u16_vg1x4_tuple:
901; CHECK:       // %bb.0: // %entry
902; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
903; CHECK-NEXT:    addvl sp, sp, #-11
904; CHECK-NEXT:    lsl x9, x1, #1
905; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
906; CHECK-NEXT:    ptrue pn8.b
907; CHECK-NEXT:    str z20, [sp, #1, mul vl] // 16-byte Folded Spill
908; CHECK-NEXT:    mov w8, wzr
909; CHECK-NEXT:    str z16, [sp, #2, mul vl] // 16-byte Folded Spill
910; CHECK-NEXT:    add x10, x9, x1
911; CHECK-NEXT:    str z15, [sp, #3, mul vl] // 16-byte Folded Spill
912; CHECK-NEXT:    str z14, [sp, #4, mul vl] // 16-byte Folded Spill
913; CHECK-NEXT:    str z13, [sp, #5, mul vl] // 16-byte Folded Spill
914; CHECK-NEXT:    str z12, [sp, #6, mul vl] // 16-byte Folded Spill
915; CHECK-NEXT:    str z11, [sp, #7, mul vl] // 16-byte Folded Spill
916; CHECK-NEXT:    str z10, [sp, #8, mul vl] // 16-byte Folded Spill
917; CHECK-NEXT:    str z9, [sp, #9, mul vl] // 16-byte Folded Spill
918; CHECK-NEXT:    str z8, [sp, #10, mul vl] // 16-byte Folded Spill
919; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0]
920; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x1]
921; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x9]
922; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0, x10]
923; CHECK-NEXT:    mov z4.d, z16.d
924; CHECK-NEXT:    mov z8.d, z20.d
925; CHECK-NEXT:    mov z12.d, z24.d
926; CHECK-NEXT:    mov z16.d, z28.d
927; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z1.b - z4.b }, z0.b
928; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z5.b - z8.b }, z0.b
929; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z9.b - z12.b }, z0.b
930; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z13.b - z16.b }, z0.b
931; CHECK-NEXT:    ldr z20, [sp, #1, mul vl] // 16-byte Folded Reload
932; CHECK-NEXT:    ldr z16, [sp, #2, mul vl] // 16-byte Folded Reload
933; CHECK-NEXT:    ldr z15, [sp, #3, mul vl] // 16-byte Folded Reload
934; CHECK-NEXT:    ldr z14, [sp, #4, mul vl] // 16-byte Folded Reload
935; CHECK-NEXT:    ldr z13, [sp, #5, mul vl] // 16-byte Folded Reload
936; CHECK-NEXT:    ldr z12, [sp, #6, mul vl] // 16-byte Folded Reload
937; CHECK-NEXT:    ldr z11, [sp, #7, mul vl] // 16-byte Folded Reload
938; CHECK-NEXT:    ldr z10, [sp, #8, mul vl] // 16-byte Folded Reload
939; CHECK-NEXT:    ldr z9, [sp, #9, mul vl] // 16-byte Folded Reload
940; CHECK-NEXT:    ldr z8, [sp, #10, mul vl] // 16-byte Folded Reload
941; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
942; CHECK-NEXT:    addvl sp, sp, #11
943; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
944; CHECK-NEXT:    ret
945entry:
946  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
947  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
948  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
949  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
950  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
951  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
952  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
953  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
954  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
955  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
956  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
957  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
958  %mul3 = shl i64 %stride, 1
959  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
960  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
961  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
962  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
963  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
964  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
965  %mul5 = mul i64 %stride, 3
966  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
967  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
968  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
969  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
970  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
971  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
972  call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> %zn)
973  call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> %zn)
974  call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> %zn)
975  call void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> %zn)
976  ret void
977}
978
979; == Multi, indexed (unsigned) ==
980
981define void @udot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
982; CHECK-LABEL: udot_lane_za32_u16_vg1x2:
983; CHECK:       // %bb.0:
984; CHECK-NEXT:    mov z5.d, z2.d
985; CHECK-NEXT:    mov z4.d, z1.d
986; CHECK-NEXT:    mov w8, w0
987; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
988; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
989; CHECK-NEXT:    ret
990  call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
991  %slice2 = add i32 %slice, 7
992  call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
993  ret void
994}
995
996define void @udot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
997; CHECK-LABEL: udot_lane_za32_u16_vg1x4:
998; CHECK:       // %bb.0:
999; CHECK-NEXT:    mov w8, w0
1000; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z0.h - z3.h }, z4.h[3]
1001; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z0.h - z3.h }, z4.h[3]
1002; CHECK-NEXT:    ret
1003  call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1004                                                           <vscale x 8 x i16> %zn4, i32 3)
1005  %slice2 = add i32 %slice, 7
1006  call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1007                                                           <vscale x 8 x i16> %zn4, i32 3)
1008  ret void
1009}
1010
1011define void @udot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
1012; CHECK-LABEL: udot_lane_za32_u8_vg1x2:
1013; CHECK:       // %bb.0:
1014; CHECK-NEXT:    mov z5.d, z2.d
1015; CHECK-NEXT:    mov z4.d, z1.d
1016; CHECK-NEXT:    mov w8, w0
1017; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
1018; CHECK-NEXT:    udot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
1019; CHECK-NEXT:    ret
1020  call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1021  %slice2 = add i32 %slice, 7
1022  call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1023  ret void
1024}
1025
1026define void @udot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
1027; CHECK-LABEL: udot_lane_za32_u8_vg1x4:
1028; CHECK:       // %bb.0:
1029; CHECK-NEXT:    mov z27.d, z4.d
1030; CHECK-NEXT:    mov z26.d, z3.d
1031; CHECK-NEXT:    mov w8, w0
1032; CHECK-NEXT:    mov z25.d, z2.d
1033; CHECK-NEXT:    mov z24.d, z1.d
1034; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
1035; CHECK-NEXT:    udot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
1036; CHECK-NEXT:    ret
1037  call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1038                                                           <vscale x 16 x i8> %zn4, i32 3)
1039  %slice2 = add i32 %slice, 7
1040  call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1041                                                           <vscale x 16 x i8> %zn4, i32 3)
1042  ret void
1043}
1044
1045define void @udot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
1046; CHECK-LABEL: udot_form_2x_tuple:
1047; CHECK:       // %bb.0: // %entry
1048; CHECK-NEXT:    ptrue pn8.b
1049; CHECK-NEXT:    mov w8, wzr
1050; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x0]
1051; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
1052; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
1053; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
1054; CHECK-NEXT:    ret
1055entry:
1056  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1057  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1058  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1059  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1060  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1061  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1062  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1063  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1064  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1065  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1066  ret void
1067}
1068
1069define void @udot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1070; CHECK-LABEL: udot_form_2x_tuple_svecc:
1071; CHECK:       // %bb.0: // %entry
1072; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1073; CHECK-NEXT:    addvl sp, sp, #-3
1074; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1075; CHECK-NEXT:    ptrue pn8.b
1076; CHECK-NEXT:    mov w8, wzr
1077; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
1078; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
1079; CHECK-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
1080; CHECK-NEXT:    ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
1081; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
1082; CHECK-NEXT:    udot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
1083; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
1084; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
1085; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1086; CHECK-NEXT:    addvl sp, sp, #3
1087; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1088; CHECK-NEXT:    ret
1089entry:
1090  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1091  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1092  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1093  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1094  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1095  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1096  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1097  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1098  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1099  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1100  ret void
1101}
1102
1103define void @udot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
1104; CHECK-LABEL: udot_form_4x_tuple:
1105; CHECK:       // %bb.0: // %entry
1106; CHECK-NEXT:    lsl x9, x1, #1
1107; CHECK-NEXT:    ptrue pn8.b
1108; CHECK-NEXT:    mov w8, wzr
1109; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
1110; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
1111; CHECK-NEXT:    add x10, x9, x1
1112; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
1113; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
1114; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
1115; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
1116; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
1117; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
1118; CHECK-NEXT:    ret
1119entry:
1120  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1121  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1122  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1123  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1124  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1125  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1126  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1127  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1128  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1129  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1130  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1131  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1132  %mul3 = shl i64 %stride, 1
1133  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1134  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1135  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1136  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1137  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1138  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1139  %mul5 = mul i64 %stride, 3
1140  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1141  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1142  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1143  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1144  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1145  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1146  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1147  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1148  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1149  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1150  ret void
1151}
1152
1153define void @udot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1154; CHECK-LABEL: udot_form_4x_tuple_svecc:
1155; CHECK:       // %bb.0: // %entry
1156; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1157; CHECK-NEXT:    addvl sp, sp, #-9
1158; CHECK-NEXT:    lsl x9, x1, #1
1159; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1160; CHECK-NEXT:    ptrue pn8.b
1161; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
1162; CHECK-NEXT:    mov w8, wzr
1163; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
1164; CHECK-NEXT:    add x10, x9, x1
1165; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
1166; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
1167; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
1168; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
1169; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
1170; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
1171; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
1172; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
1173; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
1174; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
1175; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
1176; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
1177; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
1178; CHECK-NEXT:    udot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
1179; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
1180; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
1181; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
1182; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
1183; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
1184; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
1185; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
1186; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
1187; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1188; CHECK-NEXT:    addvl sp, sp, #9
1189; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1190; CHECK-NEXT:    ret
1191entry:
1192  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1193  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1194  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1195  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1196  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1197  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1198  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1199  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1200  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1201  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1202  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1203  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1204  %mul3 = shl i64 %stride, 1
1205  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1206  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1207  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1208  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1209  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1210  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1211  %mul5 = mul i64 %stride, 3
1212  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1213  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1214  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1215  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1216  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1217  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1218  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1219  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1220  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1221  tail call void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1222  ret void
1223}
1224
1225define void @udot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
1226; CHECK-LABEL: udot_lane_za64_u16_vg1x2:
1227; CHECK:       // %bb.0:
1228; CHECK-NEXT:    mov z5.d, z2.d
1229; CHECK-NEXT:    mov z4.d, z1.d
1230; CHECK-NEXT:    mov w8, w0
1231; CHECK-NEXT:    udot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
1232; CHECK-NEXT:    udot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
1233; CHECK-NEXT:    ret
1234  call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
1235  %slice2 = add i32 %slice, 7
1236  call void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
1237  ret void
1238}
1239
1240define void @udot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
1241; CHECK-LABEL: udot_lane_za64_u16_vg1x4:
1242; CHECK:       // %bb.0:
1243; CHECK-NEXT:    mov z27.d, z4.d
1244; CHECK-NEXT:    mov z26.d, z3.d
1245; CHECK-NEXT:    mov w8, w0
1246; CHECK-NEXT:    mov z25.d, z2.d
1247; CHECK-NEXT:    mov z24.d, z1.d
1248; CHECK-NEXT:    udot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
1249; CHECK-NEXT:    udot za.d[w8, 7, vgx4], { z24.h - z27.h }, z5.h[1]
1250; CHECK-NEXT:    ret
1251  call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1252                                                           <vscale x 8 x i16> %zn4, i32 1)
1253  %slice2 = add i32 %slice, 7
1254  call void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1255                                                           <vscale x 8 x i16> %zn4, i32 1)
1256  ret void
1257}
1258
1259define void @usdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
1260; CHECK-LABEL: usdot_lane_za32_u8_vg1x2:
1261; CHECK:       // %bb.0:
1262; CHECK-NEXT:    mov z5.d, z2.d
1263; CHECK-NEXT:    mov z4.d, z1.d
1264; CHECK-NEXT:    mov w8, w0
1265; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
1266; CHECK-NEXT:    usdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
1267; CHECK-NEXT:    ret
1268  call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1269  %slice2 = add i32 %slice, 7
1270  call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1271  ret void
1272}
1273
1274define void @usdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
1275; CHECK-LABEL: usdot_lane_za32_u8_vg1x4:
1276; CHECK:       // %bb.0:
1277; CHECK-NEXT:    mov z27.d, z4.d
1278; CHECK-NEXT:    mov z26.d, z3.d
1279; CHECK-NEXT:    mov w8, w0
1280; CHECK-NEXT:    mov z25.d, z2.d
1281; CHECK-NEXT:    mov z24.d, z1.d
1282; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
1283; CHECK-NEXT:    usdot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
1284; CHECK-NEXT:    ret
1285  call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1286                                                            <vscale x 16 x i8> %zn4, i32 3)
1287  %slice2 = add i32 %slice, 7
1288  call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1289                                                            <vscale x 16 x i8> %zn4, i32 3)
1290  ret void
1291}
1292
1293define void @usdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
1294; CHECK-LABEL: usdot_form_2x_tuple:
1295; CHECK:       // %bb.0: // %entry
1296; CHECK-NEXT:    ptrue pn8.b
1297; CHECK-NEXT:    mov w8, wzr
1298; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x0]
1299; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
1300; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
1301; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
1302; CHECK-NEXT:    ret
1303entry:
1304  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1305  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1306  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1307  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1308  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1309  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1310  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1311  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1312  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1313  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1314  ret void
1315}
1316
1317define void @usdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1318; CHECK-LABEL: usdot_form_2x_tuple_svecc:
1319; CHECK:       // %bb.0: // %entry
1320; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1321; CHECK-NEXT:    addvl sp, sp, #-3
1322; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1323; CHECK-NEXT:    ptrue pn8.b
1324; CHECK-NEXT:    mov w8, wzr
1325; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
1326; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
1327; CHECK-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
1328; CHECK-NEXT:    ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
1329; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
1330; CHECK-NEXT:    usdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
1331; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
1332; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
1333; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1334; CHECK-NEXT:    addvl sp, sp, #3
1335; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1336; CHECK-NEXT:    ret
1337entry:
1338  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1339  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1340  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1341  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1342  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1343  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1344  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1345  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1346  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1347  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1348  ret void
1349}
1350
1351define void @usdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
1352; CHECK-LABEL: usdot_form_4x_tuple:
1353; CHECK:       // %bb.0: // %entry
1354; CHECK-NEXT:    lsl x9, x1, #1
1355; CHECK-NEXT:    ptrue pn8.b
1356; CHECK-NEXT:    mov w8, wzr
1357; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
1358; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
1359; CHECK-NEXT:    add x10, x9, x1
1360; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
1361; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
1362; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
1363; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
1364; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
1365; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
1366; CHECK-NEXT:    ret
1367entry:
1368  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1369  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1370  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1371  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1372  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1373  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1374  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1375  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1376  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1377  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1378  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1379  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1380  %mul3 = shl i64 %stride, 1
1381  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1382  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1383  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1384  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1385  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1386  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1387  %mul5 = mul i64 %stride, 3
1388  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1389  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1390  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1391  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1392  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1393  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1394  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1395  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1396  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1397  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1398  ret void
1399}
1400
1401define void @usdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1402; CHECK-LABEL: usdot_form_4x_tuple_svecc:
1403; CHECK:       // %bb.0: // %entry
1404; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1405; CHECK-NEXT:    addvl sp, sp, #-9
1406; CHECK-NEXT:    lsl x9, x1, #1
1407; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1408; CHECK-NEXT:    ptrue pn8.b
1409; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
1410; CHECK-NEXT:    mov w8, wzr
1411; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
1412; CHECK-NEXT:    add x10, x9, x1
1413; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
1414; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
1415; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
1416; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
1417; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
1418; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
1419; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
1420; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
1421; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
1422; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
1423; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
1424; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
1425; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
1426; CHECK-NEXT:    usdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
1427; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
1428; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
1429; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
1430; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
1431; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
1432; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
1433; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
1434; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
1435; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1436; CHECK-NEXT:    addvl sp, sp, #9
1437; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1438; CHECK-NEXT:    ret
1439entry:
1440  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1441  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1442  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1443  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1444  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1445  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1446  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1447  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1448  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1449  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1450  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1451  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1452  %mul3 = shl i64 %stride, 1
1453  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1454  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1455  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1456  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1457  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1458  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1459  %mul5 = mul i64 %stride, 3
1460  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1461  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1462  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1463  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1464  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1465  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1466  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1467  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1468  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1469  tail call void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1470  ret void
1471}
1472
1473; == Multi, indexed (signed) ==
1474
1475define void @sdot_lane_za32_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #0 {
1476; CHECK-LABEL: sdot_lane_za32_u16_vg1x2:
1477; CHECK:       // %bb.0:
1478; CHECK-NEXT:    mov z5.d, z2.d
1479; CHECK-NEXT:    mov z4.d, z1.d
1480; CHECK-NEXT:    mov w8, w0
1481; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z4.h, z5.h }, z3.h[3]
1482; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z4.h, z5.h }, z3.h[3]
1483; CHECK-NEXT:    ret
1484  call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
1485  %slice2 = add i32 %slice, 7
1486  call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 3)
1487  ret void
1488}
1489
1490define void @sdot_lane_za32_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #0 {
1491; CHECK-LABEL: sdot_lane_za32_u16_vg1x4:
1492; CHECK:       // %bb.0:
1493; CHECK-NEXT:    mov z27.d, z4.d
1494; CHECK-NEXT:    mov z26.d, z3.d
1495; CHECK-NEXT:    mov w8, w0
1496; CHECK-NEXT:    mov z25.d, z2.d
1497; CHECK-NEXT:    mov z24.d, z1.d
1498; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z24.h - z27.h }, z5.h[3]
1499; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z24.h - z27.h }, z5.h[3]
1500; CHECK-NEXT:    ret
1501  call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1502                                                           <vscale x 8 x i16> %zn4, i32 3)
1503  %slice2 = add i32 %slice, 7
1504  call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1505                                                           <vscale x 8 x i16> %zn4, i32 3)
1506  ret void
1507}
1508
1509define void @sdot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
1510; CHECK-LABEL: sdot_lane_za32_u8_vg1x2:
1511; CHECK:       // %bb.0:
1512; CHECK-NEXT:    mov z5.d, z2.d
1513; CHECK-NEXT:    mov z4.d, z1.d
1514; CHECK-NEXT:    mov w8, w0
1515; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
1516; CHECK-NEXT:    sdot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
1517; CHECK-NEXT:    ret
1518  call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1519  %slice2 = add i32 %slice, 7
1520  call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1521  ret void
1522}
1523
1524define void @sdot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
1525; CHECK-LABEL: sdot_lane_za32_u8_vg1x4:
1526; CHECK:       // %bb.0:
1527; CHECK-NEXT:    mov z27.d, z4.d
1528; CHECK-NEXT:    mov z26.d, z3.d
1529; CHECK-NEXT:    mov w8, w0
1530; CHECK-NEXT:    mov z25.d, z2.d
1531; CHECK-NEXT:    mov z24.d, z1.d
1532; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
1533; CHECK-NEXT:    sdot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
1534; CHECK-NEXT:    ret
1535  call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1536                                                           <vscale x 16 x i8> %zn4, i32 3)
1537  %slice2 = add i32 %slice, 7
1538  call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1539                                                           <vscale x 16 x i8> %zn4, i32 3)
1540  ret void
1541}
1542
1543define void @sdot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
1544; CHECK-LABEL: sdot_form_2x_tuple:
1545; CHECK:       // %bb.0: // %entry
1546; CHECK-NEXT:    ptrue pn8.b
1547; CHECK-NEXT:    mov w8, wzr
1548; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x0]
1549; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
1550; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
1551; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
1552; CHECK-NEXT:    ret
1553entry:
1554  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1555  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1556  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1557  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1558  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1559  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1560  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1561  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1562  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1563  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1564  ret void
1565}
1566
1567define void @sdot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1568; CHECK-LABEL: sdot_form_2x_tuple_svecc:
1569; CHECK:       // %bb.0: // %entry
1570; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1571; CHECK-NEXT:    addvl sp, sp, #-3
1572; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1573; CHECK-NEXT:    ptrue pn8.b
1574; CHECK-NEXT:    mov w8, wzr
1575; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
1576; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
1577; CHECK-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
1578; CHECK-NEXT:    ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
1579; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
1580; CHECK-NEXT:    sdot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
1581; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
1582; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
1583; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1584; CHECK-NEXT:    addvl sp, sp, #3
1585; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1586; CHECK-NEXT:    ret
1587entry:
1588  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1589  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1590  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1591  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1592  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1593  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1594  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1595  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1596  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1597  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1598  ret void
1599}
1600
1601define void @sdot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
1602; CHECK-LABEL: sdot_form_4x_tuple:
1603; CHECK:       // %bb.0: // %entry
1604; CHECK-NEXT:    lsl x9, x1, #1
1605; CHECK-NEXT:    ptrue pn8.b
1606; CHECK-NEXT:    mov w8, wzr
1607; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
1608; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
1609; CHECK-NEXT:    add x10, x9, x1
1610; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
1611; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
1612; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
1613; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
1614; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
1615; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
1616; CHECK-NEXT:    ret
1617entry:
1618  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1619  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1620  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1621  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1622  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1623  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1624  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1625  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1626  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1627  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1628  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1629  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1630  %mul3 = shl i64 %stride, 1
1631  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1632  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1633  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1634  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1635  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1636  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1637  %mul5 = mul i64 %stride, 3
1638  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1639  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1640  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1641  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1642  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1643  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1644  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1645  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1646  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1647  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1648  ret void
1649}
1650
1651define void @sdot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1652; CHECK-LABEL: sdot_form_4x_tuple_svecc:
1653; CHECK:       // %bb.0: // %entry
1654; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1655; CHECK-NEXT:    addvl sp, sp, #-9
1656; CHECK-NEXT:    lsl x9, x1, #1
1657; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1658; CHECK-NEXT:    ptrue pn8.b
1659; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
1660; CHECK-NEXT:    mov w8, wzr
1661; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
1662; CHECK-NEXT:    add x10, x9, x1
1663; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
1664; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
1665; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
1666; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
1667; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
1668; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
1669; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
1670; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
1671; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
1672; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
1673; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
1674; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
1675; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
1676; CHECK-NEXT:    sdot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
1677; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
1678; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
1679; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
1680; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
1681; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
1682; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
1683; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
1684; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
1685; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1686; CHECK-NEXT:    addvl sp, sp, #9
1687; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1688; CHECK-NEXT:    ret
1689entry:
1690  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1691  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1692  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1693  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1694  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1695  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1696  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1697  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1698  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1699  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1700  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1701  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1702  %mul3 = shl i64 %stride, 1
1703  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1704  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1705  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1706  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1707  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1708  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1709  %mul5 = mul i64 %stride, 3
1710  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1711  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1712  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1713  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1714  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1715  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1716  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1717  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1718  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1719  tail call void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1720  ret void
1721}
1722
1723define void @sdot_lane_za64_u16_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2) #1 {
1724; CHECK-LABEL: sdot_lane_za64_u16_vg1x2:
1725; CHECK:       // %bb.0:
1726; CHECK-NEXT:    mov z5.d, z2.d
1727; CHECK-NEXT:    mov z4.d, z1.d
1728; CHECK-NEXT:    mov w8, w0
1729; CHECK-NEXT:    sdot za.d[w8, 0, vgx2], { z4.h, z5.h }, z3.h[1]
1730; CHECK-NEXT:    sdot za.d[w8, 7, vgx2], { z4.h, z5.h }, z3.h[1]
1731; CHECK-NEXT:    ret
1732  call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
1733  %slice2 = add i32 %slice, 7
1734  call void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, i32 1)
1735  ret void
1736}
1737
1738define void @sdot_lane_za64_u16_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3, <vscale x 8 x i16> %zn4) #1 {
1739; CHECK-LABEL: sdot_lane_za64_u16_vg1x4:
1740; CHECK:       // %bb.0:
1741; CHECK-NEXT:    mov z27.d, z4.d
1742; CHECK-NEXT:    mov z26.d, z3.d
1743; CHECK-NEXT:    mov w8, w0
1744; CHECK-NEXT:    mov z25.d, z2.d
1745; CHECK-NEXT:    mov z24.d, z1.d
1746; CHECK-NEXT:    sdot za.d[w8, 0, vgx4], { z24.h - z27.h }, z5.h[1]
1747; CHECK-NEXT:    sdot za.d[w8, 7, vgx4], { z24.h - z27.h }, z5.h[1]
1748; CHECK-NEXT:    ret
1749  call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 %slice, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1750                                                           <vscale x 8 x i16> %zn4, i32 1)
1751  %slice2 = add i32 %slice, 7
1752  call void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32 %slice2, <vscale x 8 x i16> %zn0, <vscale x 8 x i16> %zn1, <vscale x 8 x i16> %zn2, <vscale x 8 x i16> %zn3,
1753                                                           <vscale x 8 x i16> %zn4, i32 1)
1754  ret void
1755}
1756
1757
1758
1759define void @sudot_lane_za32_u8_vg1x2(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2) #0 {
1760; CHECK-LABEL: sudot_lane_za32_u8_vg1x2:
1761; CHECK:       // %bb.0:
1762; CHECK-NEXT:    mov z5.d, z2.d
1763; CHECK-NEXT:    mov z4.d, z1.d
1764; CHECK-NEXT:    mov w8, w0
1765; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z4.b, z5.b }, z3.b[3]
1766; CHECK-NEXT:    sudot za.s[w8, 7, vgx2], { z4.b, z5.b }, z3.b[3]
1767; CHECK-NEXT:    ret
1768  call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1769  %slice2 = add i32 %slice, 7
1770  call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, i32 3)
1771  ret void
1772}
1773
1774define void @sudot_lane_za32_u8_vg1x4(i32 %slice, <vscale x 16 x i8> %unused, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4) #0 {
1775; CHECK-LABEL: sudot_lane_za32_u8_vg1x4:
1776; CHECK:       // %bb.0:
1777; CHECK-NEXT:    mov z27.d, z4.d
1778; CHECK-NEXT:    mov z26.d, z3.d
1779; CHECK-NEXT:    mov w8, w0
1780; CHECK-NEXT:    mov z25.d, z2.d
1781; CHECK-NEXT:    mov z24.d, z1.d
1782; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z5.b[3]
1783; CHECK-NEXT:    sudot za.s[w8, 7, vgx4], { z24.b - z27.b }, z5.b[3]
1784; CHECK-NEXT:    ret
1785  call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 %slice, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1786                                                            <vscale x 16 x i8> %zn4, i32 3)
1787  %slice2 = add i32 %slice, 7
1788  call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 %slice2, <vscale x 16 x i8> %zn0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3,
1789                                                            <vscale x 16 x i8> %zn4, i32 3)
1790  ret void
1791}
1792
1793define void @sudot_form_2x_tuple(ptr %ptr, i64 %stride) #0 {
1794; CHECK-LABEL: sudot_form_2x_tuple:
1795; CHECK:       // %bb.0: // %entry
1796; CHECK-NEXT:    ptrue pn8.b
1797; CHECK-NEXT:    mov w8, wzr
1798; CHECK-NEXT:    ld1b { z16.b, z24.b }, pn8/z, [x0]
1799; CHECK-NEXT:    ld1b { z17.b, z25.b }, pn8/z, [x0, x1]
1800; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z16.b, z17.b }, z0.b[0]
1801; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z24.b, z25.b }, z0.b[0]
1802; CHECK-NEXT:    ret
1803entry:
1804  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1805  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1806  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1807  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1808  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1809  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1810  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1811  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1812  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1813  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1814  ret void
1815}
1816
1817define void @sudot_form_2x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1818; CHECK-LABEL: sudot_form_2x_tuple_svecc:
1819; CHECK:       // %bb.0: // %entry
1820; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1821; CHECK-NEXT:    addvl sp, sp, #-3
1822; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1823; CHECK-NEXT:    ptrue pn8.b
1824; CHECK-NEXT:    mov w8, wzr
1825; CHECK-NEXT:    str z9, [sp, #1, mul vl] // 16-byte Folded Spill
1826; CHECK-NEXT:    str z8, [sp, #2, mul vl] // 16-byte Folded Spill
1827; CHECK-NEXT:    ld1b { z0.b, z8.b }, pn8/z, [x0]
1828; CHECK-NEXT:    ld1b { z1.b, z9.b }, pn8/z, [x0, x1]
1829; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z0.b, z1.b }, z0.b[0]
1830; CHECK-NEXT:    sudot za.s[w8, 0, vgx2], { z8.b, z9.b }, z0.b[0]
1831; CHECK-NEXT:    ldr z9, [sp, #1, mul vl] // 16-byte Folded Reload
1832; CHECK-NEXT:    ldr z8, [sp, #2, mul vl] // 16-byte Folded Reload
1833; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1834; CHECK-NEXT:    addvl sp, sp, #3
1835; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1836; CHECK-NEXT:    ret
1837entry:
1838  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1839  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1840  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1841  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1842  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1843  %4 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1844  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 0
1845  %6 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %4, 1
1846  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %5, <vscale x 16 x i8> poison, i32 0)
1847  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %6, <vscale x 16 x i8> poison, i32 0)
1848  ret void
1849}
1850
1851define void @sudot_form_4x_tuple(ptr %ptr, i64 %stride) #0 {
1852; CHECK-LABEL: sudot_form_4x_tuple:
1853; CHECK:       // %bb.0: // %entry
1854; CHECK-NEXT:    lsl x9, x1, #1
1855; CHECK-NEXT:    ptrue pn8.b
1856; CHECK-NEXT:    mov w8, wzr
1857; CHECK-NEXT:    ld1b { z16.b, z20.b, z24.b, z28.b }, pn8/z, [x0]
1858; CHECK-NEXT:    ld1b { z17.b, z21.b, z25.b, z29.b }, pn8/z, [x0, x1]
1859; CHECK-NEXT:    add x10, x9, x1
1860; CHECK-NEXT:    ld1b { z18.b, z22.b, z26.b, z30.b }, pn8/z, [x0, x9]
1861; CHECK-NEXT:    ld1b { z19.b, z23.b, z27.b, z31.b }, pn8/z, [x0, x10]
1862; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z16.b - z19.b }, z0.b[0]
1863; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z20.b - z23.b }, z0.b[0]
1864; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z24.b - z27.b }, z0.b[0]
1865; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z28.b - z31.b }, z0.b[0]
1866; CHECK-NEXT:    ret
1867entry:
1868  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1869  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1870  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1871  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1872  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1873  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1874  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1875  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1876  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1877  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1878  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1879  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1880  %mul3 = shl i64 %stride, 1
1881  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1882  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1883  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1884  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1885  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1886  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1887  %mul5 = mul i64 %stride, 3
1888  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1889  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1890  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1891  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1892  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1893  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1894  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1895  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1896  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1897  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1898  ret void
1899}
1900
1901define void @sudot_form_4x_tuple_svecc(ptr %ptr, i64 %stride, <vscale x 16 x i8> %scalable_arg) #0 {
1902; CHECK-LABEL: sudot_form_4x_tuple_svecc:
1903; CHECK:       // %bb.0: // %entry
1904; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1905; CHECK-NEXT:    addvl sp, sp, #-9
1906; CHECK-NEXT:    lsl x9, x1, #1
1907; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
1908; CHECK-NEXT:    ptrue pn8.b
1909; CHECK-NEXT:    str z15, [sp, #1, mul vl] // 16-byte Folded Spill
1910; CHECK-NEXT:    mov w8, wzr
1911; CHECK-NEXT:    str z14, [sp, #2, mul vl] // 16-byte Folded Spill
1912; CHECK-NEXT:    add x10, x9, x1
1913; CHECK-NEXT:    str z13, [sp, #3, mul vl] // 16-byte Folded Spill
1914; CHECK-NEXT:    str z12, [sp, #4, mul vl] // 16-byte Folded Spill
1915; CHECK-NEXT:    str z11, [sp, #5, mul vl] // 16-byte Folded Spill
1916; CHECK-NEXT:    str z10, [sp, #6, mul vl] // 16-byte Folded Spill
1917; CHECK-NEXT:    str z9, [sp, #7, mul vl] // 16-byte Folded Spill
1918; CHECK-NEXT:    str z8, [sp, #8, mul vl] // 16-byte Folded Spill
1919; CHECK-NEXT:    ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0]
1920; CHECK-NEXT:    ld1b { z1.b, z5.b, z9.b, z13.b }, pn8/z, [x0, x1]
1921; CHECK-NEXT:    ld1b { z2.b, z6.b, z10.b, z14.b }, pn8/z, [x0, x9]
1922; CHECK-NEXT:    ld1b { z3.b, z7.b, z11.b, z15.b }, pn8/z, [x0, x10]
1923; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z0.b - z3.b }, z0.b[0]
1924; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z4.b - z7.b }, z0.b[0]
1925; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z8.b - z11.b }, z0.b[0]
1926; CHECK-NEXT:    sudot za.s[w8, 0, vgx4], { z12.b - z15.b }, z0.b[0]
1927; CHECK-NEXT:    ldr z15, [sp, #1, mul vl] // 16-byte Folded Reload
1928; CHECK-NEXT:    ldr z14, [sp, #2, mul vl] // 16-byte Folded Reload
1929; CHECK-NEXT:    ldr z13, [sp, #3, mul vl] // 16-byte Folded Reload
1930; CHECK-NEXT:    ldr z12, [sp, #4, mul vl] // 16-byte Folded Reload
1931; CHECK-NEXT:    ldr z11, [sp, #5, mul vl] // 16-byte Folded Reload
1932; CHECK-NEXT:    ldr z10, [sp, #6, mul vl] // 16-byte Folded Reload
1933; CHECK-NEXT:    ldr z9, [sp, #7, mul vl] // 16-byte Folded Reload
1934; CHECK-NEXT:    ldr z8, [sp, #8, mul vl] // 16-byte Folded Reload
1935; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
1936; CHECK-NEXT:    addvl sp, sp, #9
1937; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1938; CHECK-NEXT:    ret
1939entry:
1940  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
1941  %1 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %ptr)
1942  %2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 0
1943  %3 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 1
1944  %4 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 2
1945  %5 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %1, 3
1946  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
1947  %6 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx2)
1948  %7 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 0
1949  %8 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 1
1950  %9 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 2
1951  %10 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %6, 3
1952  %mul3 = shl i64 %stride, 1
1953  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
1954  %11 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx4)
1955  %12 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 0
1956  %13 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 1
1957  %14 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 2
1958  %15 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %11, 3
1959  %mul5 = mul i64 %stride, 3
1960  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
1961  %16 = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %0, ptr %arrayidx6)
1962  %17 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 0
1963  %18 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 1
1964  %19 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 2
1965  %20 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %16, 3
1966  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %2, <vscale x 16 x i8> %7, <vscale x 16 x i8> %12, <vscale x 16 x i8> %17, <vscale x 16 x i8> poison, i32 0)
1967  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %3, <vscale x 16 x i8> %8, <vscale x 16 x i8> %13, <vscale x 16 x i8> %18, <vscale x 16 x i8> poison, i32 0)
1968  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %4, <vscale x 16 x i8> %9, <vscale x 16 x i8> %14, <vscale x 16 x i8> %19, <vscale x 16 x i8> poison, i32 0)
1969  tail call void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32 0, <vscale x 16 x i8> %5, <vscale x 16 x i8> %10, <vscale x 16 x i8> %15, <vscale x 16 x i8> %20, <vscale x 16 x i8> poison, i32 0)
1970  ret void
1971}
1972
1973
1974attributes #0 = { nounwind "target-features"="+sme2" }
1975attributes #1 = { nounwind "target-features"="+sme2,+sme-i16i64" }
1976
1977; == Multi, multi (unsigned)
1978
1979declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1980declare void @llvm.aarch64.sme.udot.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
1981                                                       <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1982declare void @llvm.aarch64.sme.udot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1983declare void @llvm.aarch64.sme.udot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
1984                                                       <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1985declare void @llvm.aarch64.sme.udot.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1986declare void @llvm.aarch64.sme.udot.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
1987                                                       <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1988declare void @llvm.aarch64.sme.usdot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1989declare void @llvm.aarch64.sme.usdot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
1990                                                        <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1991
1992; == Multi, multi (signed)
1993
1994declare void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1995declare void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
1996                                                       <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
1997declare void @llvm.aarch64.sme.sdot.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
1998declare void @llvm.aarch64.sme.sdot.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>,
1999                                                       <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2000declare void @llvm.aarch64.sme.sdot.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2001declare void @llvm.aarch64.sme.sdot.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>,
2002                                                       <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2003
2004; == Multi, single (unsigned)
2005
2006declare void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2007declare void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2008declare void @llvm.aarch64.sme.udot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2009declare void @llvm.aarch64.sme.udot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2010declare void @llvm.aarch64.sme.udot.single.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2011declare void @llvm.aarch64.sme.udot.single.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2012declare void @llvm.aarch64.sme.usdot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2013declare void @llvm.aarch64.sme.usdot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2014
2015; == Multi, single (signed)
2016
2017declare void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2018declare void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2019declare void @llvm.aarch64.sme.sdot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2020declare void @llvm.aarch64.sme.sdot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2021declare void @llvm.aarch64.sme.sdot.single.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2022declare void @llvm.aarch64.sme.sdot.single.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
2023declare void @llvm.aarch64.sme.sudot.single.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2024declare void @llvm.aarch64.sme.sudot.single.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
2025
2026; == Multi, indexed (unsigned)
2027
2028declare void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2029declare void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2030declare void @llvm.aarch64.sme.udot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2031declare void @llvm.aarch64.sme.udot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2032declare void @llvm.aarch64.sme.udot.lane.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2033declare void @llvm.aarch64.sme.udot.lane.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2034declare void @llvm.aarch64.sme.usdot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2035declare void @llvm.aarch64.sme.usdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2036
2037; == Multi, indexed (signed)
2038
2039declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2040declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2041declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2042declare void @llvm.aarch64.sme.sdot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2043declare void @llvm.aarch64.sme.sdot.lane.za64.vg1x2.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2044declare void @llvm.aarch64.sme.sdot.lane.za64.vg1x4.nxv8i16(i32, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
2045declare void @llvm.aarch64.sme.sudot.lane.za32.vg1x2.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2046declare void @llvm.aarch64.sme.sudot.lane.za32.vg1x4.nxv16i8(i32, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
2047