xref: /llvm-project/llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll (revision 62baf21daa377c4ec1a641b26931063c1117d262)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -force-streaming -verify-machineinstrs < %s | FileCheck %s
3
4;
5; ADD Multi-Single x2
6;
7
8define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,  <vscale x 4 x i32> %zm) {
9; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i32:
10; CHECK:       // %bb.0:
11; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
12; CHECK-NEXT:    mov w8, w0
13; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
14; CHECK-NEXT:    add za.s[w8, 0, vgx2], { z0.s, z1.s }, z2.s
15; CHECK-NEXT:    add za.s[w8, 7, vgx2], { z0.s, z1.s }, z2.s
16; CHECK-NEXT:    ret
17  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice,
18                                                       <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
19                                                       <vscale x 4 x i32> %zm)
20  %slice.7 = add i32 %slice, 7
21  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice.7,
22                                                       <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
23                                                       <vscale x 4 x i32> %zm)
24  ret void
25}
26
27define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,  <vscale x 2 x i64> %zm) {
28; CHECK-LABEL: multi_vector_add_write_single_za_vg1x2_i64:
29; CHECK:       // %bb.0:
30; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
31; CHECK-NEXT:    mov w8, w0
32; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
33; CHECK-NEXT:    add za.d[w8, 0, vgx2], { z0.d, z1.d }, z2.d
34; CHECK-NEXT:    add za.d[w8, 7, vgx2], { z0.d, z1.d }, z2.d
35; CHECK-NEXT:    ret
36  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice,
37                                                       <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
38                                                       <vscale x 2 x i64> %zm)
39  %slice.7 = add i32 %slice, 7
40  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice.7,
41                                                       <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
42                                                       <vscale x 2 x i64> %zm)
43  ret void
44}
45
46;
47; ADD Multi-Single x4
48;
49
50define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
51; CHECK-LABEL: multi_vector_add_write_single_za_vg1x4_i32:
52; CHECK:       // %bb.0:
53; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
54; CHECK-NEXT:    mov w8, w0
55; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
56; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
57; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
58; CHECK-NEXT:    add za.s[w8, 0, vgx4], { z0.s - z3.s }, z4.s
59; CHECK-NEXT:    add za.s[w8, 7, vgx4], { z0.s - z3.s }, z4.s
60; CHECK-NEXT:    ret
61                                               <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
62                                               <vscale x 4 x i32> %zm) {
63  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice,
64                                                       <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
65                                                       <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
66                                                       <vscale x 4 x i32> %zm)
67  %slice.7 = add i32 %slice, 7
68  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice.7,
69                                                       <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
70                                                       <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
71                                                       <vscale x 4 x i32> %zm)
72  ret void
73}
74
75define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice,
76; CHECK-LABEL: multi_vector_add_write_single_za_vg1x4_i64:
77; CHECK:       // %bb.0:
78; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
79; CHECK-NEXT:    mov w8, w0
80; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
81; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
82; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
83; CHECK-NEXT:    add za.d[w8, 0, vgx4], { z0.d - z3.d }, z4.d
84; CHECK-NEXT:    add za.d[w8, 7, vgx4], { z0.d - z3.d }, z4.d
85; CHECK-NEXT:    ret
86                                               <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
87                                               <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
88                                               <vscale x 2 x i64> %zm) {
89  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice,
90                                                       <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
91                                                       <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
92                                                       <vscale x 2 x i64> %zm)
93  %slice.7 = add i32 %slice, 7
94  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice.7,
95                                                       <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
96                                                       <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
97                                                       <vscale x 2 x i64> %zm)
98  ret void
99}
100
101;
102; ADD Multi-Multi x2
103;
104
105define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
106; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i32:
107; CHECK:       // %bb.0:
108; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
109; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
110; CHECK-NEXT:    mov w8, w0
111; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
112; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
113; CHECK-NEXT:    add za.s[w8, 0, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
114; CHECK-NEXT:    add za.s[w8, 7, vgx2], { z0.s, z1.s }, { z2.s, z3.s }
115; CHECK-NEXT:    ret
116                                        <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) {
117  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice,
118                                                       <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
119                                                       <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
120  %slice.7 = add i32 %slice, 7
121  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice.7,
122                                                       <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
123                                                       <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
124  ret void
125}
126
127
128define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
129; CHECK-LABEL: multi_vector_add_write_za_vg1x2_i64:
130; CHECK:       // %bb.0:
131; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z2_z3 def $z2_z3
132; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
133; CHECK-NEXT:    mov w8, w0
134; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z2_z3 def $z2_z3
135; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
136; CHECK-NEXT:    add za.d[w8, 0, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
137; CHECK-NEXT:    add za.d[w8, 7, vgx2], { z0.d, z1.d }, { z2.d, z3.d }
138; CHECK-NEXT:    ret
139                                        <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) {
140  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice,
141                                                       <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
142                                                       <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
143  %slice.7 = add i32 %slice, 7
144  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice.7,
145                                                       <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
146                                                       <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
147  ret void
148}
149
150
151;
152; ADD Multi-Multi x4
153;
154
155define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
156; CHECK-LABEL: multi_vector_add_write_za_vg1x4_i32:
157; CHECK:       // %bb.0:
158; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
159; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
160; CHECK-NEXT:    mov w8, w0
161; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
162; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
163; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
164; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
165; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
166; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
167; CHECK-NEXT:    add za.s[w8, 0, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
168; CHECK-NEXT:    add za.s[w8, 7, vgx4], { z0.s - z3.s }, { z4.s - z7.s }
169; CHECK-NEXT:    ret
170                                        <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
171                                        <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
172                                        <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) {
173  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice,
174                                                      <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
175                                                      <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
176                                                      <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
177                                                      <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3)
178  %slice.7 = add i32 %slice, 7
179  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice.7,
180                                                       <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
181                                                      <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
182                                                       <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
183                                                      <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3)
184  ret void
185}
186
187define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
188; CHECK-LABEL: multi_vector_add_write_za_vg1x4_i64:
189; CHECK:       // %bb.0:
190; CHECK-NEXT:    // kill: def $z7 killed $z7 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
191; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
192; CHECK-NEXT:    mov w8, w0
193; CHECK-NEXT:    // kill: def $z6 killed $z6 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
194; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
195; CHECK-NEXT:    // kill: def $z5 killed $z5 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
196; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
197; CHECK-NEXT:    // kill: def $z4 killed $z4 killed $z4_z5_z6_z7 def $z4_z5_z6_z7
198; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
199; CHECK-NEXT:    add za.d[w8, 0, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
200; CHECK-NEXT:    add za.d[w8, 7, vgx4], { z0.d - z3.d }, { z4.d - z7.d }
201; CHECK-NEXT:    ret
202                                        <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
203                                        <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
204                                        <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) {
205  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice,
206                                                       <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
207                                                       <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
208                                                       <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
209                                                       <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3)
210  %slice.7 = add i32 %slice, 7
211  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice.7,
212                                                       <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
213                                                       <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
214                                                       <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
215                                                       <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3)
216  ret void
217}
218
219;
220; ADD and accumulate into ZA
221;
222; x2
223define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) {
224; CHECK-LABEL: multi_vector_add_za_vg1x2_i32:
225; CHECK:       // %bb.0:
226; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
227; CHECK-NEXT:    mov w8, w0
228; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
229; CHECK-NEXT:    add za.s[w8, 0, vgx2], { z0.s, z1.s }
230; CHECK-NEXT:    add za.s[w8, 7, vgx2], { z0.s, z1.s }
231; CHECK-NEXT:    ret
232  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice,<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
233  %slice.7 = add i32 %slice, 7
234  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
235  ret void
236}
237
238define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) {
239; CHECK-LABEL: multi_vector_add_za_vg1x2_i64:
240; CHECK:       // %bb.0:
241; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
242; CHECK-NEXT:    mov w8, w0
243; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
244; CHECK-NEXT:    add za.d[w8, 0, vgx2], { z0.d, z1.d }
245; CHECK-NEXT:    add za.d[w8, 7, vgx2], { z0.d, z1.d }
246; CHECK-NEXT:    ret
247  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
248  %slice.7 = add i32 %slice, 7
249  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
250  ret void
251}
252
253define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) {
254; CHECK-LABEL: multi_vector_add_za_vg1x2_f32:
255; CHECK:       // %bb.0:
256; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
257; CHECK-NEXT:    mov w8, w0
258; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
259; CHECK-NEXT:    fadd za.s[w8, 0, vgx2], { z0.s, z1.s }
260; CHECK-NEXT:    fadd za.s[w8, 7, vgx2], { z0.s, z1.s }
261; CHECK-NEXT:    ret
262  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice,
263                                                       <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
264  %slice.7 = add i32 %slice, 7
265  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice.7,
266                                                       <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
267  ret void
268}
269
270define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) {
271; CHECK-LABEL: multi_vector_add_za_vg1x2_f64:
272; CHECK:       // %bb.0:
273; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
274; CHECK-NEXT:    mov w8, w0
275; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
276; CHECK-NEXT:    fadd za.d[w8, 0, vgx2], { z0.d, z1.d }
277; CHECK-NEXT:    fadd za.d[w8, 7, vgx2], { z0.d, z1.d }
278; CHECK-NEXT:    ret
279  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice,
280                                                       <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1)
281  %slice.7 = add i32 %slice, 7
282  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice.7,
283                                                       <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1)
284  ret void
285}
286
287; x4
288
289define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) {
290; CHECK-LABEL: multi_vector_add_za_vg1x4_i32:
291; CHECK:       // %bb.0:
292; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
293; CHECK-NEXT:    mov w8, w0
294; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
295; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
296; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
297; CHECK-NEXT:    add za.s[w8, 0, vgx4], { z0.s - z3.s }
298; CHECK-NEXT:    add za.s[w8, 7, vgx4], { z0.s - z3.s }
299; CHECK-NEXT:    ret
300  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice,
301                  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
302                  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
303  %slice.7 = add i32 %slice, 7
304  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice.7,
305                   <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
306                   <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
307  ret void
308}
309
310define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) {
311; CHECK-LABEL: multi_vector_add_za_vg1x4_i64:
312; CHECK:       // %bb.0:
313; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
314; CHECK-NEXT:    mov w8, w0
315; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
316; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
317; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
318; CHECK-NEXT:    add za.d[w8, 0, vgx4], { z0.d - z3.d }
319; CHECK-NEXT:    add za.d[w8, 7, vgx4], { z0.d - z3.d }
320; CHECK-NEXT:    ret
321  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice,
322                  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
323                  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3)
324  %slice.7 = add i32 %slice, 7
325  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice.7,
326                  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
327                  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3)
328  ret void
329}
330
331define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) {
332; CHECK-LABEL: multi_vector_add_za_vg1x4_f32:
333; CHECK:       // %bb.0:
334; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
335; CHECK-NEXT:    mov w8, w0
336; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
337; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
338; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
339; CHECK-NEXT:    fadd za.s[w8, 0, vgx4], { z0.s - z3.s }
340; CHECK-NEXT:    fadd za.s[w8, 7, vgx4], { z0.s - z3.s }
341; CHECK-NEXT:    ret
342  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice,
343                  <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
344                  <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
345  %slice.7 = add i32 %slice, 7
346  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice.7,
347                  <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
348                  <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
349  ret void
350}
351
352define void @multi_vector_add_za_vg1x4_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) {
353; CHECK-LABEL: multi_vector_add_za_vg1x4_f64:
354; CHECK:       // %bb.0:
355; CHECK-NEXT:    // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
356; CHECK-NEXT:    mov w8, w0
357; CHECK-NEXT:    // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
358; CHECK-NEXT:    // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
359; CHECK-NEXT:    // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3
360; CHECK-NEXT:    fadd za.d[w8, 0, vgx4], { z0.d - z3.d }
361; CHECK-NEXT:    fadd za.d[w8, 7, vgx4], { z0.d - z3.d }
362; CHECK-NEXT:    ret
363  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice,
364                  <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
365                  <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3)
366  %slice.7 = add i32 %slice, 7
367  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice.7,
368                  <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
369                  <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3)
370  ret void
371}
372
373;
374; ADD Vectors Multi-Single x2
375;
376
377define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x2_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm) {
378; CHECK-LABEL: multi_vec_add_single_x2_s8:
379; CHECK:       // %bb.0:
380; CHECK-NEXT:    mov z5.d, z2.d
381; CHECK-NEXT:    mov z4.d, z1.d
382; CHECK-NEXT:    add { z4.b, z5.b }, { z4.b, z5.b }, z3.b
383; CHECK-NEXT:    mov z0.d, z4.d
384; CHECK-NEXT:    mov z1.d, z5.d
385; CHECK-NEXT:    ret
386  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> }
387               @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
388                                                         <vscale x 16 x i8> %zm)
389  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
390}
391
392define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_single_x2_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm) {
393; CHECK-LABEL: multi_vec_add_single_x2_s16:
394; CHECK:       // %bb.0:
395; CHECK-NEXT:    mov z5.d, z2.d
396; CHECK-NEXT:    mov z4.d, z1.d
397; CHECK-NEXT:    add { z4.h, z5.h }, { z4.h, z5.h }, z3.h
398; CHECK-NEXT:    mov z0.d, z4.d
399; CHECK-NEXT:    mov z1.d, z5.d
400; CHECK-NEXT:    ret
401  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> }
402               @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
403                                                         <vscale x 8 x i16> %zm)
404  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
405}
406
407define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_single_x2_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm) {
408; CHECK-LABEL: multi_vec_add_single_x2_s32:
409; CHECK:       // %bb.0:
410; CHECK-NEXT:    mov z5.d, z2.d
411; CHECK-NEXT:    mov z4.d, z1.d
412; CHECK-NEXT:    add { z4.s, z5.s }, { z4.s, z5.s }, z3.s
413; CHECK-NEXT:    mov z0.d, z4.d
414; CHECK-NEXT:    mov z1.d, z5.d
415; CHECK-NEXT:    ret
416  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> }
417               @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
418                                                         <vscale x 4 x i32> %zm)
419  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
420}
421
422define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_single_x2_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm) {
423; CHECK-LABEL: multi_vec_add_single_x2_s64:
424; CHECK:       // %bb.0:
425; CHECK-NEXT:    mov z5.d, z2.d
426; CHECK-NEXT:    mov z4.d, z1.d
427; CHECK-NEXT:    add { z4.d, z5.d }, { z4.d, z5.d }, z3.d
428; CHECK-NEXT:    mov z0.d, z4.d
429; CHECK-NEXT:    mov z1.d, z5.d
430; CHECK-NEXT:    ret
431 %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> }
432              @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
433                                                        <vscale x 2 x i64> %zm)
434  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
435}
436
437;
438; ADD Vectors Multi-Single x4
439;
440
441define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8>%zm) {
442; CHECK-LABEL: multi_vec_add_single_x4_s8:
443; CHECK:       // %bb.0:
444; CHECK-NEXT:    mov z27.d, z4.d
445; CHECK-NEXT:    mov z26.d, z3.d
446; CHECK-NEXT:    mov z25.d, z2.d
447; CHECK-NEXT:    mov z24.d, z1.d
448; CHECK-NEXT:    add { z24.b - z27.b }, { z24.b - z27.b }, z5.b
449; CHECK-NEXT:    mov z0.d, z24.d
450; CHECK-NEXT:    mov z1.d, z25.d
451; CHECK-NEXT:    mov z2.d, z26.d
452; CHECK-NEXT:    mov z3.d, z27.d
453; CHECK-NEXT:    ret
454  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
455              @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
456                                                        <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
457                                                        <vscale x 16 x i8> %zm)
458  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
459}
460
461define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_x4_single_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) {
462; CHECK-LABEL: multi_vec_add_x4_single_s16:
463; CHECK:       // %bb.0:
464; CHECK-NEXT:    mov z27.d, z4.d
465; CHECK-NEXT:    mov z26.d, z3.d
466; CHECK-NEXT:    mov z25.d, z2.d
467; CHECK-NEXT:    mov z24.d, z1.d
468; CHECK-NEXT:    add { z24.h - z27.h }, { z24.h - z27.h }, z5.h
469; CHECK-NEXT:    mov z0.d, z24.d
470; CHECK-NEXT:    mov z1.d, z25.d
471; CHECK-NEXT:    mov z2.d, z26.d
472; CHECK-NEXT:    mov z3.d, z27.d
473; CHECK-NEXT:    ret
474  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
475              @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
476                                                        <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
477                                                        <vscale x 8 x i16> %zm)
478  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
479}
480
481define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_x4_single_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) {
482; CHECK-LABEL: multi_vec_add_x4_single_s32:
483; CHECK:       // %bb.0:
484; CHECK-NEXT:    mov z27.d, z4.d
485; CHECK-NEXT:    mov z26.d, z3.d
486; CHECK-NEXT:    mov z25.d, z2.d
487; CHECK-NEXT:    mov z24.d, z1.d
488; CHECK-NEXT:    add { z24.s - z27.s }, { z24.s - z27.s }, z5.s
489; CHECK-NEXT:    mov z0.d, z24.d
490; CHECK-NEXT:    mov z1.d, z25.d
491; CHECK-NEXT:    mov z2.d, z26.d
492; CHECK-NEXT:    mov z3.d, z27.d
493; CHECK-NEXT:    ret
494  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
495              @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
496                                                        <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
497                                                        <vscale x 4 x i32> %zm)
498  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
499}
500
501define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_x4_single_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) {
502; CHECK-LABEL: multi_vec_add_x4_single_s64:
503; CHECK:       // %bb.0:
504; CHECK-NEXT:    mov z27.d, z4.d
505; CHECK-NEXT:    mov z26.d, z3.d
506; CHECK-NEXT:    mov z25.d, z2.d
507; CHECK-NEXT:    mov z24.d, z1.d
508; CHECK-NEXT:    add { z24.d - z27.d }, { z24.d - z27.d }, z5.d
509; CHECK-NEXT:    mov z0.d, z24.d
510; CHECK-NEXT:    mov z1.d, z25.d
511; CHECK-NEXT:    mov z2.d, z26.d
512; CHECK-NEXT:    mov z3.d, z27.d
513; CHECK-NEXT:    ret
514  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
515              @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
516                                                        <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
517                                                        <vscale x 2 x i64> %zm)
518  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
519}
520declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
521declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
522declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
523declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
524declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
525declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
526declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
527declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
528declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>)
529declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>)
530declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
531declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>)
532declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>)
533declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>)
534declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>)
535declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>)
536declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
537declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
538declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
539declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
540declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
541declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
542declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
543declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
544