xref: /llvm-project/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll (revision 62baf21daa377c4ec1a641b26931063c1117d262)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs -force-streaming < %s | FileCheck %s
3
4;
5; Move Multi-Vector From Tile (Read) x2
6;
7
8; Horizontal
9
10define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg2_b(i32 %slice) {
11; CHECK-LABEL: za_read_horiz_vg2_b:
12; CHECK:       // %bb.0:
13; CHECK-NEXT:    mov w12, w0
14; CHECK-NEXT:    mov { z0.b, z1.b }, za0h.b[w12, 0:1]
15; CHECK-NEXT:    mov { z0.b, z1.b }, za0h.b[w12, 14:15]
16; CHECK-NEXT:    ret
17  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice)
18  %slice.14 = add i32 %slice, 14
19  %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice.14)
20  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
21}
22
23define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg2_h(i32 %slice) {
24; CHECK-LABEL: za_read_horiz_vg2_h:
25; CHECK:       // %bb.0:
26; CHECK-NEXT:    mov w12, w0
27; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
28; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
29; CHECK-NEXT:    ret
30  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 0, i32 %slice)
31  %slice.6 = add i32 %slice, 6
32  %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 %slice.6)
33  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
34}
35
36define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg2_f16(i32 %slice) {
37; CHECK-LABEL: za_read_horiz_vg2_f16:
38; CHECK:       // %bb.0:
39; CHECK-NEXT:    mov w12, w0
40; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
41; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
42; CHECK-NEXT:    ret
43  %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 0, i32 %slice)
44  %slice.6 = add i32 %slice, 6
45  %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 %slice.6)
46  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
47}
48
49define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg2_bf16(i32 %slice) {
50; CHECK-LABEL: za_read_horiz_vg2_bf16:
51; CHECK:       // %bb.0:
52; CHECK-NEXT:    mov w12, w0
53; CHECK-NEXT:    mov { z0.h, z1.h }, za0h.h[w12, 0:1]
54; CHECK-NEXT:    mov { z0.h, z1.h }, za1h.h[w12, 6:7]
55; CHECK-NEXT:    ret
56  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 0, i32 %slice)
57  %slice.6 = add i32 %slice, 6
58  %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 %slice.6)
59  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
60}
61
62define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg2_s(i32 %slice) {
63; CHECK-LABEL: za_read_horiz_vg2_s:
64; CHECK:       // %bb.0:
65; CHECK-NEXT:    mov w12, w0
66; CHECK-NEXT:    mov { z0.s, z1.s }, za0h.s[w12, 0:1]
67; CHECK-NEXT:    mov { z0.s, z1.s }, za3h.s[w12, 2:3]
68; CHECK-NEXT:    ret
69  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 0, i32 %slice)
70  %slice.2 = add i32 %slice, 2
71  %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 %slice.2)
72  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
73}
74
75define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg2_f32(i32 %slice) {
76; CHECK-LABEL: za_read_horiz_vg2_f32:
77; CHECK:       // %bb.0:
78; CHECK-NEXT:    mov w12, w0
79; CHECK-NEXT:    mov { z0.s, z1.s }, za0h.s[w12, 0:1]
80; CHECK-NEXT:    mov { z0.s, z1.s }, za3h.s[w12, 2:3]
81; CHECK-NEXT:    ret
82  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 0, i32 %slice)
83  %slice.2 = add i32 %slice, 2
84  %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 %slice.2)
85  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
86}
87
88define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg2_d(i32 %slice) {
89; CHECK-LABEL: za_read_horiz_vg2_d:
90; CHECK:       // %bb.0:
91; CHECK-NEXT:    mov w12, w0
92; CHECK-NEXT:    mov { z0.d, z1.d }, za0h.d[w12, 0:1]
93; CHECK-NEXT:    ret
94  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 0, i32 %slice)
95  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
96}
97
98define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg2_f64(i32 %slice) {
99; CHECK-LABEL: za_read_horiz_vg2_f64:
100; CHECK:       // %bb.0:
101; CHECK-NEXT:    mov w12, w0
102; CHECK-NEXT:    mov { z0.d, z1.d }, za0h.d[w12, 0:1]
103; CHECK-NEXT:    ret
104  %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 0, i32 %slice)
105  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
106}
107
108; Vertical
109
110define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg2_b(i32 %slice) {
111; CHECK-LABEL: za_read_vert_vg2_b:
112; CHECK:       // %bb.0:
113; CHECK-NEXT:    mov w12, w0
114; CHECK-NEXT:    mov { z0.b, z1.b }, za0v.b[w12, 0:1]
115; CHECK-NEXT:    mov { z0.b, z1.b }, za0v.b[w12, 14:15]
116; CHECK-NEXT:    ret
117  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice)
118  %slice.14 = add i32 %slice, 14
119  %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice.14)
120  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
121}
122
123define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg2_h(i32 %slice) {
124; CHECK-LABEL: za_read_vert_vg2_h:
125; CHECK:       // %bb.0:
126; CHECK-NEXT:    mov w12, w0
127; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
128; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
129; CHECK-NEXT:    ret
130  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 0, i32 %slice)
131  %slice.6 = add i32 %slice, 6
132  %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 %slice.6)
133  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
134}
135
136define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg2_f16(i32 %slice) {
137; CHECK-LABEL: za_read_vert_vg2_f16:
138; CHECK:       // %bb.0:
139; CHECK-NEXT:    mov w12, w0
140; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
141; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
142; CHECK-NEXT:    ret
143  %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 0, i32 %slice)
144  %slice.6 = add i32 %slice, 6
145  %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 %slice.6)
146  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
147}
148
149define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg2_bf16(i32 %slice) {
150; CHECK-LABEL: za_read_vert_vg2_bf16:
151; CHECK:       // %bb.0:
152; CHECK-NEXT:    mov w12, w0
153; CHECK-NEXT:    mov { z0.h, z1.h }, za0v.h[w12, 0:1]
154; CHECK-NEXT:    mov { z0.h, z1.h }, za1v.h[w12, 6:7]
155; CHECK-NEXT:    ret
156  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 0, i32 %slice)
157  %slice.6 = add i32 %slice, 6
158  %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 %slice.6)
159  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
160}
161
162define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg2_s(i32 %slice) {
163; CHECK-LABEL: za_read_vert_vg2_s:
164; CHECK:       // %bb.0:
165; CHECK-NEXT:    mov w12, w0
166; CHECK-NEXT:    mov { z0.s, z1.s }, za0v.s[w12, 0:1]
167; CHECK-NEXT:    mov { z0.s, z1.s }, za3v.s[w12, 2:3]
168; CHECK-NEXT:    ret
169  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 0, i32 %slice)
170  %slice.2 = add i32 %slice, 2
171  %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 %slice.2)
172  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
173}
174
175define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg2_f32(i32 %slice) {
176; CHECK-LABEL: za_read_vert_vg2_f32:
177; CHECK:       // %bb.0:
178; CHECK-NEXT:    mov w12, w0
179; CHECK-NEXT:    mov { z0.s, z1.s }, za0v.s[w12, 0:1]
180; CHECK-NEXT:    mov { z0.s, z1.s }, za3v.s[w12, 2:3]
181; CHECK-NEXT:    ret
182  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 0, i32 %slice)
183  %slice.2 = add i32 %slice, 2
184  %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 %slice.2)
185  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
186}
187
188define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg2_d(i32 %slice) {
189; CHECK-LABEL: za_read_vert_vg2_d:
190; CHECK:       // %bb.0:
191; CHECK-NEXT:    mov w12, w0
192; CHECK-NEXT:    mov { z0.d, z1.d }, za0v.d[w12, 0:1]
193; CHECK-NEXT:    ret
194  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 0, i32 %slice)
195  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
196}
197
198define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg2_f64(i32 %slice) {
199; CHECK-LABEL: za_read_vert_vg2_f64:
200; CHECK:       // %bb.0:
201; CHECK-NEXT:    mov w12, w0
202; CHECK-NEXT:    mov { z0.d, z1.d }, za0v.d[w12, 0:1]
203; CHECK-NEXT:    ret
204  %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 0, i32 %slice)
205  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res
206}
207
208;
209; Move Multi-Vector From Tile (Read) x4
210;
211
212; Horizontal
213
214define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_horiz_vg4_b(i32 %slice) {
215; CHECK-LABEL: za_read_horiz_vg4_b:
216; CHECK:       // %bb.0:
217; CHECK-NEXT:    mov w12, w0
218; CHECK-NEXT:    mov { z0.b - z3.b }, za0h.b[w12, 0:3]
219; CHECK-NEXT:    mov { z0.b - z3.b }, za0h.b[w12, 12:15]
220; CHECK-NEXT:    ret
221  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice)
222  %slice.12 = add i32 %slice, 12
223  %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice.12)
224  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
225}
226
227define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_horiz_vg4_h(i32 %slice) {
228; CHECK-LABEL: za_read_horiz_vg4_h:
229; CHECK:       // %bb.0:
230; CHECK-NEXT:    mov w12, w0
231; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
232; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
233; CHECK-NEXT:    ret
234  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 0, i32 %slice)
235  %slice.4 = add i32 %slice, 4
236  %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 %slice.4)
237  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
238}
239
240define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_horiz_vg4_f16(i32 %slice) {
241; CHECK-LABEL: za_read_horiz_vg4_f16:
242; CHECK:       // %bb.0:
243; CHECK-NEXT:    mov w12, w0
244; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
245; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
246; CHECK-NEXT:    ret
247  %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 0, i32 %slice)
248  %slice.4 = add i32 %slice, 4
249  %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 %slice.4)
250  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
251}
252
253define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_horiz_vg4_bf16(i32 %slice) {
254; CHECK-LABEL: za_read_horiz_vg4_bf16:
255; CHECK:       // %bb.0:
256; CHECK-NEXT:    mov w12, w0
257; CHECK-NEXT:    mov { z0.h - z3.h }, za0h.h[w12, 0:3]
258; CHECK-NEXT:    mov { z0.h - z3.h }, za1h.h[w12, 4:7]
259; CHECK-NEXT:    ret
260  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 0, i32 %slice)
261  %slice.4 = add i32 %slice, 4
262  %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 %slice.4)
263  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
264}
265
266define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_horiz_vg4_s(i32 %slice) {
267; CHECK-LABEL: za_read_horiz_vg4_s:
268; CHECK:       // %bb.0:
269; CHECK-NEXT:    mov w12, w0
270; CHECK-NEXT:    mov { z0.s - z3.s }, za0h.s[w12, 0:3]
271; CHECK-NEXT:    ret
272  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 0, i32 %slice)
273  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
274}
275
276define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_horiz_vg4_f32(i32 %slice) {
277; CHECK-LABEL: za_read_horiz_vg4_f32:
278; CHECK:       // %bb.0:
279; CHECK-NEXT:    mov w12, w0
280; CHECK-NEXT:    mov { z0.s - z3.s }, za0h.s[w12, 0:3]
281; CHECK-NEXT:    ret
282  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 0, i32 %slice)
283  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
284}
285
286define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_horiz_vg4_d(i32 %slice) {
287; CHECK-LABEL: za_read_horiz_vg4_d:
288; CHECK:       // %bb.0:
289; CHECK-NEXT:    mov w12, w0
290; CHECK-NEXT:    mov { z0.d - z3.d }, za0h.d[w12, 0:3]
291; CHECK-NEXT:    ret
292  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 0, i32 %slice)
293  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
294}
295
296define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_horiz_vg4_f64(i32 %slice) {
297; CHECK-LABEL: za_read_horiz_vg4_f64:
298; CHECK:       // %bb.0:
299; CHECK-NEXT:    mov w12, w0
300; CHECK-NEXT:    mov { z0.d - z3.d }, za0h.d[w12, 0:3]
301; CHECK-NEXT:    ret
302  %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 0, i32 %slice)
303  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
304}
305
306; Vertical
307
308define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vert_vg4_b(i32 %slice) {
309; CHECK-LABEL: za_read_vert_vg4_b:
310; CHECK:       // %bb.0:
311; CHECK-NEXT:    mov w12, w0
312; CHECK-NEXT:    mov { z0.b - z3.b }, za0v.b[w12, 0:3]
313; CHECK-NEXT:    mov { z0.b - z3.b }, za0v.b[w12, 12:15]
314; CHECK-NEXT:    ret
315  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice)
316  %slice.12 = add i32 %slice, 12
317  %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice.12)
318  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
319}
320
321define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vert_vg4_h(i32 %slice) {
322; CHECK-LABEL: za_read_vert_vg4_h:
323; CHECK:       // %bb.0:
324; CHECK-NEXT:    mov w12, w0
325; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
326; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
327; CHECK-NEXT:    ret
328  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 0, i32 %slice)
329  %slice.4 = add i32 %slice, 4
330  %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 %slice.4)
331  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
332}
333
334define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vert_vg4_f16(i32 %slice) {
335; CHECK-LABEL: za_read_vert_vg4_f16:
336; CHECK:       // %bb.0:
337; CHECK-NEXT:    mov w12, w0
338; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
339; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
340; CHECK-NEXT:    ret
341  %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 0, i32 %slice)
342  %slice.4 = add i32 %slice, 4
343  %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 %slice.4)
344  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
345}
346
347define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vert_vg4_bf16(i32 %slice) {
348; CHECK-LABEL: za_read_vert_vg4_bf16:
349; CHECK:       // %bb.0:
350; CHECK-NEXT:    mov w12, w0
351; CHECK-NEXT:    mov { z0.h - z3.h }, za0v.h[w12, 0:3]
352; CHECK-NEXT:    mov { z0.h - z3.h }, za1v.h[w12, 4:7]
353; CHECK-NEXT:    ret
354  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 0, i32 %slice)
355  %slice.4 = add i32 %slice, 4
356  %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 %slice.4)
357  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
358}
359
360define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vert_vg4_s(i32 %slice) {
361; CHECK-LABEL: za_read_vert_vg4_s:
362; CHECK:       // %bb.0:
363; CHECK-NEXT:    mov w12, w0
364; CHECK-NEXT:    mov { z0.s - z3.s }, za0v.s[w12, 0:3]
365; CHECK-NEXT:    ret
366  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 0, i32 %slice)
367  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
368}
369
370define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vert_vg4_f32(i32 %slice) {
371; CHECK-LABEL: za_read_vert_vg4_f32:
372; CHECK:       // %bb.0:
373; CHECK-NEXT:    mov w12, w0
374; CHECK-NEXT:    mov { z0.s - z3.s }, za0v.s[w12, 0:3]
375; CHECK-NEXT:    ret
376  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 0, i32 %slice)
377  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res
378}
379
380define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vert_vg4_d(i32 %slice) {
381; CHECK-LABEL: za_read_vert_vg4_d:
382; CHECK:       // %bb.0:
383; CHECK-NEXT:    mov w12, w0
384; CHECK-NEXT:    mov { z0.d - z3.d }, za0v.d[w12, 0:3]
385; CHECK-NEXT:    ret
386  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 0, i32 %slice)
387  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
388}
389
390define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vert_vg4_f64(i32 %slice) {
391; CHECK-LABEL: za_read_vert_vg4_f64:
392; CHECK:       // %bb.0:
393; CHECK-NEXT:    mov w12, w0
394; CHECK-NEXT:    mov { z0.d - z3.d }, za0v.d[w12, 0:3]
395; CHECK-NEXT:    ret
396  %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 0, i32 %slice)
397  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res
398}
399
400; Move Multi-Vector From ZA (Read) x2
401
402define { <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x2_b(i32 %slice) {
403; CHECK-LABEL: za_read_vg1x2_b:
404; CHECK:       // %bb.0:
405; CHECK-NEXT:    mov w8, w0
406; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
407; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
408; CHECK-NEXT:    ret
409  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice)
410  %slice.7 = add i32 %slice, 7
411  %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32 %slice.7)
412  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
413}
414
415define { <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x2_h(i32 %slice) {
416; CHECK-LABEL: za_read_vg1x2_h:
417; CHECK:       // %bb.0:
418; CHECK-NEXT:    mov w8, w0
419; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
420; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
421; CHECK-NEXT:    ret
422  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice)
423  %slice.7 = add i32 %slice, 7
424  %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32 %slice.7)
425  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
426}
427
428define { <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x2_f16(i32 %slice) {
429; CHECK-LABEL: za_read_vg1x2_f16:
430; CHECK:       // %bb.0:
431; CHECK-NEXT:    mov w8, w0
432; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
433; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
434; CHECK-NEXT:    ret
435  %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice)
436  %slice.7 = add i32 %slice, 7
437  %res2 = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32 %slice.7)
438  ret { <vscale x 8 x half>, <vscale x 8 x half> } %res2
439}
440
441define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x2_bf16(i32 %slice) {
442; CHECK-LABEL: za_read_vg1x2_bf16:
443; CHECK:       // %bb.0:
444; CHECK-NEXT:    mov w8, w0
445; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
446; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
447; CHECK-NEXT:    ret
448  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice)
449  %slice.7 = add i32 %slice, 7
450  %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32 %slice.7)
451  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
452}
453
454define { <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x2_s(i32 %slice) {
455; CHECK-LABEL: za_read_vg1x2_s:
456; CHECK:       // %bb.0:
457; CHECK-NEXT:    mov w8, w0
458; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
459; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
460; CHECK-NEXT:    ret
461  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice)
462  %slice.7 = add i32 %slice, 7
463  %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32 %slice.7)
464  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
465}
466
467define { <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x2_f32(i32 %slice) {
468; CHECK-LABEL: za_read_vg1x2_f32:
469; CHECK:       // %bb.0:
470; CHECK-NEXT:    mov w8, w0
471; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
472; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
473; CHECK-NEXT:    ret
474  %res = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice)
475  %slice.7 = add i32 %slice, 7
476  %res2 = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32 %slice.7)
477  ret { <vscale x 4 x float>, <vscale x 4 x float> } %res2
478}
479
480define { <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x2_d(i32 %slice) {
481; CHECK-LABEL: za_read_vg1x2_d:
482; CHECK:       // %bb.0:
483; CHECK-NEXT:    mov w8, w0
484; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
485; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
486; CHECK-NEXT:    ret
487  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice)
488  %slice.7 = add i32 %slice, 7
489  %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice.7)
490  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
491}
492
493define { <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x2_f64(i32 %slice) {
494; CHECK-LABEL: za_read_vg1x2_f64:
495; CHECK:       // %bb.0:
496; CHECK-NEXT:    mov w8, w0
497; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 0, vgx2]
498; CHECK-NEXT:    mov { z0.d, z1.d }, za.d[w8, 7, vgx2]
499; CHECK-NEXT:    ret
500  %res = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice)
501  %slice.7 = add i32 %slice, 7
502  %res2 = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice.7)
503  ret { <vscale x 2 x double>, <vscale x 2 x double> } %res2
504}
505
506; Move Multi-Vector From ZA (Read) x4
507
508define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @za_read_vg1x4_b(i32 %slice) {
509; CHECK-LABEL: za_read_vg1x4_b:
510; CHECK:       // %bb.0:
511; CHECK-NEXT:    mov w8, w0
512; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
513; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
514; CHECK-NEXT:    ret
515  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice)
516  %slice.7 = add i32 %slice, 7
517  %res2 = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32 %slice.7)
518  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res2
519}
520
521define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @za_read_vg1x4_h(i32 %slice) {
522; CHECK-LABEL: za_read_vg1x4_h:
523; CHECK:       // %bb.0:
524; CHECK-NEXT:    mov w8, w0
525; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
526; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
527; CHECK-NEXT:    ret
528  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice)
529  %slice.7 = add i32 %slice, 7
530  %res2 = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32 %slice.7)
531  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res2
532}
533
534define { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @za_read_vg1x4_f16(i32 %slice) {
535; CHECK-LABEL: za_read_vg1x4_f16:
536; CHECK:       // %bb.0:
537; CHECK-NEXT:    mov w8, w0
538; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
539; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
540; CHECK-NEXT:    ret
541  %res = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice)
542  %slice.7 = add i32 %slice, 7
543  %res2 = call { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32 %slice.7)
544  ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } %res2
545}
546
547define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @za_read_vg1x4_bf16(i32 %slice) {
548; CHECK-LABEL: za_read_vg1x4_bf16:
549; CHECK:       // %bb.0:
550; CHECK-NEXT:    mov w8, w0
551; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
552; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
553; CHECK-NEXT:    ret
554  %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice)
555  %slice.7 = add i32 %slice, 7
556  %res2 = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32 %slice.7)
557  ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res2
558}
559
560define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @za_read_vg1x4_s(i32 %slice) {
561; CHECK-LABEL: za_read_vg1x4_s:
562; CHECK:       // %bb.0:
563; CHECK-NEXT:    mov w8, w0
564; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
565; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
566; CHECK-NEXT:    ret
567  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice)
568  %slice.7 = add i32 %slice, 7
569  %res2 = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32 %slice.7)
570  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res2
571}
572
573define { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @za_read_vg1x4_f32(i32 %slice) {
574; CHECK-LABEL: za_read_vg1x4_f32:
575; CHECK:       // %bb.0:
576; CHECK-NEXT:    mov w8, w0
577; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
578; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
579; CHECK-NEXT:    ret
580  %res = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice)
581  %slice.7 = add i32 %slice, 7
582  %res2 = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32 %slice.7)
583  ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %res2
584}
585
586define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @za_read_vg1x4_d(i32 %slice) {
587; CHECK-LABEL: za_read_vg1x4_d:
588; CHECK:       // %bb.0:
589; CHECK-NEXT:    mov w8, w0
590; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
591; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
592; CHECK-NEXT:    ret
593  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice)
594  %slice.7 = add i32 %slice, 7
595  %res2 = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice.7)
596  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res2
597}
598
599define { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @za_read_vg1x4_f64(i32 %slice) {
600; CHECK-LABEL: za_read_vg1x4_f64:
601; CHECK:       // %bb.0:
602; CHECK-NEXT:    mov w8, w0
603; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 0, vgx4]
604; CHECK-NEXT:    mov { z0.d - z3.d }, za.d[w8, 7, vgx4]
605; CHECK-NEXT:    ret
606  %res = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice)
607  %slice.7 = add i32 %slice, 7
608  %res2 = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice.7)
609  ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %res2
610}
611
612declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32, i32)
613declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32, i32)
614declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32, i32)
615declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32, i32)
616declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32, i32)
617declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32, i32)
618declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32, i32)
619declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32, i32)
620
621declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32, i32)
622declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32, i32)
623declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32, i32)
624declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32, i32)
625declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32, i32)
626declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32, i32)
627declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32, i32)
628declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32, i32)
629
630declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32, i32)
631declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32, i32)
632declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32, i32)
633declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32, i32)
634declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32, i32)
635declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32, i32)
636declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32, i32)
637declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32, i32)
638
639declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32, i32)
640declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32, i32)
641declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32, i32)
642declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32, i32)
643declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32, i32)
644declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32, i32)
645declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32, i32)
646declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32, i32)
647
648declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x2.nxv16i8(i32)
649declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x2.nxv8i16(i32)
650declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x2.nxv4i32(i32)
651declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32)
652declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x2.nxv8f16(i32)
653declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x2.nxv8bf16(i32)
654declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x2.nxv4f32(i32)
655declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32)
656
657declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.read.vg1x4.nxv16i8(i32)
658declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sme.read.vg1x4.nxv8i16(i32)
659declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sme.read.vg1x4.nxv4i32(i32)
660declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32)
661declare { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.read.vg1x4.nxv8f16(i32)
662declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.read.vg1x4.nxv8bf16(i32)
663declare { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sme.read.vg1x4.nxv4f32(i32)
664declare { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32)
665