xref: /llvm-project/llvm/test/CodeGen/AArch64/sme-intrinsics-loads.ll (revision 62baf21daa377c4ec1a641b26931063c1117d262)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -force-streaming -verify-machineinstrs < %s | FileCheck %s
3
4define void @ld1b(<vscale x 16 x i1> %pg, ptr %ptr, i32 %sliceidx) {
5; CHECK-LABEL: ld1b:
6; CHECK:       // %bb.0:
7; CHECK-NEXT:    mov w12, w1
8; CHECK-NEXT:    mov w13, wzr
9; CHECK-NEXT:    ld1b {za0h.b[w12, 15]}, p0/z, [x0]
10; CHECK-NEXT:    ld1b {za0v.b[w13, 0]}, p0/z, [x0]
11; CHECK-NEXT:    ret
12  %tileslice = add i32 %sliceidx, 15
13  call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> %pg, ptr %ptr, i32 0, i32 %tileslice)
14  call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> %pg, ptr %ptr, i32 0, i32 0)
15  ret void;
16}
17
18define void @ld1b_with_addr_offset(<vscale x 16 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) {
19; CHECK-LABEL: ld1b_with_addr_offset:
20; CHECK:       // %bb.0:
21; CHECK-NEXT:    mov w13, wzr
22; CHECK-NEXT:    mov w12, w2
23; CHECK-NEXT:    ld1b {za0h.b[w13, 0]}, p0/z, [x0, x1]
24; CHECK-NEXT:    ld1b {za0v.b[w12, 15]}, p0/z, [x0, x1]
25; CHECK-NEXT:    ret
26  %base = getelementptr i8, ptr %ptr, i64 %index
27  %tileslice = add i32 %sliceidx, 15
28  call void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1> %pg, ptr %base, i32 0, i32 0)
29  call void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1> %pg, ptr %base, i32 0, i32 %tileslice)
30  ret void;
31}
32
33define void @ld1h(<vscale x 8 x i1> %pg, ptr %ptr, i32 %sliceidx) {
34; CHECK-LABEL: ld1h:
35; CHECK:       // %bb.0:
36; CHECK-NEXT:    mov w12, w1
37; CHECK-NEXT:    mov w13, wzr
38; CHECK-NEXT:    ld1h {za0h.h[w12, 7]}, p0/z, [x0]
39; CHECK-NEXT:    ld1h {za1h.h[w13, 0]}, p0/z, [x0]
40; CHECK-NEXT:    ld1h {za0v.h[w13, 0]}, p0/z, [x0]
41; CHECK-NEXT:    ld1h {za1v.h[w12, 7]}, p0/z, [x0]
42; CHECK-NEXT:    ret
43  %tileslice = add i32 %sliceidx, 7
44  call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> %pg, ptr %ptr, i32 0, i32 %tileslice)
45  call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> %pg, ptr %ptr, i32 1, i32 0)
46  call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> %pg, ptr %ptr, i32 0, i32 0)
47  call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> %pg, ptr %ptr, i32 1, i32 %tileslice)
48  ret void;
49}
50
51define void @ld1h_with_addr_offset(<vscale x 8 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) {
52; CHECK-LABEL: ld1h_with_addr_offset:
53; CHECK:       // %bb.0:
54; CHECK-NEXT:    mov w12, w2
55; CHECK-NEXT:    mov w13, wzr
56; CHECK-NEXT:    ld1h {za0h.h[w12, 7]}, p0/z, [x0, x1, lsl #1]
57; CHECK-NEXT:    ld1h {za1v.h[w13, 0]}, p0/z, [x0, x1, lsl #1]
58; CHECK-NEXT:    ret
59  %base = getelementptr i16, ptr %ptr, i64 %index
60  %tileslice = add i32 %sliceidx, 7
61  call void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1> %pg, ptr %base, i32 0, i32 %tileslice)
62  call void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1> %pg, ptr %base, i32 1, i32 0)
63  ret void;
64}
65
66define void @ld1w(<vscale x 4 x i1> %pg, ptr %ptr, i32 %sliceidx) {
67; CHECK-LABEL: ld1w:
68; CHECK:       // %bb.0:
69; CHECK-NEXT:    mov w12, w1
70; CHECK-NEXT:    mov w13, wzr
71; CHECK-NEXT:    ld1w {za0h.s[w13, 0]}, p0/z, [x0]
72; CHECK-NEXT:    ld1w {za1h.s[w13, 0]}, p0/z, [x0]
73; CHECK-NEXT:    ld1w {za2h.s[w13, 0]}, p0/z, [x0]
74; CHECK-NEXT:    ld1w {za3h.s[w12, 3]}, p0/z, [x0]
75; CHECK-NEXT:    ld1w {za0v.s[w13, 0]}, p0/z, [x0]
76; CHECK-NEXT:    ld1w {za1v.s[w13, 0]}, p0/z, [x0]
77; CHECK-NEXT:    ld1w {za2v.s[w12, 3]}, p0/z, [x0]
78; CHECK-NEXT:    ld1w {za3v.s[w13, 0]}, p0/z, [x0]
79; CHECK-NEXT:    ret
80  %tileslice = add i32 %sliceidx, 3
81  call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 0, i32 0)
82  call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 1, i32 0)
83  call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 2, i32 0)
84  call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %ptr, i32 3, i32 %tileslice)
85  call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 0, i32 0)
86  call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 1, i32 0)
87  call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 2, i32 %tileslice)
88  call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> %pg, ptr %ptr, i32 3, i32 0)
89  ret void;
90}
91
92define void @ld1w_with_addr_offset(<vscale x 4 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) {
93; CHECK-LABEL: ld1w_with_addr_offset:
94; CHECK:       // %bb.0:
95; CHECK-NEXT:    mov w12, w2
96; CHECK-NEXT:    mov w13, wzr
97; CHECK-NEXT:    ld1w {za0h.s[w13, 0]}, p0/z, [x0, x1, lsl #2]
98; CHECK-NEXT:    ld1w {za3v.s[w12, 3]}, p0/z, [x0, x1, lsl #2]
99; CHECK-NEXT:    ret
100  %base = getelementptr i32, ptr %ptr, i64 %index
101  %tileslice = add i32 %sliceidx, 3
102  call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %base, i32 0, i32 0)
103  call void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1> %pg, ptr %base, i32 3, i32 %tileslice)
104  ret void;
105}
106
107define void @ld1d(<vscale x 2 x i1> %pg, ptr %ptr, i32 %sliceidx) {
108; CHECK-LABEL: ld1d:
109; CHECK:       // %bb.0:
110; CHECK-NEXT:    mov w12, w1
111; CHECK-NEXT:    mov w13, wzr
112; CHECK-NEXT:    ld1d {za0h.d[w13, 0]}, p0/z, [x0]
113; CHECK-NEXT:    ld1d {za1h.d[w13, 0]}, p0/z, [x0]
114; CHECK-NEXT:    ld1d {za2h.d[w13, 0]}, p0/z, [x0]
115; CHECK-NEXT:    ld1d {za3h.d[w13, 0]}, p0/z, [x0]
116; CHECK-NEXT:    ld1d {za4h.d[w12, 1]}, p0/z, [x0]
117; CHECK-NEXT:    ld1d {za5h.d[w13, 0]}, p0/z, [x0]
118; CHECK-NEXT:    ld1d {za6h.d[w13, 0]}, p0/z, [x0]
119; CHECK-NEXT:    ld1d {za7h.d[w13, 0]}, p0/z, [x0]
120; CHECK-NEXT:    ld1d {za0v.d[w13, 0]}, p0/z, [x0]
121; CHECK-NEXT:    ld1d {za1v.d[w13, 0]}, p0/z, [x0]
122; CHECK-NEXT:    ld1d {za2v.d[w13, 0]}, p0/z, [x0]
123; CHECK-NEXT:    ld1d {za3v.d[w13, 0]}, p0/z, [x0]
124; CHECK-NEXT:    ld1d {za4v.d[w13, 0]}, p0/z, [x0]
125; CHECK-NEXT:    ld1d {za5v.d[w13, 0]}, p0/z, [x0]
126; CHECK-NEXT:    ld1d {za6v.d[w13, 0]}, p0/z, [x0]
127; CHECK-NEXT:    ld1d {za7v.d[w12, 1]}, p0/z, [x0]
128; CHECK-NEXT:    ret
129  %tileslice = add i32 %sliceidx, 1
130  call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 0, i32 0)
131  call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 1, i32 0)
132  call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 2, i32 0)
133  call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 3, i32 0)
134  call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 4, i32 %tileslice)
135  call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 5, i32 0)
136  call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 6, i32 0)
137  call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %ptr, i32 7, i32 0)
138  call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 0, i32 0)
139  call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 1, i32 0)
140  call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 2, i32 0)
141  call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 3, i32 0)
142  call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 4, i32 0)
143  call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 5, i32 0)
144  call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 6, i32 0)
145  call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %ptr, i32 7, i32 %tileslice)
146  ret void;
147}
148
149define void @ld1d_with_addr_offset(<vscale x 2 x i1> %pg, ptr %ptr, i64 %index, i32 %sliceidx) {
150; CHECK-LABEL: ld1d_with_addr_offset:
151; CHECK:       // %bb.0:
152; CHECK-NEXT:    mov w12, w2
153; CHECK-NEXT:    mov w13, wzr
154; CHECK-NEXT:    ld1d {za0h.d[w12, 1]}, p0/z, [x0, x1, lsl #3]
155; CHECK-NEXT:    ld1d {za7v.d[w13, 0]}, p0/z, [x0, x1, lsl #3]
156; CHECK-NEXT:    ret
157  %base = getelementptr i64, ptr %ptr, i64 %index
158  %tileslice = add i32 %sliceidx, 1
159  call void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1> %pg, ptr %base, i32 0, i32 %tileslice)
160  call void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1> %pg, ptr %base, i32 7, i32 0)
161  ret void;
162}
163
164define void @ld1q(<vscale x 1 x i1> %pg, ptr %ptr) {
165; CHECK-LABEL: ld1q:
166; CHECK:       // %bb.0:
167; CHECK-NEXT:    mov w12, wzr
168; CHECK-NEXT:    ld1q {za0h.q[w12, 0]}, p0/z, [x0]
169; CHECK-NEXT:    ld1q {za1h.q[w12, 0]}, p0/z, [x0]
170; CHECK-NEXT:    ld1q {za2h.q[w12, 0]}, p0/z, [x0]
171; CHECK-NEXT:    ld1q {za3h.q[w12, 0]}, p0/z, [x0]
172; CHECK-NEXT:    ld1q {za4h.q[w12, 0]}, p0/z, [x0]
173; CHECK-NEXT:    ld1q {za5h.q[w12, 0]}, p0/z, [x0]
174; CHECK-NEXT:    ld1q {za6h.q[w12, 0]}, p0/z, [x0]
175; CHECK-NEXT:    ld1q {za7h.q[w12, 0]}, p0/z, [x0]
176; CHECK-NEXT:    ld1q {za8h.q[w12, 0]}, p0/z, [x0]
177; CHECK-NEXT:    ld1q {za9h.q[w12, 0]}, p0/z, [x0]
178; CHECK-NEXT:    ld1q {za10h.q[w12, 0]}, p0/z, [x0]
179; CHECK-NEXT:    ld1q {za11h.q[w12, 0]}, p0/z, [x0]
180; CHECK-NEXT:    ld1q {za12h.q[w12, 0]}, p0/z, [x0]
181; CHECK-NEXT:    ld1q {za13h.q[w12, 0]}, p0/z, [x0]
182; CHECK-NEXT:    ld1q {za14h.q[w12, 0]}, p0/z, [x0]
183; CHECK-NEXT:    ld1q {za15h.q[w12, 0]}, p0/z, [x0]
184; CHECK-NEXT:    ld1q {za0v.q[w12, 0]}, p0/z, [x0]
185; CHECK-NEXT:    ld1q {za1v.q[w12, 0]}, p0/z, [x0]
186; CHECK-NEXT:    ld1q {za2v.q[w12, 0]}, p0/z, [x0]
187; CHECK-NEXT:    ld1q {za3v.q[w12, 0]}, p0/z, [x0]
188; CHECK-NEXT:    ld1q {za4v.q[w12, 0]}, p0/z, [x0]
189; CHECK-NEXT:    ld1q {za5v.q[w12, 0]}, p0/z, [x0]
190; CHECK-NEXT:    ld1q {za6v.q[w12, 0]}, p0/z, [x0]
191; CHECK-NEXT:    ld1q {za7v.q[w12, 0]}, p0/z, [x0]
192; CHECK-NEXT:    ld1q {za8v.q[w12, 0]}, p0/z, [x0]
193; CHECK-NEXT:    ld1q {za9v.q[w12, 0]}, p0/z, [x0]
194; CHECK-NEXT:    ld1q {za10v.q[w12, 0]}, p0/z, [x0]
195; CHECK-NEXT:    ld1q {za11v.q[w12, 0]}, p0/z, [x0]
196; CHECK-NEXT:    ld1q {za12v.q[w12, 0]}, p0/z, [x0]
197; CHECK-NEXT:    ld1q {za13v.q[w12, 0]}, p0/z, [x0]
198; CHECK-NEXT:    ld1q {za14v.q[w12, 0]}, p0/z, [x0]
199; CHECK-NEXT:    ld1q {za15v.q[w12, 0]}, p0/z, [x0]
200; CHECK-NEXT:    ret
201  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 0, i32 0)
202  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 1, i32 0)
203  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 2, i32 0)
204  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 3, i32 0)
205  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 4, i32 0)
206  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 5, i32 0)
207  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 6, i32 0)
208  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 7, i32 0)
209  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 8, i32 0)
210  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 9, i32 0)
211  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 10, i32 0)
212  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 11, i32 0)
213  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 12, i32 0)
214  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 13, i32 0)
215  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 14, i32 0)
216  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %ptr, i32 15, i32 0)
217  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 0, i32 0)
218  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 1, i32 0)
219  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 2, i32 0)
220  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 3, i32 0)
221  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 4, i32 0)
222  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 5, i32 0)
223  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 6, i32 0)
224  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 7, i32 0)
225  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 8, i32 0)
226  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 9, i32 0)
227  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 10, i32 0)
228  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 11, i32 0)
229  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 12, i32 0)
230  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 13, i32 0)
231  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 14, i32 0)
232  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %ptr, i32 15, i32 0)
233  ret void;
234}
235
236define void @ld1q_with_addr_offset(<vscale x 1 x i1> %pg, ptr %ptr, i64 %index) {
237; CHECK-LABEL: ld1q_with_addr_offset:
238; CHECK:       // %bb.0:
239; CHECK-NEXT:    mov w12, wzr
240; CHECK-NEXT:    ld1q {za0h.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
241; CHECK-NEXT:    ld1q {za15v.q[w12, 0]}, p0/z, [x0, x1, lsl #4]
242; CHECK-NEXT:    ret
243  %base = getelementptr i128, ptr %ptr, i64 %index
244  call void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1> %pg, ptr %base, i32 0, i32 0)
245  call void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1> %pg, ptr %base, i32 15, i32 0)
246  ret void;
247}
248
249define void @ldr(ptr %ptr) {
250; CHECK-LABEL: ldr:
251; CHECK:       // %bb.0:
252; CHECK-NEXT:    mov w12, wzr
253; CHECK-NEXT:    ldr za[w12, 0], [x0]
254; CHECK-NEXT:    ret
255  call void @llvm.aarch64.sme.ldr(i32 0, ptr %ptr, i32 0)
256  ret void;
257}
258
259define void @ldr_with_off_15(ptr %ptr) {
260; CHECK-LABEL: ldr_with_off_15:
261; CHECK:       // %bb.0:
262; CHECK-NEXT:    mov w12, #15 // =0xf
263; CHECK-NEXT:    add x8, x0, #15
264; CHECK-NEXT:    ldr za[w12, 0], [x8]
265; CHECK-NEXT:    ret
266  %base = getelementptr i8, ptr %ptr, i64 15
267  call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0)
268  ret void;
269}
270
271define void @ldr_with_off_15mulvl(ptr %ptr) {
272; CHECK-LABEL: ldr_with_off_15mulvl:
273; CHECK:       // %bb.0:
274; CHECK-NEXT:    mov w12, #15 // =0xf
275; CHECK-NEXT:    addvl x8, x0, #15
276; CHECK-NEXT:    ldr za[w12, 0], [x8]
277; CHECK-NEXT:    ret
278  %vscale = call i64 @llvm.vscale.i64()
279  %mulvl = mul i64 %vscale, 240
280  %base = getelementptr i8, ptr %ptr, i64 %mulvl
281  call void @llvm.aarch64.sme.ldr(i32 15, ptr %base, i32 0)
282  ret void;
283}
284
285define void @ldr_with_off_16mulvl(ptr %ptr) {
286; CHECK-LABEL: ldr_with_off_16mulvl:
287; CHECK:       // %bb.0:
288; CHECK-NEXT:    mov w12, #16 // =0x10
289; CHECK-NEXT:    addvl x8, x0, #16
290; CHECK-NEXT:    ldr za[w12, 0], [x8]
291; CHECK-NEXT:    ret
292  %vscale = call i64 @llvm.vscale.i64()
293  %mulvl = mul i64 %vscale, 256
294  %base = getelementptr i8, ptr %ptr, i64 %mulvl
295  call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 0)
296  ret void;
297}
298
299define void @ldr_with_off_var(ptr %base, i32 %off) {
300; CHECK-LABEL: ldr_with_off_var:
301; CHECK:       // %bb.0:
302; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
303; CHECK-NEXT:    sxtw x8, w1
304; CHECK-NEXT:    rdsvl x9, #1
305; CHECK-NEXT:    add w12, w1, #16
306; CHECK-NEXT:    madd x8, x9, x8, x0
307; CHECK-NEXT:    ldr za[w12, 0], [x8]
308; CHECK-NEXT:    ret
309  call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 %off)
310  ret void;
311}
312
313define void @ldr_with_off_15imm(ptr %base) {
314; CHECK-LABEL: ldr_with_off_15imm:
315; CHECK:       // %bb.0:
316; CHECK-NEXT:    mov w12, #16 // =0x10
317; CHECK-NEXT:    ldr za[w12, 15], [x0, #15, mul vl]
318; CHECK-NEXT:    ret
319  call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 15)
320  ret void;
321}
322
323define void @ldr_with_off_16imm(ptr %base) {
324; CHECK-LABEL: ldr_with_off_16imm:
325; CHECK:       // %bb.0:
326; CHECK-NEXT:    rdsvl x8, #1
327; CHECK-NEXT:    mov w12, #32 // =0x20
328; CHECK-NEXT:    add x8, x0, x8, lsl #4
329; CHECK-NEXT:    ldr za[w12, 0], [x8]
330; CHECK-NEXT:    ret
331  call void @llvm.aarch64.sme.ldr(i32 16, ptr %base, i32 16)
332  ret void;
333}
334
335define void @ldr_with_off_many_imm(i32 %tile_slice, ptr %ptr) {
336; CHECK-LABEL: ldr_with_off_many_imm:
337; CHECK:       // %bb.0: // %entry
338; CHECK-NEXT:    mov w12, w0
339; CHECK-NEXT:    ldr za[w12, 1], [x1, #1, mul vl]
340; CHECK-NEXT:    ldr za[w12, 2], [x1, #2, mul vl]
341; CHECK-NEXT:    ldr za[w12, 3], [x1, #3, mul vl]
342; CHECK-NEXT:    ldr za[w12, 4], [x1, #4, mul vl]
343; CHECK-NEXT:    ret
344entry:
345  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 1)
346  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 2)
347  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 3)
348  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 4)
349  ret void
350}
351
352define void @ldr_with_off_many_imm_15_18(i32 %tile_slice, ptr %ptr) {
353; CHECK-LABEL: ldr_with_off_many_imm_15_18:
354; CHECK:       // %bb.0: // %entry
355; CHECK-NEXT:    rdsvl x8, #1
356; CHECK-NEXT:    mov w12, w0
357; CHECK-NEXT:    add x8, x1, x8, lsl #4
358; CHECK-NEXT:    ldr za[w12, 15], [x1, #15, mul vl]
359; CHECK-NEXT:    add w12, w0, #16
360; CHECK-NEXT:    ldr za[w12, 0], [x8]
361; CHECK-NEXT:    ldr za[w12, 1], [x8, #1, mul vl]
362; CHECK-NEXT:    ldr za[w12, 2], [x8, #2, mul vl]
363; CHECK-NEXT:    ret
364entry:
365  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 15)
366  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16)
367  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17)
368  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18)
369  ret void
370}
371
372define void @ldr_with_off_many_imm_16_19(i32 %tile_slice, ptr %ptr) {
373; CHECK-LABEL: ldr_with_off_many_imm_16_19:
374; CHECK:       // %bb.0: // %entry
375; CHECK-NEXT:    rdsvl x8, #1
376; CHECK-NEXT:    add w12, w0, #16
377; CHECK-NEXT:    add x8, x1, x8, lsl #4
378; CHECK-NEXT:    ldr za[w12, 0], [x8]
379; CHECK-NEXT:    ldr za[w12, 1], [x8, #1, mul vl]
380; CHECK-NEXT:    ldr za[w12, 2], [x8, #2, mul vl]
381; CHECK-NEXT:    ldr za[w12, 3], [x8, #3, mul vl]
382; CHECK-NEXT:    ret
383entry:
384  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 16)
385  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 17)
386  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 18)
387  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 19)
388  ret void
389}
390
391define void @ldr_with_off_many_imm_31_34(i32 %tile_slice, ptr %ptr) {
392; CHECK-LABEL: ldr_with_off_many_imm_31_34:
393; CHECK:       // %bb.0: // %entry
394; CHECK-NEXT:    rdsvl x8, #1
395; CHECK-NEXT:    add w12, w0, #16
396; CHECK-NEXT:    add x9, x1, x8, lsl #4
397; CHECK-NEXT:    add x8, x1, x8, lsl #5
398; CHECK-NEXT:    ldr za[w12, 15], [x9, #15, mul vl]
399; CHECK-NEXT:    add w12, w0, #32
400; CHECK-NEXT:    ldr za[w12, 0], [x8]
401; CHECK-NEXT:    ldr za[w12, 1], [x8, #1, mul vl]
402; CHECK-NEXT:    ldr za[w12, 2], [x8, #2, mul vl]
403; CHECK-NEXT:    ret
404entry:
405  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 31)
406  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32)
407  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33)
408  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34)
409  ret void
410}
411
412define void @ldr_with_off_many_imm_32_35(i32 %tile_slice, ptr %ptr, i64 %vnum) {
413; CHECK-LABEL: ldr_with_off_many_imm_32_35:
414; CHECK:       // %bb.0: // %entry
415; CHECK-NEXT:    rdsvl x8, #1
416; CHECK-NEXT:    add w12, w0, #32
417; CHECK-NEXT:    add x8, x1, x8, lsl #5
418; CHECK-NEXT:    ldr za[w12, 0], [x8]
419; CHECK-NEXT:    ldr za[w12, 1], [x8, #1, mul vl]
420; CHECK-NEXT:    ldr za[w12, 2], [x8, #2, mul vl]
421; CHECK-NEXT:    ldr za[w12, 3], [x8, #3, mul vl]
422; CHECK-NEXT:    ret
423entry:
424  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 32)
425  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 33)
426  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 34)
427  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 35)
428  ret void
429}
430
431define void @ldr_with_off_many_var(i32 %tile_slice, ptr %ptr, i64 %vnum) {
432; CHECK-LABEL: ldr_with_off_many_var:
433; CHECK:       // %bb.0: // %entry
434; CHECK-NEXT:    sxtw x8, w2
435; CHECK-NEXT:    rdsvl x9, #1
436; CHECK-NEXT:    add w12, w0, w2
437; CHECK-NEXT:    madd x8, x9, x8, x1
438; CHECK-NEXT:    ldr za[w12, 0], [x8]
439; CHECK-NEXT:    ldr za[w12, 1], [x8, #1, mul vl]
440; CHECK-NEXT:    ldr za[w12, 2], [x8, #2, mul vl]
441; CHECK-NEXT:    ldr za[w12, 3], [x8, #3, mul vl]
442; CHECK-NEXT:    ret
443entry:
444  %0 = trunc i64 %vnum to i32
445  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %0)
446  %1 = add i32 %0, 1
447  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1)
448  %2 = add i32 %0, 2
449  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2)
450  %3 = add i32 %0, 3
451  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3)
452  ret void
453}
454
455define void @ldr_with_off_many_var_high(i32 %tile_slice, ptr %ptr, i64 %vnum) {
456; CHECK-LABEL: ldr_with_off_many_var_high:
457; CHECK:       // %bb.0: // %entry
458; CHECK-NEXT:    add w8, w2, #32
459; CHECK-NEXT:    rdsvl x10, #1
460; CHECK-NEXT:    sxtw x9, w8
461; CHECK-NEXT:    add w12, w0, w8
462; CHECK-NEXT:    madd x9, x10, x9, x1
463; CHECK-NEXT:    ldr za[w12, 1], [x9, #1, mul vl]
464; CHECK-NEXT:    ldr za[w12, 2], [x9, #2, mul vl]
465; CHECK-NEXT:    ldr za[w12, 3], [x9, #3, mul vl]
466; CHECK-NEXT:    ldr za[w12, 4], [x9, #4, mul vl]
467; CHECK-NEXT:    ret
468entry:
469  %0 = trunc i64 %vnum to i32
470  %1 = add i32 %0, 33
471  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %1)
472  %2 = add i32 %0, 34
473  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %2)
474  %3 = add i32 %0, 35
475  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %3)
476  %4 = add i32 %0, 36
477  tail call void @llvm.aarch64.sme.ldr(i32 %tile_slice, ptr %ptr, i32 %4)
478  ret void
479}
480
481; Ensure that the tile offset is sunk, given that this is likely to be an 'add'
482; that's decomposed into a base + offset in ISel.
483define void @test_ld1_sink_tile0_offset_operand(<vscale x 4 x i1> %pg, ptr %src, i32 %base, i32 %N) {
484; CHECK-LABEL: test_ld1_sink_tile0_offset_operand:
485; CHECK:       // %bb.0: // %entry
486; CHECK-NEXT:    mov w12, w1
487; CHECK-NEXT:  .LBB24_1: // %for.body
488; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
489; CHECK-NEXT:    ld1w {za0h.s[w12, 0]}, p0/z, [x0]
490; CHECK-NEXT:    subs w2, w2, #1
491; CHECK-NEXT:    ld1w {za0h.s[w12, 1]}, p0/z, [x0]
492; CHECK-NEXT:    ld1w {za0h.s[w12, 2]}, p0/z, [x0]
493; CHECK-NEXT:    b.ne .LBB24_1
494; CHECK-NEXT:  // %bb.2: // %exit
495; CHECK-NEXT:    ret
496entry:
497  %add1 = add i32 %base, 1
498  %add2 = add i32 %base, 2
499  br label %for.body
500
501for.body:
502  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
503  call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %base)
504  call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %add1)
505  call void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1> %pg, ptr %src, i32 0, i32 %add2)
506  %inc = add nuw nsw i32 %i, 1
507  %exitcond.not = icmp eq i32 %inc, %N
508  br i1 %exitcond.not, label %exit, label %for.body
509
510exit:
511  ret void
512}
513
514
515declare void @llvm.aarch64.sme.ld1b.horiz(<vscale x 16 x i1>, ptr, i32, i32)
516declare void @llvm.aarch64.sme.ld1h.horiz(<vscale x 8 x i1>, ptr, i32, i32)
517declare void @llvm.aarch64.sme.ld1w.horiz(<vscale x 4 x i1>, ptr, i32, i32)
518declare void @llvm.aarch64.sme.ld1d.horiz(<vscale x 2 x i1>, ptr, i32, i32)
519declare void @llvm.aarch64.sme.ld1q.horiz(<vscale x 1 x i1>, ptr, i32, i32)
520declare void @llvm.aarch64.sme.ld1b.vert(<vscale x 16 x i1>, ptr, i32, i32)
521declare void @llvm.aarch64.sme.ld1h.vert(<vscale x 8 x i1>, ptr, i32, i32)
522declare void @llvm.aarch64.sme.ld1w.vert(<vscale x 4 x i1>, ptr, i32, i32)
523declare void @llvm.aarch64.sme.ld1d.vert(<vscale x 2 x i1>, ptr, i32, i32)
524declare void @llvm.aarch64.sme.ld1q.vert(<vscale x 1 x i1>, ptr, i32, i32)
525
526declare void @llvm.aarch64.sme.ldr(i32, ptr, i32)
527declare i64 @llvm.vscale.i64()
528