xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-imm.ll (revision 1ee315ae7964c8433b772e0b5d667834994ba753)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
3
4; Range checks: for all the instruction tested in this file, the
5; immediate must be within the range [-8, 7] (4-bit immediate). Out of
6; range values are tested only in one case (following). Valid values
7; are tested all through the rest of the file.
8
9define void @imm_out_of_range(ptr %base, <vscale x 2 x i1> %mask) nounwind {
10; CHECK-LABEL: imm_out_of_range:
11; CHECK:       // %bb.0:
12; CHECK-NEXT:    rdvl x8, #8
13; CHECK-NEXT:    add x8, x0, x8
14; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8]
15; CHECK-NEXT:    rdvl x8, #-9
16; CHECK-NEXT:    add x8, x0, x8
17; CHECK-NEXT:    st1d { z0.d }, p0, [x8]
18; CHECK-NEXT:    ret
19  %base_load = getelementptr <vscale x 2 x i64>, ptr %base, i64 8
20  %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(ptr %base_load,
21                                                            i32 1,
22                                                            <vscale x 2 x i1> %mask,
23                                                            <vscale x 2 x i64> undef)
24  %base_store = getelementptr <vscale x 2 x i64>, ptr %base, i64 -9
25  call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
26                                       ptr %base_store,
27                                       i32 1,
28                                       <vscale x 2 x i1> %mask)
29  ret void
30}
31
32; 2-lane contiguous load/stores
33
34define void @test_masked_ldst_sv2i8(ptr %base, <vscale x 2 x i1> %mask) nounwind {
35; CHECK-LABEL: test_masked_ldst_sv2i8:
36; CHECK:       // %bb.0:
37; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, #-8, mul vl]
38; CHECK-NEXT:    st1b { z0.d }, p0, [x0, #-7, mul vl]
39; CHECK-NEXT:    ret
40  %base_load = getelementptr <vscale x 2 x i8>, ptr %base, i64 -8
41  %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_load,
42                                                          i32 1,
43                                                          <vscale x 2 x i1> %mask,
44                                                          <vscale x 2 x i8> undef)
45  %base_store = getelementptr <vscale x 2 x i8>, ptr %base, i64 -7
46  call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
47                                      ptr %base_store,
48                                      i32 1,
49                                      <vscale x 2 x i1> %mask)
50  ret void
51}
52
53define void @test_masked_ldst_sv2i16(ptr %base, <vscale x 2 x i1> %mask) nounwind {
54; CHECK-LABEL: test_masked_ldst_sv2i16:
55; CHECK:       // %bb.0:
56; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, #-8, mul vl]
57; CHECK-NEXT:    st1h { z0.d }, p0, [x0, #-7, mul vl]
58; CHECK-NEXT:    ret
59  %base_load = getelementptr <vscale x 2 x i16>, ptr %base, i64 -8
60  %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_load,
61                                                            i32 1,
62                                                            <vscale x 2 x i1> %mask,
63                                                            <vscale x 2 x i16> undef)
64  %base_store = getelementptr <vscale x 2 x i16>, ptr %base, i64 -7
65  call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
66                                       ptr %base_store,
67                                       i32 1,
68                                       <vscale x 2 x i1> %mask)
69  ret void
70}
71
72
73define void @test_masked_ldst_sv2i32(ptr %base, <vscale x 2 x i1> %mask) nounwind {
74; CHECK-LABEL: test_masked_ldst_sv2i32:
75; CHECK:       // %bb.0:
76; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, #-8, mul vl]
77; CHECK-NEXT:    st1w { z0.d }, p0, [x0, #-7, mul vl]
78; CHECK-NEXT:    ret
79  %base_load = getelementptr <vscale x 2 x i32>, ptr %base, i64 -8
80  %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_load,
81                                                            i32 1,
82                                                            <vscale x 2 x i1> %mask,
83                                                            <vscale x 2 x i32> undef)
84  %base_store = getelementptr <vscale x 2 x i32>, ptr %base, i64 -7
85  call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
86                                       ptr %base_store,
87                                       i32 1,
88                                       <vscale x 2 x i1> %mask)
89  ret void
90}
91
92define void @test_masked_ldst_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
93; CHECK-LABEL: test_masked_ldst_sv2i64:
94; CHECK:       // %bb.0:
95; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #-8, mul vl]
96; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #-7, mul vl]
97; CHECK-NEXT:    ret
98  %base_load = getelementptr <vscale x 2 x i64>, ptr %base, i64 -8
99  %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(ptr %base_load,
100                                                            i32 1,
101                                                            <vscale x 2 x i1> %mask,
102                                                            <vscale x 2 x i64> undef)
103  %base_store = getelementptr <vscale x 2 x i64>, ptr %base, i64 -7
104  call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
105                                       ptr %base_store,
106                                       i32 1,
107                                       <vscale x 2 x i1> %mask)
108  ret void
109}
110
111define void @test_masked_ldst_sv2f16(ptr %base, <vscale x 2 x i1> %mask) nounwind {
112; CHECK-LABEL: test_masked_ldst_sv2f16:
113; CHECK:       // %bb.0:
114; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, #-8, mul vl]
115; CHECK-NEXT:    st1h { z0.d }, p0, [x0, #-7, mul vl]
116; CHECK-NEXT:    ret
117  %base_load = getelementptr <vscale x 2 x half>, ptr %base, i64 -8
118  %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(ptr %base_load,
119                                                             i32 1,
120                                                             <vscale x 2 x i1> %mask,
121                                                             <vscale x 2 x half> undef)
122  %base_store = getelementptr <vscale x 2 x half>, ptr %base, i64 -7
123  call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
124                                       ptr %base_store,
125                                       i32 1,
126                                       <vscale x 2 x i1> %mask)
127  ret void
128}
129
130
131define void @test_masked_ldst_sv2f32(ptr %base, <vscale x 2 x i1> %mask) nounwind {
132; CHECK-LABEL: test_masked_ldst_sv2f32:
133; CHECK:       // %bb.0:
134; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, #-8, mul vl]
135; CHECK-NEXT:    st1w { z0.d }, p0, [x0, #-7, mul vl]
136; CHECK-NEXT:    ret
137  %base_load = getelementptr <vscale x 2 x float>, ptr %base, i64 -8
138  %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(ptr %base_load,
139                                                              i32 1,
140                                                              <vscale x 2 x i1> %mask,
141                                                              <vscale x 2 x float> undef)
142  %base_store = getelementptr <vscale x 2 x float>, ptr %base, i64 -7
143  call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
144                                       ptr %base_store,
145                                       i32 1,
146                                       <vscale x 2 x i1> %mask)
147  ret void
148}
149
150define void @test_masked_ldst_sv2f64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
151; CHECK-LABEL: test_masked_ldst_sv2f64:
152; CHECK:       // %bb.0:
153; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, #-6, mul vl]
154; CHECK-NEXT:    st1d { z0.d }, p0, [x0, #-5, mul vl]
155; CHECK-NEXT:    ret
156  %base_load = getelementptr <vscale x 2 x double>, ptr %base, i64 -6
157  %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %base_load,
158                                                               i32 1,
159                                                               <vscale x 2 x i1> %mask,
160                                                               <vscale x 2 x double> undef)
161  %base_store = getelementptr <vscale x 2 x double>, ptr %base, i64 -5
162  call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
163                                       ptr %base_store,
164                                       i32 1,
165                                       <vscale x 2 x i1> %mask)
166  ret void
167}
168
169; 2-lane zero/sign extended contiguous loads.
170
171define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
172; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
173; CHECK:       // %bb.0:
174; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, #-4, mul vl]
175; CHECK-NEXT:    ret
176  %base_load = getelementptr <vscale x 2 x i8>, ptr %base, i64 -4
177  %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_load,
178                                                          i32 1,
179                                                          <vscale x 2 x i1> %mask,
180                                                          <vscale x 2 x i8> undef)
181  %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
182  ret <vscale x 2 x i64> %ext
183}
184
185define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
186; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
187; CHECK:       // %bb.0:
188; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, #-3, mul vl]
189; CHECK-NEXT:    ret
190  %base_load = getelementptr <vscale x 2 x i8>, ptr %base, i64 -3
191  %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_load,
192                                                          i32 1,
193                                                          <vscale x 2 x i1> %mask,
194                                                          <vscale x 2 x i8> undef)
195  %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
196  ret <vscale x 2 x i64> %ext
197}
198
199define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
200; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
201; CHECK:       // %bb.0:
202; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, #1, mul vl]
203; CHECK-NEXT:    ret
204  %base_load = getelementptr <vscale x 2 x i16>, ptr %base, i64 1
205  %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_load,
206                                                            i32 1,
207                                                            <vscale x 2 x i1> %mask,
208                                                            <vscale x 2 x i16> undef)
209  %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
210  ret <vscale x 2 x i64> %ext
211}
212
213define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
214; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
215; CHECK:       // %bb.0:
216; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, #2, mul vl]
217; CHECK-NEXT:    ret
218  %base_load = getelementptr <vscale x 2 x i16>, ptr %base, i64 2
219  %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_load,
220                                                            i32 1,
221                                                            <vscale x 2 x i1> %mask,
222                                                            <vscale x 2 x i16> undef)
223  %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
224  ret <vscale x 2 x i64> %ext
225}
226
227define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
228; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
229; CHECK:       // %bb.0:
230; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, #-2, mul vl]
231; CHECK-NEXT:    ret
232  %base_load = getelementptr <vscale x 2 x i32>, ptr %base, i64 -2
233  %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_load,
234                                                            i32 1,
235                                                            <vscale x 2 x i1> %mask,
236                                                            <vscale x 2 x i32> undef)
237  %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
238  ret <vscale x 2 x i64> %ext
239}
240
241define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask) nounwind {
242; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
243; CHECK:       // %bb.0:
244; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, #-1, mul vl]
245; CHECK-NEXT:    ret
246  %base_load = getelementptr <vscale x 2 x i32>, ptr %base, i64 -1
247  %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_load,
248                                                            i32 1,
249                                                            <vscale x 2 x i1> %mask,
250                                                            <vscale x 2 x i32> undef)
251  %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
252  ret <vscale x 2 x i64> %ext
253}
254
255; 2-lane truncating contiguous stores.
256
257define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask) nounwind {
258; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
259; CHECK:       // %bb.0:
260; CHECK-NEXT:    st1b { z0.d }, p0, [x0, #3, mul vl]
261; CHECK-NEXT:    ret
262  %base_load = getelementptr <vscale x 2 x i8>, ptr %base, i64 3
263  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
264  call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
265                                      ptr %base_load,
266                                      i32 1,
267                                      <vscale x 2 x i1> %mask)
268  ret void
269}
270
271
272define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask) nounwind {
273; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
274; CHECK:       // %bb.0:
275; CHECK-NEXT:    st1h { z0.d }, p0, [x0, #4, mul vl]
276; CHECK-NEXT:    ret
277  %base_load = getelementptr <vscale x 2 x i16>, ptr %base, i64 4
278  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
279  call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
280                                       ptr %base_load,
281                                       i32 1,
282                                       <vscale x 2 x i1> %mask)
283  ret void
284}
285
286define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask) nounwind {
287; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
288; CHECK:       // %bb.0:
289; CHECK-NEXT:    st1w { z0.d }, p0, [x0, #5, mul vl]
290; CHECK-NEXT:    ret
291  %base_load = getelementptr <vscale x 2 x i32>, ptr %base, i64 5
292  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
293  call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
294                                       ptr %base_load,
295                                       i32 1,
296                                       <vscale x 2 x i1> %mask)
297  ret void
298}
299
300; 4-lane contiguous load/stores.
301
302define void @test_masked_ldst_sv4i8(ptr %base, <vscale x 4 x i1> %mask) nounwind {
303; CHECK-LABEL: test_masked_ldst_sv4i8:
304; CHECK:       // %bb.0:
305; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, #-1, mul vl]
306; CHECK-NEXT:    st1b { z0.s }, p0, [x0, #2, mul vl]
307; CHECK-NEXT:    ret
308  %base_load = getelementptr <vscale x 4 x i8>, ptr %base, i64 -1
309  %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_load,
310                                                          i32 1,
311                                                          <vscale x 4 x i1> %mask,
312                                                          <vscale x 4 x i8> undef)
313  %base_store = getelementptr <vscale x 4 x i8>, ptr %base, i64 2
314  call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
315                                      ptr %base_store,
316                                      i32 1,
317                                      <vscale x 4 x i1> %mask)
318  ret void
319}
320
321define void @test_masked_ldst_sv4i16(ptr %base, <vscale x 4 x i1> %mask) nounwind {
322; CHECK-LABEL: test_masked_ldst_sv4i16:
323; CHECK:       // %bb.0:
324; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, #-1, mul vl]
325; CHECK-NEXT:    st1h { z0.s }, p0, [x0, #2, mul vl]
326; CHECK-NEXT:    ret
327  %base_load = getelementptr <vscale x 4 x i16>, ptr %base, i64 -1
328  %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_load,
329                                                            i32 1,
330                                                            <vscale x 4 x i1> %mask,
331                                                            <vscale x 4 x i16> undef)
332  %base_store = getelementptr <vscale x 4 x i16>, ptr %base, i64 2
333  call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
334                                       ptr %base_store,
335                                       i32 1,
336                                       <vscale x 4 x i1> %mask)
337  ret void
338}
339
340define void @test_masked_ldst_sv4i32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
341; CHECK-LABEL: test_masked_ldst_sv4i32:
342; CHECK:       // %bb.0:
343; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, #6, mul vl]
344; CHECK-NEXT:    st1w { z0.s }, p0, [x0, #7, mul vl]
345; CHECK-NEXT:    ret
346  %base_load = getelementptr <vscale x 4 x i32>, ptr %base, i64 6
347  %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %base_load,
348                                                            i32 1,
349                                                            <vscale x 4 x i1> %mask,
350                                                            <vscale x 4 x i32> undef)
351  %base_store = getelementptr <vscale x 4 x i32>, ptr %base, i64 7
352  call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
353                                       ptr %base_store,
354                                       i32 1,
355                                       <vscale x 4 x i1> %mask)
356  ret void
357}
358
359define void @test_masked_ldst_sv4f16(ptr %base, <vscale x 4 x i1> %mask) nounwind {
360; CHECK-LABEL: test_masked_ldst_sv4f16:
361; CHECK:       // %bb.0:
362; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, #-1, mul vl]
363; CHECK-NEXT:    st1h { z0.s }, p0, [x0, #2, mul vl]
364; CHECK-NEXT:    ret
365  %base_load = getelementptr <vscale x 4 x half>, ptr %base, i64 -1
366  %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(ptr %base_load,
367                                                             i32 1,
368                                                             <vscale x 4 x i1> %mask,
369                                                             <vscale x 4 x half> undef)
370  %base_store = getelementptr <vscale x 4 x half>, ptr %base, i64 2
371  call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
372                                       ptr %base_store,
373                                       i32 1,
374                                       <vscale x 4 x i1> %mask)
375  ret void
376}
377
378define void @test_masked_ldst_sv4f32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
379; CHECK-LABEL: test_masked_ldst_sv4f32:
380; CHECK:       // %bb.0:
381; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, #-1, mul vl]
382; CHECK-NEXT:    st1w { z0.s }, p0, [x0, #2, mul vl]
383; CHECK-NEXT:    ret
384  %base_load = getelementptr <vscale x 4 x float>, ptr %base, i64 -1
385  %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(ptr %base_load,
386                                                              i32 1,
387                                                              <vscale x 4 x i1> %mask,
388                                                              <vscale x 4 x float> undef)
389  %base_store = getelementptr <vscale x 4 x float>, ptr %base, i64 2
390  call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
391                                       ptr %base_store,
392                                       i32 1,
393                                       <vscale x 4 x i1> %mask)
394  ret void
395}
396
397; 4-lane zero/sign extended contiguous loads.
398
399define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
400; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
401; CHECK:       // %bb.0:
402; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, #-4, mul vl]
403; CHECK-NEXT:    ret
404  %base_load = getelementptr <vscale x 4 x i8>, ptr %base, i64 -4
405  %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_load,
406                                                          i32 1,
407                                                          <vscale x 4 x i1> %mask,
408                                                          <vscale x 4 x i8> undef)
409  %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
410  ret <vscale x 4 x i32> %ext
411}
412
413define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
414; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
415; CHECK:       // %bb.0:
416; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0, #-3, mul vl]
417; CHECK-NEXT:    ret
418  %base_load = getelementptr <vscale x 4 x i8>, ptr %base, i64 -3
419  %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_load,
420                                                          i32 1,
421                                                          <vscale x 4 x i1> %mask,
422                                                          <vscale x 4 x i8> undef)
423  %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
424  ret <vscale x 4 x i32> %ext
425}
426
427define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
428; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
429; CHECK:       // %bb.0:
430; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, #1, mul vl]
431; CHECK-NEXT:    ret
432  %base_load = getelementptr <vscale x 4 x i16>, ptr %base, i64 1
433  %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_load,
434                                                            i32 1,
435                                                            <vscale x 4 x i1> %mask,
436                                                            <vscale x 4 x i16> undef)
437  %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
438  ret <vscale x 4 x i32> %ext
439}
440
441define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask) nounwind {
442; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
443; CHECK:       // %bb.0:
444; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, #2, mul vl]
445; CHECK-NEXT:    ret
446  %base_load = getelementptr <vscale x 4 x i16>, ptr %base, i64 2
447  %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_load,
448                                                            i32 1,
449                                                            <vscale x 4 x i1> %mask,
450                                                            <vscale x 4 x i16> undef)
451  %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
452  ret <vscale x 4 x i32> %ext
453}
454
455; 4-lane truncating contiguous stores.
456
457define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, ptr %base, <vscale x 4 x i1> %mask) nounwind {
458; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
459; CHECK:       // %bb.0:
460; CHECK-NEXT:    st1b { z0.s }, p0, [x0, #3, mul vl]
461; CHECK-NEXT:    ret
462  %base_load = getelementptr <vscale x 4 x i8>, ptr %base, i64 3
463  %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
464  call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
465                                      ptr %base_load,
466                                      i32 1,
467                                      <vscale x 4 x i1> %mask)
468  ret void
469}
470
471
472define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, ptr %base, <vscale x 4 x i1> %mask) nounwind {
473; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
474; CHECK:       // %bb.0:
475; CHECK-NEXT:    st1h { z0.s }, p0, [x0, #4, mul vl]
476; CHECK-NEXT:    ret
477  %base_load = getelementptr <vscale x 4 x i16>, ptr %base, i64 4
478  %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
479  call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
480                                       ptr %base_load,
481                                       i32 1,
482                                       <vscale x 4 x i1> %mask)
483  ret void
484}
485
486; 8-lane contiguous load/stores.
487
488define void @test_masked_ldst_sv8i8(ptr %base, <vscale x 8 x i1> %mask) nounwind {
489; CHECK-LABEL: test_masked_ldst_sv8i8:
490; CHECK:       // %bb.0:
491; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0, #6, mul vl]
492; CHECK-NEXT:    st1b { z0.h }, p0, [x0, #7, mul vl]
493; CHECK-NEXT:    ret
494  %base_load = getelementptr <vscale x 8 x i8>, ptr %base, i64 6
495  %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_load,
496                                                          i32 1,
497                                                          <vscale x 8 x i1> %mask,
498                                                          <vscale x 8 x i8> undef)
499  %base_store = getelementptr <vscale x 8 x i8>, ptr %base, i64 7
500  call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
501                                      ptr %base_store,
502                                      i32 1,
503                                      <vscale x 8 x i1> %mask)
504  ret void
505}
506
507define void @test_masked_ldst_sv8i16(ptr %base, <vscale x 8 x i1> %mask) nounwind {
508; CHECK-LABEL: test_masked_ldst_sv8i16:
509; CHECK:       // %bb.0:
510; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, #6, mul vl]
511; CHECK-NEXT:    st1h { z0.h }, p0, [x0, #7, mul vl]
512; CHECK-NEXT:    ret
513  %base_load = getelementptr <vscale x 8 x i16>, ptr %base, i64 6
514  %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(ptr %base_load,
515                                                            i32 1,
516                                                            <vscale x 8 x i1> %mask,
517                                                            <vscale x 8 x i16> undef)
518  %base_store = getelementptr <vscale x 8 x i16>, ptr %base, i64 7
519  call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
520                                       ptr %base_store,
521                                       i32 1,
522                                       <vscale x 8 x i1> %mask)
523  ret void
524}
525
526define void @test_masked_ldst_sv8f16(ptr %base, <vscale x 8 x i1> %mask) nounwind {
527; CHECK-LABEL: test_masked_ldst_sv8f16:
528; CHECK:       // %bb.0:
529; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, #-1, mul vl]
530; CHECK-NEXT:    st1h { z0.h }, p0, [x0, #2, mul vl]
531; CHECK-NEXT:    ret
532  %base_load = getelementptr <vscale x 8 x half>, ptr %base, i64 -1
533  %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(ptr %base_load,
534                                                             i32 1,
535                                                             <vscale x 8 x i1> %mask,
536                                                             <vscale x 8 x half> undef)
537  %base_store = getelementptr <vscale x 8 x half>, ptr %base, i64 2
538  call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
539                                       ptr %base_store,
540                                       i32 1,
541                                       <vscale x 8 x i1> %mask)
542  ret void
543}
544
545define void @test_masked_ldst_sv8bf16(ptr %base, <vscale x 8 x i1> %mask) nounwind #0 {
546; CHECK-LABEL: test_masked_ldst_sv8bf16:
547; CHECK:       // %bb.0:
548; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0, #-1, mul vl]
549; CHECK-NEXT:    st1h { z0.h }, p0, [x0, #2, mul vl]
550; CHECK-NEXT:    ret
551  %base_load = getelementptr <vscale x 8 x bfloat>, ptr %base, i64 -1
552  %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(ptr %base_load,
553                                                                i32 1,
554                                                                <vscale x 8 x i1> %mask,
555                                                                <vscale x 8 x bfloat> undef)
556  %base_store = getelementptr <vscale x 8 x bfloat>, ptr %base, i64 2
557  call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
558                                        ptr %base_store,
559                                        i32 1,
560                                        <vscale x 8 x i1> %mask)
561  ret void
562}
563
564; 8-lane zero/sign extended contiguous loads.
565
566define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(ptr %base, <vscale x 8 x i1> %mask) nounwind {
567; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
568; CHECK:       // %bb.0:
569; CHECK-NEXT:    ld1b { z0.h }, p0/z, [x0, #-4, mul vl]
570; CHECK-NEXT:    ret
571  %base_load = getelementptr <vscale x 8 x i8>, ptr %base, i64 -4
572  %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_load,
573                                                          i32 1,
574                                                          <vscale x 8 x i1> %mask,
575                                                          <vscale x 8 x i8> undef)
576  %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
577  ret <vscale x 8 x i16> %ext
578}
579
580define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(ptr %base, <vscale x 8 x i1> %mask) nounwind {
581; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
582; CHECK:       // %bb.0:
583; CHECK-NEXT:    ld1sb { z0.h }, p0/z, [x0, #-3, mul vl]
584; CHECK-NEXT:    ret
585  %base_load = getelementptr <vscale x 8 x i8>, ptr %base, i64 -3
586  %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_load,
587                                                          i32 1,
588                                                          <vscale x 8 x i1> %mask,
589                                                          <vscale x 8 x i8> undef)
590  %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
591  ret <vscale x 8 x i16> %ext
592}
593
594; 8-lane truncating contiguous stores.
595
596define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, ptr %base, <vscale x 8 x i1> %mask) nounwind {
597; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
598; CHECK:       // %bb.0:
599; CHECK-NEXT:    st1b { z0.h }, p0, [x0, #3, mul vl]
600; CHECK-NEXT:    ret
601  %base_load = getelementptr <vscale x 8 x i8>, ptr %base, i64 3
602  %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
603  call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
604                                      ptr %base_load,
605                                      i32 1,
606                                      <vscale x 8 x i1> %mask)
607  ret void
608}
609
610; 16-lane contiguous load/stores.
611
612define void @test_masked_ldst_sv16i8(ptr %base, <vscale x 16 x i1> %mask) nounwind {
613; CHECK-LABEL: test_masked_ldst_sv16i8:
614; CHECK:       // %bb.0:
615; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0, #6, mul vl]
616; CHECK-NEXT:    st1b { z0.b }, p0, [x0, #7, mul vl]
617; CHECK-NEXT:    ret
618  %base_load = getelementptr <vscale x 16 x i8>, ptr %base, i64 6
619  %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr %base_load,
620                                                            i32 1,
621                                                            <vscale x 16 x i1> %mask,
622                                                            <vscale x 16 x i8> undef)
623  %base_store = getelementptr <vscale x 16 x i8>, ptr %base, i64 7
624  call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
625                                       ptr %base_store,
626                                       i32 1,
627                                       <vscale x 16 x i1> %mask)
628  ret void
629}
630
631; 2-element contiguous loads.
632declare <vscale x 2 x i8>  @llvm.masked.load.nxv2i8 (ptr , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
633declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
634declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
635declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
636declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
637declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
638declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
639
640; 4-element contiguous loads.
641declare <vscale x 4 x i8>  @llvm.masked.load.nxv4i8 (ptr , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
642declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
643declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
644declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
645declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
646
647; 8-element contiguous loads.
648declare <vscale x 8 x i8>  @llvm.masked.load.nxv8i8 (ptr , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
649declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
650declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
651declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
652
653; 16-element contiguous loads.
654declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
655
656; 2-element contiguous stores.
657declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , ptr , i32, <vscale x 2 x i1>)
658declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, ptr, i32, <vscale x 2 x i1>)
659declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, ptr, i32, <vscale x 2 x i1>)
660declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, ptr, i32, <vscale x 2 x i1>)
661declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, ptr, i32, <vscale x 2 x i1>)
662declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, ptr, i32, <vscale x 2 x i1>)
663declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, ptr, i32, <vscale x 2 x i1>)
664
665; 4-element contiguous stores.
666declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , ptr , i32, <vscale x 4 x i1>)
667declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, ptr, i32, <vscale x 4 x i1>)
668declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, ptr, i32, <vscale x 4 x i1>)
669declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, ptr, i32, <vscale x 4 x i1>)
670declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, ptr, i32, <vscale x 4 x i1>)
671
672; 8-element contiguous stores.
673declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , ptr , i32, <vscale x 8 x i1>)
674declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, ptr, i32, <vscale x 8 x i1>)
675declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, ptr, i32, <vscale x 8 x i1>)
676declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, ptr, i32, <vscale x 8 x i1>)
677
678; 16-element contiguous stores.
679declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, ptr, i32, <vscale x 16 x i1>)
680
681; +bf16 is required for the bfloat version.
682attributes #0 = { "target-features"="+sve,+bf16" }
683