xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-pred-contiguous-ldst-addressing-mode-reg-reg.ll (revision 1ee315ae7964c8433b772e0b5d667834994ba753)
1; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s
2
3; 2-lane contiguous load/stores
4
5define void @test_masked_ldst_sv2i8(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
6; CHECK-LABEL: test_masked_ldst_sv2i8:
7; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1]
8; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1]
9; CHECK-NEXT: ret
10  %base_i8 = getelementptr i8, ptr %base, i64 %offset
11  %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_i8,
12                                                          i32 1,
13                                                          <vscale x 2 x i1> %mask,
14                                                          <vscale x 2 x i8> undef)
15  call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data,
16                                      ptr %base_i8,
17                                      i32 1,
18                                      <vscale x 2 x i1> %mask)
19  ret void
20}
21
22define void @test_masked_ldst_sv2i16(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
23; CHECK-LABEL: test_masked_ldst_sv2i16:
24; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
25; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
26; CHECK-NEXT: ret
27  %base_i16 = getelementptr i16, ptr %base, i64 %offset
28  %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_i16,
29                                                            i32 1,
30                                                            <vscale x 2 x i1> %mask,
31                                                            <vscale x 2 x i16> undef)
32  call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data,
33                                       ptr %base_i16,
34                                       i32 1,
35                                       <vscale x 2 x i1> %mask)
36  ret void
37}
38
39define void @test_masked_ldst_sv2i32(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
40; CHECK-LABEL: test_masked_ldst_sv2i32:
41; CHECK-NEXT: ld1w  { z0.d }, p0/z, [x0, x1, lsl #2]
42; CHECK-NEXT: st1w  { z0.d }, p0, [x0, x1, lsl #2]
43; CHECK-NEXT: ret
44  %base_i32 = getelementptr i32, ptr %base, i64 %offset
45  %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_i32,
46                                                            i32 1,
47                                                            <vscale x 2 x i1> %mask,
48                                                            <vscale x 2 x i32> undef)
49  call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data,
50                                       ptr %base_i32,
51                                       i32 1,
52                                       <vscale x 2 x i1> %mask)
53  ret void
54}
55
56define void @test_masked_ldst_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
57; CHECK-LABEL: test_masked_ldst_sv2i64:
58; CHECK-NEXT: ld1d  { z0.d }, p0/z, [x0, x1, lsl #3]
59; CHECK-NEXT: st1d  { z0.d }, p0, [x0, x1, lsl #3]
60; CHECK-NEXT: ret
61  %base_i64 = getelementptr i64, ptr %base, i64 %offset
62  %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(ptr %base_i64,
63                                                            i32 1,
64                                                            <vscale x 2 x i1> %mask,
65                                                            <vscale x 2 x i64> undef)
66  call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data,
67                                       ptr %base_i64,
68                                       i32 1,
69                                       <vscale x 2 x i1> %mask)
70  ret void
71}
72
73define void @test_masked_ldst_sv2f16(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
74; CHECK-LABEL: test_masked_ldst_sv2f16:
75; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1]
76; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1]
77; CHECK-NEXT: ret
78  %base_half = getelementptr half, ptr %base, i64 %offset
79  %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(ptr %base_half,
80                                                             i32 1,
81                                                             <vscale x 2 x i1> %mask,
82                                                             <vscale x 2 x half> undef)
83  call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data,
84                                       ptr %base_half,
85                                       i32 1,
86                                       <vscale x 2 x i1> %mask)
87  ret void
88}
89
90define void @test_masked_ldst_sv2f32(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
91; CHECK-LABEL: test_masked_ldst_sv2f32:
92; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2]
93; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2]
94; CHECK-NEXT: ret
95  %base_float = getelementptr float, ptr %base, i64 %offset
96  %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(ptr %base_float,
97                                                              i32 1,
98                                                              <vscale x 2 x i1> %mask,
99                                                              <vscale x 2 x float> undef)
100  call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data,
101                                       ptr %base_float,
102                                       i32 1,
103                                       <vscale x 2 x i1> %mask)
104  ret void
105}
106
107define void @test_masked_ldst_sv2f64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
108; CHECK-LABEL: test_masked_ldst_sv2f64:
109; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3]
110; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3]
111; CHECK-NEXT: ret
112  %base_double = getelementptr double, ptr %base, i64 %offset
113  %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %base_double,
114                                                               i32 1,
115                                                               <vscale x 2 x i1> %mask,
116                                                               <vscale x 2 x double> undef)
117  call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data,
118                                       ptr %base_double,
119                                       i32 1,
120                                       <vscale x 2 x i1> %mask)
121  ret void
122}
123
124; 2-lane zero/sign extended contiguous loads.
125
126define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
127; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64:
128; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1]
129; CHECK-NEXT: ret
130  %base_i8 = getelementptr i8, ptr %base, i64 %offset
131  %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_i8,
132                                                          i32 1,
133                                                          <vscale x 2 x i1> %mask,
134                                                          <vscale x 2 x i8> undef)
135  %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
136  ret <vscale x 2 x i64> %ext
137}
138
139define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
140; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64:
141; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1]
142; CHECK-NEXT: ret
143  %base_i8 = getelementptr i8, ptr %base, i64 %offset
144  %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_i8,
145                                                          i32 1,
146                                                          <vscale x 2 x i1> %mask,
147                                                          <vscale x 2 x i8> undef)
148  %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
149  ret <vscale x 2 x i64> %ext
150}
151
152define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
153; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64:
154; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1]
155; CHECK-NEXT: ret
156  %base_i16 = getelementptr i16, ptr %base, i64 %offset
157  %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_i16,
158                                                            i32 1,
159                                                            <vscale x 2 x i1> %mask,
160                                                            <vscale x 2 x i16> undef)
161  %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
162  ret <vscale x 2 x i64> %ext
163}
164
165define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
166; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64:
167; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1]
168; CHECK-NEXT: ret
169  %base_i16 = getelementptr i16, ptr %base, i64 %offset
170  %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_i16,
171                                                            i32 1,
172                                                            <vscale x 2 x i1> %mask,
173                                                            <vscale x 2 x i16> undef)
174  %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
175  ret <vscale x 2 x i64> %ext
176}
177
178
179define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
180; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64:
181; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2]
182; CHECK-NEXT: ret
183  %base_i32 = getelementptr i32, ptr %base, i64 %offset
184  %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_i32,
185                                                            i32 1,
186                                                            <vscale x 2 x i1> %mask,
187                                                            <vscale x 2 x i32> undef)
188  %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
189  ret <vscale x 2 x i64> %ext
190}
191
192define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
193; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64:
194; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2]
195; CHECK-NEXT: ret
196  %base_i32 = getelementptr i32, ptr %base, i64 %offset
197  %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_i32,
198                                                            i32 1,
199                                                            <vscale x 2 x i1> %mask,
200                                                            <vscale x 2 x i32> undef)
201  %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
202  ret <vscale x 2 x i64> %ext
203}
204
205; 2-lane truncating contiguous stores.
206
207define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
208; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8:
209; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1]
210; CHECK-NEXT: ret
211  %base_i8 = getelementptr i8, ptr %base, i64 %offset
212  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8>
213  call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc,
214                                      ptr %base_i8,
215                                      i32 1,
216                                      <vscale x 2 x i1> %mask)
217  ret void
218}
219
220define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
221; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16:
222; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1]
223; CHECK-NEXT: ret
224  %base_i16 = getelementptr i16, ptr %base, i64 %offset
225  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16>
226  call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc,
227                                       ptr %base_i16,
228                                       i32 1,
229                                       <vscale x 2 x i1> %mask)
230  ret void
231}
232
233define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind {
234; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32:
235; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2]
236; CHECK-NEXT: ret
237  %base_i32 = getelementptr i32, ptr %base, i64 %offset
238  %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32>
239  call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc,
240                                       ptr %base_i32,
241                                       i32 1,
242                                       <vscale x 2 x i1> %mask)
243  ret void
244}
245
246; 4-lane contiguous load/stores.
247
248define void @test_masked_ldst_sv4i8(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
249; CHECK-LABEL: test_masked_ldst_sv4i8:
250; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1]
251; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1]
252; CHECK-NEXT: ret
253  %base_i8 = getelementptr i8, ptr %base, i64 %offset
254  %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_i8,
255                                                          i32 1,
256                                                          <vscale x 4 x i1> %mask,
257                                                          <vscale x 4 x i8> undef)
258  call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data,
259                                      ptr %base_i8,
260                                      i32 1,
261                                      <vscale x 4 x i1> %mask)
262  ret void
263}
264
265define void @test_masked_ldst_sv4i16(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
266; CHECK-LABEL: test_masked_ldst_sv4i16:
267; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
268; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
269; CHECK-NEXT: ret
270  %base_i16 = getelementptr i16, ptr %base, i64 %offset
271  %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_i16,
272                                                            i32 1,
273                                                            <vscale x 4 x i1> %mask,
274                                                            <vscale x 4 x i16> undef)
275  call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data,
276                                       ptr %base_i16,
277                                       i32 1,
278                                       <vscale x 4 x i1> %mask)
279  ret void
280}
281
282define void @test_masked_ldst_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
283; CHECK-LABEL: test_masked_ldst_sv4i32:
284; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
285; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
286; CHECK-NEXT: ret
287  %base_i32 = getelementptr i32, ptr %base, i64 %offset
288  %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %base_i32,
289                                                            i32 1,
290                                                            <vscale x 4 x i1> %mask,
291                                                            <vscale x 4 x i32> undef)
292  call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data,
293                                       ptr %base_i32,
294                                       i32 1,
295                                       <vscale x 4 x i1> %mask)
296  ret void
297}
298
299define void @test_masked_ldst_sv4f16(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
300; CHECK-LABEL: test_masked_ldst_sv4f16:
301; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1]
302; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1]
303; CHECK-NEXT: ret
304  %base_f16 = getelementptr half, ptr %base, i64 %offset
305  %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(ptr %base_f16,
306                                                             i32 1,
307                                                             <vscale x 4 x i1> %mask,
308                                                             <vscale x 4 x half> undef)
309  call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data,
310                                       ptr %base_f16,
311                                       i32 1,
312                                       <vscale x 4 x i1> %mask)
313  ret void
314}
315
316define void @test_masked_ldst_sv4f32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
317; CHECK-LABEL: test_masked_ldst_sv4f32:
318; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2]
319; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2]
320; CHECK-NEXT: ret
321  %base_f32 = getelementptr float, ptr %base, i64 %offset
322  %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(ptr %base_f32,
323                                                              i32 1,
324                                                              <vscale x 4 x i1> %mask,
325                                                              <vscale x 4 x float> undef)
326  call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data,
327                                       ptr %base_f32,
328                                       i32 1,
329                                       <vscale x 4 x i1> %mask)
330  ret void
331}
332
333; 4-lane zero/sign extended contiguous loads.
334
335define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
336; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32:
337; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1]
338; CHECK-NEXT: ret
339  %base_i8 = getelementptr i8, ptr %base, i64 %offset
340  %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_i8,
341                                                          i32 1,
342                                                          <vscale x 4 x i1> %mask,
343                                                          <vscale x 4 x i8> undef)
344  %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
345  ret <vscale x 4 x i32> %ext
346}
347
348define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
349; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32:
350; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1]
351; CHECK-NEXT: ret
352  %base_i8 = getelementptr i8, ptr %base, i64 %offset
353  %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_i8,
354                                                          i32 1,
355                                                          <vscale x 4 x i1> %mask,
356                                                          <vscale x 4 x i8> undef)
357  %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
358  ret <vscale x 4 x i32> %ext
359}
360
361define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
362; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32:
363; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1]
364; CHECK-NEXT: ret
365  %base_i16 = getelementptr i16, ptr %base, i64 %offset
366  %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_i16,
367                                                            i32 1,
368                                                            <vscale x 4 x i1> %mask,
369                                                            <vscale x 4 x i16> undef)
370  %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
371  ret <vscale x 4 x i32> %ext
372}
373
374define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
375; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32:
376; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1]
377; CHECK-NEXT: ret
378  %base_i16 = getelementptr i16, ptr %base, i64 %offset
379  %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_i16,
380                                                            i32 1,
381                                                            <vscale x 4 x i1> %mask,
382                                                            <vscale x 4 x i16> undef)
383  %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
384  ret <vscale x 4 x i32> %ext
385}
386
387; 4-lane truncating contiguous stores.
388
389define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
390; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8:
391; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1]
392; CHECK-NEXT: ret
393  %base_i8 = getelementptr i8, ptr %base, i64 %offset
394  %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8>
395  call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc,
396                                      ptr %base_i8,
397                                      i32 1,
398                                      <vscale x 4 x i1> %mask)
399  ret void
400}
401
402define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind {
403; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16:
404; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1]
405; CHECK-NEXT: ret
406  %base_i16 = getelementptr i16, ptr %base, i64 %offset
407  %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16>
408  call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc,
409                                       ptr %base_i16,
410                                       i32 1,
411                                       <vscale x 4 x i1> %mask)
412  ret void
413}
414
415; 8-lane contiguous load/stores.
416
417define void @test_masked_ldst_sv8i8(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
418; CHECK-LABEL: test_masked_ldst_sv8i8:
419; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1]
420; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1]
421; CHECK-NEXT: ret
422  %base_i8 = getelementptr i8, ptr %base, i64 %offset
423  %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_i8,
424                                                          i32 1,
425                                                          <vscale x 8 x i1> %mask,
426                                                          <vscale x 8 x i8> undef)
427  call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data,
428                                      ptr %base_i8,
429                                      i32 1,
430                                      <vscale x 8 x i1> %mask)
431  ret void
432}
433
434define void @test_masked_ldst_sv8i16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
435; CHECK-LABEL: test_masked_ldst_sv8i16:
436; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
437; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
438; CHECK-NEXT: ret
439  %base_i16 = getelementptr i16, ptr %base, i64 %offset
440  %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(ptr %base_i16,
441                                                            i32 1,
442                                                            <vscale x 8 x i1> %mask,
443                                                            <vscale x 8 x i16> undef)
444  call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data,
445                                       ptr %base_i16,
446                                       i32 1,
447                                       <vscale x 8 x i1> %mask)
448  ret void
449}
450
451define void @test_masked_ldst_sv8f16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
452; CHECK-LABEL: test_masked_ldst_sv8f16:
453; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
454; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
455; CHECK-NEXT: ret
456  %base_f16 = getelementptr half, ptr %base, i64 %offset
457  %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(ptr %base_f16,
458                                                             i32 1,
459                                                             <vscale x 8 x i1> %mask,
460                                                             <vscale x 8 x half> undef)
461  call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data,
462                                       ptr %base_f16,
463                                       i32 1,
464                                       <vscale x 8 x i1> %mask)
465  ret void
466}
467
468define void @test_masked_ldst_sv8bf16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind #0 {
469; CHECK-LABEL: test_masked_ldst_sv8bf16:
470; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1]
471; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1]
472; CHECK-NEXT: ret
473  %base_f16 = getelementptr bfloat, ptr %base, i64 %offset
474  %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(ptr %base_f16,
475                                                               i32 1,
476                                                               <vscale x 8 x i1> %mask,
477                                                               <vscale x 8 x bfloat> undef)
478  call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data,
479                                        ptr %base_f16,
480                                        i32 1,
481                                        <vscale x 8 x i1> %mask)
482  ret void
483}
484
485; 8-lane zero/sign extended contiguous loads.
486
487define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
488; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16:
489; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1]
490; CHECK-NEXT: ret
491  %base_i8 = getelementptr i8, ptr %base, i64 %offset
492  %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_i8,
493                                                          i32 1,
494                                                          <vscale x 8 x i1> %mask,
495                                                          <vscale x 8 x i8> undef)
496  %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
497  ret <vscale x 8 x i16> %ext
498}
499
500define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
501; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16:
502; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1]
503; CHECK-NEXT: ret
504  %base_i8 = getelementptr i8, ptr %base, i64 %offset
505  %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_i8,
506                                                          i32 1,
507                                                          <vscale x 8 x i1> %mask,
508                                                          <vscale x 8 x i8> undef)
509  %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
510  ret <vscale x 8 x i16> %ext
511}
512
513; 8-lane truncating contiguous stores.
514
515define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind {
516; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8:
517; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1]
518; CHECK-NEXT: ret
519  %base_i8 = getelementptr i8, ptr %base, i64 %offset
520  %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8>
521  call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc,
522                                      ptr %base_i8,
523                                      i32 1,
524                                      <vscale x 8 x i1> %mask)
525  ret void
526}
527
528; 16-lane contiguous load/stores.
529
530define void @test_masked_ldst_sv16i8(ptr %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind {
531; CHECK-LABEL: test_masked_ldst_sv16i8:
532; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1]
533; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1]
534; CHECK-NEXT: ret
535  %base_i8 = getelementptr i8, ptr %base, i64 %offset
536  %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr %base_i8,
537                                                            i32 1,
538                                                            <vscale x 16 x i1> %mask,
539                                                            <vscale x 16 x i8> undef)
540  call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data,
541                                       ptr %base_i8,
542                                       i32 1,
543                                       <vscale x 16 x i1> %mask)
544  ret void
545}
546
547; 2-element contiguous loads.
548declare <vscale x 2 x i8>  @llvm.masked.load.nxv2i8 (ptr , i32, <vscale x 2 x i1>, <vscale x 2 x i8> )
549declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
550declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
551declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
552declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
553declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
554declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
555
556; 4-element contiguous loads.
557declare <vscale x 4 x i8>  @llvm.masked.load.nxv4i8 (ptr , i32, <vscale x 4 x i1>, <vscale x 4 x i8> )
558declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
559declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
560declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
561declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
562
563; 8-element contiguous loads.
564declare <vscale x 8 x i8>  @llvm.masked.load.nxv8i8 (ptr , i32, <vscale x 8 x i1>, <vscale x 8 x i8> )
565declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
566declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x half>)
567declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>)
568
569; 16-element contiguous loads.
570declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
571
572; 2-element contiguous stores.
573declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , ptr , i32, <vscale x 2 x i1>)
574declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, ptr, i32, <vscale x 2 x i1>)
575declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, ptr, i32, <vscale x 2 x i1>)
576declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, ptr, i32, <vscale x 2 x i1>)
577declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, ptr, i32, <vscale x 2 x i1>)
578declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, ptr, i32, <vscale x 2 x i1>)
579declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, ptr, i32, <vscale x 2 x i1>)
580
581; 4-element contiguous stores.
582declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , ptr , i32, <vscale x 4 x i1>)
583declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, ptr, i32, <vscale x 4 x i1>)
584declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, ptr, i32, <vscale x 4 x i1>)
585declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, ptr, i32, <vscale x 4 x i1>)
586declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, ptr, i32, <vscale x 4 x i1>)
587
588; 8-element contiguous stores.
589declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , ptr , i32, <vscale x 8 x i1>)
590declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, ptr, i32, <vscale x 8 x i1>)
591declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, ptr, i32, <vscale x 8 x i1>)
592declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, ptr, i32, <vscale x 8 x i1>)
593
594; 16-element contiguous stores.
595declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, ptr, i32, <vscale x 16 x i1>)
596
597; +bf16 is required for the bfloat version.
598attributes #0 = { "target-features"="+sve,+bf16" }
599