xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll (revision 5ddce70ef0e5a641d7fea95e31fc5e2439cb98cb)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s | FileCheck %s
3
4;
5; LD1RQB
6;
7
8define <vscale x 16 x i8> @ld1rqb_i8(<vscale x 16 x i1> %pred, ptr %addr) {
9; CHECK-LABEL: ld1rqb_i8:
10; CHECK:       // %bb.0:
11; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0]
12; CHECK-NEXT:    ret
13  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %addr)
14  ret <vscale x 16 x i8> %res
15}
16
17define <vscale x 16 x i8> @ld1rqb_i8_imm(<vscale x 16 x i1> %pred, ptr %addr) {
18; CHECK-LABEL: ld1rqb_i8_imm:
19; CHECK:       // %bb.0:
20; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, #16]
21; CHECK-NEXT:    ret
22  %ptr = getelementptr inbounds i8, ptr %addr, i8 16
23  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr)
24  ret <vscale x 16 x i8> %res
25}
26
27define <vscale x 16 x i8> @ld1rqb_i8_scalar(<vscale x 16 x i1> %pred, ptr %addr, i64 %idx) {
28; CHECK-LABEL: ld1rqb_i8_scalar:
29; CHECK:       // %bb.0:
30; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x1]
31; CHECK-NEXT:    ret
32  %ptr = getelementptr inbounds i8, ptr %addr, i64 %idx
33  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr)
34  ret <vscale x 16 x i8> %res
35}
36
37define <vscale x 16 x i8> @ld1rqb_i8_imm_lower_bound(<vscale x 16 x i1> %pred, ptr %addr) {
38; CHECK-LABEL: ld1rqb_i8_imm_lower_bound:
39; CHECK:       // %bb.0:
40; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, #-128]
41; CHECK-NEXT:    ret
42  %ptr = getelementptr inbounds i8, ptr %addr, i8 -128
43  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr)
44  ret <vscale x 16 x i8> %res
45}
46
47define <vscale x 16 x i8> @ld1rqb_i8_imm_upper_bound(<vscale x 16 x i1> %pred, ptr %addr) {
48; CHECK-LABEL: ld1rqb_i8_imm_upper_bound:
49; CHECK:       // %bb.0:
50; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, #112]
51; CHECK-NEXT:    ret
52  %ptr = getelementptr inbounds i8, ptr %addr, i8 112
53  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr)
54  ret <vscale x 16 x i8> %res
55}
56
57define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_lower_bound(<vscale x 16 x i1> %pred, ptr %addr) {
58; CHECK-LABEL: ld1rqb_i8_imm_out_of_lower_bound:
59; CHECK:       // %bb.0:
60; CHECK-NEXT:    mov x8, #-129
61; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x8]
62; CHECK-NEXT:    ret
63  %ptr = getelementptr inbounds i8, ptr %addr, i64 -129
64  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr)
65  ret <vscale x 16 x i8> %res
66}
67
68define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_upper_bound(<vscale x 16 x i1> %pred, ptr %addr) {
69; CHECK-LABEL: ld1rqb_i8_imm_out_of_upper_bound:
70; CHECK:       // %bb.0:
71; CHECK-NEXT:    mov w8, #113
72; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x8]
73; CHECK-NEXT:    ret
74  %ptr = getelementptr inbounds i8, ptr %addr, i64 113
75  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr)
76  ret <vscale x 16 x i8> %res
77}
78
79define <vscale x 16 x i8> @ld1rqb_i8_imm_dupqlane(<vscale x 8 x i1> %pred, ptr %addr) {
80; CHECK-LABEL: ld1rqb_i8_imm_dupqlane:
81; CHECK:       // %bb.0:
82; CHECK-NEXT:    ptrue p0.b
83; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, #-16]
84; CHECK-NEXT:    ret
85  %ptr = getelementptr inbounds <16 x i8>, ptr %addr, i16 -1
86  %load = load <16 x i8>, ptr %ptr
87  %1 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %load, i64 0)
88  %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %1, i64 0)
89  ret <vscale x 16 x i8> %2
90}
91
92define <vscale x 16 x i8> @ld1rqb_i8_scalar_dupqlane(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) {
93; CHECK-LABEL: ld1rqb_i8_scalar_dupqlane:
94; CHECK:       // %bb.0:
95; CHECK-NEXT:    ptrue p0.b
96; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0, x1]
97; CHECK-NEXT:    ret
98  %ptr = getelementptr inbounds i8, ptr %addr, i64 %idx
99  %load = load <16 x i8>, ptr %ptr
100  %1 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %load, i64 0)
101  %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %1, i64 0)
102  ret <vscale x 16 x i8> %2
103}
104
105;
106; LD1RQH
107;
108
109define <vscale x 8 x i16> @ld1rqh_i16(<vscale x 8 x i1> %pred, ptr %addr) {
110; CHECK-LABEL: ld1rqh_i16:
111; CHECK:       // %bb.0:
112; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0]
113; CHECK-NEXT:    ret
114  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %pred, ptr %addr)
115  ret <vscale x 8 x i16> %res
116}
117
118define <vscale x 8 x half> @ld1rqh_f16(<vscale x 8 x i1> %pred, ptr %addr) {
119; CHECK-LABEL: ld1rqh_f16:
120; CHECK:       // %bb.0:
121; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0]
122; CHECK-NEXT:    ret
123  %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %pred, ptr %addr)
124  ret <vscale x 8 x half> %res
125}
126
127define <vscale x 8 x i16> @ld1rqh_i16_imm(<vscale x 8 x i1> %pred, ptr %addr) {
128; CHECK-LABEL: ld1rqh_i16_imm:
129; CHECK:       // %bb.0:
130; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-64]
131; CHECK-NEXT:    ret
132  %ptr = getelementptr inbounds i16, ptr %addr, i16 -32
133  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %pred, ptr %ptr)
134  ret <vscale x 8 x i16> %res
135}
136
137define <vscale x 8 x half> @ld1rqh_f16_imm(<vscale x 8 x i1> %pred, ptr %addr) {
138; CHECK-LABEL: ld1rqh_f16_imm:
139; CHECK:       // %bb.0:
140; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-16]
141; CHECK-NEXT:    ret
142  %ptr = getelementptr inbounds half, ptr %addr, i16 -8
143  %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %pred, ptr %ptr)
144  ret <vscale x 8 x half> %res
145}
146
147define <vscale x 8 x i16> @ld1rqh_i16_scalar(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) {
148; CHECK-LABEL: ld1rqh_i16_scalar:
149; CHECK:       // %bb.0:
150; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
151; CHECK-NEXT:    ret
152  %ptr = getelementptr inbounds i16, ptr %addr, i64 %idx
153  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %pred, ptr %ptr)
154  ret <vscale x 8 x i16> %res
155}
156
157define <vscale x 8 x half> @ld1rqh_f16_scalar(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) {
158; CHECK-LABEL: ld1rqh_f16_scalar:
159; CHECK:       // %bb.0:
160; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
161; CHECK-NEXT:    ret
162  %ptr = getelementptr inbounds half, ptr %addr, i64 %idx
163  %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %pred, ptr %ptr)
164  ret <vscale x 8 x half> %res
165}
166
167define <vscale x 8 x bfloat> @ld1rqh_bf16(<vscale x 8 x i1> %pred, ptr %addr) {
168; CHECK-LABEL: ld1rqh_bf16:
169; CHECK:       // %bb.0:
170; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0]
171; CHECK-NEXT:    ret
172  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, ptr %addr)
173  ret <vscale x 8 x bfloat> %res
174}
175
176define <vscale x 8 x bfloat> @ld1rqh_bf16_imm(<vscale x 8 x i1> %pred, ptr %addr) {
177; CHECK-LABEL: ld1rqh_bf16_imm:
178; CHECK:       // %bb.0:
179; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-16]
180; CHECK-NEXT:    ret
181  %ptr = getelementptr inbounds bfloat, ptr %addr, i16 -8
182  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, ptr %ptr)
183  ret <vscale x 8 x bfloat> %res
184}
185
186define <vscale x 8 x bfloat> @ld1rqh_bf16_scalar(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) {
187; CHECK-LABEL: ld1rqh_bf16_scalar:
188; CHECK:       // %bb.0:
189; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
190; CHECK-NEXT:    ret
191  %ptr = getelementptr inbounds bfloat, ptr %addr, i64 %idx
192  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, ptr %ptr)
193  ret <vscale x 8 x bfloat> %res
194}
195
196define <vscale x 8 x i16> @ld1rqh_i16_imm_dupqlane(<vscale x 8 x i1> %pred, ptr %addr) {
197; CHECK-LABEL: ld1rqh_i16_imm_dupqlane:
198; CHECK:       // %bb.0:
199; CHECK-NEXT:    ptrue p0.h
200; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-16]
201; CHECK-NEXT:    ret
202  %ptr = getelementptr inbounds <8 x i16>, ptr %addr, i16 -1
203  %load = load <8 x i16>, ptr %ptr
204  %1 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %load, i64 0)
205  %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %1, i64 0)
206  ret <vscale x 8 x i16> %2
207}
208
209define <vscale x 8 x i16> @ld1rqh_i16_scalar_dupqlane(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) {
210; CHECK-LABEL: ld1rqh_i16_scalar_dupqlane:
211; CHECK:       // %bb.0:
212; CHECK-NEXT:    ptrue p0.h
213; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
214; CHECK-NEXT:    ret
215  %ptr = getelementptr inbounds i16, ptr %addr, i64 %idx
216  %load = load <8 x i16>, ptr %ptr
217  %1 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %load, i64 0)
218  %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %1, i64 0)
219  ret <vscale x 8 x i16> %2
220}
221
222define <vscale x 8 x half> @ld1rqh_f16_imm_dupqlane(<vscale x 8 x i1> %pred, ptr %addr) {
223; CHECK-LABEL: ld1rqh_f16_imm_dupqlane:
224; CHECK:       // %bb.0:
225; CHECK-NEXT:    ptrue p0.h
226; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-16]
227; CHECK-NEXT:    ret
228  %ptr = getelementptr inbounds <8 x half>, ptr %addr, i16 -1
229  %load = load <8 x half>, ptr %ptr
230  %1 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %load, i64 0)
231  %2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %1, i64 0)
232  ret <vscale x 8 x half> %2
233}
234
235define <vscale x 8 x half> @ld1rqh_f16_scalar_dupqlane(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) {
236; CHECK-LABEL: ld1rqh_f16_scalar_dupqlane:
237; CHECK:       // %bb.0:
238; CHECK-NEXT:    ptrue p0.h
239; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
240; CHECK-NEXT:    ret
241  %ptr = getelementptr inbounds half, ptr %addr, i64 %idx
242  %load = load <8 x half>, ptr %ptr
243  %1 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %load, i64 0)
244  %2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %1, i64 0)
245  ret <vscale x 8 x half> %2
246}
247
248define <vscale x 8 x bfloat> @ld1rqh_bf16_imm_dupqlane(<vscale x 8 x i1> %pred, ptr %addr) {
249; CHECK-LABEL: ld1rqh_bf16_imm_dupqlane:
250; CHECK:       // %bb.0:
251; CHECK-NEXT:    ptrue p0.h
252; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, #-16]
253; CHECK-NEXT:    ret
254  %ptr = getelementptr inbounds <8 x bfloat>, ptr %addr, i16 -1
255  %load = load <8 x bfloat>, ptr %ptr
256  %1 = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %load, i64 0)
257  %2 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %1, i64 0)
258  ret <vscale x 8 x bfloat> %2
259}
260
261define <vscale x 8 x bfloat> @ld1rqh_bf16_scalar_dupqlane(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) {
262; CHECK-LABEL: ld1rqh_bf16_scalar_dupqlane:
263; CHECK:       // %bb.0:
264; CHECK-NEXT:    ptrue p0.h
265; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1]
266; CHECK-NEXT:    ret
267  %ptr = getelementptr inbounds bfloat, ptr %addr, i64 %idx
268  %load = load <8 x bfloat>, ptr %ptr
269  %1 = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %load, i64 0)
270  %2 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %1, i64 0)
271  ret <vscale x 8 x bfloat> %2
272}
273
274;
275; LD1RQW
276;
277
278define <vscale x 4 x i32> @ld1rqw_i32(<vscale x 4 x i1> %pred, ptr %addr) {
279; CHECK-LABEL: ld1rqw_i32:
280; CHECK:       // %bb.0:
281; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0]
282; CHECK-NEXT:    ret
283  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %pred, ptr %addr)
284  ret <vscale x 4 x i32> %res
285}
286
287define <vscale x 4 x float> @ld1rqw_f32(<vscale x 4 x i1> %pred, ptr %addr) {
288; CHECK-LABEL: ld1rqw_f32:
289; CHECK:       // %bb.0:
290; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0]
291; CHECK-NEXT:    ret
292  %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %pred, ptr %addr)
293  ret <vscale x 4 x float> %res
294}
295
296define <vscale x 4 x i32> @ld1rqw_i32_imm(<vscale x 4 x i1> %pred, ptr %addr) {
297; CHECK-LABEL: ld1rqw_i32_imm:
298; CHECK:       // %bb.0:
299; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, #112]
300; CHECK-NEXT:    ret
301  %ptr = getelementptr inbounds i32, ptr %addr, i32 28
302  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %pred, ptr %ptr)
303  ret <vscale x 4 x i32> %res
304}
305
306define <vscale x 4 x float> @ld1rqw_f32_imm(<vscale x 4 x i1> %pred, ptr %addr) {
307; CHECK-LABEL: ld1rqw_f32_imm:
308; CHECK:       // %bb.0:
309; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, #32]
310; CHECK-NEXT:    ret
311  %ptr = getelementptr inbounds float, ptr %addr, i32 8
312  %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %pred, ptr %ptr)
313  ret <vscale x 4 x float> %res
314}
315
316define <vscale x 4 x i32> @ld1rqw_i32_scalar(<vscale x 4 x i1> %pred, ptr %base, i64 %idx) {
317; CHECK-LABEL: ld1rqw_i32_scalar:
318; CHECK:       // %bb.0:
319; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
320; CHECK-NEXT:    ret
321  %ptr = getelementptr inbounds i32, ptr %base, i64 %idx
322  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %pred, ptr %ptr)
323  ret <vscale x 4 x i32> %res
324}
325
326define <vscale x 4 x float> @ld1rqw_f32_scalar(<vscale x 4 x i1> %pred, ptr %base, i64 %idx) {
327; CHECK-LABEL: ld1rqw_f32_scalar:
328; CHECK:       // %bb.0:
329; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
330; CHECK-NEXT:    ret
331  %ptr = getelementptr inbounds float, ptr %base, i64 %idx
332  %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %pred, ptr %ptr)
333  ret <vscale x 4 x float> %res
334}
335
336define <vscale x 4 x i32> @ld1rqw_i32_imm_dupqlane(<vscale x 4 x i1> %pred, ptr %addr) {
337; CHECK-LABEL: ld1rqw_i32_imm_dupqlane:
338; CHECK:       // %bb.0:
339; CHECK-NEXT:    ptrue p0.s
340; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, #16]
341; CHECK-NEXT:    ret
342  %ptr = getelementptr inbounds <4 x i32>, ptr %addr, i32 1
343  %load = load <4 x i32>, ptr %ptr
344  %1 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %load, i64 0)
345  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %1, i64 0)
346  ret <vscale x 4 x i32> %2
347}
348
349define <vscale x 4 x i32> @ld1rqw_i32_scalar_dupqlane(<vscale x 4 x i1> %pred, ptr %addr, i64 %idx) {
350; CHECK-LABEL: ld1rqw_i32_scalar_dupqlane:
351; CHECK:       // %bb.0:
352; CHECK-NEXT:    ptrue p0.s
353; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
354; CHECK-NEXT:    ret
355  %ptr = getelementptr inbounds i32, ptr %addr, i64 %idx
356  %load = load <4 x i32>, ptr %ptr
357  %1 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %load, i64 0)
358  %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %1, i64 0)
359  ret <vscale x 4 x i32> %2
360}
361
362define <vscale x 4 x float> @ld1rqw_f32_imm_dupqlane(<vscale x 4 x i1> %pred, ptr %addr) {
363; CHECK-LABEL: ld1rqw_f32_imm_dupqlane:
364; CHECK:       // %bb.0:
365; CHECK-NEXT:    ptrue p0.s
366; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, #16]
367; CHECK-NEXT:    ret
368  %ptr = getelementptr inbounds <4 x float>, ptr %addr, i32 1
369  %load = load <4 x float>, ptr %ptr
370  %1 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %load, i64 0)
371  %2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %1, i64 0)
372  ret <vscale x 4 x float> %2
373}
374
375define <vscale x 4 x float> @ld1rqw_f32_scalar_dupqlane(<vscale x 4 x i1> %pred, ptr %addr, i64 %idx) {
376; CHECK-LABEL: ld1rqw_f32_scalar_dupqlane:
377; CHECK:       // %bb.0:
378; CHECK-NEXT:    ptrue p0.s
379; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2]
380; CHECK-NEXT:    ret
381  %ptr = getelementptr inbounds float, ptr %addr, i64 %idx
382  %load = load <4 x float>, ptr %ptr
383  %1 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %load, i64 0)
384  %2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %1, i64 0)
385  ret <vscale x 4 x float> %2
386}
387
388;
389; LD1RQD
390;
391
392define <vscale x 2 x i64> @ld1rqd_i64(<vscale x 2 x i1> %pred, ptr %addr) {
393; CHECK-LABEL: ld1rqd_i64:
394; CHECK:       // %bb.0:
395; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0]
396; CHECK-NEXT:    ret
397  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %pred, ptr %addr)
398  ret <vscale x 2 x i64> %res
399}
400
401define <vscale x 2 x double> @ld1rqd_f64(<vscale x 2 x i1> %pred, ptr %addr) {
402; CHECK-LABEL: ld1rqd_f64:
403; CHECK:       // %bb.0:
404; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0]
405; CHECK-NEXT:    ret
406  %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %pred, ptr %addr)
407  ret <vscale x 2 x double> %res
408}
409
410define <vscale x 2 x i64> @ld1rqd_i64_imm(<vscale x 2 x i1> %pred, ptr %addr) {
411; CHECK-LABEL: ld1rqd_i64_imm:
412; CHECK:       // %bb.0:
413; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, #64]
414; CHECK-NEXT:    ret
415  %ptr = getelementptr inbounds i64, ptr %addr, i64 8
416  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %pred, ptr %ptr)
417  ret <vscale x 2 x i64> %res
418}
419
420define <vscale x 2 x double> @ld1rqd_f64_imm(<vscale x 2 x i1> %pred, ptr %addr) {
421; CHECK-LABEL: ld1rqd_f64_imm:
422; CHECK:       // %bb.0:
423; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, #-128]
424; CHECK-NEXT:    ret
425  %ptr = getelementptr inbounds double, ptr %addr, i64 -16
426  %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %pred, ptr %ptr)
427  ret <vscale x 2 x double> %res
428}
429
430define <vscale x 2 x i64> @ld1rqd_i64_scalar(<vscale x 2 x i1> %pred, ptr %base, i64 %idx) {
431; CHECK-LABEL: ld1rqd_i64_scalar:
432; CHECK:       // %bb.0:
433; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
434; CHECK-NEXT:    ret
435  %ptr = getelementptr inbounds i64, ptr %base, i64 %idx
436  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %pred, ptr %ptr)
437  ret <vscale x 2 x i64> %res
438}
439
440define <vscale x 2 x double> @ld1rqd_f64_scalar(<vscale x 2 x i1> %pred, ptr %base, i64 %idx) {
441; CHECK-LABEL: ld1rqd_f64_scalar:
442; CHECK:       // %bb.0:
443; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
444; CHECK-NEXT:    ret
445  %ptr = getelementptr inbounds double, ptr %base, i64 %idx
446  %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %pred, ptr %ptr)
447  ret <vscale x 2 x double> %res
448}
449
450define <vscale x 2 x i64> @ld1rqd_i64_imm_dupqlane(<vscale x 2 x i1> %pred, ptr %addr) {
451; CHECK-LABEL: ld1rqd_i64_imm_dupqlane:
452; CHECK:       // %bb.0:
453; CHECK-NEXT:    ptrue p0.d
454; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, #16]
455; CHECK-NEXT:    ret
456  %ptr = getelementptr inbounds <2 x i64>, ptr %addr, i64 1
457  %load = load <2 x i64>, ptr %ptr
458  %1 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %load, i64 0)
459  %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %1, i64 0)
460  ret <vscale x 2 x i64> %2
461}
462
463define <vscale x 2 x i64> @ld1rqd_i64_scalar_dupqlane(<vscale x 2 x i1> %pred, ptr %addr, i64 %idx) {
464; CHECK-LABEL: ld1rqd_i64_scalar_dupqlane:
465; CHECK:       // %bb.0:
466; CHECK-NEXT:    ptrue p0.d
467; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
468; CHECK-NEXT:    ret
469  %ptr = getelementptr inbounds i64, ptr %addr, i64 %idx
470  %load = load <2 x i64>, ptr %ptr
471  %1 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %load, i64 0)
472  %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %1, i64 0)
473  ret <vscale x 2 x i64> %2
474}
475
476define <vscale x 2 x double> @ld1rqd_f64_imm_dupqlane(<vscale x 2 x i1> %pred, ptr %addr) {
477; CHECK-LABEL: ld1rqd_f64_imm_dupqlane:
478; CHECK:       // %bb.0:
479; CHECK-NEXT:    ptrue p0.d
480; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, #16]
481; CHECK-NEXT:    ret
482  %ptr = getelementptr inbounds <2 x double>, ptr %addr, i64 1
483  %load = load <2 x double>, ptr %ptr
484  %1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %load, i64 0)
485  %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %1, i64 0)
486  ret <vscale x 2 x double> %2
487}
488
489define <vscale x 2 x double> @ld1rqd_f64_scalar_dupqlane(<vscale x 2 x i1> %pred, ptr %addr, i64 %idx) {
490; CHECK-LABEL: ld1rqd_f64_scalar_dupqlane:
491; CHECK:       // %bb.0:
492; CHECK-NEXT:    ptrue p0.d
493; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3]
494; CHECK-NEXT:    ret
495  %ptr = getelementptr inbounds double, ptr %addr, i64 %idx
496  %load = load <2 x double>, ptr %ptr
497  %1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %load, i64 0)
498  %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %1, i64 0)
499  ret <vscale x 2 x double> %2
500}
501
502;
503; LDNT1B
504;
505
506define <vscale x 16 x i8> @ldnt1b_i8(<vscale x 16 x i1> %pred, ptr %addr) {
507; CHECK-LABEL: ldnt1b_i8:
508; CHECK:       // %bb.0:
509; CHECK-NEXT:    ldnt1b { z0.b }, p0/z, [x0]
510; CHECK-NEXT:    ret
511  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1> %pred,
512                                                                 ptr %addr)
513  ret <vscale x 16 x i8> %res
514}
515
516;
517; LDNT1H
518;
519
520define <vscale x 8 x i16> @ldnt1h_i16(<vscale x 8 x i1> %pred, ptr %addr) {
521; CHECK-LABEL: ldnt1h_i16:
522; CHECK:       // %bb.0:
523; CHECK-NEXT:    ldnt1h { z0.h }, p0/z, [x0]
524; CHECK-NEXT:    ret
525  %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> %pred,
526                                                                 ptr %addr)
527  ret <vscale x 8 x i16> %res
528}
529
530define <vscale x 8 x half> @ldnt1h_f16(<vscale x 8 x i1> %pred, ptr %addr) {
531; CHECK-LABEL: ldnt1h_f16:
532; CHECK:       // %bb.0:
533; CHECK-NEXT:    ldnt1h { z0.h }, p0/z, [x0]
534; CHECK-NEXT:    ret
535  %res = call <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1> %pred,
536                                                                  ptr %addr)
537  ret <vscale x 8 x half> %res
538}
539
540define <vscale x 8 x bfloat> @ldnt1h_bf16(<vscale x 8 x i1> %pred, ptr %addr) {
541; CHECK-LABEL: ldnt1h_bf16:
542; CHECK:       // %bb.0:
543; CHECK-NEXT:    ldnt1h { z0.h }, p0/z, [x0]
544; CHECK-NEXT:    ret
545  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %pred,
546                                                                     ptr %addr)
547  ret <vscale x 8 x bfloat> %res
548}
549
550;
551; LDNT1W
552;
553
554define <vscale x 4 x i32> @ldnt1w_i32(<vscale x 4 x i1> %pred, ptr %addr) {
555; CHECK-LABEL: ldnt1w_i32:
556; CHECK:       // %bb.0:
557; CHECK-NEXT:    ldnt1w { z0.s }, p0/z, [x0]
558; CHECK-NEXT:    ret
559  %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1> %pred,
560                                                                 ptr %addr)
561  ret <vscale x 4 x i32> %res
562}
563
564define <vscale x 4 x float> @ldnt1w_f32(<vscale x 4 x i1> %pred, ptr %addr) {
565; CHECK-LABEL: ldnt1w_f32:
566; CHECK:       // %bb.0:
567; CHECK-NEXT:    ldnt1w { z0.s }, p0/z, [x0]
568; CHECK-NEXT:    ret
569  %res = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1> %pred,
570                                                                   ptr %addr)
571  ret <vscale x 4 x float> %res
572}
573
574;
575; LDNT1D
576;
577
578define <vscale x 2 x i64> @ldnt1d_i64(<vscale x 2 x i1> %pred, ptr %addr) {
579; CHECK-LABEL: ldnt1d_i64:
580; CHECK:       // %bb.0:
581; CHECK-NEXT:    ldnt1d { z0.d }, p0/z, [x0]
582; CHECK-NEXT:    ret
583  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1> %pred,
584                                                                 ptr %addr)
585  ret <vscale x 2 x i64> %res
586}
587
588define <vscale x 2 x double> @ldnt1d_f64(<vscale x 2 x i1> %pred, ptr %addr) {
589; CHECK-LABEL: ldnt1d_f64:
590; CHECK:       // %bb.0:
591; CHECK-NEXT:    ldnt1d { z0.d }, p0/z, [x0]
592; CHECK-NEXT:    ret
593  %res = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1> %pred,
594                                                                    ptr %addr)
595  ret <vscale x 2 x double> %res
596}
597
598
599declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1>, ptr)
600declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1>, ptr)
601declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1>, ptr)
602declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1>, ptr)
603declare <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1>, ptr)
604declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1>, ptr)
605declare <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1>, ptr)
606declare <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1>, ptr)
607
608declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, ptr)
609declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, ptr)
610declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1>, ptr)
611declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, ptr)
612declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, ptr)
613declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, ptr)
614declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, ptr)
615declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, ptr)
616
617declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
618declare <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double>, <2 x double>, i64)
619declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
620declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
621declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
622declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
623declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
624declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
625
626declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64)
627declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64)
628declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
629declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
630declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
631declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
632declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat>, i64)
633declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64)
634