xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-ld1r.ll (revision cc82f1290a1e2157a6c0530d78d8cc84d2b8553d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,CHECK-LD1R
3; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+no-sve-fp-ld1r < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NO-LD1R
4;
5; Check that ldr1* instruction is generated to splat scalar during load,
6; rather than mov from scalar to vector register (which would require the vector unit).
7;
8; one-off: ld1r_stack checks that ldr1b works with stack objects.
9;
10; Test axes:
11;   types = [i8, i16, i32, i64, half, float, double]
12;   methods = [direct load, gep upper bound - 1, gep out of range x {neg,pos}, sext..., zext..., unpacked_floats...]
13;
14
15@g8 = external global i8
16
17; One-off test for splatted value coming from stack load.
18define <vscale x 16 x i8> @ld1r_stack() {
19; CHECK-LABEL: ld1r_stack:
20; CHECK:       // %bb.0:
21; CHECK-NEXT:    sub sp, sp, #16
22; CHECK-NEXT:    .cfi_def_cfa_offset 16
23; CHECK-NEXT:    adrp x8, :got:g8
24; CHECK-NEXT:    ptrue p0.b
25; CHECK-NEXT:    ldr x8, [x8, :got_lo12:g8]
26; CHECK-NEXT:    ldrb w8, [x8]
27; CHECK-NEXT:    strb w8, [sp, #12]
28; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [sp, #14]
29; CHECK-NEXT:    add sp, sp, #16
30; CHECK-NEXT:    ret
31  %valp = alloca i8
32  %valp2  = load volatile i8, ptr @g8
33  store volatile i8 %valp2, ptr %valp
34  %valp3 = getelementptr i8, ptr %valp, i32 2
35  %val = load i8, ptr %valp3
36  %1 = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
37  %2 = shufflevector <vscale x 16 x i8> %1, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
38  ret <vscale x 16 x i8> %2
39}
40
41define <vscale x 16 x i8> @ld1rb(ptr %valp) {
42; CHECK-LABEL: ld1rb:
43; CHECK:       // %bb.0:
44; CHECK-NEXT:    ptrue p0.b
45; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x0]
46; CHECK-NEXT:    ret
47  %val = load i8, ptr %valp
48  %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
49  %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
50  ret <vscale x 16 x i8> %shf
51}
52
53define <vscale x 16 x i8> @ld1rb_gep(ptr %valp) {
54; CHECK-LABEL: ld1rb_gep:
55; CHECK:       // %bb.0:
56; CHECK-NEXT:    ptrue p0.b
57; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x0, #63]
58; CHECK-NEXT:    ret
59  %valp2 = getelementptr i8, ptr %valp, i32 63
60  %val = load i8, ptr %valp2
61  %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
62  %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
63  ret <vscale x 16 x i8> %shf
64}
65
66define <vscale x 16 x i8> @ld1rb_gep_out_of_range_up(ptr %valp) {
67; CHECK-LABEL: ld1rb_gep_out_of_range_up:
68; CHECK:       // %bb.0:
69; CHECK-NEXT:    ptrue p0.b
70; CHECK-NEXT:    add x8, x0, #64
71; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x8]
72; CHECK-NEXT:    ret
73  %valp2 = getelementptr i8, ptr %valp, i32 64
74  %val = load i8, ptr %valp2
75  %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
76  %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
77  ret <vscale x 16 x i8> %shf
78}
79
80define <vscale x 16 x i8> @ld1rb_gep_out_of_range_down(ptr %valp) {
81; CHECK-LABEL: ld1rb_gep_out_of_range_down:
82; CHECK:       // %bb.0:
83; CHECK-NEXT:    ptrue p0.b
84; CHECK-NEXT:    sub x8, x0, #1
85; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x8]
86; CHECK-NEXT:    ret
87  %valp2 = getelementptr i8, ptr %valp, i32 -1
88  %val = load i8, ptr %valp2
89  %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
90  %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
91  ret <vscale x 16 x i8> %shf
92}
93
94define <vscale x 8 x i16> @ld1rb_i8_i16_zext(ptr %valp) {
95; CHECK-LABEL: ld1rb_i8_i16_zext:
96; CHECK:       // %bb.0:
97; CHECK-NEXT:    ptrue p0.h
98; CHECK-NEXT:    ld1rb { z0.h }, p0/z, [x0]
99; CHECK-NEXT:    ret
100  %val = load i8, ptr %valp
101  %ext = zext i8 %val to i16
102  %ins = insertelement <vscale x 8 x i16> undef, i16 %ext, i32 0
103  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
104  ret <vscale x 8 x i16> %shf
105}
106
107define <vscale x 8 x i16> @ld1rb_i8_i16_sext(ptr %valp) {
108; CHECK-LABEL: ld1rb_i8_i16_sext:
109; CHECK:       // %bb.0:
110; CHECK-NEXT:    ptrue p0.h
111; CHECK-NEXT:    ld1rsb { z0.h }, p0/z, [x0]
112; CHECK-NEXT:    ret
113  %val = load i8, ptr %valp
114  %ext = sext i8 %val to i16
115  %ins = insertelement <vscale x 8 x i16> undef, i16 %ext, i32 0
116  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
117  ret <vscale x 8 x i16> %shf
118}
119
120define <vscale x 4 x i32> @ld1rb_i8_i32_zext(ptr %valp) {
121; CHECK-LABEL: ld1rb_i8_i32_zext:
122; CHECK:       // %bb.0:
123; CHECK-NEXT:    ptrue p0.s
124; CHECK-NEXT:    ld1rb { z0.s }, p0/z, [x0]
125; CHECK-NEXT:    ret
126  %val = load i8, ptr %valp
127  %ext = zext i8 %val to i32
128  %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
129  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
130  ret <vscale x 4 x i32> %shf
131}
132
133define <vscale x 4 x i32> @ld1rb_i8_i32_sext(ptr %valp) {
134; CHECK-LABEL: ld1rb_i8_i32_sext:
135; CHECK:       // %bb.0:
136; CHECK-NEXT:    ptrue p0.s
137; CHECK-NEXT:    ld1rsb { z0.s }, p0/z, [x0]
138; CHECK-NEXT:    ret
139  %val = load i8, ptr %valp
140  %ext = sext i8 %val to i32
141  %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
142  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
143  ret <vscale x 4 x i32> %shf
144}
145
146define <vscale x 2 x i64> @ld1rb_i8_i64_zext(ptr %valp) {
147; CHECK-LABEL: ld1rb_i8_i64_zext:
148; CHECK:       // %bb.0:
149; CHECK-NEXT:    ptrue p0.d
150; CHECK-NEXT:    ld1rb { z0.d }, p0/z, [x0]
151; CHECK-NEXT:    ret
152  %val = load i8, ptr %valp
153  %ext = zext i8 %val to i64
154  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
155  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
156  ret <vscale x 2 x i64> %shf
157}
158
159define <vscale x 2 x i64> @ld1rb_i8_i64_sext(ptr %valp) {
160; CHECK-LABEL: ld1rb_i8_i64_sext:
161; CHECK:       // %bb.0:
162; CHECK-NEXT:    ptrue p0.d
163; CHECK-NEXT:    ld1rsb { z0.d }, p0/z, [x0]
164; CHECK-NEXT:    ret
165  %val = load i8, ptr %valp
166  %ext = sext i8 %val to i64
167  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
168  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
169  ret <vscale x 2 x i64> %shf
170}
171
172define <vscale x 8 x i16> @ld1rh(ptr %valp) {
173; CHECK-LABEL: ld1rh:
174; CHECK:       // %bb.0:
175; CHECK-NEXT:    ptrue p0.h
176; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0]
177; CHECK-NEXT:    ret
178  %val = load i16, ptr %valp
179  %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
180  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
181  ret <vscale x 8 x i16> %shf
182}
183
184define <vscale x 8 x i16> @ld1rh_gep(ptr %valp) {
185; CHECK-LABEL: ld1rh_gep:
186; CHECK:       // %bb.0:
187; CHECK-NEXT:    ptrue p0.h
188; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0, #126]
189; CHECK-NEXT:    ret
190  %valp2 = getelementptr i16, ptr %valp, i32 63
191  %val = load i16, ptr %valp2
192  %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
193  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
194  ret <vscale x 8 x i16> %shf
195}
196
197define <vscale x 8 x i16> @ld1rh_gep_out_of_range_up(ptr %valp) {
198; CHECK-LABEL: ld1rh_gep_out_of_range_up:
199; CHECK:       // %bb.0:
200; CHECK-NEXT:    ptrue p0.h
201; CHECK-NEXT:    add x8, x0, #128
202; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x8]
203; CHECK-NEXT:    ret
204  %valp2 = getelementptr i16, ptr %valp, i32 64
205  %val = load i16, ptr %valp2
206  %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
207  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
208  ret <vscale x 8 x i16> %shf
209}
210
211define <vscale x 8 x i16> @ld1rh_gep_out_of_range_down(ptr %valp) {
212; CHECK-LABEL: ld1rh_gep_out_of_range_down:
213; CHECK:       // %bb.0:
214; CHECK-NEXT:    ptrue p0.h
215; CHECK-NEXT:    sub x8, x0, #2
216; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x8]
217; CHECK-NEXT:    ret
218  %valp2 = getelementptr i16, ptr %valp, i32 -1
219  %val = load i16, ptr %valp2
220  %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
221  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
222  ret <vscale x 8 x i16> %shf
223}
224
225define <vscale x 4 x i32> @ld1rh_i16_i32_zext(ptr %valp) {
226; CHECK-LABEL: ld1rh_i16_i32_zext:
227; CHECK:       // %bb.0:
228; CHECK-NEXT:    ptrue p0.s
229; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x0]
230; CHECK-NEXT:    ret
231  %val = load i16, ptr %valp
232  %ext = zext i16 %val to i32
233  %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
234  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
235  ret <vscale x 4 x i32> %shf
236}
237
238define <vscale x 4 x i32> @ld1rh_i16_i32_sext(ptr %valp) {
239; CHECK-LABEL: ld1rh_i16_i32_sext:
240; CHECK:       // %bb.0:
241; CHECK-NEXT:    ptrue p0.s
242; CHECK-NEXT:    ld1rsh { z0.s }, p0/z, [x0]
243; CHECK-NEXT:    ret
244  %val = load i16, ptr %valp
245  %ext = sext i16 %val to i32
246  %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
247  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
248  ret <vscale x 4 x i32> %shf
249}
250
251define <vscale x 2 x i64> @ld1rh_i16_i64_zext(ptr %valp) {
252; CHECK-LABEL: ld1rh_i16_i64_zext:
253; CHECK:       // %bb.0:
254; CHECK-NEXT:    ptrue p0.d
255; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x0]
256; CHECK-NEXT:    ret
257  %val = load i16, ptr %valp
258  %ext = zext i16 %val to i64
259  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
260  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
261  ret <vscale x 2 x i64> %shf
262}
263
264define <vscale x 2 x i64> @ld1rh_i16_i64_sext(ptr %valp) {
265; CHECK-LABEL: ld1rh_i16_i64_sext:
266; CHECK:       // %bb.0:
267; CHECK-NEXT:    ptrue p0.d
268; CHECK-NEXT:    ld1rsh { z0.d }, p0/z, [x0]
269; CHECK-NEXT:    ret
270  %val = load i16, ptr %valp
271  %ext = sext i16 %val to i64
272  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
273  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
274  ret <vscale x 2 x i64> %shf
275}
276
277define <vscale x 4 x i32> @ld1rw(ptr %valp) {
278; CHECK-LABEL: ld1rw:
279; CHECK:       // %bb.0:
280; CHECK-NEXT:    ptrue p0.s
281; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
282; CHECK-NEXT:    ret
283  %val = load i32, ptr %valp
284  %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
285  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
286  ret <vscale x 4 x i32> %shf
287}
288
289define <vscale x 4 x i32> @ld1rw_gep(ptr %valp) {
290; CHECK-LABEL: ld1rw_gep:
291; CHECK:       // %bb.0:
292; CHECK-NEXT:    ptrue p0.s
293; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0, #252]
294; CHECK-NEXT:    ret
295  %valp2 = getelementptr i32, ptr %valp, i32 63
296  %val = load i32, ptr %valp2
297  %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
298  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
299  ret <vscale x 4 x i32> %shf
300}
301
302define <vscale x 4 x i32> @ld1rw_gep_out_of_range_up(ptr %valp) {
303; CHECK-LABEL: ld1rw_gep_out_of_range_up:
304; CHECK:       // %bb.0:
305; CHECK-NEXT:    ptrue p0.s
306; CHECK-NEXT:    add x8, x0, #256
307; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x8]
308; CHECK-NEXT:    ret
309  %valp2 = getelementptr i32, ptr %valp, i32 64
310  %val = load i32, ptr %valp2
311  %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
312  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
313  ret <vscale x 4 x i32> %shf
314}
315
316define <vscale x 4 x i32> @ld1rw_gep_out_of_range_down(ptr %valp) {
317; CHECK-LABEL: ld1rw_gep_out_of_range_down:
318; CHECK:       // %bb.0:
319; CHECK-NEXT:    ptrue p0.s
320; CHECK-NEXT:    sub x8, x0, #4
321; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x8]
322; CHECK-NEXT:    ret
323  %valp2 = getelementptr i32, ptr %valp, i32 -1
324  %val = load i32, ptr %valp2
325  %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
326  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
327  ret <vscale x 4 x i32> %shf
328}
329
330define <vscale x 2 x i64> @ld1rw_i32_i64_zext(ptr %valp) {
331; CHECK-LABEL: ld1rw_i32_i64_zext:
332; CHECK:       // %bb.0:
333; CHECK-NEXT:    ptrue p0.d
334; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x0]
335; CHECK-NEXT:    ret
336  %val = load i32, ptr %valp
337  %ext = zext i32 %val to i64
338  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
339  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
340  ret <vscale x 2 x i64> %shf
341}
342
343define <vscale x 2 x i64> @ld1rw_i32_i64_sext(ptr %valp) {
344; CHECK-LABEL: ld1rw_i32_i64_sext:
345; CHECK:       // %bb.0:
346; CHECK-NEXT:    ptrue p0.d
347; CHECK-NEXT:    ld1rsw { z0.d }, p0/z, [x0]
348; CHECK-NEXT:    ret
349  %val = load i32, ptr %valp
350  %ext = sext i32 %val to i64
351  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
352  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
353  ret <vscale x 2 x i64> %shf
354}
355
356define <vscale x 2 x i64> @ld1rd(ptr %valp) {
357; CHECK-LABEL: ld1rd:
358; CHECK:       // %bb.0:
359; CHECK-NEXT:    ptrue p0.d
360; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
361; CHECK-NEXT:    ret
362  %val = load i64, ptr %valp
363  %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
364  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
365  ret <vscale x 2 x i64> %shf
366}
367
368define <vscale x 2 x i64> @ld1rd_gep(ptr %valp) {
369; CHECK-LABEL: ld1rd_gep:
370; CHECK:       // %bb.0:
371; CHECK-NEXT:    ptrue p0.d
372; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0, #504]
373; CHECK-NEXT:    ret
374  %valp2 = getelementptr i64, ptr %valp, i32 63
375  %val = load i64, ptr %valp2
376  %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
377  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
378  ret <vscale x 2 x i64> %shf
379}
380
381define <vscale x 2 x i64> @ld1rd_gep_out_of_range_up(ptr %valp) {
382; CHECK-LABEL: ld1rd_gep_out_of_range_up:
383; CHECK:       // %bb.0:
384; CHECK-NEXT:    ptrue p0.d
385; CHECK-NEXT:    add x8, x0, #512
386; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
387; CHECK-NEXT:    ret
388  %valp2 = getelementptr i64, ptr %valp, i32 64
389  %val = load i64, ptr %valp2
390  %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
391  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
392  ret <vscale x 2 x i64> %shf
393}
394
395define <vscale x 2 x i64> @ld1rd_gep_out_of_range_down(ptr %valp) {
396; CHECK-LABEL: ld1rd_gep_out_of_range_down:
397; CHECK:       // %bb.0:
398; CHECK-NEXT:    ptrue p0.d
399; CHECK-NEXT:    sub x8, x0, #8
400; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
401; CHECK-NEXT:    ret
402  %valp2 = getelementptr i64, ptr %valp, i32 -1
403  %val = load i64, ptr %valp2
404  %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
405  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
406  ret <vscale x 2 x i64> %shf
407}
408
409define <vscale x 8 x half> @ld1rh_half(ptr %valp) {
410; CHECK-LD1R-LABEL: ld1rh_half:
411; CHECK-LD1R:       // %bb.0:
412; CHECK-LD1R-NEXT:    ptrue p0.h
413; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x0]
414; CHECK-LD1R-NEXT:    ret
415;
416; CHECK-NO-LD1R-LABEL: ld1rh_half:
417; CHECK-NO-LD1R:       // %bb.0:
418; CHECK-NO-LD1R-NEXT:    ldr h0, [x0]
419; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
420; CHECK-NO-LD1R-NEXT:    ret
421  %val = load half, ptr %valp
422  %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
423  %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
424  ret <vscale x 8 x half> %shf
425}
426
427define <vscale x 8 x half> @ld1rh_half_neoverse(ptr %valp) #1 {
428; CHECK-LABEL: ld1rh_half_neoverse:
429; CHECK:       // %bb.0:
430; CHECK-NEXT:    ldr h0, [x0]
431; CHECK-NEXT:    mov z0.h, h0
432; CHECK-NEXT:    ret
433  %val = load half, ptr %valp
434  %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
435  %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
436  ret <vscale x 8 x half> %shf
437}
438
439define <vscale x 8 x half> @ld1rh_half_gep(ptr %valp) {
440; CHECK-LD1R-LABEL: ld1rh_half_gep:
441; CHECK-LD1R:       // %bb.0:
442; CHECK-LD1R-NEXT:    ptrue p0.h
443; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x0, #126]
444; CHECK-LD1R-NEXT:    ret
445;
446; CHECK-NO-LD1R-LABEL: ld1rh_half_gep:
447; CHECK-NO-LD1R:       // %bb.0:
448; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #126]
449; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
450; CHECK-NO-LD1R-NEXT:    ret
451  %valp2 = getelementptr half, ptr %valp, i32 63
452  %val = load half, ptr %valp2
453  %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
454  %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
455  ret <vscale x 8 x half> %shf
456}
457
458define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_up(ptr %valp) {
459; CHECK-LD1R-LABEL: ld1rh_half_gep_out_of_range_up:
460; CHECK-LD1R:       // %bb.0:
461; CHECK-LD1R-NEXT:    ptrue p0.h
462; CHECK-LD1R-NEXT:    add x8, x0, #128
463; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x8]
464; CHECK-LD1R-NEXT:    ret
465;
466; CHECK-NO-LD1R-LABEL: ld1rh_half_gep_out_of_range_up:
467; CHECK-NO-LD1R:       // %bb.0:
468; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #128]
469; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
470; CHECK-NO-LD1R-NEXT:    ret
471  %valp2 = getelementptr half, ptr %valp, i32 64
472  %val = load half, ptr %valp2
473  %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
474  %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
475  ret <vscale x 8 x half> %shf
476}
477
478define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_down(ptr %valp) {
479; CHECK-LD1R-LABEL: ld1rh_half_gep_out_of_range_down:
480; CHECK-LD1R:       // %bb.0:
481; CHECK-LD1R-NEXT:    ptrue p0.h
482; CHECK-LD1R-NEXT:    sub x8, x0, #2
483; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x8]
484; CHECK-LD1R-NEXT:    ret
485;
486; CHECK-NO-LD1R-LABEL: ld1rh_half_gep_out_of_range_down:
487; CHECK-NO-LD1R:       // %bb.0:
488; CHECK-NO-LD1R-NEXT:    ldur h0, [x0, #-2]
489; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
490; CHECK-NO-LD1R-NEXT:    ret
491  %valp2 = getelementptr half, ptr %valp, i32 -1
492  %val = load half, ptr %valp2
493  %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
494  %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
495  ret <vscale x 8 x half> %shf
496}
497
498define <vscale x 4 x half> @ld1rh_half_unpacked4(ptr %valp) {
499; CHECK-LD1R-LABEL: ld1rh_half_unpacked4:
500; CHECK-LD1R:       // %bb.0:
501; CHECK-LD1R-NEXT:    ptrue p0.s
502; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x0]
503; CHECK-LD1R-NEXT:    ret
504;
505; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4:
506; CHECK-NO-LD1R:       // %bb.0:
507; CHECK-NO-LD1R-NEXT:    ldr h0, [x0]
508; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
509; CHECK-NO-LD1R-NEXT:    ret
510  %val = load half, ptr %valp
511  %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
512  %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
513  ret <vscale x 4 x half> %shf
514}
515
516define <vscale x 4 x half> @ld1rh_half_unpacked4_gep(ptr %valp) {
517; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep:
518; CHECK-LD1R:       // %bb.0:
519; CHECK-LD1R-NEXT:    ptrue p0.s
520; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x0, #126]
521; CHECK-LD1R-NEXT:    ret
522;
523; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep:
524; CHECK-NO-LD1R:       // %bb.0:
525; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #126]
526; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
527; CHECK-NO-LD1R-NEXT:    ret
528  %valp2 = getelementptr half, ptr %valp, i32 63
529  %val = load half, ptr %valp2
530  %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
531  %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
532  ret <vscale x 4 x half> %shf
533}
534
535define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_up(ptr %valp) {
536; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up:
537; CHECK-LD1R:       // %bb.0:
538; CHECK-LD1R-NEXT:    ptrue p0.s
539; CHECK-LD1R-NEXT:    add x8, x0, #128
540; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x8]
541; CHECK-LD1R-NEXT:    ret
542;
543; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up:
544; CHECK-NO-LD1R:       // %bb.0:
545; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #128]
546; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
547; CHECK-NO-LD1R-NEXT:    ret
548  %valp2 = getelementptr half, ptr %valp, i32 64
549  %val = load half, ptr %valp2
550  %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
551  %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
552  ret <vscale x 4 x half> %shf
553}
554
555define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_down(ptr %valp) {
556; CHECK-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down:
557; CHECK-LD1R:       // %bb.0:
558; CHECK-LD1R-NEXT:    ptrue p0.s
559; CHECK-LD1R-NEXT:    sub x8, x0, #2
560; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x8]
561; CHECK-LD1R-NEXT:    ret
562;
563; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down:
564; CHECK-NO-LD1R:       // %bb.0:
565; CHECK-NO-LD1R-NEXT:    ldur h0, [x0, #-2]
566; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
567; CHECK-NO-LD1R-NEXT:    ret
568  %valp2 = getelementptr half, ptr %valp, i32 -1
569  %val = load half, ptr %valp2
570  %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
571  %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
572  ret <vscale x 4 x half> %shf
573}
574
575define <vscale x 2 x half> @ld1rh_half_unpacked2(ptr %valp) {
576; CHECK-LD1R-LABEL: ld1rh_half_unpacked2:
577; CHECK-LD1R:       // %bb.0:
578; CHECK-LD1R-NEXT:    ptrue p0.d
579; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x0]
580; CHECK-LD1R-NEXT:    ret
581;
582; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2:
583; CHECK-NO-LD1R:       // %bb.0:
584; CHECK-NO-LD1R-NEXT:    ldr h0, [x0]
585; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
586; CHECK-NO-LD1R-NEXT:    ret
587  %val = load half, ptr %valp
588  %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
589  %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
590  ret <vscale x 2 x half> %shf
591}
592
593define <vscale x 2 x half> @ld1rh_half_unpacked2_gep(ptr %valp) {
594; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep:
595; CHECK-LD1R:       // %bb.0:
596; CHECK-LD1R-NEXT:    ptrue p0.d
597; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x0, #126]
598; CHECK-LD1R-NEXT:    ret
599;
600; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep:
601; CHECK-NO-LD1R:       // %bb.0:
602; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #126]
603; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
604; CHECK-NO-LD1R-NEXT:    ret
605  %valp2 = getelementptr half, ptr %valp, i32 63
606  %val = load half, ptr %valp2
607  %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
608  %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
609  ret <vscale x 2 x half> %shf
610}
611
612define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_up(ptr %valp) {
613; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up:
614; CHECK-LD1R:       // %bb.0:
615; CHECK-LD1R-NEXT:    ptrue p0.d
616; CHECK-LD1R-NEXT:    add x8, x0, #128
617; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x8]
618; CHECK-LD1R-NEXT:    ret
619;
620; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up:
621; CHECK-NO-LD1R:       // %bb.0:
622; CHECK-NO-LD1R-NEXT:    ldr h0, [x0, #128]
623; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
624; CHECK-NO-LD1R-NEXT:    ret
625  %valp2 = getelementptr half, ptr %valp, i32 64
626  %val = load half, ptr %valp2
627  %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
628  %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
629  ret <vscale x 2 x half> %shf
630}
631
632define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_down(ptr %valp) {
633; CHECK-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down:
634; CHECK-LD1R:       // %bb.0:
635; CHECK-LD1R-NEXT:    ptrue p0.d
636; CHECK-LD1R-NEXT:    sub x8, x0, #2
637; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x8]
638; CHECK-LD1R-NEXT:    ret
639;
640; CHECK-NO-LD1R-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down:
641; CHECK-NO-LD1R:       // %bb.0:
642; CHECK-NO-LD1R-NEXT:    ldur h0, [x0, #-2]
643; CHECK-NO-LD1R-NEXT:    mov z0.h, h0
644; CHECK-NO-LD1R-NEXT:    ret
645  %valp2 = getelementptr half, ptr %valp, i32 -1
646  %val = load half, ptr %valp2
647  %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
648  %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
649  ret <vscale x 2 x half> %shf
650}
651
652define <vscale x 4 x float> @ld1rw_float(ptr %valp) {
653; CHECK-LD1R-LABEL: ld1rw_float:
654; CHECK-LD1R:       // %bb.0:
655; CHECK-LD1R-NEXT:    ptrue p0.s
656; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x0]
657; CHECK-LD1R-NEXT:    ret
658;
659; CHECK-NO-LD1R-LABEL: ld1rw_float:
660; CHECK-NO-LD1R:       // %bb.0:
661; CHECK-NO-LD1R-NEXT:    ldr s0, [x0]
662; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
663; CHECK-NO-LD1R-NEXT:    ret
664  %val = load float, ptr %valp
665  %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
666  %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
667  ret <vscale x 4 x float> %shf
668}
669
670define <vscale x 4 x float> @ld1rw_float_gep(ptr %valp) {
671; CHECK-LD1R-LABEL: ld1rw_float_gep:
672; CHECK-LD1R:       // %bb.0:
673; CHECK-LD1R-NEXT:    ptrue p0.s
674; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x0, #252]
675; CHECK-LD1R-NEXT:    ret
676;
677; CHECK-NO-LD1R-LABEL: ld1rw_float_gep:
678; CHECK-NO-LD1R:       // %bb.0:
679; CHECK-NO-LD1R-NEXT:    ldr s0, [x0, #252]
680; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
681; CHECK-NO-LD1R-NEXT:    ret
682  %valp2 = getelementptr float, ptr %valp, i32 63
683  %val = load float, ptr %valp2
684  %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
685  %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
686  ret <vscale x 4 x float> %shf
687}
688
689define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_up(ptr %valp) {
690; CHECK-LD1R-LABEL: ld1rw_float_gep_out_of_range_up:
691; CHECK-LD1R:       // %bb.0:
692; CHECK-LD1R-NEXT:    ptrue p0.s
693; CHECK-LD1R-NEXT:    add x8, x0, #256
694; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x8]
695; CHECK-LD1R-NEXT:    ret
696;
697; CHECK-NO-LD1R-LABEL: ld1rw_float_gep_out_of_range_up:
698; CHECK-NO-LD1R:       // %bb.0:
699; CHECK-NO-LD1R-NEXT:    ldr s0, [x0, #256]
700; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
701; CHECK-NO-LD1R-NEXT:    ret
702  %valp2 = getelementptr float, ptr %valp, i32 64
703  %val = load float, ptr %valp2
704  %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
705  %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
706  ret <vscale x 4 x float> %shf
707}
708
709define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_down(ptr %valp) {
710; CHECK-LD1R-LABEL: ld1rw_float_gep_out_of_range_down:
711; CHECK-LD1R:       // %bb.0:
712; CHECK-LD1R-NEXT:    ptrue p0.s
713; CHECK-LD1R-NEXT:    sub x8, x0, #4
714; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x8]
715; CHECK-LD1R-NEXT:    ret
716;
717; CHECK-NO-LD1R-LABEL: ld1rw_float_gep_out_of_range_down:
718; CHECK-NO-LD1R:       // %bb.0:
719; CHECK-NO-LD1R-NEXT:    ldur s0, [x0, #-4]
720; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
721; CHECK-NO-LD1R-NEXT:    ret
722  %valp2 = getelementptr float, ptr %valp, i32 -1
723  %val = load float, ptr %valp2
724  %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
725  %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
726  ret <vscale x 4 x float> %shf
727}
728
729define <vscale x 2 x float> @ld1rw_float_unpacked2(ptr %valp) {
730; CHECK-LD1R-LABEL: ld1rw_float_unpacked2:
731; CHECK-LD1R:       // %bb.0:
732; CHECK-LD1R-NEXT:    ptrue p0.d
733; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x0]
734; CHECK-LD1R-NEXT:    ret
735;
736; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2:
737; CHECK-NO-LD1R:       // %bb.0:
738; CHECK-NO-LD1R-NEXT:    ldr s0, [x0]
739; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
740; CHECK-NO-LD1R-NEXT:    ret
741  %val = load float, ptr %valp
742  %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
743  %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
744  ret <vscale x 2 x float> %shf
745}
746
747define <vscale x 2 x float> @ld1rw_float_unpacked2_gep(ptr %valp) {
748; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep:
749; CHECK-LD1R:       // %bb.0:
750; CHECK-LD1R-NEXT:    ptrue p0.d
751; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x0, #252]
752; CHECK-LD1R-NEXT:    ret
753;
754; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep:
755; CHECK-NO-LD1R:       // %bb.0:
756; CHECK-NO-LD1R-NEXT:    ldr s0, [x0, #252]
757; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
758; CHECK-NO-LD1R-NEXT:    ret
759  %valp2 = getelementptr float, ptr %valp, i32 63
760  %val = load float, ptr %valp2
761  %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
762  %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
763  ret <vscale x 2 x float> %shf
764}
765
766define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_up(ptr %valp) {
767; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up:
768; CHECK-LD1R:       // %bb.0:
769; CHECK-LD1R-NEXT:    ptrue p0.d
770; CHECK-LD1R-NEXT:    add x8, x0, #256
771; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x8]
772; CHECK-LD1R-NEXT:    ret
773;
774; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up:
775; CHECK-NO-LD1R:       // %bb.0:
776; CHECK-NO-LD1R-NEXT:    ldr s0, [x0, #256]
777; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
778; CHECK-NO-LD1R-NEXT:    ret
779  %valp2 = getelementptr float, ptr %valp, i32 64
780  %val = load float, ptr %valp2
781  %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
782  %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
783  ret <vscale x 2 x float> %shf
784}
785
786define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_down(ptr %valp) {
787; CHECK-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down:
788; CHECK-LD1R:       // %bb.0:
789; CHECK-LD1R-NEXT:    ptrue p0.d
790; CHECK-LD1R-NEXT:    sub x8, x0, #4
791; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x8]
792; CHECK-LD1R-NEXT:    ret
793;
794; CHECK-NO-LD1R-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down:
795; CHECK-NO-LD1R:       // %bb.0:
796; CHECK-NO-LD1R-NEXT:    ldur s0, [x0, #-4]
797; CHECK-NO-LD1R-NEXT:    mov z0.s, s0
798; CHECK-NO-LD1R-NEXT:    ret
799  %valp2 = getelementptr float, ptr %valp, i32 -1
800  %val = load float, ptr %valp2
801  %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
802  %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
803  ret <vscale x 2 x float> %shf
804}
805
806define <vscale x 2 x double> @ld1rd_double(ptr %valp) {
807; CHECK-LD1R-LABEL: ld1rd_double:
808; CHECK-LD1R:       // %bb.0:
809; CHECK-LD1R-NEXT:    ptrue p0.d
810; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x0]
811; CHECK-LD1R-NEXT:    ret
812;
813; CHECK-NO-LD1R-LABEL: ld1rd_double:
814; CHECK-NO-LD1R:       // %bb.0:
815; CHECK-NO-LD1R-NEXT:    ldr d0, [x0]
816; CHECK-NO-LD1R-NEXT:    mov z0.d, d0
817; CHECK-NO-LD1R-NEXT:    ret
818  %val = load double, ptr %valp
819  %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
820  %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
821  ret <vscale x 2 x double> %shf
822}
823
824define <vscale x 2 x double> @ld1rd_double_gep(ptr %valp) {
825; CHECK-LD1R-LABEL: ld1rd_double_gep:
826; CHECK-LD1R:       // %bb.0:
827; CHECK-LD1R-NEXT:    ptrue p0.d
828; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x0, #504]
829; CHECK-LD1R-NEXT:    ret
830;
831; CHECK-NO-LD1R-LABEL: ld1rd_double_gep:
832; CHECK-NO-LD1R:       // %bb.0:
833; CHECK-NO-LD1R-NEXT:    ldr d0, [x0, #504]
834; CHECK-NO-LD1R-NEXT:    mov z0.d, d0
835; CHECK-NO-LD1R-NEXT:    ret
836  %valp2 = getelementptr double, ptr %valp, i32 63
837  %val = load double, ptr %valp2
838  %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
839  %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
840  ret <vscale x 2 x double> %shf
841}
842
843define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_up(ptr %valp) {
844; CHECK-LD1R-LABEL: ld1rd_double_gep_out_of_range_up:
845; CHECK-LD1R:       // %bb.0:
846; CHECK-LD1R-NEXT:    ptrue p0.d
847; CHECK-LD1R-NEXT:    add x8, x0, #512
848; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x8]
849; CHECK-LD1R-NEXT:    ret
850;
851; CHECK-NO-LD1R-LABEL: ld1rd_double_gep_out_of_range_up:
852; CHECK-NO-LD1R:       // %bb.0:
853; CHECK-NO-LD1R-NEXT:    ldr d0, [x0, #512]
854; CHECK-NO-LD1R-NEXT:    mov z0.d, d0
855; CHECK-NO-LD1R-NEXT:    ret
856  %valp2 = getelementptr double, ptr %valp, i32 64
857  %val = load double, ptr %valp2
858  %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
859  %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
860  ret <vscale x 2 x double> %shf
861}
862
863define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_down(ptr %valp) {
864; CHECK-LD1R-LABEL: ld1rd_double_gep_out_of_range_down:
865; CHECK-LD1R:       // %bb.0:
866; CHECK-LD1R-NEXT:    ptrue p0.d
867; CHECK-LD1R-NEXT:    sub x8, x0, #8
868; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x8]
869; CHECK-LD1R-NEXT:    ret
870;
871; CHECK-NO-LD1R-LABEL: ld1rd_double_gep_out_of_range_down:
872; CHECK-NO-LD1R:       // %bb.0:
873; CHECK-NO-LD1R-NEXT:    ldur d0, [x0, #-8]
874; CHECK-NO-LD1R-NEXT:    mov z0.d, d0
875; CHECK-NO-LD1R-NEXT:    ret
876  %valp2 = getelementptr double, ptr %valp, i32 -1
877  %val = load double, ptr %valp2
878  %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
879  %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
880  ret <vscale x 2 x double> %shf
881}
882
883define <vscale x 2 x double> @dupq_ld1rqd_f64(ptr %a) {
884; CHECK-LABEL: dupq_ld1rqd_f64:
885; CHECK:       // %bb.0:
886; CHECK-NEXT:    ptrue p0.d
887; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0]
888; CHECK-NEXT:    ret
889  %1 = load <2 x double>, ptr %a
890  %2 = tail call fast <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %1, i64 0)
891  %3 = tail call fast <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %2, i64 0)
892  ret <vscale x 2 x double> %3
893}
894
895define <vscale x 4 x float> @dupq_ld1rqw_f32(ptr %a) {
896; CHECK-LABEL: dupq_ld1rqw_f32:
897; CHECK:       // %bb.0:
898; CHECK-NEXT:    ptrue p0.s
899; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0]
900; CHECK-NEXT:    ret
901  %1 = load <4 x float>, ptr %a
902  %2 = tail call fast <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %1, i64 0)
903  %3 = tail call fast <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %2, i64 0)
904  ret <vscale x 4 x float> %3
905}
906
907define <vscale x 8 x half> @dupq_ld1rqh_f16(ptr %a) {
908; CHECK-LABEL: dupq_ld1rqh_f16:
909; CHECK:       // %bb.0:
910; CHECK-NEXT:    ptrue p0.h
911; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0]
912; CHECK-NEXT:    ret
913  %1 = load <8 x half>, ptr %a
914  %2 = tail call fast <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %1, i64 0)
915  %3 = tail call fast <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %2, i64 0)
916  ret <vscale x 8 x half> %3
917}
918
919define <vscale x 8 x bfloat> @dupq_ld1rqh_bf16(ptr %a) #0 {
920; CHECK-LABEL: dupq_ld1rqh_bf16:
921; CHECK:       // %bb.0:
922; CHECK-NEXT:    ptrue p0.h
923; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0]
924; CHECK-NEXT:    ret
925  %1 = load <8 x bfloat>, ptr %a
926  %2 = tail call fast <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %1, i64 0)
927  %3 = tail call fast <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %2, i64 0)
928  ret <vscale x 8 x bfloat> %3
929}
930
931define <vscale x 2 x i64> @dupq_ld1rqd_i64(ptr %a) #0 {
932; CHECK-LABEL: dupq_ld1rqd_i64:
933; CHECK:       // %bb.0:
934; CHECK-NEXT:    ptrue p0.d
935; CHECK-NEXT:    ld1rqd { z0.d }, p0/z, [x0]
936; CHECK-NEXT:    ret
937  %1 = load <2 x i64>, ptr %a
938  %2 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %1, i64 0)
939  %3 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %2, i64 0)
940  ret <vscale x 2 x i64> %3
941}
942
943define <vscale x 4 x i32> @dupq_ld1rqw_i32(ptr %a) #0 {
944; CHECK-LABEL: dupq_ld1rqw_i32:
945; CHECK:       // %bb.0:
946; CHECK-NEXT:    ptrue p0.s
947; CHECK-NEXT:    ld1rqw { z0.s }, p0/z, [x0]
948; CHECK-NEXT:    ret
949  %1 = load <4 x i32>, ptr %a
950  %2 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %1, i64 0)
951  %3 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %2, i64 0)
952  ret <vscale x 4 x i32> %3
953}
954
955define <vscale x 8 x i16> @dupq_ld1rqw_i16(ptr %a) #0 {
956; CHECK-LABEL: dupq_ld1rqw_i16:
957; CHECK:       // %bb.0:
958; CHECK-NEXT:    ptrue p0.h
959; CHECK-NEXT:    ld1rqh { z0.h }, p0/z, [x0]
960; CHECK-NEXT:    ret
961  %1 = load <8 x i16>, ptr %a
962  %2 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %1, i64 0)
963  %3 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %2, i64 0)
964  ret <vscale x 8 x i16> %3
965}
966
967define <vscale x 16 x i8> @dupq_ld1rqw_i8(ptr %a) #0 {
968; CHECK-LABEL: dupq_ld1rqw_i8:
969; CHECK:       // %bb.0:
970; CHECK-NEXT:    ptrue p0.b
971; CHECK-NEXT:    ld1rqb { z0.b }, p0/z, [x0]
972; CHECK-NEXT:    ret
973  %1 = load <16 x i8>, ptr %a
974  %2 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %1, i64 0)
975  %3 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %2, i64 0)
976  ret <vscale x 16 x i8> %3
977}
978
979;
980;
981; Tests for dup:
982;
983; Positive tests:
984; * dup with passthru=undef or passthrue=zero.
985; * sign/zero extending.
986; * unpacked types.
987;
988; Negative tests:
989; * dup with passthru as a parameter.
990;
991;
992
993define <vscale x 16 x i8> @dup_ld1rb_i8_passthruundef_nxv16i8(<vscale x 16 x i1> %pg, ptr %addr) {
994; CHECK-LABEL: dup_ld1rb_i8_passthruundef_nxv16i8:
995; CHECK:       // %bb.0:
996; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x0]
997; CHECK-NEXT:    ret
998    %ld = load i8, ptr %addr
999    %res = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> %pg, i8 %ld)
1000    ret <vscale x 16 x i8> %res
1001}
1002define <vscale x 8 x i16> @dup_ld1rh_i16_passthruundef_nxv8i16(<vscale x 8 x i1> %pg, ptr %addr) {
1003; CHECK-LABEL: dup_ld1rh_i16_passthruundef_nxv8i16:
1004; CHECK:       // %bb.0:
1005; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0]
1006; CHECK-NEXT:    ret
1007    %ld = load i16, ptr %addr
1008    %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 %ld)
1009    ret <vscale x 8 x i16> %res
1010}
1011define <vscale x 8 x i16> @dup_ld1rh_i8_passthruundef_nxv8i16_sext(<vscale x 8 x i1> %pg, ptr %addr) {
1012; CHECK-LABEL: dup_ld1rh_i8_passthruundef_nxv8i16_sext:
1013; CHECK:       // %bb.0:
1014; CHECK-NEXT:    ld1rsb { z0.h }, p0/z, [x0]
1015; CHECK-NEXT:    ret
1016    %ld = load i8, ptr %addr
1017    %ext = sext i8 %ld to i16
1018    %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 %ext)
1019    ret <vscale x 8 x i16> %res
1020}
1021define <vscale x 8 x i16> @dup_ld1rh_i8_passthruundef_nxv8i16_zext(<vscale x 8 x i1> %pg, ptr %addr) {
1022; CHECK-LABEL: dup_ld1rh_i8_passthruundef_nxv8i16_zext:
1023; CHECK:       // %bb.0:
1024; CHECK-NEXT:    ld1rb { z0.h }, p0/z, [x0]
1025; CHECK-NEXT:    ret
1026    %ld = load i8, ptr %addr
1027    %ext = zext i8 %ld to i16
1028    %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> %pg, i16 %ext)
1029    ret <vscale x 8 x i16> %res
1030}
1031define <vscale x 4 x i32> @dup_ld1rs_i32_passthruundef_nxv4i32(<vscale x 4 x i1> %pg, ptr %addr) {
1032; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv4i32:
1033; CHECK:       // %bb.0:
1034; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
1035; CHECK-NEXT:    ret
1036    %ld = load i32, ptr %addr
1037    %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ld)
1038    ret <vscale x 4 x i32> %res
1039}
1040define <vscale x 4 x i32> @dup_ld1rs_i8_passthruundef_nxv4i32_sext(<vscale x 4 x i1> %pg, ptr %addr) {
1041; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv4i32_sext:
1042; CHECK:       // %bb.0:
1043; CHECK-NEXT:    ld1rsb { z0.s }, p0/z, [x0]
1044; CHECK-NEXT:    ret
1045    %ld = load i8, ptr %addr
1046    %ext = sext i8 %ld to i32
1047    %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
1048    ret <vscale x 4 x i32> %res
1049}
1050define <vscale x 4 x i32> @dup_ld1rs_i8_passthruundef_nxv4i32_zext(<vscale x 4 x i1> %pg, ptr %addr) {
1051; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv4i32_zext:
1052; CHECK:       // %bb.0:
1053; CHECK-NEXT:    ld1rb { z0.s }, p0/z, [x0]
1054; CHECK-NEXT:    ret
1055    %ld = load i8, ptr %addr
1056    %ext = zext i8 %ld to i32
1057    %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
1058    ret <vscale x 4 x i32> %res
1059}
1060define <vscale x 4 x i32> @dup_ld1rs_i16_passthruundef_nxv4i32_sext(<vscale x 4 x i1> %pg, ptr %addr) {
1061; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv4i32_sext:
1062; CHECK:       // %bb.0:
1063; CHECK-NEXT:    ld1rsh { z0.s }, p0/z, [x0]
1064; CHECK-NEXT:    ret
1065    %ld = load i16, ptr %addr
1066    %ext = sext i16 %ld to i32
1067    %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
1068    ret <vscale x 4 x i32> %res
1069}
1070define <vscale x 4 x i32> @dup_ld1rs_i16_passthruundef_nxv4i32_zext(<vscale x 4 x i1> %pg, ptr %addr) {
1071; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv4i32_zext:
1072; CHECK:       // %bb.0:
1073; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x0]
1074; CHECK-NEXT:    ret
1075    %ld = load i16, ptr %addr
1076    %ext = zext i16 %ld to i32
1077    %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> %pg, i32 %ext)
1078    ret <vscale x 4 x i32> %res
1079}
1080define <vscale x 2 x i64> @dup_ld1rd_i64_passthruundef_nxv2i64(<vscale x 2 x i1> %pg, ptr %addr) {
1081; CHECK-LABEL: dup_ld1rd_i64_passthruundef_nxv2i64:
1082; CHECK:       // %bb.0:
1083; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
1084; CHECK-NEXT:    ret
1085    %ld = load i64, ptr %addr
1086    %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ld)
1087    ret <vscale x 2 x i64> %res
1088}
1089define <vscale x 2 x i64> @dup_ld1rs_i8_passthruundef_nxv2i64_sext(<vscale x 2 x i1> %pg, ptr %addr) {
1090; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv2i64_sext:
1091; CHECK:       // %bb.0:
1092; CHECK-NEXT:    ld1rsb { z0.d }, p0/z, [x0]
1093; CHECK-NEXT:    ret
1094    %ld = load i8, ptr %addr
1095    %ext = sext i8 %ld to i64
1096    %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1097    ret <vscale x 2 x i64> %res
1098}
1099define <vscale x 2 x i64> @dup_ld1rs_i8_passthruundef_nxv2i64_zext(<vscale x 2 x i1> %pg, ptr %addr) {
1100; CHECK-LABEL: dup_ld1rs_i8_passthruundef_nxv2i64_zext:
1101; CHECK:       // %bb.0:
1102; CHECK-NEXT:    ld1rb { z0.d }, p0/z, [x0]
1103; CHECK-NEXT:    ret
1104    %ld = load i8, ptr %addr
1105    %ext = zext i8 %ld to i64
1106    %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1107    ret <vscale x 2 x i64> %res
1108}
1109define <vscale x 2 x i64> @dup_ld1rs_i16_passthruundef_nxv2i64_sext(<vscale x 2 x i1> %pg, ptr %addr) {
1110; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv2i64_sext:
1111; CHECK:       // %bb.0:
1112; CHECK-NEXT:    ld1rsh { z0.d }, p0/z, [x0]
1113; CHECK-NEXT:    ret
1114    %ld = load i16, ptr %addr
1115    %ext = sext i16 %ld to i64
1116    %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1117    ret <vscale x 2 x i64> %res
1118}
1119define <vscale x 2 x i64> @dup_ld1rs_i16_passthruundef_nxv2i64_zext(<vscale x 2 x i1> %pg, ptr %addr) {
1120; CHECK-LABEL: dup_ld1rs_i16_passthruundef_nxv2i64_zext:
1121; CHECK:       // %bb.0:
1122; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x0]
1123; CHECK-NEXT:    ret
1124    %ld = load i16, ptr %addr
1125    %ext = zext i16 %ld to i64
1126    %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1127    ret <vscale x 2 x i64> %res
1128}
1129define <vscale x 2 x i64> @dup_ld1rs_i32_passthruundef_nxv2i64_sext(<vscale x 2 x i1> %pg, ptr %addr) {
1130; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv2i64_sext:
1131; CHECK:       // %bb.0:
1132; CHECK-NEXT:    ld1rsw { z0.d }, p0/z, [x0]
1133; CHECK-NEXT:    ret
1134    %ld = load i32, ptr %addr
1135    %ext = sext i32 %ld to i64
1136    %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1137    ret <vscale x 2 x i64> %res
1138}
1139define <vscale x 2 x i64> @dup_ld1rs_i32_passthruundef_nxv2i64_zext(<vscale x 2 x i1> %pg, ptr %addr) {
1140; CHECK-LABEL: dup_ld1rs_i32_passthruundef_nxv2i64_zext:
1141; CHECK:       // %bb.0:
1142; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x0]
1143; CHECK-NEXT:    ret
1144    %ld = load i32, ptr %addr
1145    %ext = zext i32 %ld to i64
1146    %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1147    ret <vscale x 2 x i64> %res
1148}
1149define <vscale x 8 x half> @dup_ld1rh_half_passthruundef_nxv8f16(<vscale x 8 x i1> %pg, ptr %addr) {
1150; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv8f16:
1151; CHECK-LD1R:       // %bb.0:
1152; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x0]
1153; CHECK-LD1R-NEXT:    ret
1154;
1155; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv8f16:
1156; CHECK-NO-LD1R:       // %bb.0:
1157; CHECK-NO-LD1R-NEXT:    ldr h0, [x0]
1158; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h0
1159; CHECK-NO-LD1R-NEXT:    ret
1160    %ld = load half, ptr %addr
1161    %res = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> %pg, half %ld)
1162    ret <vscale x 8 x half> %res
1163}
1164define <vscale x 4 x float> @dup_ld1rs_float_passthruundef_nxv4f32(<vscale x 4 x i1> %pg, ptr %addr) {
1165; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruundef_nxv4f32:
1166; CHECK-LD1R:       // %bb.0:
1167; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x0]
1168; CHECK-LD1R-NEXT:    ret
1169;
1170; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruundef_nxv4f32:
1171; CHECK-NO-LD1R:       // %bb.0:
1172; CHECK-NO-LD1R-NEXT:    ldr s0, [x0]
1173; CHECK-NO-LD1R-NEXT:    mov z0.s, p0/m, s0
1174; CHECK-NO-LD1R-NEXT:    ret
1175    %ld = load float, ptr %addr
1176    %res = call <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> %pg, float %ld)
1177    ret <vscale x 4 x float> %res
1178}
1179define <vscale x 2 x double> @dup_ld1rd_double_passthruundef_nxv2f64(<vscale x 2 x i1> %pg, ptr %addr) {
1180; CHECK-LD1R-LABEL: dup_ld1rd_double_passthruundef_nxv2f64:
1181; CHECK-LD1R:       // %bb.0:
1182; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x0]
1183; CHECK-LD1R-NEXT:    ret
1184;
1185; CHECK-NO-LD1R-LABEL: dup_ld1rd_double_passthruundef_nxv2f64:
1186; CHECK-NO-LD1R:       // %bb.0:
1187; CHECK-NO-LD1R-NEXT:    ldr d0, [x0]
1188; CHECK-NO-LD1R-NEXT:    mov z0.d, p0/m, d0
1189; CHECK-NO-LD1R-NEXT:    ret
1190    %ld = load double, ptr %addr
1191    %res = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> %pg, double %ld)
1192    ret <vscale x 2 x double> %res
1193}
1194define <vscale x 4 x half> @dup_ld1rh_half_passthruundef_nxv4f16(<vscale x 4 x i1> %pg, ptr %addr) {
1195; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv4f16:
1196; CHECK-LD1R:       // %bb.0:
1197; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x0]
1198; CHECK-LD1R-NEXT:    ret
1199;
1200; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruundef_nxv4f16:
1201; CHECK-NO-LD1R:       // %bb.0:
1202; CHECK-NO-LD1R-NEXT:    ldr h0, [x0]
1203; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h0
1204; CHECK-NO-LD1R-NEXT:    ret
1205    %ld = load half, ptr %addr
1206    %res = call <vscale x 4 x half> @llvm.aarch64.sve.dup.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> %pg, half %ld)
1207    ret <vscale x 4 x half> %res
1208}
1209define <vscale x 16 x i8> @dup_ld1rb_i8_passthruzero_nxv16i8(<vscale x 16 x i1> %pg, ptr %addr) {
1210; CHECK-LABEL: dup_ld1rb_i8_passthruzero_nxv16i8:
1211; CHECK:       // %bb.0:
1212; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x0]
1213; CHECK-NEXT:    ret
1214    %ld = load i8, ptr %addr
1215    %res = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i1> %pg, i8 %ld)
1216    ret <vscale x 16 x i8> %res
1217}
1218define <vscale x 8 x i16> @dup_ld1rh_i16_passthruzero_nxv8i16(<vscale x 8 x i1> %pg, ptr %addr) {
1219; CHECK-LABEL: dup_ld1rh_i16_passthruzero_nxv8i16:
1220; CHECK:       // %bb.0:
1221; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0]
1222; CHECK-NEXT:    ret
1223    %ld = load i16, ptr %addr
1224    %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> zeroinitializer, <vscale x 8 x i1> %pg, i16 %ld)
1225    ret <vscale x 8 x i16> %res
1226}
1227define <vscale x 4 x i32> @dup_ld1rs_i32_passthruzero_nxv4i32(<vscale x 4 x i1> %pg, ptr %addr) {
1228; CHECK-LABEL: dup_ld1rs_i32_passthruzero_nxv4i32:
1229; CHECK:       // %bb.0:
1230; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
1231; CHECK-NEXT:    ret
1232    %ld = load i32, ptr %addr
1233    %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 4 x i1> %pg, i32 %ld)
1234    ret <vscale x 4 x i32> %res
1235}
1236define <vscale x 2 x i64> @dup_ld1rd_i64_passthruzero_nxv2i64(<vscale x 2 x i1> %pg, ptr %addr) {
1237; CHECK-LABEL: dup_ld1rd_i64_passthruzero_nxv2i64:
1238; CHECK:       // %bb.0:
1239; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
1240; CHECK-NEXT:    ret
1241    %ld = load i64, ptr %addr
1242    %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, i64 %ld)
1243    ret <vscale x 2 x i64> %res
1244}
1245define <vscale x 8 x half> @dup_ld1rh_half_passthruzero_nxv8f16(<vscale x 8 x i1> %pg, ptr %addr) {
1246; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv8f16:
1247; CHECK-LD1R:       // %bb.0:
1248; CHECK-LD1R-NEXT:    ld1rh { z0.h }, p0/z, [x0]
1249; CHECK-LD1R-NEXT:    ret
1250;
1251; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv8f16:
1252; CHECK-NO-LD1R:       // %bb.0:
1253; CHECK-NO-LD1R-NEXT:    mov z0.h, #0 // =0x0
1254; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
1255; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h1
1256; CHECK-NO-LD1R-NEXT:    ret
1257    %ld = load half, ptr %addr
1258    %res = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> zeroinitializer, <vscale x 8 x i1> %pg, half %ld)
1259    ret <vscale x 8 x half> %res
1260}
1261define <vscale x 4 x float> @dup_ld1rs_float_passthruzero_nxv4f32(<vscale x 4 x i1> %pg, ptr %addr) {
1262; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv4f32:
1263; CHECK-LD1R:       // %bb.0:
1264; CHECK-LD1R-NEXT:    ld1rw { z0.s }, p0/z, [x0]
1265; CHECK-LD1R-NEXT:    ret
1266;
1267; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv4f32:
1268; CHECK-NO-LD1R:       // %bb.0:
1269; CHECK-NO-LD1R-NEXT:    mov z0.s, #0 // =0x0
1270; CHECK-NO-LD1R-NEXT:    ldr s1, [x0]
1271; CHECK-NO-LD1R-NEXT:    mov z0.s, p0/m, s1
1272; CHECK-NO-LD1R-NEXT:    ret
1273    %ld = load float, ptr %addr
1274    %res = call <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float> zeroinitializer, <vscale x 4 x i1> %pg, float %ld)
1275    ret <vscale x 4 x float> %res
1276}
1277define <vscale x 2 x double> @dup_ld1rd_double_passthruzero_nxv2f64(<vscale x 2 x i1> %pg, ptr %addr) {
1278; CHECK-LD1R-LABEL: dup_ld1rd_double_passthruzero_nxv2f64:
1279; CHECK-LD1R:       // %bb.0:
1280; CHECK-LD1R-NEXT:    ld1rd { z0.d }, p0/z, [x0]
1281; CHECK-LD1R-NEXT:    ret
1282;
1283; CHECK-NO-LD1R-LABEL: dup_ld1rd_double_passthruzero_nxv2f64:
1284; CHECK-NO-LD1R:       // %bb.0:
1285; CHECK-NO-LD1R-NEXT:    mov z0.d, #0 // =0x0
1286; CHECK-NO-LD1R-NEXT:    ldr d1, [x0]
1287; CHECK-NO-LD1R-NEXT:    mov z0.d, p0/m, d1
1288; CHECK-NO-LD1R-NEXT:    ret
1289    %ld = load double, ptr %addr
1290    %res = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> zeroinitializer, <vscale x 2 x i1> %pg, double %ld)
1291    ret <vscale x 2 x double> %res
1292}
1293define <vscale x 4 x half> @dup_ld1rh_half_passthruzero_nxv4f16(<vscale x 4 x i1> %pg, ptr %addr) {
1294; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv4f16:
1295; CHECK-LD1R:       // %bb.0:
1296; CHECK-LD1R-NEXT:    ld1rh { z0.s }, p0/z, [x0]
1297; CHECK-LD1R-NEXT:    ret
1298;
1299; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv4f16:
1300; CHECK-NO-LD1R:       // %bb.0:
1301; CHECK-NO-LD1R-NEXT:    mov z0.h, #0 // =0x0
1302; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
1303; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h1
1304; CHECK-NO-LD1R-NEXT:    ret
1305    %ld = load half, ptr %addr
1306    %res = call <vscale x 4 x half> @llvm.aarch64.sve.dup.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x i1> %pg, half %ld)
1307    ret <vscale x 4 x half> %res
1308}
1309define <vscale x 2 x half> @dup_ld1rh_half_passthruzero_nxv2f16(<vscale x 2 x i1> %pg, ptr %addr) {
1310; CHECK-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv2f16:
1311; CHECK-LD1R:       // %bb.0:
1312; CHECK-LD1R-NEXT:    ld1rh { z0.d }, p0/z, [x0]
1313; CHECK-LD1R-NEXT:    ret
1314;
1315; CHECK-NO-LD1R-LABEL: dup_ld1rh_half_passthruzero_nxv2f16:
1316; CHECK-NO-LD1R:       // %bb.0:
1317; CHECK-NO-LD1R-NEXT:    mov z0.h, #0 // =0x0
1318; CHECK-NO-LD1R-NEXT:    ldr h1, [x0]
1319; CHECK-NO-LD1R-NEXT:    mov z0.h, p0/m, h1
1320; CHECK-NO-LD1R-NEXT:    ret
1321    %ld = load half, ptr %addr
1322    %res = call <vscale x 2 x half> @llvm.aarch64.sve.dup.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x i1> %pg, half %ld)
1323    ret <vscale x 2 x half> %res
1324}
1325define <vscale x 2 x float> @dup_ld1rs_float_passthruzero_nxv2f32(<vscale x 2 x i1> %pg, ptr %addr) {
1326; CHECK-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv2f32:
1327; CHECK-LD1R:       // %bb.0:
1328; CHECK-LD1R-NEXT:    ld1rw { z0.d }, p0/z, [x0]
1329; CHECK-LD1R-NEXT:    ret
1330;
1331; CHECK-NO-LD1R-LABEL: dup_ld1rs_float_passthruzero_nxv2f32:
1332; CHECK-NO-LD1R:       // %bb.0:
1333; CHECK-NO-LD1R-NEXT:    mov z0.s, #0 // =0x0
1334; CHECK-NO-LD1R-NEXT:    ldr s1, [x0]
1335; CHECK-NO-LD1R-NEXT:    mov z0.s, p0/m, s1
1336; CHECK-NO-LD1R-NEXT:    ret
1337    %ld = load float, ptr %addr
1338    %res = call <vscale x 2 x float> @llvm.aarch64.sve.dup.nxv2f32(<vscale x 2 x float> zeroinitializer, <vscale x 2 x i1> %pg, float %ld)
1339    ret <vscale x 2 x float> %res
1340}
1341define <vscale x 16 x i8> @negtest_dup_ld1rb_i8_passthru_nxv16i8(<vscale x 16 x i8> %pt, <vscale x 16 x i1> %pg, ptr %addr) {
1342; CHECK-LABEL: negtest_dup_ld1rb_i8_passthru_nxv16i8:
1343; CHECK:       // %bb.0:
1344; CHECK-NEXT:    ldrb w8, [x0]
1345; CHECK-NEXT:    mov z0.b, p0/m, w8
1346; CHECK-NEXT:    ret
1347    %ld = load i8, ptr %addr
1348    %res = call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %pt, <vscale x 16 x i1> %pg, i8 %ld)
1349    ret <vscale x 16 x i8> %res
1350}
1351define <vscale x 8 x i16> @negtest_dup_ld1rh_i16_passthru_nxv8i16(<vscale x 8 x i16> %pt, <vscale x 8 x i1> %pg, ptr %addr) {
1352; CHECK-LABEL: negtest_dup_ld1rh_i16_passthru_nxv8i16:
1353; CHECK:       // %bb.0:
1354; CHECK-NEXT:    ldrh w8, [x0]
1355; CHECK-NEXT:    mov z0.h, p0/m, w8
1356; CHECK-NEXT:    ret
1357    %ld = load i16, ptr %addr
1358    %res = call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> %pt, <vscale x 8 x i1> %pg, i16 %ld)
1359    ret <vscale x 8 x i16> %res
1360}
1361define <vscale x 4 x i32> @negtest_dup_ld1rs_i32_passthru_nxv4i32(<vscale x 4 x i32> %pt, <vscale x 4 x i1> %pg, ptr %addr) {
1362; CHECK-LABEL: negtest_dup_ld1rs_i32_passthru_nxv4i32:
1363; CHECK:       // %bb.0:
1364; CHECK-NEXT:    ldr w8, [x0]
1365; CHECK-NEXT:    mov z0.s, p0/m, w8
1366; CHECK-NEXT:    ret
1367    %ld = load i32, ptr %addr
1368    %res = call <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32> %pt, <vscale x 4 x i1> %pg, i32 %ld)
1369    ret <vscale x 4 x i32> %res
1370}
1371define <vscale x 2 x i64> @negtest_dup_ld1rd_i64_passthru_nxv2i64(<vscale x 2 x i64> %pt, <vscale x 2 x i1> %pg, ptr %addr) {
1372; CHECK-LABEL: negtest_dup_ld1rd_i64_passthru_nxv2i64:
1373; CHECK:       // %bb.0:
1374; CHECK-NEXT:    ldr x8, [x0]
1375; CHECK-NEXT:    mov z0.d, p0/m, x8
1376; CHECK-NEXT:    ret
1377    %ld = load i64, ptr %addr
1378    %res = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> %pt, <vscale x 2 x i1> %pg, i64 %ld)
1379    ret <vscale x 2 x i64> %res
1380}
1381define <vscale x 8 x half> @negtest_dup_ld1rh_half_passthru_nxv8f16(<vscale x 8 x half> %pt, <vscale x 8 x i1> %pg, ptr %addr) {
1382; CHECK-LABEL: negtest_dup_ld1rh_half_passthru_nxv8f16:
1383; CHECK:       // %bb.0:
1384; CHECK-NEXT:    ldr h1, [x0]
1385; CHECK-NEXT:    mov z0.h, p0/m, h1
1386; CHECK-NEXT:    ret
1387    %ld = load half, ptr %addr
1388    %res = call <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half> %pt, <vscale x 8 x i1> %pg, half %ld)
1389    ret <vscale x 8 x half> %res
1390}
1391define <vscale x 4 x float> @negtest_dup_ld1rs_float_passthru_nxv4f32(<vscale x 4 x float> %pt, <vscale x 4 x i1> %pg, ptr %addr) {
1392; CHECK-LABEL: negtest_dup_ld1rs_float_passthru_nxv4f32:
1393; CHECK:       // %bb.0:
1394; CHECK-NEXT:    ldr s1, [x0]
1395; CHECK-NEXT:    mov z0.s, p0/m, s1
1396; CHECK-NEXT:    ret
1397    %ld = load float, ptr %addr
1398    %res = call <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float> %pt, <vscale x 4 x i1> %pg, float %ld)
1399    ret <vscale x 4 x float> %res
1400}
1401define <vscale x 2 x double> @negtest_dup_ld1rd_double_passthru_nxv2f64(<vscale x 2 x double> %pt, <vscale x 2 x i1> %pg, ptr %addr) {
1402; CHECK-LABEL: negtest_dup_ld1rd_double_passthru_nxv2f64:
1403; CHECK:       // %bb.0:
1404; CHECK-NEXT:    ldr d1, [x0]
1405; CHECK-NEXT:    mov z0.d, p0/m, d1
1406; CHECK-NEXT:    ret
1407    %ld = load double, ptr %addr
1408    %res = call <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double> %pt, <vscale x 2 x i1> %pg, double %ld)
1409    ret <vscale x 2 x double> %res
1410}
1411
1412
1413; Check that a load consumed by a scalable splat prefers a replicating load.
1414define ptr @avoid_preindex_load(ptr %src, ptr %out) {
1415; CHECK-LABEL: avoid_preindex_load:
1416; CHECK:       // %bb.0:
1417; CHECK-NEXT:    ptrue p0.d
1418; CHECK-NEXT:    ld1rsb { z0.d }, p0/z, [x0, #1]
1419; CHECK-NEXT:    add x0, x0, #1
1420; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
1421; CHECK-NEXT:    ret
1422  %ptr = getelementptr inbounds i8, ptr %src, i64 1
1423  %tmp = load i8, ptr %ptr, align 4
1424  %ext = sext i8 %tmp to i64
1425  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
1426  %dup = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
1427  store <vscale x 2 x i64> %dup, ptr %out
1428  ret ptr %ptr
1429}
1430
1431; Check that a load consumed by a scalable splat prefers a replicating
1432; load over a pre-indexed load.
1433define ptr @avoid_preindex_load_dup(ptr %src, <vscale x 2 x i1> %pg, ptr %out) {
1434; CHECK-LABEL: avoid_preindex_load_dup:
1435; CHECK:       // %bb.0:
1436; CHECK-NEXT:    ld1rsb { z0.d }, p0/z, [x0, #1]
1437; CHECK-NEXT:    ptrue p0.d
1438; CHECK-NEXT:    add x0, x0, #1
1439; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
1440; CHECK-NEXT:    ret
1441  %ptr = getelementptr inbounds i8, ptr %src, i64 1
1442  %tmp = load i8, ptr %ptr, align 4
1443  %ext = sext i8 %tmp to i64
1444  %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> %pg, i64 %ext)
1445  store <vscale x 2 x i64> %dup, ptr %out
1446  ret ptr %ptr
1447}
1448
1449; Same as avoid_preindex_load_dup, but with zero passthru.
1450define ptr @avoid_preindex_load_dup_passthru_zero(ptr %src, <vscale x 2 x i1> %pg, ptr %out) {
1451; CHECK-LABEL: avoid_preindex_load_dup_passthru_zero:
1452; CHECK:       // %bb.0:
1453; CHECK-NEXT:    ld1rsb { z0.d }, p0/z, [x0, #1]
1454; CHECK-NEXT:    ptrue p0.d
1455; CHECK-NEXT:    add x0, x0, #1
1456; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
1457; CHECK-NEXT:    ret
1458  %ptr = getelementptr inbounds i8, ptr %src, i64 1
1459  %tmp = load i8, ptr %ptr, align 4
1460  %ext = sext i8 %tmp to i64
1461  %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> %pg, i64 %ext)
1462  store <vscale x 2 x i64> %dup, ptr %out
1463  ret ptr %ptr
1464}
1465
1466; If a dup has a non-undef passthru, stick with the pre-indexed load.
1467define ptr @preindex_load_dup_passthru(<vscale x 2 x i64> %passthru, ptr %src, <vscale x 2 x i1> %pg, ptr %out) {
1468; CHECK-LABEL: preindex_load_dup_passthru:
1469; CHECK:       // %bb.0:
1470; CHECK-NEXT:    ldrsb x8, [x0, #1]!
1471; CHECK-NEXT:    mov z0.d, p0/m, x8
1472; CHECK-NEXT:    ptrue p0.d
1473; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
1474; CHECK-NEXT:    ret
1475  %ptr = getelementptr inbounds i8, ptr %src, i64 1
1476  %tmp = load i8, ptr %ptr, align 4
1477  %ext = sext i8 %tmp to i64
1478  %dup = call <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64> %passthru, <vscale x 2 x i1> %pg, i64 %ext)
1479  store <vscale x 2 x i64> %dup, ptr %out
1480  ret ptr %ptr
1481}
1482
1483; Show that a second user of the load prevents the replicating load
1484; check which would ordinarily inhibit indexed loads from firing.
1485define ptr @preidx8sext64_instead_of_ld1r(ptr %src, ptr %out, ptr %dst) {
1486; CHECK-LABEL: preidx8sext64_instead_of_ld1r:
1487; CHECK:       // %bb.0:
1488; CHECK-NEXT:    ldrsb x8, [x0, #1]!
1489; CHECK-NEXT:    ptrue p0.d
1490; CHECK-NEXT:    mov z0.d, x8
1491; CHECK-NEXT:    st1d { z0.d }, p0, [x1]
1492; CHECK-NEXT:    str x8, [x2]
1493; CHECK-NEXT:    ret
1494  %ptr = getelementptr inbounds i8, ptr %src, i64 1
1495  %tmp = load i8, ptr %ptr, align 4
1496  %ext = sext i8 %tmp to i64
1497  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
1498  %dup = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
1499  store <vscale x 2 x i64> %dup, ptr %out
1500  store i64 %ext, ptr %dst
1501  ret ptr %ptr
1502}
1503
1504
1505declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64)
1506declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64)
1507declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64)
1508declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64)
1509declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64)
1510declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat>, i64)
1511declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64)
1512declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64)
1513
1514declare <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double>, <2 x double>, i64)
1515declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64)
1516declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64)
1517declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
1518declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
1519declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
1520declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
1521declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
1522
1523declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
1524declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)
1525declare <vscale x 4 x i32> @llvm.aarch64.sve.dup.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32)
1526declare <vscale x 2 x i64> @llvm.aarch64.sve.dup.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64)
1527declare <vscale x 8 x half> @llvm.aarch64.sve.dup.nxv8f16(<vscale x 8 x half>, <vscale x 8 x i1>, half)
1528declare <vscale x 4 x float> @llvm.aarch64.sve.dup.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float)
1529declare <vscale x 2 x double> @llvm.aarch64.sve.dup.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double)
1530declare <vscale x 4 x half> @llvm.aarch64.sve.dup.nxv4f16(<vscale x 4 x half>, <vscale x 4 x i1>, half)
1531declare <vscale x 2 x half> @llvm.aarch64.sve.dup.nxv2f16(<vscale x 2 x half>, <vscale x 2 x i1>, half)
1532declare <vscale x 2 x float> @llvm.aarch64.sve.dup.nxv2f32(<vscale x 2 x float>, <vscale x 2 x i1>, float)
1533
1534
1535attributes #0 = { "target-features"="+sve,+bf16" }
1536attributes #1 = { "target-cpu"="neoverse-v1" }
1537