xref: /llvm-project/llvm/test/CodeGen/AArch64/load-insert-zero.ll (revision 02a1d311bde4a90cffa661215c81f9fef1bc7967)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+bf16,+sve | FileCheck %s
3
4define <8 x i8> @loadv8i8(ptr %p) {
5; CHECK-LABEL: loadv8i8:
6; CHECK:       // %bb.0:
7; CHECK-NEXT:    ldr b0, [x0]
8; CHECK-NEXT:    ret
9  %l = load i8, ptr %p
10  %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
11  ret <8 x i8> %v
12}
13
14define <16 x i8> @loadv16i8(ptr %p) {
15; CHECK-LABEL: loadv16i8:
16; CHECK:       // %bb.0:
17; CHECK-NEXT:    ldr b0, [x0]
18; CHECK-NEXT:    ret
19  %l = load i8, ptr %p
20  %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
21  ret <16 x i8> %v
22}
23
24define <4 x i16> @loadv4i16(ptr %p) {
25; CHECK-LABEL: loadv4i16:
26; CHECK:       // %bb.0:
27; CHECK-NEXT:    ldr h0, [x0]
28; CHECK-NEXT:    ret
29  %l = load i16, ptr %p
30  %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
31  ret <4 x i16> %v
32}
33
34define <8 x i16> @loadv8i16(ptr %p) {
35; CHECK-LABEL: loadv8i16:
36; CHECK:       // %bb.0:
37; CHECK-NEXT:    ldr h0, [x0]
38; CHECK-NEXT:    ret
39  %l = load i16, ptr %p
40  %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
41  ret <8 x i16> %v
42}
43
44define <2 x i32> @loadv2i32(ptr %p) {
45; CHECK-LABEL: loadv2i32:
46; CHECK:       // %bb.0:
47; CHECK-NEXT:    ldr s0, [x0]
48; CHECK-NEXT:    ret
49  %l = load i32, ptr %p
50  %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
51  ret <2 x i32> %v
52}
53
54define <4 x i32> @loadv4i32(ptr %p) {
55; CHECK-LABEL: loadv4i32:
56; CHECK:       // %bb.0:
57; CHECK-NEXT:    ldr s0, [x0]
58; CHECK-NEXT:    ret
59  %l = load i32, ptr %p
60  %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
61  ret <4 x i32> %v
62}
63
64define <2 x i64> @loadv2i64(ptr %p) {
65; CHECK-LABEL: loadv2i64:
66; CHECK:       // %bb.0:
67; CHECK-NEXT:    ldr d0, [x0]
68; CHECK-NEXT:    ret
69  %l = load i64, ptr %p
70  %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
71  ret <2 x i64> %v
72}
73
74
75define <4 x half> @loadv4f16(ptr %p) {
76; CHECK-LABEL: loadv4f16:
77; CHECK:       // %bb.0:
78; CHECK-NEXT:    ldr h0, [x0]
79; CHECK-NEXT:    ret
80  %l = load half, ptr %p
81  %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
82  ret <4 x half> %v
83}
84
85define <8 x half> @loadv8f16(ptr %p) {
86; CHECK-LABEL: loadv8f16:
87; CHECK:       // %bb.0:
88; CHECK-NEXT:    ldr h0, [x0]
89; CHECK-NEXT:    ret
90  %l = load half, ptr %p
91  %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
92  ret <8 x half> %v
93}
94
95define <4 x bfloat> @loadv4bf16(ptr %p) {
96; CHECK-LABEL: loadv4bf16:
97; CHECK:       // %bb.0:
98; CHECK-NEXT:    ldr h0, [x0]
99; CHECK-NEXT:    ret
100  %l = load bfloat, ptr %p
101  %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
102  ret <4 x bfloat> %v
103}
104
105define <8 x bfloat> @loadv8bf16(ptr %p) {
106; CHECK-LABEL: loadv8bf16:
107; CHECK:       // %bb.0:
108; CHECK-NEXT:    ldr h0, [x0]
109; CHECK-NEXT:    ret
110  %l = load bfloat, ptr %p
111  %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
112  ret <8 x bfloat> %v
113}
114
115define <2 x float> @loadv2f32(ptr %p) {
116; CHECK-LABEL: loadv2f32:
117; CHECK:       // %bb.0:
118; CHECK-NEXT:    ldr s0, [x0]
119; CHECK-NEXT:    ret
120  %l = load float, ptr %p
121  %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
122  ret <2 x float> %v
123}
124
125define <4 x float> @loadv4f32(ptr %p) {
126; CHECK-LABEL: loadv4f32:
127; CHECK:       // %bb.0:
128; CHECK-NEXT:    ldr s0, [x0]
129; CHECK-NEXT:    ret
130  %l = load float, ptr %p
131  %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
132  ret <4 x float> %v
133}
134
135define <2 x double> @loadv2f64(ptr %p) {
136; CHECK-LABEL: loadv2f64:
137; CHECK:       // %bb.0:
138; CHECK-NEXT:    ldr d0, [x0]
139; CHECK-NEXT:    ret
140  %l = load double, ptr %p
141  %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
142  ret <2 x double> %v
143}
144
145
146; Unscaled
147
148define <8 x i8> @loadv8i8_offset(ptr %p) {
149; CHECK-LABEL: loadv8i8_offset:
150; CHECK:       // %bb.0:
151; CHECK-NEXT:    ldr b0, [x0, #1]
152; CHECK-NEXT:    ret
153  %g = getelementptr inbounds i8, ptr %p, i64 1
154  %l = load i8, ptr %g
155  %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
156  ret <8 x i8> %v
157}
158
159define <16 x i8> @loadv16i8_offset(ptr %p) {
160; CHECK-LABEL: loadv16i8_offset:
161; CHECK:       // %bb.0:
162; CHECK-NEXT:    ldr b0, [x0, #1]
163; CHECK-NEXT:    ret
164  %g = getelementptr inbounds i8, ptr %p, i64 1
165  %l = load i8, ptr %g
166  %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
167  ret <16 x i8> %v
168}
169
170define <4 x i16> @loadv4i16_offset(ptr %p) {
171; CHECK-LABEL: loadv4i16_offset:
172; CHECK:       // %bb.0:
173; CHECK-NEXT:    ldur h0, [x0, #1]
174; CHECK-NEXT:    ret
175  %g = getelementptr inbounds i8, ptr %p, i64 1
176  %l = load i16, ptr %g
177  %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
178  ret <4 x i16> %v
179}
180
181define <8 x i16> @loadv8i16_offset(ptr %p) {
182; CHECK-LABEL: loadv8i16_offset:
183; CHECK:       // %bb.0:
184; CHECK-NEXT:    ldur h0, [x0, #1]
185; CHECK-NEXT:    ret
186  %g = getelementptr inbounds i8, ptr %p, i64 1
187  %l = load i16, ptr %g
188  %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
189  ret <8 x i16> %v
190}
191
192define <2 x i32> @loadv2i32_offset(ptr %p) {
193; CHECK-LABEL: loadv2i32_offset:
194; CHECK:       // %bb.0:
195; CHECK-NEXT:    ldur s0, [x0, #1]
196; CHECK-NEXT:    ret
197  %g = getelementptr inbounds i8, ptr %p, i64 1
198  %l = load i32, ptr %g
199  %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
200  ret <2 x i32> %v
201}
202
203define <4 x i32> @loadv4i32_offset(ptr %p) {
204; CHECK-LABEL: loadv4i32_offset:
205; CHECK:       // %bb.0:
206; CHECK-NEXT:    ldur s0, [x0, #1]
207; CHECK-NEXT:    ret
208  %g = getelementptr inbounds i8, ptr %p, i64 1
209  %l = load i32, ptr %g
210  %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
211  ret <4 x i32> %v
212}
213
214define <2 x i64> @loadv2i64_offset(ptr %p) {
215; CHECK-LABEL: loadv2i64_offset:
216; CHECK:       // %bb.0:
217; CHECK-NEXT:    ldur d0, [x0, #1]
218; CHECK-NEXT:    ret
219  %g = getelementptr inbounds i8, ptr %p, i64 1
220  %l = load i64, ptr %g
221  %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
222  ret <2 x i64> %v
223}
224
225
226define <4 x half> @loadv4f16_offset(ptr %p) {
227; CHECK-LABEL: loadv4f16_offset:
228; CHECK:       // %bb.0:
229; CHECK-NEXT:    ldur h0, [x0, #1]
230; CHECK-NEXT:    ret
231  %g = getelementptr inbounds i8, ptr %p, i64 1
232  %l = load half, ptr %g
233  %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
234  ret <4 x half> %v
235}
236
237define <8 x half> @loadv8f16_offset(ptr %p) {
238; CHECK-LABEL: loadv8f16_offset:
239; CHECK:       // %bb.0:
240; CHECK-NEXT:    ldur h0, [x0, #1]
241; CHECK-NEXT:    ret
242  %g = getelementptr inbounds i8, ptr %p, i64 1
243  %l = load half, ptr %g
244  %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
245  ret <8 x half> %v
246}
247
248define <4 x bfloat> @loadv4bf16_offset(ptr %p) {
249; CHECK-LABEL: loadv4bf16_offset:
250; CHECK:       // %bb.0:
251; CHECK-NEXT:    ldur h0, [x0, #1]
252; CHECK-NEXT:    ret
253  %g = getelementptr inbounds i8, ptr %p, i64 1
254  %l = load bfloat, ptr %g
255  %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
256  ret <4 x bfloat> %v
257}
258
259define <8 x bfloat> @loadv8bf16_offset(ptr %p) {
260; CHECK-LABEL: loadv8bf16_offset:
261; CHECK:       // %bb.0:
262; CHECK-NEXT:    ldur h0, [x0, #1]
263; CHECK-NEXT:    ret
264  %g = getelementptr inbounds i8, ptr %p, i64 1
265  %l = load bfloat, ptr %g
266  %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
267  ret <8 x bfloat> %v
268}
269
270define <2 x float> @loadv2f32_offset(ptr %p) {
271; CHECK-LABEL: loadv2f32_offset:
272; CHECK:       // %bb.0:
273; CHECK-NEXT:    ldur s0, [x0, #1]
274; CHECK-NEXT:    ret
275  %g = getelementptr inbounds i8, ptr %p, i64 1
276  %l = load float, ptr %g
277  %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
278  ret <2 x float> %v
279}
280
281define <4 x float> @loadv4f32_offset(ptr %p) {
282; CHECK-LABEL: loadv4f32_offset:
283; CHECK:       // %bb.0:
284; CHECK-NEXT:    ldur s0, [x0, #1]
285; CHECK-NEXT:    ret
286  %g = getelementptr inbounds i8, ptr %p, i64 1
287  %l = load float, ptr %g
288  %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
289  ret <4 x float> %v
290}
291
292define <2 x double> @loadv2f64_offset(ptr %p) {
293; CHECK-LABEL: loadv2f64_offset:
294; CHECK:       // %bb.0:
295; CHECK-NEXT:    ldur d0, [x0, #1]
296; CHECK-NEXT:    ret
297  %g = getelementptr inbounds i8, ptr %p, i64 1
298  %l = load double, ptr %g
299  %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
300  ret <2 x double> %v
301}
302
303
304define <8 x i8> @loadv8i8_noffset(ptr %p) {
305; CHECK-LABEL: loadv8i8_noffset:
306; CHECK:       // %bb.0:
307; CHECK-NEXT:    ldur b0, [x0, #-1]
308; CHECK-NEXT:    ret
309  %g = getelementptr inbounds i8, ptr %p, i64 -1
310  %l = load i8, ptr %g
311  %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
312  ret <8 x i8> %v
313}
314
315define <16 x i8> @loadv16i8_noffset(ptr %p) {
316; CHECK-LABEL: loadv16i8_noffset:
317; CHECK:       // %bb.0:
318; CHECK-NEXT:    ldur b0, [x0, #-1]
319; CHECK-NEXT:    ret
320  %g = getelementptr inbounds i8, ptr %p, i64 -1
321  %l = load i8, ptr %g
322  %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
323  ret <16 x i8> %v
324}
325
326define <4 x i16> @loadv4i16_noffset(ptr %p) {
327; CHECK-LABEL: loadv4i16_noffset:
328; CHECK:       // %bb.0:
329; CHECK-NEXT:    ldur h0, [x0, #-1]
330; CHECK-NEXT:    ret
331  %g = getelementptr inbounds i8, ptr %p, i64 -1
332  %l = load i16, ptr %g
333  %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
334  ret <4 x i16> %v
335}
336
337define <8 x i16> @loadv8i16_noffset(ptr %p) {
338; CHECK-LABEL: loadv8i16_noffset:
339; CHECK:       // %bb.0:
340; CHECK-NEXT:    ldur h0, [x0, #-1]
341; CHECK-NEXT:    ret
342  %g = getelementptr inbounds i8, ptr %p, i64 -1
343  %l = load i16, ptr %g
344  %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
345  ret <8 x i16> %v
346}
347
348define <2 x i32> @loadv2i32_noffset(ptr %p) {
349; CHECK-LABEL: loadv2i32_noffset:
350; CHECK:       // %bb.0:
351; CHECK-NEXT:    ldur s0, [x0, #-1]
352; CHECK-NEXT:    ret
353  %g = getelementptr inbounds i8, ptr %p, i64 -1
354  %l = load i32, ptr %g
355  %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
356  ret <2 x i32> %v
357}
358
359define <4 x i32> @loadv4i32_noffset(ptr %p) {
360; CHECK-LABEL: loadv4i32_noffset:
361; CHECK:       // %bb.0:
362; CHECK-NEXT:    ldur s0, [x0, #-1]
363; CHECK-NEXT:    ret
364  %g = getelementptr inbounds i8, ptr %p, i64 -1
365  %l = load i32, ptr %g
366  %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
367  ret <4 x i32> %v
368}
369
370define <2 x i64> @loadv2i64_noffset(ptr %p) {
371; CHECK-LABEL: loadv2i64_noffset:
372; CHECK:       // %bb.0:
373; CHECK-NEXT:    ldur d0, [x0, #-1]
374; CHECK-NEXT:    ret
375  %g = getelementptr inbounds i8, ptr %p, i64 -1
376  %l = load i64, ptr %g
377  %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
378  ret <2 x i64> %v
379}
380
381define <4 x half> @loadv4f16_noffset(ptr %p) {
382; CHECK-LABEL: loadv4f16_noffset:
383; CHECK:       // %bb.0:
384; CHECK-NEXT:    ldur h0, [x0, #-1]
385; CHECK-NEXT:    ret
386  %g = getelementptr inbounds i8, ptr %p, i64 -1
387  %l = load half, ptr %g
388  %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
389  ret <4 x half> %v
390}
391
392define <8 x half> @loadv8f16_noffset(ptr %p) {
393; CHECK-LABEL: loadv8f16_noffset:
394; CHECK:       // %bb.0:
395; CHECK-NEXT:    ldur h0, [x0, #-1]
396; CHECK-NEXT:    ret
397  %g = getelementptr inbounds i8, ptr %p, i64 -1
398  %l = load half, ptr %g
399  %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
400  ret <8 x half> %v
401}
402
403define <4 x bfloat> @loadv4bf16_noffset(ptr %p) {
404; CHECK-LABEL: loadv4bf16_noffset:
405; CHECK:       // %bb.0:
406; CHECK-NEXT:    ldur h0, [x0, #-1]
407; CHECK-NEXT:    ret
408  %g = getelementptr inbounds i8, ptr %p, i64 -1
409  %l = load bfloat, ptr %g
410  %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
411  ret <4 x bfloat> %v
412}
413
414define <8 x bfloat> @loadv8bf16_noffset(ptr %p) {
415; CHECK-LABEL: loadv8bf16_noffset:
416; CHECK:       // %bb.0:
417; CHECK-NEXT:    ldur h0, [x0, #-1]
418; CHECK-NEXT:    ret
419  %g = getelementptr inbounds i8, ptr %p, i64 -1
420  %l = load bfloat, ptr %g
421  %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
422  ret <8 x bfloat> %v
423}
424
425define <2 x float> @loadv2f32_noffset(ptr %p) {
426; CHECK-LABEL: loadv2f32_noffset:
427; CHECK:       // %bb.0:
428; CHECK-NEXT:    ldur s0, [x0, #-1]
429; CHECK-NEXT:    ret
430  %g = getelementptr inbounds i8, ptr %p, i64 -1
431  %l = load float, ptr %g
432  %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
433  ret <2 x float> %v
434}
435
436define <4 x float> @loadv4f32_noffset(ptr %p) {
437; CHECK-LABEL: loadv4f32_noffset:
438; CHECK:       // %bb.0:
439; CHECK-NEXT:    ldur s0, [x0, #-1]
440; CHECK-NEXT:    ret
441  %g = getelementptr inbounds i8, ptr %p, i64 -1
442  %l = load float, ptr %g
443  %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
444  ret <4 x float> %v
445}
446
447define <2 x double> @loadv2f64_noffset(ptr %p) {
448; CHECK-LABEL: loadv2f64_noffset:
449; CHECK:       // %bb.0:
450; CHECK-NEXT:    ldur d0, [x0, #-1]
451; CHECK-NEXT:    ret
452  %g = getelementptr inbounds i8, ptr %p, i64 -1
453  %l = load double, ptr %g
454  %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
455  ret <2 x double> %v
456}
457
458
459; ROW addressing modes
460
461define <8 x i8> @loadv8i8_roW(ptr %p, i32 %o) {
462; CHECK-LABEL: loadv8i8_roW:
463; CHECK:       // %bb.0:
464; CHECK-NEXT:    ldr b0, [x0, w1, sxtw]
465; CHECK-NEXT:    ret
466  %g = getelementptr inbounds i8, ptr %p, i32 %o
467  %l = load i8, ptr %g
468  %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
469  ret <8 x i8> %v
470}
471
472define <16 x i8> @loadv16i8_roW(ptr %p, i32 %o) {
473; CHECK-LABEL: loadv16i8_roW:
474; CHECK:       // %bb.0:
475; CHECK-NEXT:    ldr b0, [x0, w1, sxtw]
476; CHECK-NEXT:    ret
477  %g = getelementptr inbounds i8, ptr %p, i32 %o
478  %l = load i8, ptr %g
479  %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
480  ret <16 x i8> %v
481}
482
483define <4 x i16> @loadv4i16_roW(ptr %p, i32 %o) {
484; CHECK-LABEL: loadv4i16_roW:
485; CHECK:       // %bb.0:
486; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
487; CHECK-NEXT:    ret
488  %g = getelementptr inbounds i16, ptr %p, i32 %o
489  %l = load i16, ptr %g
490  %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
491  ret <4 x i16> %v
492}
493
494define <8 x i16> @loadv8i16_roW(ptr %p, i32 %o) {
495; CHECK-LABEL: loadv8i16_roW:
496; CHECK:       // %bb.0:
497; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
498; CHECK-NEXT:    ret
499  %g = getelementptr inbounds i16, ptr %p, i32 %o
500  %l = load i16, ptr %g
501  %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
502  ret <8 x i16> %v
503}
504
505define <2 x i32> @loadv2i32_roW(ptr %p, i32 %o) {
506; CHECK-LABEL: loadv2i32_roW:
507; CHECK:       // %bb.0:
508; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
509; CHECK-NEXT:    ret
510  %g = getelementptr inbounds i32, ptr %p, i32 %o
511  %l = load i32, ptr %g
512  %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
513  ret <2 x i32> %v
514}
515
516define <4 x i32> @loadv4i32_roW(ptr %p, i32 %o) {
517; CHECK-LABEL: loadv4i32_roW:
518; CHECK:       // %bb.0:
519; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
520; CHECK-NEXT:    ret
521  %g = getelementptr inbounds i32, ptr %p, i32 %o
522  %l = load i32, ptr %g
523  %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
524  ret <4 x i32> %v
525}
526
527define <2 x i64> @loadv2i64_roW(ptr %p, i32 %o) {
528; CHECK-LABEL: loadv2i64_roW:
529; CHECK:       // %bb.0:
530; CHECK-NEXT:    ldr d0, [x0, w1, sxtw #3]
531; CHECK-NEXT:    ret
532  %g = getelementptr inbounds i64, ptr %p, i32 %o
533  %l = load i64, ptr %g
534  %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
535  ret <2 x i64> %v
536}
537
538define <4 x half> @loadv4f16_roW(ptr %p, i32 %o) {
539; CHECK-LABEL: loadv4f16_roW:
540; CHECK:       // %bb.0:
541; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
542; CHECK-NEXT:    ret
543  %g = getelementptr inbounds half, ptr %p, i32 %o
544  %l = load half, ptr %g
545  %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
546  ret <4 x half> %v
547}
548
549define <8 x half> @loadv8f16_roW(ptr %p, i32 %o) {
550; CHECK-LABEL: loadv8f16_roW:
551; CHECK:       // %bb.0:
552; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
553; CHECK-NEXT:    ret
554  %g = getelementptr inbounds half, ptr %p, i32 %o
555  %l = load half, ptr %g
556  %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
557  ret <8 x half> %v
558}
559
560define <4 x bfloat> @loadv4bf16_roW(ptr %p, i32 %o) {
561; CHECK-LABEL: loadv4bf16_roW:
562; CHECK:       // %bb.0:
563; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
564; CHECK-NEXT:    ret
565  %g = getelementptr inbounds bfloat, ptr %p, i32 %o
566  %l = load bfloat, ptr %g
567  %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
568  ret <4 x bfloat> %v
569}
570
571define <8 x bfloat> @loadv8bf16_roW(ptr %p, i32 %o) {
572; CHECK-LABEL: loadv8bf16_roW:
573; CHECK:       // %bb.0:
574; CHECK-NEXT:    ldr h0, [x0, w1, sxtw #1]
575; CHECK-NEXT:    ret
576  %g = getelementptr inbounds bfloat, ptr %p, i32 %o
577  %l = load bfloat, ptr %g
578  %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
579  ret <8 x bfloat> %v
580}
581
582define <2 x float> @loadv2f32_roW(ptr %p, i32 %o) {
583; CHECK-LABEL: loadv2f32_roW:
584; CHECK:       // %bb.0:
585; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
586; CHECK-NEXT:    ret
587  %g = getelementptr inbounds float, ptr %p, i32 %o
588  %l = load float, ptr %g
589  %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
590  ret <2 x float> %v
591}
592
593define <4 x float> @loadv4f32_roW(ptr %p, i32 %o) {
594; CHECK-LABEL: loadv4f32_roW:
595; CHECK:       // %bb.0:
596; CHECK-NEXT:    ldr s0, [x0, w1, sxtw #2]
597; CHECK-NEXT:    ret
598  %g = getelementptr inbounds float, ptr %p, i32 %o
599  %l = load float, ptr %g
600  %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
601  ret <4 x float> %v
602}
603
604define <2 x double> @loadv2f64_roW(ptr %p, i32 %o) {
605; CHECK-LABEL: loadv2f64_roW:
606; CHECK:       // %bb.0:
607; CHECK-NEXT:    ldr d0, [x0, w1, sxtw #3]
608; CHECK-NEXT:    ret
609  %g = getelementptr inbounds double, ptr %p, i32 %o
610  %l = load double, ptr %g
611  %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
612  ret <2 x double> %v
613}
614
615
616; roX
617
618define <8 x i8> @loadv8i8_roX(ptr %p, i64 %o) {
619; CHECK-LABEL: loadv8i8_roX:
620; CHECK:       // %bb.0:
621; CHECK-NEXT:    ldr b0, [x0, x1]
622; CHECK-NEXT:    ret
623  %g = getelementptr inbounds i8, ptr %p, i64 %o
624  %l = load i8, ptr %g
625  %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0
626  ret <8 x i8> %v
627}
628
629define <16 x i8> @loadv16i8_roX(ptr %p, i64 %o) {
630; CHECK-LABEL: loadv16i8_roX:
631; CHECK:       // %bb.0:
632; CHECK-NEXT:    ldr b0, [x0, x1]
633; CHECK-NEXT:    ret
634  %g = getelementptr inbounds i8, ptr %p, i64 %o
635  %l = load i8, ptr %g
636  %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0
637  ret <16 x i8> %v
638}
639
640define <4 x i16> @loadv4i16_roX(ptr %p, i64 %o) {
641; CHECK-LABEL: loadv4i16_roX:
642; CHECK:       // %bb.0:
643; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
644; CHECK-NEXT:    ret
645  %g = getelementptr inbounds i16, ptr %p, i64 %o
646  %l = load i16, ptr %g
647  %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0
648  ret <4 x i16> %v
649}
650
651define <8 x i16> @loadv8i16_roX(ptr %p, i64 %o) {
652; CHECK-LABEL: loadv8i16_roX:
653; CHECK:       // %bb.0:
654; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
655; CHECK-NEXT:    ret
656  %g = getelementptr inbounds i16, ptr %p, i64 %o
657  %l = load i16, ptr %g
658  %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0
659  ret <8 x i16> %v
660}
661
662define <2 x i32> @loadv2i32_roX(ptr %p, i64 %o) {
663; CHECK-LABEL: loadv2i32_roX:
664; CHECK:       // %bb.0:
665; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
666; CHECK-NEXT:    ret
667  %g = getelementptr inbounds i32, ptr %p, i64 %o
668  %l = load i32, ptr %g
669  %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0
670  ret <2 x i32> %v
671}
672
673define <4 x i32> @loadv4i32_roX(ptr %p, i64 %o) {
674; CHECK-LABEL: loadv4i32_roX:
675; CHECK:       // %bb.0:
676; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
677; CHECK-NEXT:    ret
678  %g = getelementptr inbounds i32, ptr %p, i64 %o
679  %l = load i32, ptr %g
680  %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0
681  ret <4 x i32> %v
682}
683
684define <2 x i64> @loadv2i64_roX(ptr %p, i64 %o) {
685; CHECK-LABEL: loadv2i64_roX:
686; CHECK:       // %bb.0:
687; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
688; CHECK-NEXT:    ret
689  %g = getelementptr inbounds i64, ptr %p, i64 %o
690  %l = load i64, ptr %g
691  %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0
692  ret <2 x i64> %v
693}
694
695define <4 x half> @loadv4f16_roX(ptr %p, i64 %o) {
696; CHECK-LABEL: loadv4f16_roX:
697; CHECK:       // %bb.0:
698; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
699; CHECK-NEXT:    ret
700  %g = getelementptr inbounds half, ptr %p, i64 %o
701  %l = load half, ptr %g
702  %v = insertelement <4 x half> zeroinitializer, half %l, i32 0
703  ret <4 x half> %v
704}
705
706define <8 x half> @loadv8f16_roX(ptr %p, i64 %o) {
707; CHECK-LABEL: loadv8f16_roX:
708; CHECK:       // %bb.0:
709; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
710; CHECK-NEXT:    ret
711  %g = getelementptr inbounds half, ptr %p, i64 %o
712  %l = load half, ptr %g
713  %v = insertelement <8 x half> zeroinitializer, half %l, i32 0
714  ret <8 x half> %v
715}
716
717define <4 x bfloat> @loadv4bf16_roX(ptr %p, i64 %o) {
718; CHECK-LABEL: loadv4bf16_roX:
719; CHECK:       // %bb.0:
720; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
721; CHECK-NEXT:    ret
722  %g = getelementptr inbounds bfloat, ptr %p, i64 %o
723  %l = load bfloat, ptr %g
724  %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0
725  ret <4 x bfloat> %v
726}
727
728define <8 x bfloat> @loadv8bf16_roX(ptr %p, i64 %o) {
729; CHECK-LABEL: loadv8bf16_roX:
730; CHECK:       // %bb.0:
731; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
732; CHECK-NEXT:    ret
733  %g = getelementptr inbounds bfloat, ptr %p, i64 %o
734  %l = load bfloat, ptr %g
735  %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0
736  ret <8 x bfloat> %v
737}
738
739define <2 x float> @loadv2f32_roX(ptr %p, i64 %o) {
740; CHECK-LABEL: loadv2f32_roX:
741; CHECK:       // %bb.0:
742; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
743; CHECK-NEXT:    ret
744  %g = getelementptr inbounds float, ptr %p, i64 %o
745  %l = load float, ptr %g
746  %v = insertelement <2 x float> zeroinitializer, float %l, i32 0
747  ret <2 x float> %v
748}
749
750define <4 x float> @loadv4f32_roX(ptr %p, i64 %o) {
751; CHECK-LABEL: loadv4f32_roX:
752; CHECK:       // %bb.0:
753; CHECK-NEXT:    ldr s0, [x0, x1, lsl #2]
754; CHECK-NEXT:    ret
755  %g = getelementptr inbounds float, ptr %p, i64 %o
756  %l = load float, ptr %g
757  %v = insertelement <4 x float> zeroinitializer, float %l, i32 0
758  ret <4 x float> %v
759}
760
761define <2 x double> @loadv2f64_roX(ptr %p, i64 %o) {
762; CHECK-LABEL: loadv2f64_roX:
763; CHECK:       // %bb.0:
764; CHECK-NEXT:    ldr d0, [x0, x1, lsl #3]
765; CHECK-NEXT:    ret
766  %g = getelementptr inbounds double, ptr %p, i64 %o
767  %l = load double, ptr %g
768  %v = insertelement <2 x double> zeroinitializer, double %l, i32 0
769  ret <2 x double> %v
770}
771
772
773define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) {
774; CHECK-LABEL: predictor_4x4_neon:
775; CHECK:       // %bb.0:
776; CHECK-NEXT:    movi v0.2d, #0000000000000000
777; CHECK-NEXT:    ldur w8, [x2, #2]
778; CHECK-NEXT:    ldr s1, [x2]
779; CHECK-NEXT:    ldur s2, [x2, #1]
780; CHECK-NEXT:    ushll v3.8h, v2.8b, #1
781; CHECK-NEXT:    mov v0.s[0], w8
782; CHECK-NEXT:    lsr w8, w8, #24
783; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
784; CHECK-NEXT:    urhadd v1.8b, v1.8b, v2.8b
785; CHECK-NEXT:    add v0.8h, v0.8h, v3.8h
786; CHECK-NEXT:    dup v3.8b, w8
787; CHECK-NEXT:    str s1, [x0]
788; CHECK-NEXT:    lsl x8, x1, #1
789; CHECK-NEXT:    rshrn v0.8b, v0.8h, #2
790; CHECK-NEXT:    zip1 v2.2s, v1.2s, v3.2s
791; CHECK-NEXT:    str s0, [x0, x1]
792; CHECK-NEXT:    zip1 v3.2s, v0.2s, v3.2s
793; CHECK-NEXT:    ext v2.8b, v2.8b, v0.8b, #1
794; CHECK-NEXT:    ext v1.8b, v3.8b, v0.8b, #1
795; CHECK-NEXT:    str s2, [x0, x8]
796; CHECK-NEXT:    add x8, x8, x1
797; CHECK-NEXT:    str s1, [x0, x8]
798; CHECK-NEXT:    ret
799  %5 = load i32, ptr %2, align 4
800  %6 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %5, i64 0
801  %7 = bitcast <2 x i32> %6 to <8 x i8>
802  %8 = getelementptr inbounds i8, ptr %2, i64 1
803  %9 = load i32, ptr %8, align 4
804  %10 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %9, i64 0
805  %11 = bitcast <2 x i32> %10 to <8 x i8>
806  %12 = getelementptr inbounds i8, ptr %2, i64 2
807  %13 = load i32, ptr %12, align 4
808  %14 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %13, i64 0
809  %15 = bitcast <2 x i32> %14 to <8 x i8>
810  %16 = lshr i32 %13, 24
811  %17 = trunc i32 %16 to i8
812  %18 = insertelement <8 x i8> undef, i8 %17, i64 0
813  %19 = shufflevector <8 x i8> %18, <8 x i8> poison, <8 x i32> zeroinitializer
814  %20 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %7, <8 x i8> %11)
815  %21 = zext <8 x i8> %7 to <8 x i16>
816  %22 = zext <8 x i8> %11 to <8 x i16>
817  %23 = zext <8 x i8> %15 to <8 x i16>
818  %24 = shl nuw nsw <8 x i16> %22, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
819  %25 = add nuw nsw <8 x i16> %23, %21
820  %26 = add nuw nsw <8 x i16> %25, %24
821  %27 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %26, i32 2)
822  %28 = bitcast <8 x i8> %20 to <2 x i32>
823  %29 = extractelement <2 x i32> %28, i64 0
824  store i32 %29, ptr %0, align 4
825  %30 = bitcast <8 x i8> %27 to <2 x i32>
826  %31 = getelementptr inbounds i8, ptr %0, i64 %1
827  %32 = extractelement <2 x i32> %30, i64 0
828  store i32 %32, ptr %31, align 4
829  %33 = bitcast <8 x i8> %19 to <2 x i32>
830  %34 = shufflevector <2 x i32> %28, <2 x i32> %33, <2 x i32> <i32 0, i32 2>
831  %35 = bitcast <2 x i32> %34 to <8 x i8>
832  %36 = shufflevector <2 x i32> %30, <2 x i32> %33, <2 x i32> <i32 0, i32 2>
833  %37 = bitcast <2 x i32> %36 to <8 x i8>
834  %38 = shufflevector <8 x i8> %35, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef>
835  %39 = bitcast <8 x i8> %38 to <2 x i32>
836  %40 = shl nsw i64 %1, 1
837  %41 = getelementptr inbounds i8, ptr %0, i64 %40
838  %42 = extractelement <2 x i32> %39, i64 0
839  store i32 %42, ptr %41, align 4
840  %43 = shufflevector <8 x i8> %37, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef>
841  %44 = bitcast <8 x i8> %43 to <2 x i32>
842  %45 = mul nsw i64 %1, 3
843  %46 = getelementptr inbounds i8, ptr %0, i64 %45
844  %47 = extractelement <2 x i32> %44, i64 0
845  store i32 %47, ptr %46, align 4
846  ret void
847}
848
849define void @predictor_4x4_neon_new(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) {
850; CHECK-LABEL: predictor_4x4_neon_new:
851; CHECK:       // %bb.0:
852; CHECK-NEXT:    ldr s0, [x2]
853; CHECK-NEXT:    ldur s1, [x2, #1]
854; CHECK-NEXT:    lsl x8, x1, #1
855; CHECK-NEXT:    ldur s2, [x2, #2]
856; CHECK-NEXT:    ldur s3, [x2, #3]
857; CHECK-NEXT:    uaddl v4.8h, v1.8b, v0.8b
858; CHECK-NEXT:    urhadd v0.8b, v0.8b, v1.8b
859; CHECK-NEXT:    add x9, x8, x1
860; CHECK-NEXT:    uaddl v5.8h, v2.8b, v1.8b
861; CHECK-NEXT:    uaddl v3.8h, v3.8b, v2.8b
862; CHECK-NEXT:    urhadd v1.8b, v1.8b, v2.8b
863; CHECK-NEXT:    str s0, [x0]
864; CHECK-NEXT:    add v4.8h, v4.8h, v5.8h
865; CHECK-NEXT:    add v3.8h, v3.8h, v5.8h
866; CHECK-NEXT:    rshrn v4.8b, v4.8h, #2
867; CHECK-NEXT:    rshrn v0.8b, v3.8h, #2
868; CHECK-NEXT:    str s4, [x0, x1]
869; CHECK-NEXT:    str s1, [x0, x8]
870; CHECK-NEXT:    str s0, [x0, x9]
871; CHECK-NEXT:    ret
872  %5 = load i32, ptr %2, align 4
873  %6 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %5, i64 0
874  %7 = bitcast <2 x i32> %6 to <8 x i8>
875  %8 = getelementptr inbounds i8, ptr %2, i64 1
876  %9 = load i32, ptr %8, align 4
877  %10 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %9, i64 0
878  %11 = bitcast <2 x i32> %10 to <8 x i8>
879  %12 = getelementptr inbounds i8, ptr %2, i64 2
880  %13 = load i32, ptr %12, align 4
881  %14 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %13, i64 0
882  %15 = bitcast <2 x i32> %14 to <8 x i8>
883  %16 = getelementptr inbounds i8, ptr %2, i64 3
884  %17 = load i32, ptr %16, align 4
885  %18 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %17, i64 0
886  %19 = bitcast <2 x i32> %18 to <8 x i8>
887  %20 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %7, <8 x i8> %11)
888  %21 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %11, <8 x i8> %15)
889  %22 = zext <8 x i8> %7 to <8 x i16>
890  %23 = zext <8 x i8> %11 to <8 x i16>
891  %24 = add nuw nsw <8 x i16> %23, %22
892  %25 = zext <8 x i8> %15 to <8 x i16>
893  %26 = add nuw nsw <8 x i16> %25, %23
894  %27 = add nuw nsw <8 x i16> %24, %26
895  %28 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %27, i32 2)
896  %29 = zext <8 x i8> %19 to <8 x i16>
897  %30 = add nuw nsw <8 x i16> %29, %25
898  %31 = add nuw nsw <8 x i16> %30, %26
899  %32 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %31, i32 2)
900  %33 = bitcast <8 x i8> %20 to <2 x i32>
901  %34 = extractelement <2 x i32> %33, i64 0
902  store i32 %34, ptr %0, align 4
903  %35 = bitcast <8 x i8> %28 to <2 x i32>
904  %36 = getelementptr inbounds i8, ptr %0, i64 %1
905  %37 = extractelement <2 x i32> %35, i64 0
906  store i32 %37, ptr %36, align 4
907  %38 = bitcast <8 x i8> %21 to <2 x i32>
908  %39 = shl nsw i64 %1, 1
909  %40 = getelementptr inbounds i8, ptr %0, i64 %39
910  %41 = extractelement <2 x i32> %38, i64 0
911  store i32 %41, ptr %40, align 4
912  %42 = bitcast <8 x i8> %32 to <2 x i32>
913  %43 = mul nsw i64 %1, 3
914  %44 = getelementptr inbounds i8, ptr %0, i64 %43
915  %45 = extractelement <2 x i32> %42, i64 0
916  store i32 %45, ptr %44, align 4
917  ret void
918}
919
920
921define <vscale x 8 x i8> @loadnxv8i8(ptr %p) {
922; CHECK-LABEL: loadnxv8i8:
923; CHECK:       // %bb.0:
924; CHECK-NEXT:    mov z0.h, #0 // =0x0
925; CHECK-NEXT:    ldrb w8, [x0]
926; CHECK-NEXT:    ptrue p0.h, vl1
927; CHECK-NEXT:    mov z0.h, p0/m, w8
928; CHECK-NEXT:    ret
929  %l = load i8, ptr %p
930  %v = insertelement <vscale x 8 x i8> zeroinitializer, i8 %l, i32 0
931  ret <vscale x 8 x i8> %v
932}
933
934define <vscale x 16 x i8> @loadnxv16i8(ptr %p) {
935; CHECK-LABEL: loadnxv16i8:
936; CHECK:       // %bb.0:
937; CHECK-NEXT:    ldr b0, [x0]
938; CHECK-NEXT:    ret
939  %l = load i8, ptr %p
940  %v = insertelement <vscale x 16 x i8> zeroinitializer, i8 %l, i32 0
941  ret <vscale x 16 x i8> %v
942}
943
944define <vscale x 4 x i16> @loadnxv4i16(ptr %p) {
945; CHECK-LABEL: loadnxv4i16:
946; CHECK:       // %bb.0:
947; CHECK-NEXT:    mov z0.s, #0 // =0x0
948; CHECK-NEXT:    ldrh w8, [x0]
949; CHECK-NEXT:    ptrue p0.s, vl1
950; CHECK-NEXT:    mov z0.s, p0/m, w8
951; CHECK-NEXT:    ret
952  %l = load i16, ptr %p
953  %v = insertelement <vscale x 4 x i16> zeroinitializer, i16 %l, i32 0
954  ret <vscale x 4 x i16> %v
955}
956
957define <vscale x 8 x i16> @loadnxv8i16(ptr %p) {
958; CHECK-LABEL: loadnxv8i16:
959; CHECK:       // %bb.0:
960; CHECK-NEXT:    ldr h0, [x0]
961; CHECK-NEXT:    ret
962  %l = load i16, ptr %p
963  %v = insertelement <vscale x 8 x i16> zeroinitializer, i16 %l, i32 0
964  ret <vscale x 8 x i16> %v
965}
966
967define <vscale x 2 x i32> @loadnxv2i32(ptr %p) {
968; CHECK-LABEL: loadnxv2i32:
969; CHECK:       // %bb.0:
970; CHECK-NEXT:    mov z0.d, #0 // =0x0
971; CHECK-NEXT:    ldr w8, [x0]
972; CHECK-NEXT:    ptrue p0.d, vl1
973; CHECK-NEXT:    mov z0.d, p0/m, x8
974; CHECK-NEXT:    ret
975  %l = load i32, ptr %p
976  %v = insertelement <vscale x 2 x i32> zeroinitializer, i32 %l, i32 0
977  ret <vscale x 2 x i32> %v
978}
979
980define <vscale x 4 x i32> @loadnxv4i32(ptr %p) {
981; CHECK-LABEL: loadnxv4i32:
982; CHECK:       // %bb.0:
983; CHECK-NEXT:    ldr s0, [x0]
984; CHECK-NEXT:    ret
985  %l = load i32, ptr %p
986  %v = insertelement <vscale x 4 x i32> zeroinitializer, i32 %l, i32 0
987  ret <vscale x 4 x i32> %v
988}
989
990define <vscale x 2 x i64> @loadnxv2i64(ptr %p) {
991; CHECK-LABEL: loadnxv2i64:
992; CHECK:       // %bb.0:
993; CHECK-NEXT:    ldr d0, [x0]
994; CHECK-NEXT:    ret
995  %l = load i64, ptr %p
996  %v = insertelement <vscale x 2 x i64> zeroinitializer, i64 %l, i32 0
997  ret <vscale x 2 x i64> %v
998}
999
1000
1001define <vscale x 4 x half> @loadnxv4f16(ptr %p) {
1002; CHECK-LABEL: loadnxv4f16:
1003; CHECK:       // %bb.0:
1004; CHECK-NEXT:    mov w8, wzr
1005; CHECK-NEXT:    index z0.s, #0, #1
1006; CHECK-NEXT:    ptrue p0.s
1007; CHECK-NEXT:    mov z1.s, w8
1008; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
1009; CHECK-NEXT:    mov z0.h, #0 // =0x0
1010; CHECK-NEXT:    ldr h1, [x0]
1011; CHECK-NEXT:    mov z0.h, p0/m, h1
1012; CHECK-NEXT:    ret
1013  %l = load half, ptr %p
1014  %v = insertelement <vscale x 4 x half> zeroinitializer, half %l, i32 0
1015  ret <vscale x 4 x half> %v
1016}
1017
1018define <vscale x 8 x half> @loadnxv8f16(ptr %p) {
1019; CHECK-LABEL: loadnxv8f16:
1020; CHECK:       // %bb.0:
1021; CHECK-NEXT:    ldr h0, [x0]
1022; CHECK-NEXT:    ret
1023  %l = load half, ptr %p
1024  %v = insertelement <vscale x 8 x half> zeroinitializer, half %l, i32 0
1025  ret <vscale x 8 x half> %v
1026}
1027
1028define <vscale x 4 x bfloat> @loadnxv4bf16(ptr %p) {
1029; CHECK-LABEL: loadnxv4bf16:
1030; CHECK:       // %bb.0:
1031; CHECK-NEXT:    mov w8, wzr
1032; CHECK-NEXT:    index z0.s, #0, #1
1033; CHECK-NEXT:    ptrue p0.s
1034; CHECK-NEXT:    mov z1.s, w8
1035; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
1036; CHECK-NEXT:    mov z0.h, #0 // =0x0
1037; CHECK-NEXT:    ldr h1, [x0]
1038; CHECK-NEXT:    mov z0.h, p0/m, h1
1039; CHECK-NEXT:    ret
1040  %l = load bfloat, ptr %p
1041  %v = insertelement <vscale x 4 x bfloat> zeroinitializer, bfloat %l, i32 0
1042  ret <vscale x 4 x bfloat> %v
1043}
1044
1045define <vscale x 8 x bfloat> @loadnxv8bf16(ptr %p) {
1046; CHECK-LABEL: loadnxv8bf16:
1047; CHECK:       // %bb.0:
1048; CHECK-NEXT:    ldr h0, [x0]
1049; CHECK-NEXT:    ret
1050  %l = load bfloat, ptr %p
1051  %v = insertelement <vscale x 8 x bfloat> zeroinitializer, bfloat %l, i32 0
1052  ret <vscale x 8 x bfloat> %v
1053}
1054
1055define <vscale x 2 x float> @loadnxv2f32(ptr %p) {
1056; CHECK-LABEL: loadnxv2f32:
1057; CHECK:       // %bb.0:
1058; CHECK-NEXT:    mov x8, xzr
1059; CHECK-NEXT:    index z0.d, #0, #1
1060; CHECK-NEXT:    ptrue p0.d
1061; CHECK-NEXT:    mov z1.d, x8
1062; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
1063; CHECK-NEXT:    mov z0.s, #0 // =0x0
1064; CHECK-NEXT:    ldr s1, [x0]
1065; CHECK-NEXT:    mov z0.s, p0/m, s1
1066; CHECK-NEXT:    ret
1067  %l = load float, ptr %p
1068  %v = insertelement <vscale x 2 x float> zeroinitializer, float %l, i32 0
1069  ret <vscale x 2 x float> %v
1070}
1071
1072define <vscale x 4 x float> @loadnxv4f32(ptr %p) {
1073; CHECK-LABEL: loadnxv4f32:
1074; CHECK:       // %bb.0:
1075; CHECK-NEXT:    ldr s0, [x0]
1076; CHECK-NEXT:    ret
1077  %l = load float, ptr %p
1078  %v = insertelement <vscale x 4 x float> zeroinitializer, float %l, i32 0
1079  ret <vscale x 4 x float> %v
1080}
1081
1082define <vscale x 2 x double> @loadnxv2f64(ptr %p) {
1083; CHECK-LABEL: loadnxv2f64:
1084; CHECK:       // %bb.0:
1085; CHECK-NEXT:    ldr d0, [x0]
1086; CHECK-NEXT:    ret
1087  %l = load double, ptr %p
1088  %v = insertelement <vscale x 2 x double> zeroinitializer, double %l, i32 0
1089  ret <vscale x 2 x double> %v
1090}
1091
1092
1093; Unscaled
1094
1095define <vscale x 8 x i8> @loadnxv8i8_offset(ptr %p) {
1096; CHECK-LABEL: loadnxv8i8_offset:
1097; CHECK:       // %bb.0:
1098; CHECK-NEXT:    mov z0.h, #0 // =0x0
1099; CHECK-NEXT:    ldrb w8, [x0, #1]
1100; CHECK-NEXT:    ptrue p0.h, vl1
1101; CHECK-NEXT:    mov z0.h, p0/m, w8
1102; CHECK-NEXT:    ret
1103  %g = getelementptr inbounds i8, ptr %p, i64 1
1104  %l = load i8, ptr %g
1105  %v = insertelement <vscale x 8 x i8> zeroinitializer, i8 %l, i32 0
1106  ret <vscale x 8 x i8> %v
1107}
1108
1109define <vscale x 16 x i8> @loadnxv16i8_offset(ptr %p) {
1110; CHECK-LABEL: loadnxv16i8_offset:
1111; CHECK:       // %bb.0:
1112; CHECK-NEXT:    ldr b0, [x0, #1]
1113; CHECK-NEXT:    ret
1114  %g = getelementptr inbounds i8, ptr %p, i64 1
1115  %l = load i8, ptr %g
1116  %v = insertelement <vscale x 16 x i8> zeroinitializer, i8 %l, i32 0
1117  ret <vscale x 16 x i8> %v
1118}
1119
1120define <vscale x 4 x i16> @loadnxv4i16_offset(ptr %p) {
1121; CHECK-LABEL: loadnxv4i16_offset:
1122; CHECK:       // %bb.0:
1123; CHECK-NEXT:    mov z0.s, #0 // =0x0
1124; CHECK-NEXT:    ldurh w8, [x0, #1]
1125; CHECK-NEXT:    ptrue p0.s, vl1
1126; CHECK-NEXT:    mov z0.s, p0/m, w8
1127; CHECK-NEXT:    ret
1128  %g = getelementptr inbounds i8, ptr %p, i64 1
1129  %l = load i16, ptr %g
1130  %v = insertelement <vscale x 4 x i16> zeroinitializer, i16 %l, i32 0
1131  ret <vscale x 4 x i16> %v
1132}
1133
1134define <vscale x 8 x i16> @loadnxv8i16_offset(ptr %p) {
1135; CHECK-LABEL: loadnxv8i16_offset:
1136; CHECK:       // %bb.0:
1137; CHECK-NEXT:    ldur h0, [x0, #1]
1138; CHECK-NEXT:    ret
1139  %g = getelementptr inbounds i8, ptr %p, i64 1
1140  %l = load i16, ptr %g
1141  %v = insertelement <vscale x 8 x i16> zeroinitializer, i16 %l, i32 0
1142  ret <vscale x 8 x i16> %v
1143}
1144
1145define <vscale x 2 x i32> @loadnxv2i32_offset(ptr %p) {
1146; CHECK-LABEL: loadnxv2i32_offset:
1147; CHECK:       // %bb.0:
1148; CHECK-NEXT:    mov z0.d, #0 // =0x0
1149; CHECK-NEXT:    ldur w8, [x0, #1]
1150; CHECK-NEXT:    ptrue p0.d, vl1
1151; CHECK-NEXT:    mov z0.d, p0/m, x8
1152; CHECK-NEXT:    ret
1153  %g = getelementptr inbounds i8, ptr %p, i64 1
1154  %l = load i32, ptr %g
1155  %v = insertelement <vscale x 2 x i32> zeroinitializer, i32 %l, i32 0
1156  ret <vscale x 2 x i32> %v
1157}
1158
1159define <vscale x 4 x i32> @loadnxv4i32_offset(ptr %p) {
1160; CHECK-LABEL: loadnxv4i32_offset:
1161; CHECK:       // %bb.0:
1162; CHECK-NEXT:    ldur s0, [x0, #1]
1163; CHECK-NEXT:    ret
1164  %g = getelementptr inbounds i8, ptr %p, i64 1
1165  %l = load i32, ptr %g
1166  %v = insertelement <vscale x 4 x i32> zeroinitializer, i32 %l, i32 0
1167  ret <vscale x 4 x i32> %v
1168}
1169
1170define <vscale x 2 x i64> @loadnxv2i64_offset(ptr %p) {
1171; CHECK-LABEL: loadnxv2i64_offset:
1172; CHECK:       // %bb.0:
1173; CHECK-NEXT:    ldur d0, [x0, #1]
1174; CHECK-NEXT:    ret
1175  %g = getelementptr inbounds i8, ptr %p, i64 1
1176  %l = load i64, ptr %g
1177  %v = insertelement <vscale x 2 x i64> zeroinitializer, i64 %l, i32 0
1178  ret <vscale x 2 x i64> %v
1179}
1180
1181
1182define <vscale x 4 x half> @loadnxv4f16_offset(ptr %p) {
1183; CHECK-LABEL: loadnxv4f16_offset:
1184; CHECK:       // %bb.0:
1185; CHECK-NEXT:    mov w8, wzr
1186; CHECK-NEXT:    index z0.s, #0, #1
1187; CHECK-NEXT:    ptrue p0.s
1188; CHECK-NEXT:    mov z1.s, w8
1189; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
1190; CHECK-NEXT:    mov z0.h, #0 // =0x0
1191; CHECK-NEXT:    ldur h1, [x0, #1]
1192; CHECK-NEXT:    mov z0.h, p0/m, h1
1193; CHECK-NEXT:    ret
1194  %g = getelementptr inbounds i8, ptr %p, i64 1
1195  %l = load half, ptr %g
1196  %v = insertelement <vscale x 4 x half> zeroinitializer, half %l, i32 0
1197  ret <vscale x 4 x half> %v
1198}
1199
1200define <vscale x 8 x half> @loadnxv8f16_offset(ptr %p) {
1201; CHECK-LABEL: loadnxv8f16_offset:
1202; CHECK:       // %bb.0:
1203; CHECK-NEXT:    ldur h0, [x0, #1]
1204; CHECK-NEXT:    ret
1205  %g = getelementptr inbounds i8, ptr %p, i64 1
1206  %l = load half, ptr %g
1207  %v = insertelement <vscale x 8 x half> zeroinitializer, half %l, i32 0
1208  ret <vscale x 8 x half> %v
1209}
1210
1211define <vscale x 4 x bfloat> @loadnxv4bf16_offset(ptr %p) {
1212; CHECK-LABEL: loadnxv4bf16_offset:
1213; CHECK:       // %bb.0:
1214; CHECK-NEXT:    mov w8, wzr
1215; CHECK-NEXT:    index z0.s, #0, #1
1216; CHECK-NEXT:    ptrue p0.s
1217; CHECK-NEXT:    mov z1.s, w8
1218; CHECK-NEXT:    cmpeq p0.s, p0/z, z0.s, z1.s
1219; CHECK-NEXT:    mov z0.h, #0 // =0x0
1220; CHECK-NEXT:    ldur h1, [x0, #1]
1221; CHECK-NEXT:    mov z0.h, p0/m, h1
1222; CHECK-NEXT:    ret
1223  %g = getelementptr inbounds i8, ptr %p, i64 1
1224  %l = load bfloat, ptr %g
1225  %v = insertelement <vscale x 4 x bfloat> zeroinitializer, bfloat %l, i32 0
1226  ret <vscale x 4 x bfloat> %v
1227}
1228
1229define <vscale x 8 x bfloat> @loadnxv8bf16_offset(ptr %p) {
1230; CHECK-LABEL: loadnxv8bf16_offset:
1231; CHECK:       // %bb.0:
1232; CHECK-NEXT:    ldur h0, [x0, #1]
1233; CHECK-NEXT:    ret
1234  %g = getelementptr inbounds i8, ptr %p, i64 1
1235  %l = load bfloat, ptr %g
1236  %v = insertelement <vscale x 8 x bfloat> zeroinitializer, bfloat %l, i32 0
1237  ret <vscale x 8 x bfloat> %v
1238}
1239
1240define <vscale x 2 x float> @loadnxv2f32_offset(ptr %p) {
1241; CHECK-LABEL: loadnxv2f32_offset:
1242; CHECK:       // %bb.0:
1243; CHECK-NEXT:    mov x8, xzr
1244; CHECK-NEXT:    index z0.d, #0, #1
1245; CHECK-NEXT:    ptrue p0.d
1246; CHECK-NEXT:    mov z1.d, x8
1247; CHECK-NEXT:    cmpeq p0.d, p0/z, z0.d, z1.d
1248; CHECK-NEXT:    mov z0.s, #0 // =0x0
1249; CHECK-NEXT:    ldur s1, [x0, #1]
1250; CHECK-NEXT:    mov z0.s, p0/m, s1
1251; CHECK-NEXT:    ret
1252  %g = getelementptr inbounds i8, ptr %p, i64 1
1253  %l = load float, ptr %g
1254  %v = insertelement <vscale x 2 x float> zeroinitializer, float %l, i32 0
1255  ret <vscale x 2 x float> %v
1256}
1257
1258define <vscale x 4 x float> @loadnxv4f32_offset(ptr %p) {
1259; CHECK-LABEL: loadnxv4f32_offset:
1260; CHECK:       // %bb.0:
1261; CHECK-NEXT:    ldur s0, [x0, #1]
1262; CHECK-NEXT:    ret
1263  %g = getelementptr inbounds i8, ptr %p, i64 1
1264  %l = load float, ptr %g
1265  %v = insertelement <vscale x 4 x float> zeroinitializer, float %l, i32 0
1266  ret <vscale x 4 x float> %v
1267}
1268
1269define <vscale x 2 x double> @loadnxv2f64_offset(ptr %p) {
1270; CHECK-LABEL: loadnxv2f64_offset:
1271; CHECK:       // %bb.0:
1272; CHECK-NEXT:    ldur d0, [x0, #1]
1273; CHECK-NEXT:    ret
1274  %g = getelementptr inbounds i8, ptr %p, i64 1
1275  %l = load double, ptr %g
1276  %v = insertelement <vscale x 2 x double> zeroinitializer, double %l, i32 0
1277  ret <vscale x 2 x double> %v
1278}
1279
1280
1281declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) #1
1282declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) #1
1283