xref: /llvm-project/llvm/test/CodeGen/AArch64/insert-subvector.ll (revision f6947e479e14e7904aa0b2539a95f5dfdc8f9295)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s
3
4
5; i8
6
7define <16 x i8> @insert_v16i8_2_1(float %tmp, <16 x i8> %b, <16 x i8> %a) {
8; CHECK-LABEL: insert_v16i8_2_1:
9; CHECK:       // %bb.0:
10; CHECK-NEXT:    mov v0.16b, v1.16b
11; CHECK-NEXT:    mov v0.h[0], v2.h[0]
12; CHECK-NEXT:    ret
13  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
14  ret <16 x i8> %s2
15}
16
17define <16 x i8> @insert_v16i8_2_2(float %tmp, <16 x i8> %b, <16 x i8> %a) {
18; CHECK-LABEL: insert_v16i8_2_2:
19; CHECK:       // %bb.0:
20; CHECK-NEXT:    mov v0.16b, v1.16b
21; CHECK-NEXT:    mov v0.h[1], v2.h[0]
22; CHECK-NEXT:    ret
23  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
24  ret <16 x i8> %s2
25}
26
27define <16 x i8> @insert_v16i8_2_6(float %tmp, <16 x i8> %b, <16 x i8> %a) {
28; CHECK-LABEL: insert_v16i8_2_6:
29; CHECK:       // %bb.0:
30; CHECK-NEXT:    mov v0.16b, v1.16b
31; CHECK-NEXT:    mov v0.h[6], v2.h[0]
32; CHECK-NEXT:    ret
33  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 0, i32 1, i32 30, i32 31>
34  ret <16 x i8> %s2
35}
36
37define <16 x i8> @insert_v16i8_4_1(float %tmp, <16 x i8> %b, <16 x i8> %a) {
38; CHECK-LABEL: insert_v16i8_4_1:
39; CHECK:       // %bb.0:
40; CHECK-NEXT:    mov v0.16b, v1.16b
41; CHECK-NEXT:    mov v0.s[0], v2.s[0]
42; CHECK-NEXT:    ret
43  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
44  ret <16 x i8> %s2
45}
46
47define <16 x i8> @insert_v16i8_4_15(float %tmp, <16 x i8> %b, <16 x i8> %a) {
48; CHECK-LABEL: insert_v16i8_4_15:
49; CHECK:       // %bb.0:
50; CHECK-NEXT:    // kill: def $q2 killed $q2 def $q2_q3
51; CHECK-NEXT:    adrp x8, .LCPI4_0
52; CHECK-NEXT:    mov v3.16b, v1.16b
53; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI4_0]
54; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
55; CHECK-NEXT:    ret
56  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
57  ret <16 x i8> %s2
58}
59
60define <16 x i8> @insert_v16i8_4_2(float %tmp, <16 x i8> %b, <16 x i8> %a) {
61; CHECK-LABEL: insert_v16i8_4_2:
62; CHECK:       // %bb.0:
63; CHECK-NEXT:    mov v0.16b, v1.16b
64; CHECK-NEXT:    mov v0.s[1], v2.s[0]
65; CHECK-NEXT:    ret
66  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
67  ret <16 x i8> %s2
68}
69
70define <16 x i8> @insert_v16i8_4_3(float %tmp, <16 x i8> %b, <16 x i8> %a) {
71; CHECK-LABEL: insert_v16i8_4_3:
72; CHECK:       // %bb.0:
73; CHECK-NEXT:    mov v0.16b, v1.16b
74; CHECK-NEXT:    mov v0.s[2], v2.s[0]
75; CHECK-NEXT:    ret
76  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31>
77  ret <16 x i8> %s2
78}
79
80define <16 x i8> @insert_v16i8_4_4(float %tmp, <16 x i8> %b, <16 x i8> %a) {
81; CHECK-LABEL: insert_v16i8_4_4:
82; CHECK:       // %bb.0:
83; CHECK-NEXT:    mov v0.16b, v1.16b
84; CHECK-NEXT:    mov v0.s[3], v2.s[0]
85; CHECK-NEXT:    ret
86  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 0, i32 1, i32 2, i32 3>
87  ret <16 x i8> %s2
88}
89
90define <8 x i8> @insert_v8i8_4_1(float %tmp, <8 x i8> %b, <8 x i8> %a) {
91; CHECK-LABEL: insert_v8i8_4_1:
92; CHECK:       // %bb.0:
93; CHECK-NEXT:    fmov d0, d2
94; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
95; CHECK-NEXT:    mov v0.s[1], v1.s[1]
96; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
97; CHECK-NEXT:    ret
98  %s2 = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
99  ret <8 x i8> %s2
100}
101
102define <8 x i8> @insert_v8i8_4_2(float %tmp, <8 x i8> %b, <8 x i8> %a) {
103; CHECK-LABEL: insert_v8i8_4_2:
104; CHECK:       // %bb.0:
105; CHECK-NEXT:    fmov d0, d1
106; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
107; CHECK-NEXT:    mov v0.s[1], v2.s[0]
108; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
109; CHECK-NEXT:    ret
110  %s2 = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
111  ret <8 x i8> %s2
112}
113
114define <16 x i8> @insert_v16i8_8_1(float %tmp, <16 x i8> %b, <16 x i8> %a) {
115; CHECK-LABEL: insert_v16i8_8_1:
116; CHECK:       // %bb.0:
117; CHECK-NEXT:    mov v0.16b, v2.16b
118; CHECK-NEXT:    mov v0.d[1], v1.d[1]
119; CHECK-NEXT:    ret
120  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
121  ret <16 x i8> %s2
122}
123
124define <16 x i8> @insert_v16i8_8_2(float %tmp, <16 x i8> %b, <16 x i8> %a) {
125; CHECK-LABEL: insert_v16i8_8_2:
126; CHECK:       // %bb.0:
127; CHECK-NEXT:    mov v0.16b, v1.16b
128; CHECK-NEXT:    mov v0.d[1], v2.d[0]
129; CHECK-NEXT:    ret
130  %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
131  ret <16 x i8> %s2
132}
133
134; i16
135
136define <8 x i16> @insert_v8i16_2_1(float %tmp, <8 x i16> %b, <8 x i16> %a) {
137; CHECK-LABEL: insert_v8i16_2_1:
138; CHECK:       // %bb.0:
139; CHECK-NEXT:    mov v0.16b, v1.16b
140; CHECK-NEXT:    mov v0.s[0], v2.s[0]
141; CHECK-NEXT:    ret
142  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
143  ret <8 x i16> %s2
144}
145
146define <8 x i16> @insert_v8i16_2_15(float %tmp, <8 x i16> %b, <8 x i16> %a) {
147; CHECK-LABEL: insert_v8i16_2_15:
148; CHECK:       // %bb.0:
149; CHECK-NEXT:    // kill: def $q2 killed $q2 def $q2_q3
150; CHECK-NEXT:    adrp x8, .LCPI13_0
151; CHECK-NEXT:    mov v3.16b, v1.16b
152; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI13_0]
153; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
154; CHECK-NEXT:    ret
155  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 1, i32 11, i32 12, i32 13, i32 14, i32 15>
156  ret <8 x i16> %s2
157}
158
159define <8 x i16> @insert_v8i16_2_2(float %tmp, <8 x i16> %b, <8 x i16> %a) {
160; CHECK-LABEL: insert_v8i16_2_2:
161; CHECK:       // %bb.0:
162; CHECK-NEXT:    mov v0.16b, v1.16b
163; CHECK-NEXT:    mov v0.s[1], v2.s[0]
164; CHECK-NEXT:    ret
165  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 0, i32 1, i32 12, i32 13, i32 14, i32 15>
166  ret <8 x i16> %s2
167}
168
169define <8 x i16> @insert_v8i16_2_3(float %tmp, <8 x i16> %b, <8 x i16> %a) {
170; CHECK-LABEL: insert_v8i16_2_3:
171; CHECK:       // %bb.0:
172; CHECK-NEXT:    mov v0.16b, v1.16b
173; CHECK-NEXT:    mov v0.s[2], v2.s[0]
174; CHECK-NEXT:    ret
175  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 14, i32 15>
176  ret <8 x i16> %s2
177}
178
179define <8 x i16> @insert_v8i16_2_4(float %tmp, <8 x i16> %b, <8 x i16> %a) {
180; CHECK-LABEL: insert_v8i16_2_4:
181; CHECK:       // %bb.0:
182; CHECK-NEXT:    mov v0.16b, v1.16b
183; CHECK-NEXT:    mov v0.s[3], v2.s[0]
184; CHECK-NEXT:    ret
185  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 0, i32 1>
186  ret <8 x i16> %s2
187}
188
189define <4 x i16> @insert_v4i16_2_1(float %tmp, <4 x i16> %b, <4 x i16> %a) {
190; CHECK-LABEL: insert_v4i16_2_1:
191; CHECK:       // %bb.0:
192; CHECK-NEXT:    fmov d0, d2
193; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
194; CHECK-NEXT:    mov v0.s[1], v1.s[1]
195; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
196; CHECK-NEXT:    ret
197  %s2 = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
198  ret <4 x i16> %s2
199}
200
201define <4 x i16> @insert_v4i16_2_2(float %tmp, <4 x i16> %b, <4 x i16> %a) {
202; CHECK-LABEL: insert_v4i16_2_2:
203; CHECK:       // %bb.0:
204; CHECK-NEXT:    fmov d0, d1
205; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
206; CHECK-NEXT:    mov v0.s[1], v2.s[0]
207; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
208; CHECK-NEXT:    ret
209  %s2 = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
210  ret <4 x i16> %s2
211}
212
213define <8 x i16> @insert_v8i16_4_1(float %tmp, <8 x i16> %b, <8 x i16> %a) {
214; CHECK-LABEL: insert_v8i16_4_1:
215; CHECK:       // %bb.0:
216; CHECK-NEXT:    mov v0.16b, v2.16b
217; CHECK-NEXT:    mov v0.d[1], v1.d[1]
218; CHECK-NEXT:    ret
219  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
220  ret <8 x i16> %s2
221}
222
223define <8 x i16> @insert_v8i16_4_2(float %tmp, <8 x i16> %b, <8 x i16> %a) {
224; CHECK-LABEL: insert_v8i16_4_2:
225; CHECK:       // %bb.0:
226; CHECK-NEXT:    mov v0.16b, v1.16b
227; CHECK-NEXT:    mov v0.d[1], v2.d[0]
228; CHECK-NEXT:    ret
229  %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
230  ret <8 x i16> %s2
231}
232
233; i32
234
235define <4 x i32> @insert_v4i32_2_1(float %tmp, <4 x i32> %b, <4 x i32> %a) {
236; CHECK-LABEL: insert_v4i32_2_1:
237; CHECK:       // %bb.0:
238; CHECK-NEXT:    mov v0.16b, v2.16b
239; CHECK-NEXT:    mov v0.d[1], v1.d[1]
240; CHECK-NEXT:    ret
241  %s2 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
242  ret <4 x i32> %s2
243}
244
245define <4 x i32> @insert_v4i32_2_2(float %tmp, <4 x i32> %b, <4 x i32> %a) {
246; CHECK-LABEL: insert_v4i32_2_2:
247; CHECK:       // %bb.0:
248; CHECK-NEXT:    mov v0.16b, v1.16b
249; CHECK-NEXT:    mov v0.d[1], v2.d[0]
250; CHECK-NEXT:    ret
251  %s2 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
252  ret <4 x i32> %s2
253}
254
255
256
257
258; i8
259
260define <16 x i8> @load_v16i8_4_1(float %tmp, <16 x i8> %b, ptr %a) {
261; CHECK-LABEL: load_v16i8_4_1:
262; CHECK:       // %bb.0:
263; CHECK-NEXT:    mov v0.16b, v1.16b
264; CHECK-NEXT:    ld1 { v0.s }[0], [x0]
265; CHECK-NEXT:    ret
266  %l = load <4 x i8>, ptr %a
267  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
268  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
269  ret <16 x i8> %s2
270}
271
272define <16 x i8> @load_v16i8_4_15(float %tmp, <16 x i8> %b, ptr %a) {
273; CHECK-LABEL: load_v16i8_4_15:
274; CHECK:       // %bb.0:
275; CHECK-NEXT:    // kill: def $q1 killed $q1 def $q0_q1
276; CHECK-NEXT:    adrp x8, .LCPI24_0
277; CHECK-NEXT:    ldr s0, [x0]
278; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI24_0]
279; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
280; CHECK-NEXT:    ret
281  %l = load <4 x i8>, ptr %a
282  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
283  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 0, i32 1, i32 2, i32 3, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
284  ret <16 x i8> %s2
285}
286
287define <16 x i8> @load_v16i8_4_2(float %tmp, <16 x i8> %b, ptr %a) {
288; CHECK-LABEL: load_v16i8_4_2:
289; CHECK:       // %bb.0:
290; CHECK-NEXT:    mov v0.16b, v1.16b
291; CHECK-NEXT:    ld1 { v0.s }[1], [x0]
292; CHECK-NEXT:    ret
293  %l = load <4 x i8>, ptr %a
294  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
295  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
296  ret <16 x i8> %s2
297}
298
299define <16 x i8> @load_v16i8_4_3(float %tmp, <16 x i8> %b, ptr %a) {
300; CHECK-LABEL: load_v16i8_4_3:
301; CHECK:       // %bb.0:
302; CHECK-NEXT:    mov v0.16b, v1.16b
303; CHECK-NEXT:    ld1 { v0.s }[2], [x0]
304; CHECK-NEXT:    ret
305  %l = load <4 x i8>, ptr %a
306  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
307  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 28, i32 29, i32 30, i32 31>
308  ret <16 x i8> %s2
309}
310
311define <16 x i8> @load_v16i8_4_4(float %tmp, <16 x i8> %b, ptr %a) {
312; CHECK-LABEL: load_v16i8_4_4:
313; CHECK:       // %bb.0:
314; CHECK-NEXT:    mov v0.16b, v1.16b
315; CHECK-NEXT:    ld1 { v0.s }[3], [x0]
316; CHECK-NEXT:    ret
317  %l = load <4 x i8>, ptr %a
318  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
319  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 0, i32 1, i32 2, i32 3>
320  ret <16 x i8> %s2
321}
322
323define <8 x i8> @load_v8i8_4_1(float %tmp, <8 x i8> %b, ptr %a) {
324; CHECK-LABEL: load_v8i8_4_1:
325; CHECK:       // %bb.0:
326; CHECK-NEXT:    ldr s0, [x0]
327; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
328; CHECK-NEXT:    mov v0.s[1], v1.s[1]
329; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
330; CHECK-NEXT:    ret
331  %l = load <4 x i8>, ptr %a
332  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
333  %s2 = shufflevector <8 x i8> %s1, <8 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
334  ret <8 x i8> %s2
335}
336
337define <8 x i8> @load_v8i8_4_2(float %tmp, <8 x i8> %b, ptr %a) {
338; CHECK-LABEL: load_v8i8_4_2:
339; CHECK:       // %bb.0:
340; CHECK-NEXT:    fmov d0, d1
341; CHECK-NEXT:    ldr s2, [x0]
342; CHECK-NEXT:    mov v0.s[1], v2.s[0]
343; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
344; CHECK-NEXT:    ret
345  %l = load <4 x i8>, ptr %a
346  %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
347  %s2 = shufflevector <8 x i8> %s1, <8 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
348  ret <8 x i8> %s2
349}
350
351define <16 x i8> @load_v16i8_8_1(float %tmp, <16 x i8> %b, ptr %a) {
352; CHECK-LABEL: load_v16i8_8_1:
353; CHECK:       // %bb.0:
354; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
355; CHECK-NEXT:    ldr d0, [x0]
356; CHECK-NEXT:    mov v0.d[1], v1.d[0]
357; CHECK-NEXT:    ret
358  %l = load <8 x i8>, ptr %a
359  %s1 = shufflevector <8 x i8> %l, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
360  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
361  ret <16 x i8> %s2
362}
363
364define <16 x i8> @load_v16i8_8_2(float %tmp, <16 x i8> %b, ptr %a) {
365; CHECK-LABEL: load_v16i8_8_2:
366; CHECK:       // %bb.0:
367; CHECK-NEXT:    mov v0.16b, v1.16b
368; CHECK-NEXT:    ldr d2, [x0]
369; CHECK-NEXT:    mov v0.d[1], v2.d[0]
370; CHECK-NEXT:    ret
371  %l = load <8 x i8>, ptr %a
372  %s1 = shufflevector <8 x i8> %l, <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
373  %s2 = shufflevector <16 x i8> %s1, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
374  ret <16 x i8> %s2
375}
376
377define <8 x i8> @load_v8i8_2_1(float %tmp, <8 x i8> %b, ptr %a) {
378; CHECK-LABEL: load_v8i8_2_1:
379; CHECK:       // %bb.0:
380; CHECK-NEXT:    fmov d0, d1
381; CHECK-NEXT:    ldr h2, [x0]
382; CHECK-NEXT:    mov v0.h[0], v2.h[0]
383; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
384; CHECK-NEXT:    ret
385  %l = load <2 x i8>, ptr %a
386  %s1 = shufflevector <2 x i8> %l, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
387  %s2 = shufflevector <8 x i8> %s1, <8 x i8> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
388  ret <8 x i8> %s2
389}
390
391define <8 x i8> @load_v8i8_2_15(float %tmp, <8 x i8> %b, ptr %a) {
392; CHECK-LABEL: load_v8i8_2_15:
393; CHECK:       // %bb.0:
394; CHECK-NEXT:    ldr h0, [x0]
395; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
396; CHECK-NEXT:    adrp x8, .LCPI33_0
397; CHECK-NEXT:    mov v0.d[1], v1.d[0]
398; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI33_0]
399; CHECK-NEXT:    tbl v0.8b, { v0.16b }, v1.8b
400; CHECK-NEXT:    ret
401  %l = load <2 x i8>, ptr %a
402  %s1 = shufflevector <2 x i8> %l, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
403  %s2 = shufflevector <8 x i8> %s1, <8 x i8> %b, <8 x i32> <i32 8, i32 0, i32 1, i32 11, i32 12, i32 13, i32 14, i32 15>
404  ret <8 x i8> %s2
405}
406
407define <8 x i8> @load_v8i8_2_2(float %tmp, <8 x i8> %b, ptr %a) {
408; CHECK-LABEL: load_v8i8_2_2:
409; CHECK:       // %bb.0:
410; CHECK-NEXT:    fmov d0, d1
411; CHECK-NEXT:    ldr h2, [x0]
412; CHECK-NEXT:    mov v0.h[1], v2.h[0]
413; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
414; CHECK-NEXT:    ret
415  %l = load <2 x i8>, ptr %a
416  %s1 = shufflevector <2 x i8> %l, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
417  %s2 = shufflevector <8 x i8> %s1, <8 x i8> %b, <8 x i32> <i32 8, i32 9, i32 0, i32 1, i32 12, i32 13, i32 14, i32 15>
418  ret <8 x i8> %s2
419}
420
421define <8 x i8> @load_v8i8_2_3(float %tmp, <8 x i8> %b, ptr %a) {
422; CHECK-LABEL: load_v8i8_2_3:
423; CHECK:       // %bb.0:
424; CHECK-NEXT:    fmov d0, d1
425; CHECK-NEXT:    ldr h2, [x0]
426; CHECK-NEXT:    mov v0.h[2], v2.h[0]
427; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
428; CHECK-NEXT:    ret
429  %l = load <2 x i8>, ptr %a
430  %s1 = shufflevector <2 x i8> %l, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
431  %s2 = shufflevector <8 x i8> %s1, <8 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 14, i32 15>
432  ret <8 x i8> %s2
433}
434
435define <8 x i8> @load_v8i8_2_4(float %tmp, <8 x i8> %b, ptr %a) {
436; CHECK-LABEL: load_v8i8_2_4:
437; CHECK:       // %bb.0:
438; CHECK-NEXT:    fmov d0, d1
439; CHECK-NEXT:    ldr h2, [x0]
440; CHECK-NEXT:    mov v0.h[3], v2.h[0]
441; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
442; CHECK-NEXT:    ret
443  %l = load <2 x i8>, ptr %a
444  %s1 = shufflevector <2 x i8> %l, <2 x i8> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
445  %s2 = shufflevector <8 x i8> %s1, <8 x i8> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 0, i32 1>
446  ret <8 x i8> %s2
447}
448
449define <4 x i8> @load_v4i8_2_1(float %tmp, <4 x i8> %b, ptr %a) {
450; CHECK-LABEL: load_v4i8_2_1:
451; CHECK:       // %bb.0:
452; CHECK-NEXT:    ldr h0, [x0]
453; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
454; CHECK-NEXT:    zip1 v0.8b, v0.8b, v0.8b
455; CHECK-NEXT:    mov v0.s[1], v1.s[1]
456; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
457; CHECK-NEXT:    ret
458  %l = load <2 x i8>, ptr %a
459  %s1 = shufflevector <2 x i8> %l, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
460  %s2 = shufflevector <4 x i8> %s1, <4 x i8> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
461  ret <4 x i8> %s2
462}
463
464define <4 x i8> @load_v4i8_2_2(float %tmp, <4 x i8> %b, ptr %a) {
465; CHECK-LABEL: load_v4i8_2_2:
466; CHECK:       // %bb.0:
467; CHECK-NEXT:    ldr h0, [x0]
468; CHECK-NEXT:    zip1 v2.8b, v0.8b, v0.8b
469; CHECK-NEXT:    fmov d0, d1
470; CHECK-NEXT:    mov v0.s[1], v2.s[0]
471; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
472; CHECK-NEXT:    ret
473  %l = load <2 x i8>, ptr %a
474  %s1 = shufflevector <2 x i8> %l, <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
475  %s2 = shufflevector <4 x i8> %s1, <4 x i8> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
476  ret <4 x i8> %s2
477}
478
479; i16
480
481define <8 x i16> @load_v8i16_2_1(float %tmp, <8 x i16> %b, ptr %a) {
482; CHECK-LABEL: load_v8i16_2_1:
483; CHECK:       // %bb.0:
484; CHECK-NEXT:    mov v0.16b, v1.16b
485; CHECK-NEXT:    ld1 { v0.s }[0], [x0]
486; CHECK-NEXT:    ret
487  %l = load <2 x i16>, ptr %a
488  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
489  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
490  ret <8 x i16> %s2
491}
492
493define <8 x i16> @load_v8i16_2_15(float %tmp, <8 x i16> %b, ptr %a) {
494; CHECK-LABEL: load_v8i16_2_15:
495; CHECK:       // %bb.0:
496; CHECK-NEXT:    // kill: def $q1 killed $q1 def $q0_q1
497; CHECK-NEXT:    adrp x8, .LCPI40_0
498; CHECK-NEXT:    ldr s0, [x0]
499; CHECK-NEXT:    ldr q2, [x8, :lo12:.LCPI40_0]
500; CHECK-NEXT:    tbl v0.16b, { v0.16b, v1.16b }, v2.16b
501; CHECK-NEXT:    ret
502  %l = load <2 x i16>, ptr %a
503  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
504  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 0, i32 1, i32 11, i32 12, i32 13, i32 14, i32 15>
505  ret <8 x i16> %s2
506}
507
508define <8 x i16> @load_v8i16_2_2(float %tmp, <8 x i16> %b, ptr %a) {
509; CHECK-LABEL: load_v8i16_2_2:
510; CHECK:       // %bb.0:
511; CHECK-NEXT:    mov v0.16b, v1.16b
512; CHECK-NEXT:    ld1 { v0.s }[1], [x0]
513; CHECK-NEXT:    ret
514  %l = load <2 x i16>, ptr %a
515  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
516  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 0, i32 1, i32 12, i32 13, i32 14, i32 15>
517  ret <8 x i16> %s2
518}
519
520define <8 x i16> @load_v8i16_2_3(float %tmp, <8 x i16> %b, ptr %a) {
521; CHECK-LABEL: load_v8i16_2_3:
522; CHECK:       // %bb.0:
523; CHECK-NEXT:    mov v0.16b, v1.16b
524; CHECK-NEXT:    ld1 { v0.s }[2], [x0]
525; CHECK-NEXT:    ret
526  %l = load <2 x i16>, ptr %a
527  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
528  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 14, i32 15>
529  ret <8 x i16> %s2
530}
531
532define <8 x i16> @load_v8i16_2_4(float %tmp, <8 x i16> %b, ptr %a) {
533; CHECK-LABEL: load_v8i16_2_4:
534; CHECK:       // %bb.0:
535; CHECK-NEXT:    mov v0.16b, v1.16b
536; CHECK-NEXT:    ld1 { v0.s }[3], [x0]
537; CHECK-NEXT:    ret
538  %l = load <2 x i16>, ptr %a
539  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
540  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 0, i32 1>
541  ret <8 x i16> %s2
542}
543
544define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, ptr %a) {
545; CHECK-LABEL: load_v4i16_2_1:
546; CHECK:       // %bb.0:
547; CHECK-NEXT:    ldr s0, [x0]
548; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
549; CHECK-NEXT:    mov v0.s[1], v1.s[1]
550; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
551; CHECK-NEXT:    ret
552  %l = load <2 x i16>, ptr %a
553  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
554  %s2 = shufflevector <4 x i16> %s1, <4 x i16> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
555  ret <4 x i16> %s2
556}
557
558define <4 x i16> @load_v4i16_2_2(float %tmp, <4 x i16> %b, ptr %a) {
559; CHECK-LABEL: load_v4i16_2_2:
560; CHECK:       // %bb.0:
561; CHECK-NEXT:    fmov d0, d1
562; CHECK-NEXT:    ldr s2, [x0]
563; CHECK-NEXT:    mov v0.s[1], v2.s[0]
564; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
565; CHECK-NEXT:    ret
566  %l = load <2 x i16>, ptr %a
567  %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
568  %s2 = shufflevector <4 x i16> %s1, <4 x i16> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
569  ret <4 x i16> %s2
570}
571
572define <8 x i16> @load_v8i16_4_1(float %tmp, <8 x i16> %b, ptr %a) {
573; CHECK-LABEL: load_v8i16_4_1:
574; CHECK:       // %bb.0:
575; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
576; CHECK-NEXT:    ldr d0, [x0]
577; CHECK-NEXT:    mov v0.d[1], v1.d[0]
578; CHECK-NEXT:    ret
579  %l = load <4 x i16>, ptr %a
580  %s1 = shufflevector <4 x i16> %l, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
581  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
582  ret <8 x i16> %s2
583}
584
585define <8 x i16> @load_v8i16_4_2(float %tmp, <8 x i16> %b, ptr %a) {
586; CHECK-LABEL: load_v8i16_4_2:
587; CHECK:       // %bb.0:
588; CHECK-NEXT:    mov v0.16b, v1.16b
589; CHECK-NEXT:    ldr d2, [x0]
590; CHECK-NEXT:    mov v0.d[1], v2.d[0]
591; CHECK-NEXT:    ret
592  %l = load <4 x i16>, ptr %a
593  %s1 = shufflevector <4 x i16> %l, <4 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
594  %s2 = shufflevector <8 x i16> %s1, <8 x i16> %b, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 0, i32 1, i32 2, i32 3>
595  ret <8 x i16> %s2
596}
597
598; i32
599
600define <4 x i32> @load_v4i32_2_1(float %tmp, <4 x i32> %b, ptr %a) {
601; CHECK-LABEL: load_v4i32_2_1:
602; CHECK:       // %bb.0:
603; CHECK-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
604; CHECK-NEXT:    ldr d0, [x0]
605; CHECK-NEXT:    mov v0.d[1], v1.d[0]
606; CHECK-NEXT:    ret
607  %l = load <2 x i32>, ptr %a
608  %s1 = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
609  %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
610  ret <4 x i32> %s2
611}
612
613define <4 x i32> @load_v4i32_2_2(float %tmp, <4 x i32> %b, ptr %a) {
614; CHECK-LABEL: load_v4i32_2_2:
615; CHECK:       // %bb.0:
616; CHECK-NEXT:    mov v0.16b, v1.16b
617; CHECK-NEXT:    ldr d2, [x0]
618; CHECK-NEXT:    mov v0.d[1], v2.d[0]
619; CHECK-NEXT:    ret
620  %l = load <2 x i32>, ptr %a
621  %s1 = shufflevector <2 x i32> %l, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
622  %s2 = shufflevector <4 x i32> %s1, <4 x i32> %b, <4 x i32> <i32 4, i32 5, i32 0, i32 1>
623  ret <4 x i32> %s2
624}
625
626; More than a single vector
627
628define <8 x i8> @load2_v4i8(float %tmp, ptr %a, ptr %b) {
629; CHECK-LABEL: load2_v4i8:
630; CHECK:       // %bb.0:
631; CHECK-NEXT:    ldr s0, [x0]
632; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
633; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
634; CHECK-NEXT:    ret
635  %la = load <4 x i8>, ptr %a
636  %lb = load <4 x i8>, ptr %b
637  %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
638  ret <8 x i8> %s1
639}
640
641define <16 x i8> @load3_v4i8(float %tmp, ptr %a, ptr %b) {
642; CHECK-LABEL: load3_v4i8:
643; CHECK:       // %bb.0:
644; CHECK-NEXT:    ldp s0, s1, [x0]
645; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
646; CHECK-NEXT:    mov v0.d[1], v1.d[0]
647; CHECK-NEXT:    ret
648  %la = load <4 x i8>, ptr %a
649  %lb = load <4 x i8>, ptr %b
650  %c = getelementptr <4 x i8>, ptr %a, i64 1
651  %d = getelementptr <4 x i8>, ptr %b, i64 1
652  %lc = load <4 x i8>, ptr %c
653  %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
654  %s2 = shufflevector <4 x i8> %lc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
655  %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
656  ret <16 x i8> %s3
657}
658
659define <16 x i8> @load4_v4i8(float %tmp, ptr %a, ptr %b) {
660; CHECK-LABEL: load4_v4i8:
661; CHECK:       // %bb.0:
662; CHECK-NEXT:    ldp s0, s1, [x0]
663; CHECK-NEXT:    ld1 { v0.s }[1], [x1], #4
664; CHECK-NEXT:    ld1 { v1.s }[1], [x1]
665; CHECK-NEXT:    mov v0.d[1], v1.d[0]
666; CHECK-NEXT:    ret
667  %la = load <4 x i8>, ptr %a
668  %lb = load <4 x i8>, ptr %b
669  %c = getelementptr <4 x i8>, ptr %a, i64 1
670  %d = getelementptr <4 x i8>, ptr %b, i64 1
671  %lc = load <4 x i8>, ptr %c
672  %ld = load <4 x i8>, ptr %d
673  %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
674  %s2 = shufflevector <4 x i8> %lc, <4 x i8> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
675  %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
676  ret <16 x i8> %s3
677}
678
679define <16 x i8> @load2multi1_v4i8(float %tmp, ptr %a, ptr %b) {
680; CHECK-LABEL: load2multi1_v4i8:
681; CHECK:       // %bb.0:
682; CHECK-NEXT:    ldr s0, [x0]
683; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
684; CHECK-NEXT:    mov v0.d[1], v0.d[0]
685; CHECK-NEXT:    ret
686  %la = load <4 x i8>, ptr %a
687  %lb = load <4 x i8>, ptr %b
688  %s1 = shufflevector <4 x i8> %la, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
689  %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
690  ret <16 x i8> %s3
691}
692
693define <16 x i8> @load2multi2_v4i8(float %tmp, ptr %a, ptr %b) {
694; CHECK-LABEL: load2multi2_v4i8:
695; CHECK:       // %bb.0:
696; CHECK-NEXT:    ldr s0, [x0]
697; CHECK-NEXT:    ldr s1, [x1]
698; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
699; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
700; CHECK-NEXT:    mov v1.d[1], v1.d[0]
701; CHECK-NEXT:    mov v0.d[1], v0.d[0]
702; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
703; CHECK-NEXT:    ret
704  %la = load <4 x i8>, ptr %a
705  %lb = load <4 x i8>, ptr %b
706  %s1 = shufflevector <4 x i8> %la, <4 x i8> %la, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
707  %s2 = shufflevector <4 x i8> %lb, <4 x i8> %lb, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
708  %s3 = shufflevector <8 x i8> %s1, <8 x i8> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
709  ret <16 x i8> %s3
710}
711
712define void @loads_before_stores(ptr %i44) {
713; CHECK-LABEL: loads_before_stores:
714; CHECK:       // %bb.0: // %bb
715; CHECK-NEXT:    ldr s0, [x0, #28]
716; CHECK-NEXT:    add x8, x0, #20
717; CHECK-NEXT:    ldrh w9, [x0, #26]
718; CHECK-NEXT:    ldrh w10, [x0, #24]
719; CHECK-NEXT:    ld1 { v0.s }[1], [x8]
720; CHECK-NEXT:    strh w9, [x0, #20]
721; CHECK-NEXT:    strh w10, [x0, #30]
722; CHECK-NEXT:    stur d0, [x0, #22]
723; CHECK-NEXT:    ret
724bb:
725  %i45 = getelementptr inbounds i8, ptr %i44, i64 20
726  %i46 = getelementptr inbounds i8, ptr %i44, i64 26
727  %i48 = load i8, ptr %i46, align 1
728  %i49 = getelementptr inbounds i8, ptr %i44, i64 21
729  %i50 = getelementptr inbounds i8, ptr %i44, i64 27
730  %i52 = load i8, ptr %i50, align 1
731  %i53 = getelementptr inbounds i8, ptr %i44, i64 22
732  %i54 = getelementptr inbounds i8, ptr %i44, i64 28
733  %i61 = getelementptr inbounds i8, ptr %i44, i64 24
734  %i62 = getelementptr inbounds i8, ptr %i44, i64 30
735  %i63 = load i8, ptr %i61, align 1
736  %i65 = getelementptr inbounds i8, ptr %i44, i64 25
737  %i66 = getelementptr inbounds i8, ptr %i44, i64 31
738  %i67 = load i8, ptr %i65, align 1
739  %0 = load <4 x i8>, ptr %i45, align 1
740  store i8 %i48, ptr %i45, align 1
741  store i8 %i52, ptr %i49, align 1
742  %1 = load <4 x i8>, ptr %i54, align 1
743  store i8 %i63, ptr %i62, align 1
744  %2 = shufflevector <4 x i8> %1, <4 x i8> %0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
745  store <8 x i8> %2, ptr %i53, align 1
746  store i8 %i67, ptr %i66, align 1
747  ret void
748}
749