xref: /llvm-project/llvm/test/CodeGen/ARM/vstlane.ll (revision bed1c7f061aa12417aa081e334afdba45767b938)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm -mattr=+neon | FileCheck %s
3
4;Check the (default) alignment.
5define void @vst1lanei8(ptr %A, ptr %B) nounwind {
6; CHECK-LABEL: vst1lanei8:
7; CHECK:       @ %bb.0:
8; CHECK-NEXT:    vldr d16, [r1]
9; CHECK-NEXT:    vst1.8 {d16[3]}, [r0]
10; CHECK-NEXT:    mov pc, lr
11	%tmp1 = load <8 x i8>, ptr %B
12	%tmp2 = extractelement <8 x i8> %tmp1, i32 3
13	store i8 %tmp2, ptr %A, align 8
14	ret void
15}
16
17;Check for a post-increment updating store.
18define void @vst1lanei8_update(ptr %ptr, ptr %B) nounwind {
19; CHECK-LABEL: vst1lanei8_update:
20; CHECK:       @ %bb.0:
21; CHECK-NEXT:    ldr r2, [r0]
22; CHECK-NEXT:    vldr d16, [r1]
23; CHECK-NEXT:    vst1.8 {d16[3]}, [r2]!
24; CHECK-NEXT:    str r2, [r0]
25; CHECK-NEXT:    mov pc, lr
26	%A = load ptr, ptr %ptr
27	%tmp1 = load <8 x i8>, ptr %B
28	%tmp2 = extractelement <8 x i8> %tmp1, i32 3
29	store i8 %tmp2, ptr %A, align 8
30	%tmp3 = getelementptr i8, ptr %A, i32 1
31	store ptr %tmp3, ptr %ptr
32	ret void
33}
34
35;Check the alignment value.  Max for this instruction is 16 bits:
36define void @vst1lanei16(ptr %A, ptr %B) nounwind {
37; CHECK-LABEL: vst1lanei16:
38; CHECK:       @ %bb.0:
39; CHECK-NEXT:    vldr d16, [r1]
40; CHECK-NEXT:    vst1.16 {d16[2]}, [r0:16]
41; CHECK-NEXT:    mov pc, lr
42	%tmp1 = load <4 x i16>, ptr %B
43	%tmp2 = extractelement <4 x i16> %tmp1, i32 2
44	store i16 %tmp2, ptr %A, align 8
45	ret void
46}
47
48;Check the alignment value.  Max for this instruction is 32 bits:
49define void @vst1lanei32(ptr %A, ptr %B) nounwind {
50; CHECK-LABEL: vst1lanei32:
51; CHECK:       @ %bb.0:
52; CHECK-NEXT:    vldr d16, [r1]
53; CHECK-NEXT:    vst1.32 {d16[1]}, [r0:32]
54; CHECK-NEXT:    mov pc, lr
55	%tmp1 = load <2 x i32>, ptr %B
56	%tmp2 = extractelement <2 x i32> %tmp1, i32 1
57	store i32 %tmp2, ptr %A, align 8
58	ret void
59}
60
61define void @vst1lanef(ptr %A, ptr %B) nounwind {
62; CHECK-LABEL: vst1lanef:
63; CHECK:       @ %bb.0:
64; CHECK-NEXT:    vldr d16, [r1]
65; CHECK-NEXT:    vst1.32 {d16[1]}, [r0:32]
66; CHECK-NEXT:    mov pc, lr
67	%tmp1 = load <2 x float>, ptr %B
68	%tmp2 = extractelement <2 x float> %tmp1, i32 1
69	store float %tmp2, ptr %A
70	ret void
71}
72
73; // Can use scalar load. No need to use vectors.
74; // CHE-CK: vst1.8 {d17[1]}, [r0]
75define void @vst1laneQi8(ptr %A, ptr %B) nounwind {
76; CHECK-LABEL: vst1laneQi8:
77; CHECK:       @ %bb.0:
78; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
79; CHECK-NEXT:    vst1.8 {d17[1]}, [r0]
80; CHECK-NEXT:    mov pc, lr
81	%tmp1 = load <16 x i8>, ptr %B
82	%tmp2 = extractelement <16 x i8> %tmp1, i32 9
83	store i8 %tmp2, ptr %A, align 8
84	ret void
85}
86
87define void @vst1laneQi16(ptr %A, ptr %B) nounwind {
88; CHECK-LABEL: vst1laneQi16:
89; CHECK:       @ %bb.0:
90; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
91; CHECK-NEXT:    vst1.16 {d17[1]}, [r0:16]
92; CHECK-NEXT:    mov pc, lr
93	%tmp1 = load <8 x i16>, ptr %B
94	%tmp2 = extractelement <8 x i16> %tmp1, i32 5
95	store i16 %tmp2, ptr %A, align 8
96	ret void
97}
98
99; // Can use scalar load. No need to use vectors.
100; // CHE-CK: vst1.32 {d17[1]}, [r0:32]
101define void @vst1laneQi32(ptr %A, ptr %B) nounwind {
102; CHECK-LABEL: vst1laneQi32:
103; CHECK:       @ %bb.0:
104; CHECK-NEXT:    ldr r1, [r1, #12]
105; CHECK-NEXT:    str r1, [r0]
106; CHECK-NEXT:    mov pc, lr
107	%tmp1 = load <4 x i32>, ptr %B
108	%tmp2 = extractelement <4 x i32> %tmp1, i32 3
109	store i32 %tmp2, ptr %A, align 8
110	ret void
111}
112
113;Check for a post-increment updating store.
114; // Can use scalar load. No need to use vectors.
115; // CHE-CK: vst1.32 {d17[1]}, [r1:32]!
116define void @vst1laneQi32_update(ptr %ptr, ptr %B) nounwind {
117; CHECK-LABEL: vst1laneQi32_update:
118; CHECK:       @ %bb.0:
119; CHECK-NEXT:    ldr r2, [r0]
120; CHECK-NEXT:    ldr r1, [r1, #12]
121; CHECK-NEXT:    str r1, [r2], #4
122; CHECK-NEXT:    str r2, [r0]
123; CHECK-NEXT:    mov pc, lr
124	%A = load ptr, ptr %ptr
125	%tmp1 = load <4 x i32>, ptr %B
126	%tmp2 = extractelement <4 x i32> %tmp1, i32 3
127	store i32 %tmp2, ptr %A, align 8
128	%tmp3 = getelementptr i32, ptr %A, i32 1
129	store ptr %tmp3, ptr %ptr
130	ret void
131}
132
133; // Can use scalar load. No need to use vectors.
134; // CHE-CK: vst1.32 {d17[1]}, [r0]
135define void @vst1laneQf(ptr %A, ptr %B) nounwind {
136; CHECK-LABEL: vst1laneQf:
137; CHECK:       @ %bb.0:
138; CHECK-NEXT:    ldr r1, [r1, #12]
139; CHECK-NEXT:    str r1, [r0]
140; CHECK-NEXT:    mov pc, lr
141	%tmp1 = load <4 x float>, ptr %B
142	%tmp2 = extractelement <4 x float> %tmp1, i32 3
143	store float %tmp2, ptr %A
144	ret void
145}
146
147;Check the alignment value.  Max for this instruction is 16 bits:
148define void @vst2lanei8(ptr %A, ptr %B) nounwind {
149; CHECK-LABEL: vst2lanei8:
150; CHECK:       @ %bb.0:
151; CHECK-NEXT:    vldr d16, [r1]
152; CHECK-NEXT:    vorr d17, d16, d16
153; CHECK-NEXT:    vst2.8 {d16[1], d17[1]}, [r0:16]
154; CHECK-NEXT:    mov pc, lr
155	%tmp1 = load <8 x i8>, ptr %B
156	call void @llvm.arm.neon.vst2lane.p0.v8i8(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
157	ret void
158}
159
160;Check the alignment value.  Max for this instruction is 32 bits:
161define void @vst2lanei16(ptr %A, ptr %B) nounwind {
162; CHECK-LABEL: vst2lanei16:
163; CHECK:       @ %bb.0:
164; CHECK-NEXT:    vldr d16, [r1]
165; CHECK-NEXT:    vorr d17, d16, d16
166; CHECK-NEXT:    vst2.16 {d16[1], d17[1]}, [r0:32]
167; CHECK-NEXT:    mov pc, lr
168	%tmp1 = load <4 x i16>, ptr %B
169	call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %A, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
170	ret void
171}
172
173;Check for a post-increment updating store with register increment.
174define void @vst2lanei16_update(ptr %ptr, ptr %B, i32 %inc) nounwind {
175; CHECK-LABEL: vst2lanei16_update:
176; CHECK:       @ %bb.0:
177; CHECK-NEXT:    vldr d16, [r1]
178; CHECK-NEXT:    lsl r1, r2, #1
179; CHECK-NEXT:    ldr r3, [r0]
180; CHECK-NEXT:    vorr d17, d16, d16
181; CHECK-NEXT:    vst2.16 {d16[1], d17[1]}, [r3], r1
182; CHECK-NEXT:    str r3, [r0]
183; CHECK-NEXT:    mov pc, lr
184	%A = load ptr, ptr %ptr
185	%tmp1 = load <4 x i16>, ptr %B
186	call void @llvm.arm.neon.vst2lane.p0.v4i16(ptr %A, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
187	%tmp2 = getelementptr i16, ptr %A, i32 %inc
188	store ptr %tmp2, ptr %ptr
189	ret void
190}
191
192define void @vst2lanei32(ptr %A, ptr %B) nounwind {
193; CHECK-LABEL: vst2lanei32:
194; CHECK:       @ %bb.0:
195; CHECK-NEXT:    vldr d16, [r1]
196; CHECK-NEXT:    vorr d17, d16, d16
197; CHECK-NEXT:    vst2.32 {d16[1], d17[1]}, [r0]
198; CHECK-NEXT:    mov pc, lr
199	%tmp1 = load <2 x i32>, ptr %B
200	call void @llvm.arm.neon.vst2lane.p0.v2i32(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
201	ret void
202}
203
204define void @vst2lanef(ptr %A, ptr %B) nounwind {
205; CHECK-LABEL: vst2lanef:
206; CHECK:       @ %bb.0:
207; CHECK-NEXT:    vldr d16, [r1]
208; CHECK-NEXT:    vorr d17, d16, d16
209; CHECK-NEXT:    vst2.32 {d16[1], d17[1]}, [r0]
210; CHECK-NEXT:    mov pc, lr
211	%tmp1 = load <2 x float>, ptr %B
212	call void @llvm.arm.neon.vst2lane.p0.v2f32(ptr %A, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
213	ret void
214}
215
216;Check the (default) alignment.
217define void @vst2laneQi16(ptr %A, ptr %B) nounwind {
218; CHECK-LABEL: vst2laneQi16:
219; CHECK:       @ %bb.0:
220; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
221; CHECK-NEXT:    vorr q9, q8, q8
222; CHECK-NEXT:    vst2.16 {d17[1], d19[1]}, [r0]
223; CHECK-NEXT:    mov pc, lr
224	%tmp1 = load <8 x i16>, ptr %B
225	call void @llvm.arm.neon.vst2lane.p0.v8i16(ptr %A, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
226	ret void
227}
228
229;Check the alignment value.  Max for this instruction is 64 bits:
230define void @vst2laneQi32(ptr %A, ptr %B) nounwind {
231; CHECK-LABEL: vst2laneQi32:
232; CHECK:       @ %bb.0:
233; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
234; CHECK-NEXT:    vorr q9, q8, q8
235; CHECK-NEXT:    vst2.32 {d17[0], d19[0]}, [r0:64]
236; CHECK-NEXT:    mov pc, lr
237	%tmp1 = load <4 x i32>, ptr %B
238	call void @llvm.arm.neon.vst2lane.p0.v4i32(ptr %A, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
239	ret void
240}
241
242define void @vst2laneQf(ptr %A, ptr %B) nounwind {
243; CHECK-LABEL: vst2laneQf:
244; CHECK:       @ %bb.0:
245; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
246; CHECK-NEXT:    vorr q9, q8, q8
247; CHECK-NEXT:    vst2.32 {d17[1], d19[1]}, [r0]
248; CHECK-NEXT:    mov pc, lr
249	%tmp1 = load <4 x float>, ptr %B
250	call void @llvm.arm.neon.vst2lane.p0.v4f32(ptr %A, <4 x float> %tmp1, <4 x float> %tmp1, i32 3, i32 1)
251	ret void
252}
253
254declare void @llvm.arm.neon.vst2lane.p0.v8i8(ptr, <8 x i8>, <8 x i8>, i32, i32) nounwind
255declare void @llvm.arm.neon.vst2lane.p0.v4i16(ptr, <4 x i16>, <4 x i16>, i32, i32) nounwind
256declare void @llvm.arm.neon.vst2lane.p0.v2i32(ptr, <2 x i32>, <2 x i32>, i32, i32) nounwind
257declare void @llvm.arm.neon.vst2lane.p0.v2f32(ptr, <2 x float>, <2 x float>, i32, i32) nounwind
258
259declare void @llvm.arm.neon.vst2lane.p0.v8i16(ptr, <8 x i16>, <8 x i16>, i32, i32) nounwind
260declare void @llvm.arm.neon.vst2lane.p0.v4i32(ptr, <4 x i32>, <4 x i32>, i32, i32) nounwind
261declare void @llvm.arm.neon.vst2lane.p0.v4f32(ptr, <4 x float>, <4 x float>, i32, i32) nounwind
262
263define void @vst3lanei8(ptr %A, ptr %B) nounwind {
264; CHECK-LABEL: vst3lanei8:
265; CHECK:       @ %bb.0:
266; CHECK-NEXT:    vldr d16, [r1]
267; CHECK-NEXT:    vorr d17, d16, d16
268; CHECK-NEXT:    vorr d18, d16, d16
269; CHECK-NEXT:    vst3.8 {d16[1], d17[1], d18[1]}, [r0]
270; CHECK-NEXT:    mov pc, lr
271	%tmp1 = load <8 x i8>, ptr %B
272	call void @llvm.arm.neon.vst3lane.p0.v8i8(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
273	ret void
274}
275
276;Check the (default) alignment value.  VST3 does not support alignment.
277define void @vst3lanei16(ptr %A, ptr %B) nounwind {
278; CHECK-LABEL: vst3lanei16:
279; CHECK:       @ %bb.0:
280; CHECK-NEXT:    vldr d16, [r1]
281; CHECK-NEXT:    vorr d17, d16, d16
282; CHECK-NEXT:    vorr d18, d16, d16
283; CHECK-NEXT:    vst3.16 {d16[1], d17[1], d18[1]}, [r0]
284; CHECK-NEXT:    mov pc, lr
285	%tmp1 = load <4 x i16>, ptr %B
286	call void @llvm.arm.neon.vst3lane.p0.v4i16(ptr %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
287	ret void
288}
289
290define void @vst3lanei32(ptr %A, ptr %B) nounwind {
291; CHECK-LABEL: vst3lanei32:
292; CHECK:       @ %bb.0:
293; CHECK-NEXT:    vldr d16, [r1]
294; CHECK-NEXT:    vorr d17, d16, d16
295; CHECK-NEXT:    vorr d18, d16, d16
296; CHECK-NEXT:    vst3.32 {d16[1], d17[1], d18[1]}, [r0]
297; CHECK-NEXT:    mov pc, lr
298	%tmp1 = load <2 x i32>, ptr %B
299	call void @llvm.arm.neon.vst3lane.p0.v2i32(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
300	ret void
301}
302
303define void @vst3lanef(ptr %A, ptr %B) nounwind {
304; CHECK-LABEL: vst3lanef:
305; CHECK:       @ %bb.0:
306; CHECK-NEXT:    vldr d16, [r1]
307; CHECK-NEXT:    vorr d17, d16, d16
308; CHECK-NEXT:    vorr d18, d16, d16
309; CHECK-NEXT:    vst3.32 {d16[1], d17[1], d18[1]}, [r0]
310; CHECK-NEXT:    mov pc, lr
311	%tmp1 = load <2 x float>, ptr %B
312	call void @llvm.arm.neon.vst3lane.p0.v2f32(ptr %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
313	ret void
314}
315
316define void @vst3laneQi16(ptr %A, ptr %B) nounwind {
317; CHECK-LABEL: vst3laneQi16:
318; CHECK:       @ %bb.0:
319; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
320; CHECK-NEXT:    vorr q9, q8, q8
321; CHECK-NEXT:    vorr q10, q8, q8
322; CHECK-NEXT:    vst3.16 {d17[2], d19[2], d21[2]}, [r0]
323; CHECK-NEXT:    mov pc, lr
324;Check the (default) alignment value.  VST3 does not support alignment.
325	%tmp1 = load <8 x i16>, ptr %B
326	call void @llvm.arm.neon.vst3lane.p0.v8i16(ptr %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 6, i32 8)
327	ret void
328}
329
330define void @vst3laneQi32(ptr %A, ptr %B) nounwind {
331; CHECK-LABEL: vst3laneQi32:
332; CHECK:       @ %bb.0:
333; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
334; CHECK-NEXT:    vorr q9, q8, q8
335; CHECK-NEXT:    vorr q10, q8, q8
336; CHECK-NEXT:    vst3.32 {d16[0], d18[0], d20[0]}, [r0]
337; CHECK-NEXT:    mov pc, lr
338	%tmp1 = load <4 x i32>, ptr %B
339	call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
340	ret void
341}
342
343;Check for a post-increment updating store.
344define void @vst3laneQi32_update(ptr %ptr, ptr %B) nounwind {
345; CHECK-LABEL: vst3laneQi32_update:
346; CHECK:       @ %bb.0:
347; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
348; CHECK-NEXT:    vorr q9, q8, q8
349; CHECK-NEXT:    ldr r2, [r0]
350; CHECK-NEXT:    vorr q10, q8, q8
351; CHECK-NEXT:    vst3.32 {d16[0], d18[0], d20[0]}, [r2]!
352; CHECK-NEXT:    str r2, [r0]
353; CHECK-NEXT:    mov pc, lr
354	%A = load ptr, ptr %ptr
355	%tmp1 = load <4 x i32>, ptr %B
356	call void @llvm.arm.neon.vst3lane.p0.v4i32(ptr %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
357	%tmp2 = getelementptr i32, ptr %A, i32 3
358	store ptr %tmp2, ptr %ptr
359	ret void
360}
361
362define void @vst3laneQf(ptr %A, ptr %B) nounwind {
363; CHECK-LABEL: vst3laneQf:
364; CHECK:       @ %bb.0:
365; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
366; CHECK-NEXT:    vorr q9, q8, q8
367; CHECK-NEXT:    vorr q10, q8, q8
368; CHECK-NEXT:    vst3.32 {d16[1], d18[1], d20[1]}, [r0]
369; CHECK-NEXT:    mov pc, lr
370	%tmp1 = load <4 x float>, ptr %B
371	call void @llvm.arm.neon.vst3lane.p0.v4f32(ptr %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
372	ret void
373}
374
375declare void @llvm.arm.neon.vst3lane.p0.v8i8(ptr, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
376declare void @llvm.arm.neon.vst3lane.p0.v4i16(ptr, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
377declare void @llvm.arm.neon.vst3lane.p0.v2i32(ptr, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
378declare void @llvm.arm.neon.vst3lane.p0.v2f32(ptr, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
379
380declare void @llvm.arm.neon.vst3lane.p0.v8i16(ptr, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
381declare void @llvm.arm.neon.vst3lane.p0.v4i32(ptr, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
382declare void @llvm.arm.neon.vst3lane.p0.v4f32(ptr, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
383
384
385;Check the alignment value.  Max for this instruction is 32 bits:
386define void @vst4lanei8(ptr %A, ptr %B) nounwind {
387; CHECK-LABEL: vst4lanei8:
388; CHECK:       @ %bb.0:
389; CHECK-NEXT:    vldr d16, [r1]
390; CHECK-NEXT:    vorr d17, d16, d16
391; CHECK-NEXT:    vorr d18, d16, d16
392; CHECK-NEXT:    vorr d19, d16, d16
393; CHECK-NEXT:    vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]
394; CHECK-NEXT:    mov pc, lr
395	%tmp1 = load <8 x i8>, ptr %B
396	call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
397	ret void
398}
399
400;Check for a post-increment updating store.
401define void @vst4lanei8_update(ptr %ptr, ptr %B) nounwind {
402; CHECK-LABEL: vst4lanei8_update:
403; CHECK:       @ %bb.0:
404; CHECK-NEXT:    vldr d16, [r1]
405; CHECK-NEXT:    vorr d17, d16, d16
406; CHECK-NEXT:    ldr r2, [r0]
407; CHECK-NEXT:    vorr d18, d16, d16
408; CHECK-NEXT:    vorr d19, d16, d16
409; CHECK-NEXT:    vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r2:32]!
410; CHECK-NEXT:    str r2, [r0]
411; CHECK-NEXT:    mov pc, lr
412	%A = load ptr, ptr %ptr
413	%tmp1 = load <8 x i8>, ptr %B
414	call void @llvm.arm.neon.vst4lane.p0.v8i8(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
415	%tmp2 = getelementptr i8, ptr %A, i32 4
416	store ptr %tmp2, ptr %ptr
417	ret void
418}
419
420define void @vst4lanei16(ptr %A, ptr %B) nounwind {
421; CHECK-LABEL: vst4lanei16:
422; CHECK:       @ %bb.0:
423; CHECK-NEXT:    vldr d16, [r1]
424; CHECK-NEXT:    vorr d17, d16, d16
425; CHECK-NEXT:    vorr d18, d16, d16
426; CHECK-NEXT:    vorr d19, d16, d16
427; CHECK-NEXT:    vst4.16 {d16[1], d17[1], d18[1], d19[1]}, [r0]
428; CHECK-NEXT:    mov pc, lr
429	%tmp1 = load <4 x i16>, ptr %B
430	call void @llvm.arm.neon.vst4lane.p0.v4i16(ptr %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
431	ret void
432}
433
434;Check the alignment value.  Max for this instruction is 128 bits:
435define void @vst4lanei32(ptr %A, ptr %B) nounwind {
436; CHECK-LABEL: vst4lanei32:
437; CHECK:       @ %bb.0:
438; CHECK-NEXT:    vldr d16, [r1]
439; CHECK-NEXT:    vorr d17, d16, d16
440; CHECK-NEXT:    vorr d18, d16, d16
441; CHECK-NEXT:    vorr d19, d16, d16
442; CHECK-NEXT:    vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:128]
443; CHECK-NEXT:    mov pc, lr
444	%tmp1 = load <2 x i32>, ptr %B
445	call void @llvm.arm.neon.vst4lane.p0.v2i32(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
446	ret void
447}
448
449define void @vst4lanef(ptr %A, ptr %B) nounwind {
450; CHECK-LABEL: vst4lanef:
451; CHECK:       @ %bb.0:
452; CHECK-NEXT:    vldr d16, [r1]
453; CHECK-NEXT:    vorr d17, d16, d16
454; CHECK-NEXT:    vorr d18, d16, d16
455; CHECK-NEXT:    vorr d19, d16, d16
456; CHECK-NEXT:    vst4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0]
457; CHECK-NEXT:    mov pc, lr
458	%tmp1 = load <2 x float>, ptr %B
459	call void @llvm.arm.neon.vst4lane.p0.v2f32(ptr %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
460	ret void
461}
462
463;Check the alignment value.  Max for this instruction is 64 bits:
464define void @vst4laneQi16(ptr %A, ptr %B) nounwind {
465; CHECK-LABEL: vst4laneQi16:
466; CHECK:       @ %bb.0:
467; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
468; CHECK-NEXT:    vorr q9, q8, q8
469; CHECK-NEXT:    vorr q10, q8, q8
470; CHECK-NEXT:    vorr q11, q8, q8
471; CHECK-NEXT:    vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r0:64]
472; CHECK-NEXT:    mov pc, lr
473	%tmp1 = load <8 x i16>, ptr %B
474	call void @llvm.arm.neon.vst4lane.p0.v8i16(ptr %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
475	ret void
476}
477
478;Check the (default) alignment.
479define void @vst4laneQi32(ptr %A, ptr %B) nounwind {
480; CHECK-LABEL: vst4laneQi32:
481; CHECK:       @ %bb.0:
482; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
483; CHECK-NEXT:    vorr q9, q8, q8
484; CHECK-NEXT:    vorr q10, q8, q8
485; CHECK-NEXT:    vorr q11, q8, q8
486; CHECK-NEXT:    vst4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
487; CHECK-NEXT:    mov pc, lr
488	%tmp1 = load <4 x i32>, ptr %B
489	call void @llvm.arm.neon.vst4lane.p0.v4i32(ptr %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
490	ret void
491}
492
493define void @vst4laneQf(ptr %A, ptr %B) nounwind {
494; CHECK-LABEL: vst4laneQf:
495; CHECK:       @ %bb.0:
496; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
497; CHECK-NEXT:    vorr q9, q8, q8
498; CHECK-NEXT:    vorr q10, q8, q8
499; CHECK-NEXT:    vorr q11, q8, q8
500; CHECK-NEXT:    vst4.32 {d16[1], d18[1], d20[1], d22[1]}, [r0]
501; CHECK-NEXT:    mov pc, lr
502	%tmp1 = load <4 x float>, ptr %B
503	call void @llvm.arm.neon.vst4lane.p0.v4f32(ptr %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
504	ret void
505}
506
507; Make sure this doesn't crash; PR10258
508define <8 x i16> @variable_insertelement(<8 x i16> %a, i16 %b, i32 %c) nounwind readnone {
509; CHECK-LABEL: variable_insertelement:
510; CHECK:       @ %bb.0:
511; CHECK-NEXT:    push {r11, lr}
512; CHECK-NEXT:    mov r11, sp
513; CHECK-NEXT:    sub sp, sp, #24
514; CHECK-NEXT:    bic sp, sp, #15
515; CHECK-NEXT:    ldr lr, [r11, #12]
516; CHECK-NEXT:    vmov d17, r2, r3
517; CHECK-NEXT:    vmov d16, r0, r1
518; CHECK-NEXT:    mov r1, sp
519; CHECK-NEXT:    and r0, lr, #7
520; CHECK-NEXT:    mov r2, r1
521; CHECK-NEXT:    ldrh r12, [r11, #8]
522; CHECK-NEXT:    lsl r0, r0, #1
523; CHECK-NEXT:    vst1.64 {d16, d17}, [r2:128], r0
524; CHECK-NEXT:    strh r12, [r2]
525; CHECK-NEXT:    vld1.64 {d16, d17}, [r1:128]
526; CHECK-NEXT:    vmov r0, r1, d16
527; CHECK-NEXT:    vmov r2, r3, d17
528; CHECK-NEXT:    mov sp, r11
529; CHECK-NEXT:    pop {r11, lr}
530; CHECK-NEXT:    mov pc, lr
531    %r = insertelement <8 x i16> %a, i16 %b, i32 %c
532    ret <8 x i16> %r
533}
534
535declare void @llvm.arm.neon.vst4lane.p0.v8i8(ptr, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
536declare void @llvm.arm.neon.vst4lane.p0.v4i16(ptr, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
537declare void @llvm.arm.neon.vst4lane.p0.v2i32(ptr, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
538declare void @llvm.arm.neon.vst4lane.p0.v2f32(ptr, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
539
540declare void @llvm.arm.neon.vst4lane.p0.v8i16(ptr, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
541declare void @llvm.arm.neon.vst4lane.p0.v4i32(ptr, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
542declare void @llvm.arm.neon.vst4lane.p0.v4f32(ptr, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
543