xref: /llvm-project/llvm/test/CodeGen/ARM/vldlane.ll (revision bed1c7f061aa12417aa081e334afdba45767b938)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon | FileCheck %s --check-prefixes=CHECK,DEFAULT
3; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft -mattr=+neon -regalloc=basic | FileCheck %s --check-prefixes=CHECK,BASIC
4
5;Check the (default) alignment value.
6define <8 x i8> @vld1lanei8(ptr %A, ptr %B) nounwind {
7; CHECK-LABEL: vld1lanei8:
8; CHECK:       @ %bb.0:
9; CHECK-NEXT:    vldr d16, [r1]
10; CHECK-NEXT:    vld1.8 {d16[3]}, [r0]
11; CHECK-NEXT:    vmov r0, r1, d16
12; CHECK-NEXT:    mov pc, lr
13  %tmp1 = load <8 x i8>, ptr %B
14  %tmp2 = load i8, ptr %A, align 8
15  %tmp3 = insertelement <8 x i8> %tmp1, i8 %tmp2, i32 3
16  ret <8 x i8> %tmp3
17}
18
19;Check the alignment value.  Max for this instruction is 16 bits:
20define <4 x i16> @vld1lanei16(ptr %A, ptr %B) nounwind {
21; CHECK-LABEL: vld1lanei16:
22; CHECK:       @ %bb.0:
23; CHECK-NEXT:    vldr d16, [r1]
24; CHECK-NEXT:    vld1.16 {d16[2]}, [r0:16]
25; CHECK-NEXT:    vmov r0, r1, d16
26; CHECK-NEXT:    mov pc, lr
27  %tmp1 = load <4 x i16>, ptr %B
28  %tmp2 = load i16, ptr %A, align 8
29  %tmp3 = insertelement <4 x i16> %tmp1, i16 %tmp2, i32 2
30  ret <4 x i16> %tmp3
31}
32
33;Check the alignment value.  Max for this instruction is 32 bits:
34define <2 x i32> @vld1lanei32(ptr %A, ptr %B) nounwind {
35; CHECK-LABEL: vld1lanei32:
36; CHECK:       @ %bb.0:
37; CHECK-NEXT:    vldr d16, [r1]
38; CHECK-NEXT:    vld1.32 {d16[1]}, [r0:32]
39; CHECK-NEXT:    vmov r0, r1, d16
40; CHECK-NEXT:    mov pc, lr
41  %tmp1 = load <2 x i32>, ptr %B
42  %tmp2 = load i32, ptr %A, align 8
43  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
44  ret <2 x i32> %tmp3
45}
46
47;Check the alignment value.  Legal values are none or :32.
48define <2 x i32> @vld1lanei32a32(ptr %A, ptr %B) nounwind {
49; CHECK-LABEL: vld1lanei32a32:
50; CHECK:       @ %bb.0:
51; CHECK-NEXT:    vldr d16, [r1]
52; CHECK-NEXT:    vld1.32 {d16[1]}, [r0:32]
53; CHECK-NEXT:    vmov r0, r1, d16
54; CHECK-NEXT:    mov pc, lr
55  %tmp1 = load <2 x i32>, ptr %B
56  %tmp2 = load i32, ptr %A, align 4
57  %tmp3 = insertelement <2 x i32> %tmp1, i32 %tmp2, i32 1
58  ret <2 x i32> %tmp3
59}
60
61define <2 x float> @vld1lanef(ptr %A, ptr %B) nounwind {
62; CHECK-LABEL: vld1lanef:
63; CHECK:       @ %bb.0:
64; CHECK-NEXT:    vldr d16, [r1]
65; CHECK-NEXT:    vld1.32 {d16[1]}, [r0:32]
66; CHECK-NEXT:    vmov r0, r1, d16
67; CHECK-NEXT:    mov pc, lr
68  %tmp1 = load <2 x float>, ptr %B
69  %tmp2 = load float, ptr %A, align 4
70  %tmp3 = insertelement <2 x float> %tmp1, float %tmp2, i32 1
71  ret <2 x float> %tmp3
72}
73
74define <16 x i8> @vld1laneQi8(ptr %A, ptr %B) nounwind {
75; CHECK-LABEL: vld1laneQi8:
76; CHECK:       @ %bb.0:
77; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
78; CHECK-NEXT:    vld1.8 {d17[1]}, [r0]
79; CHECK-NEXT:    vmov r0, r1, d16
80; CHECK-NEXT:    vmov r2, r3, d17
81; CHECK-NEXT:    mov pc, lr
82  %tmp1 = load <16 x i8>, ptr %B
83  %tmp2 = load i8, ptr %A, align 8
84  %tmp3 = insertelement <16 x i8> %tmp1, i8 %tmp2, i32 9
85  ret <16 x i8> %tmp3
86}
87
88define <8 x i16> @vld1laneQi16(ptr %A, ptr %B) nounwind {
89; CHECK-LABEL: vld1laneQi16:
90; CHECK:       @ %bb.0:
91; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
92; CHECK-NEXT:    vld1.16 {d17[1]}, [r0:16]
93; CHECK-NEXT:    vmov r0, r1, d16
94; CHECK-NEXT:    vmov r2, r3, d17
95; CHECK-NEXT:    mov pc, lr
96  %tmp1 = load <8 x i16>, ptr %B
97  %tmp2 = load i16, ptr %A, align 8
98  %tmp3 = insertelement <8 x i16> %tmp1, i16 %tmp2, i32 5
99  ret <8 x i16> %tmp3
100}
101
102define <4 x i32> @vld1laneQi32(ptr %A, ptr %B) nounwind {
103; CHECK-LABEL: vld1laneQi32:
104; CHECK:       @ %bb.0:
105; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
106; CHECK-NEXT:    vld1.32 {d17[1]}, [r0:32]
107; CHECK-NEXT:    vmov r0, r1, d16
108; CHECK-NEXT:    vmov r2, r3, d17
109; CHECK-NEXT:    mov pc, lr
110  %tmp1 = load <4 x i32>, ptr %B
111  %tmp2 = load i32, ptr %A, align 8
112  %tmp3 = insertelement <4 x i32> %tmp1, i32 %tmp2, i32 3
113  ret <4 x i32> %tmp3
114}
115
116define <4 x float> @vld1laneQf(ptr %A, ptr %B) nounwind {
117; CHECK-LABEL: vld1laneQf:
118; CHECK:       @ %bb.0:
119; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
120; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
121; CHECK-NEXT:    vmov r2, r3, d17
122; CHECK-NEXT:    vmov r0, r1, d16
123; CHECK-NEXT:    mov pc, lr
124  %tmp1 = load <4 x float>, ptr %B
125  %tmp2 = load float, ptr %A
126  %tmp3 = insertelement <4 x float> %tmp1, float %tmp2, i32 0
127  ret <4 x float> %tmp3
128}
129
130%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
131%struct.__neon_int16x4x2_t = type { <4 x i16>, <4 x i16> }
132%struct.__neon_int32x2x2_t = type { <2 x i32>, <2 x i32> }
133%struct.__neon_float32x2x2_t = type { <2 x float>, <2 x float> }
134
135%struct.__neon_int16x8x2_t = type { <8 x i16>, <8 x i16> }
136%struct.__neon_int32x4x2_t = type { <4 x i32>, <4 x i32> }
137%struct.__neon_float32x4x2_t = type { <4 x float>, <4 x float> }
138
139;Check the alignment value.  Max for this instruction is 16 bits:
140define <8 x i8> @vld2lanei8(ptr %A, ptr %B) nounwind {
141; CHECK-LABEL: vld2lanei8:
142; CHECK:       @ %bb.0:
143; CHECK-NEXT:    vldr d16, [r1]
144; CHECK-NEXT:    vorr d17, d16, d16
145; CHECK-NEXT:    vld2.8 {d16[1], d17[1]}, [r0:16]
146; CHECK-NEXT:    vadd.i8 d16, d16, d17
147; CHECK-NEXT:    vmov r0, r1, d16
148; CHECK-NEXT:    mov pc, lr
149  %tmp1 = load <8 x i8>, ptr %B
150  %tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
151  %tmp3 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 0
152  %tmp4 = extractvalue %struct.__neon_int8x8x2_t %tmp2, 1
153  %tmp5 = add <8 x i8> %tmp3, %tmp4
154  ret <8 x i8> %tmp5
155}
156
157;Check the alignment value.  Max for this instruction is 32 bits:
158define <4 x i16> @vld2lanei16(ptr %A, ptr %B) nounwind {
159; CHECK-LABEL: vld2lanei16:
160; CHECK:       @ %bb.0:
161; CHECK-NEXT:    vldr d16, [r1]
162; CHECK-NEXT:    vorr d17, d16, d16
163; CHECK-NEXT:    vld2.16 {d16[1], d17[1]}, [r0:32]
164; CHECK-NEXT:    vadd.i16 d16, d16, d17
165; CHECK-NEXT:    vmov r0, r1, d16
166; CHECK-NEXT:    mov pc, lr
167  %tmp1 = load <4 x i16>, ptr %B
168  %tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr %A, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
169  %tmp3 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 0
170  %tmp4 = extractvalue %struct.__neon_int16x4x2_t %tmp2, 1
171  %tmp5 = add <4 x i16> %tmp3, %tmp4
172  ret <4 x i16> %tmp5
173}
174
175define <2 x i32> @vld2lanei32(ptr %A, ptr %B) nounwind {
176; CHECK-LABEL: vld2lanei32:
177; CHECK:       @ %bb.0:
178; CHECK-NEXT:    vldr d16, [r1]
179; CHECK-NEXT:    vorr d17, d16, d16
180; CHECK-NEXT:    vld2.32 {d16[1], d17[1]}, [r0]
181; CHECK-NEXT:    vadd.i32 d16, d16, d17
182; CHECK-NEXT:    vmov r0, r1, d16
183; CHECK-NEXT:    mov pc, lr
184  %tmp1 = load <2 x i32>, ptr %B
185  %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
186  %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
187  %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
188  %tmp5 = add <2 x i32> %tmp3, %tmp4
189  ret <2 x i32> %tmp5
190}
191
192;Check for a post-increment updating load.
193define <2 x i32> @vld2lanei32_update(ptr %ptr, ptr %B) nounwind {
194; DEFAULT-LABEL: vld2lanei32_update:
195; DEFAULT:       @ %bb.0:
196; DEFAULT-NEXT:    vldr d16, [r1]
197; DEFAULT-NEXT:    ldr r3, [r0]
198; DEFAULT-NEXT:    vorr d17, d16, d16
199; DEFAULT-NEXT:    vld2.32 {d16[1], d17[1]}, [r3]!
200; DEFAULT-NEXT:    vadd.i32 d16, d16, d17
201; DEFAULT-NEXT:    str r3, [r0]
202; DEFAULT-NEXT:    vmov r2, r1, d16
203; DEFAULT-NEXT:    mov r0, r2
204; DEFAULT-NEXT:    mov pc, lr
205;
206; BASIC-LABEL: vld2lanei32_update:
207; BASIC:       @ %bb.0:
208; BASIC-NEXT:    mov r2, r1
209; BASIC-NEXT:    mov r1, r0
210; BASIC-NEXT:    vldr d16, [r2]
211; BASIC-NEXT:    ldr r0, [r0]
212; BASIC-NEXT:    vorr d17, d16, d16
213; BASIC-NEXT:    vld2.32 {d16[1], d17[1]}, [r0]!
214; BASIC-NEXT:    vadd.i32 d16, d16, d17
215; BASIC-NEXT:    str r0, [r1]
216; BASIC-NEXT:    vmov r2, r3, d16
217; BASIC-NEXT:    mov r0, r2
218; BASIC-NEXT:    mov r1, r3
219; BASIC-NEXT:    mov pc, lr
220  %A = load ptr, ptr %ptr
221  %tmp1 = load <2 x i32>, ptr %B
222  %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
223  %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
224  %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
225  %tmp5 = add <2 x i32> %tmp3, %tmp4
226  %tmp6 = getelementptr i32, ptr %A, i32 2
227  store ptr %tmp6, ptr %ptr
228  ret <2 x i32> %tmp5
229}
230
231define <2 x i32> @vld2lanei32_odd_update(ptr %ptr, ptr %B) nounwind {
232; DEFAULT-LABEL: vld2lanei32_odd_update:
233; DEFAULT:       @ %bb.0:
234; DEFAULT-NEXT:    vldr d16, [r1]
235; DEFAULT-NEXT:    mov r1, #12
236; DEFAULT-NEXT:    ldr r3, [r0]
237; DEFAULT-NEXT:    vorr d17, d16, d16
238; DEFAULT-NEXT:    vld2.32 {d16[1], d17[1]}, [r3], r1
239; DEFAULT-NEXT:    vadd.i32 d16, d16, d17
240; DEFAULT-NEXT:    str r3, [r0]
241; DEFAULT-NEXT:    vmov r2, r1, d16
242; DEFAULT-NEXT:    mov r0, r2
243; DEFAULT-NEXT:    mov pc, lr
244;
245; BASIC-LABEL: vld2lanei32_odd_update:
246; BASIC:       @ %bb.0:
247; BASIC-NEXT:    mov r2, r1
248; BASIC-NEXT:    mov r1, r0
249; BASIC-NEXT:    vldr d16, [r2]
250; BASIC-NEXT:    mov r2, #12
251; BASIC-NEXT:    ldr r0, [r0]
252; BASIC-NEXT:    vorr d17, d16, d16
253; BASIC-NEXT:    vld2.32 {d16[1], d17[1]}, [r0], r2
254; BASIC-NEXT:    vadd.i32 d16, d16, d17
255; BASIC-NEXT:    str r0, [r1]
256; BASIC-NEXT:    vmov r2, r3, d16
257; BASIC-NEXT:    mov r0, r2
258; BASIC-NEXT:    mov r1, r3
259; BASIC-NEXT:    mov pc, lr
260  %A = load ptr, ptr %ptr
261  %tmp1 = load <2 x i32>, ptr %B
262  %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
263  %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
264  %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
265  %tmp5 = add <2 x i32> %tmp3, %tmp4
266  %tmp6 = getelementptr i32, ptr %A, i32 3
267  store ptr %tmp6, ptr %ptr
268  ret <2 x i32> %tmp5
269}
270
271define <2 x float> @vld2lanef(ptr %A, ptr %B) nounwind {
272; CHECK-LABEL: vld2lanef:
273; CHECK:       @ %bb.0:
274; CHECK-NEXT:    vldr d16, [r1]
275; CHECK-NEXT:    vorr d17, d16, d16
276; CHECK-NEXT:    vld2.32 {d16[1], d17[1]}, [r0]
277; CHECK-NEXT:    vadd.f32 d16, d16, d17
278; CHECK-NEXT:    vmov r0, r1, d16
279; CHECK-NEXT:    mov pc, lr
280  %tmp1 = load <2 x float>, ptr %B
281  %tmp2 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0(ptr %A, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
282  %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 0
283  %tmp4 = extractvalue %struct.__neon_float32x2x2_t %tmp2, 1
284  %tmp5 = fadd <2 x float> %tmp3, %tmp4
285  ret <2 x float> %tmp5
286}
287
288;Check the (default) alignment.
289define <8 x i16> @vld2laneQi16(ptr %A, ptr %B) nounwind {
290; CHECK-LABEL: vld2laneQi16:
291; CHECK:       @ %bb.0:
292; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
293; CHECK-NEXT:    vorr q9, q8, q8
294; CHECK-NEXT:    vld2.16 {d17[1], d19[1]}, [r0]
295; CHECK-NEXT:    vadd.i16 q8, q8, q9
296; CHECK-NEXT:    vmov r0, r1, d16
297; CHECK-NEXT:    vmov r2, r3, d17
298; CHECK-NEXT:    mov pc, lr
299  %tmp1 = load <8 x i16>, ptr %B
300  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0(ptr %A, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
301  %tmp3 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 0
302  %tmp4 = extractvalue %struct.__neon_int16x8x2_t %tmp2, 1
303  %tmp5 = add <8 x i16> %tmp3, %tmp4
304  ret <8 x i16> %tmp5
305}
306
307;Check the alignment value.  Max for this instruction is 64 bits:
308define <4 x i32> @vld2laneQi32(ptr %A, ptr %B) nounwind {
309; CHECK-LABEL: vld2laneQi32:
310; CHECK:       @ %bb.0:
311; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
312; CHECK-NEXT:    vorr q9, q8, q8
313; CHECK-NEXT:    vld2.32 {d17[0], d19[0]}, [r0:64]
314; CHECK-NEXT:    vadd.i32 q8, q8, q9
315; CHECK-NEXT:    vmov r0, r1, d16
316; CHECK-NEXT:    vmov r2, r3, d17
317; CHECK-NEXT:    mov pc, lr
318  %tmp1 = load <4 x i32>, ptr %B
319  %tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0(ptr %A, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
320  %tmp3 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0
321  %tmp4 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 1
322  %tmp5 = add <4 x i32> %tmp3, %tmp4
323  ret <4 x i32> %tmp5
324}
325
326define <4 x float> @vld2laneQf(ptr %A, ptr %B) nounwind {
327; CHECK-LABEL: vld2laneQf:
328; CHECK:       @ %bb.0:
329; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
330; CHECK-NEXT:    vorr q9, q8, q8
331; CHECK-NEXT:    vld2.32 {d16[1], d18[1]}, [r0]
332; CHECK-NEXT:    vadd.f32 q8, q8, q9
333; CHECK-NEXT:    vmov r0, r1, d16
334; CHECK-NEXT:    vmov r2, r3, d17
335; CHECK-NEXT:    mov pc, lr
336  %tmp1 = load <4 x float>, ptr %B
337  %tmp2 = call %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0(ptr %A, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
338  %tmp3 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 0
339  %tmp4 = extractvalue %struct.__neon_float32x4x2_t %tmp2, 1
340  %tmp5 = fadd <4 x float> %tmp3, %tmp4
341  ret <4 x float> %tmp5
342}
343
344declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8.p0(ptr, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
345declare %struct.__neon_int16x4x2_t @llvm.arm.neon.vld2lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
346declare %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32.p0(ptr, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
347declare %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2lane.v2f32.p0(ptr, <2 x float>, <2 x float>, i32, i32) nounwind readonly
348
349declare %struct.__neon_int16x8x2_t @llvm.arm.neon.vld2lane.v8i16.p0(ptr, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
350declare %struct.__neon_int32x4x2_t @llvm.arm.neon.vld2lane.v4i32.p0(ptr, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
351declare %struct.__neon_float32x4x2_t @llvm.arm.neon.vld2lane.v4f32.p0(ptr, <4 x float>, <4 x float>, i32, i32) nounwind readonly
352
353%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
354%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
355%struct.__neon_int32x2x3_t = type { <2 x i32>, <2 x i32>, <2 x i32> }
356%struct.__neon_float32x2x3_t = type { <2 x float>, <2 x float>, <2 x float> }
357
358%struct.__neon_int16x8x3_t = type { <8 x i16>, <8 x i16>, <8 x i16> }
359%struct.__neon_int32x4x3_t = type { <4 x i32>, <4 x i32>, <4 x i32> }
360%struct.__neon_float32x4x3_t = type { <4 x float>, <4 x float>, <4 x float> }
361
362define <8 x i8> @vld3lanei8(ptr %A, ptr %B) nounwind {
363; DEFAULT-LABEL: vld3lanei8:
364; DEFAULT:       @ %bb.0:
365; DEFAULT-NEXT:    vldr d16, [r1]
366; DEFAULT-NEXT:    vorr d17, d16, d16
367; DEFAULT-NEXT:    vorr d18, d16, d16
368; DEFAULT-NEXT:    vld3.8 {d16[1], d17[1], d18[1]}, [r0]
369; DEFAULT-NEXT:    vadd.i8 d20, d16, d17
370; DEFAULT-NEXT:    vadd.i8 d16, d18, d20
371; DEFAULT-NEXT:    vmov r0, r1, d16
372; DEFAULT-NEXT:    mov pc, lr
373;
374; BASIC-LABEL: vld3lanei8:
375; BASIC:       @ %bb.0:
376; BASIC-NEXT:    vldr d18, [r1]
377; BASIC-NEXT:    vorr d19, d18, d18
378; BASIC-NEXT:    vorr d20, d18, d18
379; BASIC-NEXT:    vld3.8 {d18[1], d19[1], d20[1]}, [r0]
380; BASIC-NEXT:    vadd.i8 d16, d18, d19
381; BASIC-NEXT:    vadd.i8 d16, d20, d16
382; BASIC-NEXT:    vmov r0, r1, d16
383; BASIC-NEXT:    mov pc, lr
384  %tmp1 = load <8 x i8>, ptr %B
385  %tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
386  %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 0
387  %tmp4 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 1
388  %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp2, 2
389  %tmp6 = add <8 x i8> %tmp3, %tmp4
390  %tmp7 = add <8 x i8> %tmp5, %tmp6
391  ret <8 x i8> %tmp7
392}
393
394;Check the (default) alignment value.  VLD3 does not support alignment.
395define <4 x i16> @vld3lanei16(ptr %A, ptr %B) nounwind {
396; DEFAULT-LABEL: vld3lanei16:
397; DEFAULT:       @ %bb.0:
398; DEFAULT-NEXT:    vldr d16, [r1]
399; DEFAULT-NEXT:    vorr d17, d16, d16
400; DEFAULT-NEXT:    vorr d18, d16, d16
401; DEFAULT-NEXT:    vld3.16 {d16[1], d17[1], d18[1]}, [r0]
402; DEFAULT-NEXT:    vadd.i16 d20, d16, d17
403; DEFAULT-NEXT:    vadd.i16 d16, d18, d20
404; DEFAULT-NEXT:    vmov r0, r1, d16
405; DEFAULT-NEXT:    mov pc, lr
406;
407; BASIC-LABEL: vld3lanei16:
408; BASIC:       @ %bb.0:
409; BASIC-NEXT:    vldr d18, [r1]
410; BASIC-NEXT:    vorr d19, d18, d18
411; BASIC-NEXT:    vorr d20, d18, d18
412; BASIC-NEXT:    vld3.16 {d18[1], d19[1], d20[1]}, [r0]
413; BASIC-NEXT:    vadd.i16 d16, d18, d19
414; BASIC-NEXT:    vadd.i16 d16, d20, d16
415; BASIC-NEXT:    vmov r0, r1, d16
416; BASIC-NEXT:    mov pc, lr
417  %tmp1 = load <4 x i16>, ptr %B
418  %tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0(ptr %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
419  %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 0
420  %tmp4 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 1
421  %tmp5 = extractvalue %struct.__neon_int16x4x3_t %tmp2, 2
422  %tmp6 = add <4 x i16> %tmp3, %tmp4
423  %tmp7 = add <4 x i16> %tmp5, %tmp6
424  ret <4 x i16> %tmp7
425}
426
427define <2 x i32> @vld3lanei32(ptr %A, ptr %B) nounwind {
428; DEFAULT-LABEL: vld3lanei32:
429; DEFAULT:       @ %bb.0:
430; DEFAULT-NEXT:    vldr d16, [r1]
431; DEFAULT-NEXT:    vorr d17, d16, d16
432; DEFAULT-NEXT:    vorr d18, d16, d16
433; DEFAULT-NEXT:    vld3.32 {d16[1], d17[1], d18[1]}, [r0]
434; DEFAULT-NEXT:    vadd.i32 d20, d16, d17
435; DEFAULT-NEXT:    vadd.i32 d16, d18, d20
436; DEFAULT-NEXT:    vmov r0, r1, d16
437; DEFAULT-NEXT:    mov pc, lr
438;
439; BASIC-LABEL: vld3lanei32:
440; BASIC:       @ %bb.0:
441; BASIC-NEXT:    vldr d18, [r1]
442; BASIC-NEXT:    vorr d19, d18, d18
443; BASIC-NEXT:    vorr d20, d18, d18
444; BASIC-NEXT:    vld3.32 {d18[1], d19[1], d20[1]}, [r0]
445; BASIC-NEXT:    vadd.i32 d16, d18, d19
446; BASIC-NEXT:    vadd.i32 d16, d20, d16
447; BASIC-NEXT:    vmov r0, r1, d16
448; BASIC-NEXT:    mov pc, lr
449  %tmp1 = load <2 x i32>, ptr %B
450  %tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
451  %tmp3 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 0
452  %tmp4 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 1
453  %tmp5 = extractvalue %struct.__neon_int32x2x3_t %tmp2, 2
454  %tmp6 = add <2 x i32> %tmp3, %tmp4
455  %tmp7 = add <2 x i32> %tmp5, %tmp6
456  ret <2 x i32> %tmp7
457}
458
459define <2 x float> @vld3lanef(ptr %A, ptr %B) nounwind {
460; DEFAULT-LABEL: vld3lanef:
461; DEFAULT:       @ %bb.0:
462; DEFAULT-NEXT:    vldr d16, [r1]
463; DEFAULT-NEXT:    vorr d17, d16, d16
464; DEFAULT-NEXT:    vorr d18, d16, d16
465; DEFAULT-NEXT:    vld3.32 {d16[1], d17[1], d18[1]}, [r0]
466; DEFAULT-NEXT:    vadd.f32 d20, d16, d17
467; DEFAULT-NEXT:    vadd.f32 d16, d18, d20
468; DEFAULT-NEXT:    vmov r0, r1, d16
469; DEFAULT-NEXT:    mov pc, lr
470;
471; BASIC-LABEL: vld3lanef:
472; BASIC:       @ %bb.0:
473; BASIC-NEXT:    vldr d18, [r1]
474; BASIC-NEXT:    vorr d19, d18, d18
475; BASIC-NEXT:    vorr d20, d18, d18
476; BASIC-NEXT:    vld3.32 {d18[1], d19[1], d20[1]}, [r0]
477; BASIC-NEXT:    vadd.f32 d16, d18, d19
478; BASIC-NEXT:    vadd.f32 d16, d20, d16
479; BASIC-NEXT:    vmov r0, r1, d16
480; BASIC-NEXT:    mov pc, lr
481  %tmp1 = load <2 x float>, ptr %B
482  %tmp2 = call %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0(ptr %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
483  %tmp3 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 0
484  %tmp4 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 1
485  %tmp5 = extractvalue %struct.__neon_float32x2x3_t %tmp2, 2
486  %tmp6 = fadd <2 x float> %tmp3, %tmp4
487  %tmp7 = fadd <2 x float> %tmp5, %tmp6
488  ret <2 x float> %tmp7
489}
490
491;Check the (default) alignment value.  VLD3 does not support alignment.
492define <8 x i16> @vld3laneQi16(ptr %A, ptr %B) nounwind {
493; DEFAULT-LABEL: vld3laneQi16:
494; DEFAULT:       @ %bb.0:
495; DEFAULT-NEXT:    vld1.64 {d16, d17}, [r1]
496; DEFAULT-NEXT:    vorr q9, q8, q8
497; DEFAULT-NEXT:    vorr q10, q8, q8
498; DEFAULT-NEXT:    vld3.16 {d16[1], d18[1], d20[1]}, [r0]
499; DEFAULT-NEXT:    vadd.i16 q12, q8, q9
500; DEFAULT-NEXT:    vadd.i16 q8, q10, q12
501; DEFAULT-NEXT:    vmov r0, r1, d16
502; DEFAULT-NEXT:    vmov r2, r3, d17
503; DEFAULT-NEXT:    mov pc, lr
504;
505; BASIC-LABEL: vld3laneQi16:
506; BASIC:       @ %bb.0:
507; BASIC-NEXT:    vld1.64 {d18, d19}, [r1]
508; BASIC-NEXT:    vorr q10, q9, q9
509; BASIC-NEXT:    vorr q11, q9, q9
510; BASIC-NEXT:    vld3.16 {d18[1], d20[1], d22[1]}, [r0]
511; BASIC-NEXT:    vadd.i16 q8, q9, q10
512; BASIC-NEXT:    vadd.i16 q8, q11, q8
513; BASIC-NEXT:    vmov r0, r1, d16
514; BASIC-NEXT:    vmov r2, r3, d17
515; BASIC-NEXT:    mov pc, lr
516  %tmp1 = load <8 x i16>, ptr %B
517  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0(ptr %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
518  %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
519  %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
520  %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
521  %tmp6 = add <8 x i16> %tmp3, %tmp4
522  %tmp7 = add <8 x i16> %tmp5, %tmp6
523  ret <8 x i16> %tmp7
524}
525
526;Check for a post-increment updating load with register increment.
527define <8 x i16> @vld3laneQi16_update(ptr %ptr, ptr %B, i32 %inc) nounwind {
528; DEFAULT-LABEL: vld3laneQi16_update:
529; DEFAULT:       @ %bb.0:
530; DEFAULT-NEXT:    .save {r11, lr}
531; DEFAULT-NEXT:    push {r11, lr}
532; DEFAULT-NEXT:    vld1.64 {d16, d17}, [r1]
533; DEFAULT-NEXT:    lsl r1, r2, #1
534; DEFAULT-NEXT:    vorr q9, q8, q8
535; DEFAULT-NEXT:    ldr lr, [r0]
536; DEFAULT-NEXT:    vorr q10, q8, q8
537; DEFAULT-NEXT:    vld3.16 {d16[1], d18[1], d20[1]}, [lr], r1
538; DEFAULT-NEXT:    vadd.i16 q12, q8, q9
539; DEFAULT-NEXT:    vadd.i16 q8, q10, q12
540; DEFAULT-NEXT:    str lr, [r0]
541; DEFAULT-NEXT:    vmov r12, r1, d16
542; DEFAULT-NEXT:    vmov r2, r3, d17
543; DEFAULT-NEXT:    mov r0, r12
544; DEFAULT-NEXT:    pop {r11, lr}
545; DEFAULT-NEXT:    mov pc, lr
546;
547; BASIC-LABEL: vld3laneQi16_update:
548; BASIC:       @ %bb.0:
549; BASIC-NEXT:    .save {r11, lr}
550; BASIC-NEXT:    push {r11, lr}
551; BASIC-NEXT:    vld1.64 {d18, d19}, [r1]
552; BASIC-NEXT:    mov r3, r0
553; BASIC-NEXT:    vorr q10, q9, q9
554; BASIC-NEXT:    lsl r1, r2, #1
555; BASIC-NEXT:    ldr r0, [r0]
556; BASIC-NEXT:    vorr q11, q9, q9
557; BASIC-NEXT:    vld3.16 {d18[1], d20[1], d22[1]}, [r0], r1
558; BASIC-NEXT:    vadd.i16 q8, q9, q10
559; BASIC-NEXT:    vadd.i16 q8, q11, q8
560; BASIC-NEXT:    str r0, [r3]
561; BASIC-NEXT:    vmov r1, lr, d16
562; BASIC-NEXT:    vmov r2, r12, d17
563; BASIC-NEXT:    mov r0, r1
564; BASIC-NEXT:    mov r1, lr
565; BASIC-NEXT:    mov r3, r12
566; BASIC-NEXT:    pop {r11, lr}
567; BASIC-NEXT:    mov pc, lr
568  %A = load ptr, ptr %ptr
569  %tmp1 = load <8 x i16>, ptr %B
570  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0(ptr %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
571  %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
572  %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
573  %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
574  %tmp6 = add <8 x i16> %tmp3, %tmp4
575  %tmp7 = add <8 x i16> %tmp5, %tmp6
576  %tmp8 = getelementptr i16, ptr %A, i32 %inc
577  store ptr %tmp8, ptr %ptr
578  ret <8 x i16> %tmp7
579}
580
581define <4 x i32> @vld3laneQi32(ptr %A, ptr %B) nounwind {
582; DEFAULT-LABEL: vld3laneQi32:
583; DEFAULT:       @ %bb.0:
584; DEFAULT-NEXT:    vld1.64 {d16, d17}, [r1]
585; DEFAULT-NEXT:    vorr q9, q8, q8
586; DEFAULT-NEXT:    vorr q10, q8, q8
587; DEFAULT-NEXT:    vld3.32 {d17[1], d19[1], d21[1]}, [r0]
588; DEFAULT-NEXT:    vadd.i32 q12, q8, q9
589; DEFAULT-NEXT:    vadd.i32 q8, q10, q12
590; DEFAULT-NEXT:    vmov r0, r1, d16
591; DEFAULT-NEXT:    vmov r2, r3, d17
592; DEFAULT-NEXT:    mov pc, lr
593;
594; BASIC-LABEL: vld3laneQi32:
595; BASIC:       @ %bb.0:
596; BASIC-NEXT:    vld1.64 {d18, d19}, [r1]
597; BASIC-NEXT:    vorr q10, q9, q9
598; BASIC-NEXT:    vorr q11, q9, q9
599; BASIC-NEXT:    vld3.32 {d19[1], d21[1], d23[1]}, [r0]
600; BASIC-NEXT:    vadd.i32 q8, q9, q10
601; BASIC-NEXT:    vadd.i32 q8, q11, q8
602; BASIC-NEXT:    vmov r0, r1, d16
603; BASIC-NEXT:    vmov r2, r3, d17
604; BASIC-NEXT:    mov pc, lr
605  %tmp1 = load <4 x i32>, ptr %B
606  %tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0(ptr %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 3, i32 1)
607  %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 0
608  %tmp4 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 1
609  %tmp5 = extractvalue %struct.__neon_int32x4x3_t %tmp2, 2
610  %tmp6 = add <4 x i32> %tmp3, %tmp4
611  %tmp7 = add <4 x i32> %tmp5, %tmp6
612  ret <4 x i32> %tmp7
613}
614
615define <4 x float> @vld3laneQf(ptr %A, ptr %B) nounwind {
616; DEFAULT-LABEL: vld3laneQf:
617; DEFAULT:       @ %bb.0:
618; DEFAULT-NEXT:    vld1.64 {d16, d17}, [r1]
619; DEFAULT-NEXT:    vorr q9, q8, q8
620; DEFAULT-NEXT:    vorr q10, q8, q8
621; DEFAULT-NEXT:    vld3.32 {d16[1], d18[1], d20[1]}, [r0]
622; DEFAULT-NEXT:    vadd.f32 q12, q8, q9
623; DEFAULT-NEXT:    vadd.f32 q8, q10, q12
624; DEFAULT-NEXT:    vmov r0, r1, d16
625; DEFAULT-NEXT:    vmov r2, r3, d17
626; DEFAULT-NEXT:    mov pc, lr
627;
628; BASIC-LABEL: vld3laneQf:
629; BASIC:       @ %bb.0:
630; BASIC-NEXT:    vld1.64 {d18, d19}, [r1]
631; BASIC-NEXT:    vorr q10, q9, q9
632; BASIC-NEXT:    vorr q11, q9, q9
633; BASIC-NEXT:    vld3.32 {d18[1], d20[1], d22[1]}, [r0]
634; BASIC-NEXT:    vadd.f32 q8, q9, q10
635; BASIC-NEXT:    vadd.f32 q8, q11, q8
636; BASIC-NEXT:    vmov r0, r1, d16
637; BASIC-NEXT:    vmov r2, r3, d17
638; BASIC-NEXT:    mov pc, lr
639  %tmp1 = load <4 x float>, ptr %B
640  %tmp2 = call %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0(ptr %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
641  %tmp3 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 0
642  %tmp4 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 1
643  %tmp5 = extractvalue %struct.__neon_float32x4x3_t %tmp2, 2
644  %tmp6 = fadd <4 x float> %tmp3, %tmp4
645  %tmp7 = fadd <4 x float> %tmp5, %tmp6
646  ret <4 x float> %tmp7
647}
648
649declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8.p0(ptr, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
650declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
651declare %struct.__neon_int32x2x3_t @llvm.arm.neon.vld3lane.v2i32.p0(ptr, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
652declare %struct.__neon_float32x2x3_t @llvm.arm.neon.vld3lane.v2f32.p0(ptr, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
653
654declare %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0(ptr, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
655declare %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3lane.v4i32.p0(ptr, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
656declare %struct.__neon_float32x4x3_t @llvm.arm.neon.vld3lane.v4f32.p0(ptr, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
657
658%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>,  <8 x i8>,  <8 x i8> }
659%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
660%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
661%struct.__neon_float32x2x4_t = type { <2 x float>, <2 x float>, <2 x float>, <2 x float> }
662
663%struct.__neon_int16x8x4_t = type { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> }
664%struct.__neon_int32x4x4_t = type { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> }
665%struct.__neon_float32x4x4_t = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
666
667;Check the alignment value.  Max for this instruction is 32 bits:
668define <8 x i8> @vld4lanei8(ptr %A, ptr %B) nounwind {
669; CHECK-LABEL: vld4lanei8:
670; CHECK:       @ %bb.0:
671; CHECK-NEXT:    vldr d16, [r1]
672; CHECK-NEXT:    vorr d17, d16, d16
673; CHECK-NEXT:    vorr d18, d16, d16
674; CHECK-NEXT:    vorr d19, d16, d16
675; CHECK-NEXT:    vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]
676; CHECK-NEXT:    vadd.i8 d16, d16, d17
677; CHECK-NEXT:    vadd.i8 d20, d18, d19
678; CHECK-NEXT:    vadd.i8 d16, d16, d20
679; CHECK-NEXT:    vmov r0, r1, d16
680; CHECK-NEXT:    mov pc, lr
681  %tmp1 = load <8 x i8>, ptr %B
682  %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
683  %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
684  %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
685  %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
686  %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
687  %tmp7 = add <8 x i8> %tmp3, %tmp4
688  %tmp8 = add <8 x i8> %tmp5, %tmp6
689  %tmp9 = add <8 x i8> %tmp7, %tmp8
690  ret <8 x i8> %tmp9
691}
692
693;Check for a post-increment updating load.
694define <8 x i8> @vld4lanei8_update(ptr %ptr, ptr %B) nounwind {
695; DEFAULT-LABEL: vld4lanei8_update:
696; DEFAULT:       @ %bb.0:
697; DEFAULT-NEXT:    vldr d16, [r1]
698; DEFAULT-NEXT:    vorr d17, d16, d16
699; DEFAULT-NEXT:    ldr r3, [r0]
700; DEFAULT-NEXT:    vorr d18, d16, d16
701; DEFAULT-NEXT:    vorr d19, d16, d16
702; DEFAULT-NEXT:    vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r3:32]!
703; DEFAULT-NEXT:    vadd.i8 d16, d16, d17
704; DEFAULT-NEXT:    vadd.i8 d20, d18, d19
705; DEFAULT-NEXT:    str r3, [r0]
706; DEFAULT-NEXT:    vadd.i8 d16, d16, d20
707; DEFAULT-NEXT:    vmov r2, r1, d16
708; DEFAULT-NEXT:    mov r0, r2
709; DEFAULT-NEXT:    mov pc, lr
710;
711; BASIC-LABEL: vld4lanei8_update:
712; BASIC:       @ %bb.0:
713; BASIC-NEXT:    vldr d16, [r1]
714; BASIC-NEXT:    mov r3, r0
715; BASIC-NEXT:    vorr d17, d16, d16
716; BASIC-NEXT:    ldr r0, [r0]
717; BASIC-NEXT:    vorr d18, d16, d16
718; BASIC-NEXT:    vorr d19, d16, d16
719; BASIC-NEXT:    vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r0:32]!
720; BASIC-NEXT:    vadd.i8 d16, d16, d17
721; BASIC-NEXT:    vadd.i8 d20, d18, d19
722; BASIC-NEXT:    str r0, [r3]
723; BASIC-NEXT:    vadd.i8 d16, d16, d20
724; BASIC-NEXT:    vmov r1, r2, d16
725; BASIC-NEXT:    mov r0, r1
726; BASIC-NEXT:    mov r1, r2
727; BASIC-NEXT:    mov pc, lr
728  %A = load ptr, ptr %ptr
729  %tmp1 = load <8 x i8>, ptr %B
730  %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0(ptr %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
731  %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
732  %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
733  %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
734  %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
735  %tmp7 = add <8 x i8> %tmp3, %tmp4
736  %tmp8 = add <8 x i8> %tmp5, %tmp6
737  %tmp9 = add <8 x i8> %tmp7, %tmp8
738  %tmp10 = getelementptr i8, ptr %A, i32 4
739  store ptr %tmp10, ptr %ptr
740  ret <8 x i8> %tmp9
741}
742
743;Check that a power-of-two alignment smaller than the total size of the memory
744;being loaded is ignored.
745define <4 x i16> @vld4lanei16(ptr %A, ptr %B) nounwind {
746; CHECK-LABEL: vld4lanei16:
747; CHECK:       @ %bb.0:
748; CHECK-NEXT:    vldr d16, [r1]
749; CHECK-NEXT:    vorr d17, d16, d16
750; CHECK-NEXT:    vorr d18, d16, d16
751; CHECK-NEXT:    vorr d19, d16, d16
752; CHECK-NEXT:    vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [r0]
753; CHECK-NEXT:    vadd.i16 d16, d16, d17
754; CHECK-NEXT:    vadd.i16 d20, d18, d19
755; CHECK-NEXT:    vadd.i16 d16, d16, d20
756; CHECK-NEXT:    vmov r0, r1, d16
757; CHECK-NEXT:    mov pc, lr
758  %tmp1 = load <4 x i16>, ptr %B
759  %tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0(ptr %A, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 4)
760  %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 0
761  %tmp4 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 1
762  %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 2
763  %tmp6 = extractvalue %struct.__neon_int16x4x4_t %tmp2, 3
764  %tmp7 = add <4 x i16> %tmp3, %tmp4
765  %tmp8 = add <4 x i16> %tmp5, %tmp6
766  %tmp9 = add <4 x i16> %tmp7, %tmp8
767  ret <4 x i16> %tmp9
768}
769
770;Check the alignment value.  An 8-byte alignment is allowed here even though
771;it is smaller than the total size of the memory being loaded.
772define <2 x i32> @vld4lanei32(ptr %A, ptr %B) nounwind {
773; CHECK-LABEL: vld4lanei32:
774; CHECK:       @ %bb.0:
775; CHECK-NEXT:    vldr d16, [r1]
776; CHECK-NEXT:    vorr d17, d16, d16
777; CHECK-NEXT:    vorr d18, d16, d16
778; CHECK-NEXT:    vorr d19, d16, d16
779; CHECK-NEXT:    vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0:64]
780; CHECK-NEXT:    vadd.i32 d16, d16, d17
781; CHECK-NEXT:    vadd.i32 d20, d18, d19
782; CHECK-NEXT:    vadd.i32 d16, d16, d20
783; CHECK-NEXT:    vmov r0, r1, d16
784; CHECK-NEXT:    mov pc, lr
785  %tmp1 = load <2 x i32>, ptr %B
786  %tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0(ptr %A, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 8)
787  %tmp3 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 0
788  %tmp4 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 1
789  %tmp5 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 2
790  %tmp6 = extractvalue %struct.__neon_int32x2x4_t %tmp2, 3
791  %tmp7 = add <2 x i32> %tmp3, %tmp4
792  %tmp8 = add <2 x i32> %tmp5, %tmp6
793  %tmp9 = add <2 x i32> %tmp7, %tmp8
794  ret <2 x i32> %tmp9
795}
796
797define <2 x float> @vld4lanef(ptr %A, ptr %B) nounwind {
798; CHECK-LABEL: vld4lanef:
799; CHECK:       @ %bb.0:
800; CHECK-NEXT:    vldr d16, [r1]
801; CHECK-NEXT:    vorr d17, d16, d16
802; CHECK-NEXT:    vorr d18, d16, d16
803; CHECK-NEXT:    vorr d19, d16, d16
804; CHECK-NEXT:    vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r0]
805; CHECK-NEXT:    vadd.f32 d16, d16, d17
806; CHECK-NEXT:    vadd.f32 d20, d18, d19
807; CHECK-NEXT:    vadd.f32 d16, d16, d20
808; CHECK-NEXT:    vmov r0, r1, d16
809; CHECK-NEXT:    mov pc, lr
810  %tmp1 = load <2 x float>, ptr %B
811  %tmp2 = call %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0(ptr %A, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, <2 x float> %tmp1, i32 1, i32 1)
812  %tmp3 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 0
813  %tmp4 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 1
814  %tmp5 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 2
815  %tmp6 = extractvalue %struct.__neon_float32x2x4_t %tmp2, 3
816  %tmp7 = fadd <2 x float> %tmp3, %tmp4
817  %tmp8 = fadd <2 x float> %tmp5, %tmp6
818  %tmp9 = fadd <2 x float> %tmp7, %tmp8
819  ret <2 x float> %tmp9
820}
821
822;Check the alignment value.  Max for this instruction is 64 bits:
823define <8 x i16> @vld4laneQi16(ptr %A, ptr %B) nounwind {
824; CHECK-LABEL: vld4laneQi16:
825; CHECK:       @ %bb.0:
826; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
827; CHECK-NEXT:    vorr q9, q8, q8
828; CHECK-NEXT:    vorr q10, q8, q8
829; CHECK-NEXT:    vorr q11, q8, q8
830; CHECK-NEXT:    vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r0:64]
831; CHECK-NEXT:    vadd.i16 q8, q8, q9
832; CHECK-NEXT:    vadd.i16 q12, q10, q11
833; CHECK-NEXT:    vadd.i16 q8, q8, q12
834; CHECK-NEXT:    vmov r0, r1, d16
835; CHECK-NEXT:    vmov r2, r3, d17
836; CHECK-NEXT:    mov pc, lr
837  %tmp1 = load <8 x i16>, ptr %B
838  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0(ptr %A, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 16)
839  %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 0
840  %tmp4 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 1
841  %tmp5 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 2
842  %tmp6 = extractvalue %struct.__neon_int16x8x4_t %tmp2, 3
843  %tmp7 = add <8 x i16> %tmp3, %tmp4
844  %tmp8 = add <8 x i16> %tmp5, %tmp6
845  %tmp9 = add <8 x i16> %tmp7, %tmp8
846  ret <8 x i16> %tmp9
847}
848
849;Check the (default) alignment.
850define <4 x i32> @vld4laneQi32(ptr %A, ptr %B) nounwind {
851; CHECK-LABEL: vld4laneQi32:
852; CHECK:       @ %bb.0:
853; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
854; CHECK-NEXT:    vorr q9, q8, q8
855; CHECK-NEXT:    vorr q10, q8, q8
856; CHECK-NEXT:    vorr q11, q8, q8
857; CHECK-NEXT:    vld4.32 {d17[0], d19[0], d21[0], d23[0]}, [r0]
858; CHECK-NEXT:    vadd.i32 q8, q8, q9
859; CHECK-NEXT:    vadd.i32 q12, q10, q11
860; CHECK-NEXT:    vadd.i32 q8, q8, q12
861; CHECK-NEXT:    vmov r0, r1, d16
862; CHECK-NEXT:    vmov r2, r3, d17
863; CHECK-NEXT:    mov pc, lr
864  %tmp1 = load <4 x i32>, ptr %B
865  %tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0(ptr %A, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
866  %tmp3 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 0
867  %tmp4 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 1
868  %tmp5 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 2
869  %tmp6 = extractvalue %struct.__neon_int32x4x4_t %tmp2, 3
870  %tmp7 = add <4 x i32> %tmp3, %tmp4
871  %tmp8 = add <4 x i32> %tmp5, %tmp6
872  %tmp9 = add <4 x i32> %tmp7, %tmp8
873  ret <4 x i32> %tmp9
874}
875
876define <4 x float> @vld4laneQf(ptr %A, ptr %B) nounwind {
877; CHECK-LABEL: vld4laneQf:
878; CHECK:       @ %bb.0:
879; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
880; CHECK-NEXT:    vorr q9, q8, q8
881; CHECK-NEXT:    vorr q10, q8, q8
882; CHECK-NEXT:    vorr q11, q8, q8
883; CHECK-NEXT:    vld4.32 {d16[1], d18[1], d20[1], d22[1]}, [r0]
884; CHECK-NEXT:    vadd.f32 q8, q8, q9
885; CHECK-NEXT:    vadd.f32 q12, q10, q11
886; CHECK-NEXT:    vadd.f32 q8, q8, q12
887; CHECK-NEXT:    vmov r0, r1, d16
888; CHECK-NEXT:    vmov r2, r3, d17
889; CHECK-NEXT:    mov pc, lr
890  %tmp1 = load <4 x float>, ptr %B
891  %tmp2 = call %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0(ptr %A, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1, i32 1)
892  %tmp3 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 0
893  %tmp4 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 1
894  %tmp5 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 2
895  %tmp6 = extractvalue %struct.__neon_float32x4x4_t %tmp2, 3
896  %tmp7 = fadd <4 x float> %tmp3, %tmp4
897  %tmp8 = fadd <4 x float> %tmp5, %tmp6
898  %tmp9 = fadd <4 x float> %tmp7, %tmp8
899  ret <4 x float> %tmp9
900}
901
902declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8.p0(ptr, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
903declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16.p0(ptr, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
904declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32.p0(ptr, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
905declare %struct.__neon_float32x2x4_t @llvm.arm.neon.vld4lane.v2f32.p0(ptr, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind readonly
906
907declare %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4lane.v8i16.p0(ptr, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind readonly
908declare %struct.__neon_int32x4x4_t @llvm.arm.neon.vld4lane.v4i32.p0(ptr, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind readonly
909declare %struct.__neon_float32x4x4_t @llvm.arm.neon.vld4lane.v4f32.p0(ptr, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind readonly
910
911; Radar 8776599: If one of the operands to a QQQQ REG_SEQUENCE is a register
912; in the QPR_VFP2 regclass, it needs to be copied to a QPR regclass because
913; we don't currently have a QQQQ_VFP2 super-regclass.  (The "0" for the low
914; part of %ins67 is supposed to be loaded by a VLDRS instruction in this test.)
915define <8 x i16> @test_qqqq_regsequence_subreg([6 x i64] %b) nounwind {
916; DEFAULT-LABEL: test_qqqq_regsequence_subreg:
917; DEFAULT:       @ %bb.0:
918; DEFAULT-NEXT:    add r0, sp, #24
919; DEFAULT-NEXT:    vld1.32 {d21[0]}, [r0:32]
920; DEFAULT-NEXT:    add r0, sp, #28
921; DEFAULT-NEXT:    vmov.i32 d20, #0x0
922; DEFAULT-NEXT:    vld1.32 {d21[1]}, [r0:32]
923; DEFAULT-NEXT:    vld3.16 {d16[1], d18[1], d20[1]}, [r0]
924; DEFAULT-NEXT:    vadd.i16 q12, q8, q9
925; DEFAULT-NEXT:    vadd.i16 q8, q10, q12
926; DEFAULT-NEXT:    vmov r0, r1, d16
927; DEFAULT-NEXT:    vmov r2, r3, d17
928; DEFAULT-NEXT:    mov pc, lr
929;
930; BASIC-LABEL: test_qqqq_regsequence_subreg:
931; BASIC:       @ %bb.0:
932; BASIC-NEXT:    add r0, sp, #24
933; BASIC-NEXT:    vld1.32 {d23[0]}, [r0:32]
934; BASIC-NEXT:    add r0, sp, #28
935; BASIC-NEXT:    vmov.i32 d22, #0x0
936; BASIC-NEXT:    vld1.32 {d23[1]}, [r0:32]
937; BASIC-NEXT:    vld3.16 {d18[1], d20[1], d22[1]}, [r0]
938; BASIC-NEXT:    vadd.i16 q8, q9, q10
939; BASIC-NEXT:    vadd.i16 q8, q11, q8
940; BASIC-NEXT:    vmov r0, r1, d16
941; BASIC-NEXT:    vmov r2, r3, d17
942; BASIC-NEXT:    mov pc, lr
943  %tmp63 = extractvalue [6 x i64] %b, 5
944  %tmp64 = zext i64 %tmp63 to i128
945  %tmp65 = shl i128 %tmp64, 64
946  %ins67 = or i128 %tmp65, 0
947  %tmp78 = bitcast i128 %ins67 to <8 x i16>
948  %vld3_lane = tail call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16.p0(ptr undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> %tmp78, i32 1, i32 2)
949  %tmp3 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 0
950  %tmp4 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 1
951  %tmp5 = extractvalue %struct.__neon_int16x8x3_t %vld3_lane, 2
952  %tmp6 = add <8 x i16> %tmp3, %tmp4
953  %tmp7 = add <8 x i16> %tmp5, %tmp6
954  ret <8 x i16> %tmp7
955}
956
957declare void @llvm.trap() nounwind
958