xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
3
4; i32
5
6define arm_aapcs_vfpcc <2 x i32> @ptr_v2i32(ptr %offptr) {
7; CHECK-LABEL: ptr_v2i32:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    ldrd r0, r1, [r0]
10; CHECK-NEXT:    ldr r1, [r1]
11; CHECK-NEXT:    ldr r0, [r0]
12; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
13; CHECK-NEXT:    bx lr
14entry:
15  %offs = load <2 x ptr>, ptr %offptr, align 4
16  %gather = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %offs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
17  ret <2 x i32> %gather
18}
19
20define arm_aapcs_vfpcc <4 x i32> @ptr_v4i32(ptr %offptr) {
21; CHECK-LABEL: ptr_v4i32:
22; CHECK:       @ %bb.0: @ %entry
23; CHECK-NEXT:    vldrw.u32 q1, [r0]
24; CHECK-NEXT:    vldrw.u32 q0, [q1]
25; CHECK-NEXT:    bx lr
26entry:
27  %offs = load <4 x ptr>, ptr %offptr, align 4
28  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
29  ret <4 x i32> %gather
30}
31
32define arm_aapcs_vfpcc <8 x i32> @ptr_v8i32(ptr %offptr) {
33; CHECK-LABEL: ptr_v8i32:
34; CHECK:       @ %bb.0: @ %entry
35; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
36; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
37; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
38; CHECK-NEXT:    vmov r1, r2, d1
39; CHECK-NEXT:    vmov r3, r12, d0
40; CHECK-NEXT:    vldrw.u32 q0, [r0]
41; CHECK-NEXT:    vmov r0, lr, d1
42; CHECK-NEXT:    ldr r7, [r2]
43; CHECK-NEXT:    vmov r2, r4, d0
44; CHECK-NEXT:    ldr r6, [r1]
45; CHECK-NEXT:    ldr r3, [r3]
46; CHECK-NEXT:    ldr r0, [r0]
47; CHECK-NEXT:    ldr.w r1, [r12]
48; CHECK-NEXT:    vmov q1[2], q1[0], r3, r6
49; CHECK-NEXT:    ldr.w r5, [lr]
50; CHECK-NEXT:    vmov q1[3], q1[1], r1, r7
51; CHECK-NEXT:    ldr r2, [r2]
52; CHECK-NEXT:    ldr r4, [r4]
53; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
54; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
55; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
56entry:
57  %offs = load <8 x ptr>, ptr %offptr, align 4
58  %gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
59  ret <8 x i32> %gather
60}
61
62define arm_aapcs_vfpcc <16 x i32> @ptr_v16i32(ptr %offptr) {
63; CHECK-LABEL: ptr_v16i32:
64; CHECK:       @ %bb.0: @ %entry
65; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
66; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
67; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
68; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
69; CHECK-NEXT:    vldrw.u32 q2, [r0, #32]
70; CHECK-NEXT:    vmov r1, r2, d1
71; CHECK-NEXT:    vmov r3, lr, d0
72; CHECK-NEXT:    vldrw.u32 q0, [r0]
73; CHECK-NEXT:    vmov r4, r5, d1
74; CHECK-NEXT:    ldr r7, [r2]
75; CHECK-NEXT:    vmov r2, r6, d0
76; CHECK-NEXT:    ldr.w r12, [r1]
77; CHECK-NEXT:    ldr r3, [r3]
78; CHECK-NEXT:    ldr r4, [r4]
79; CHECK-NEXT:    ldr r5, [r5]
80; CHECK-NEXT:    vmov q3[2], q3[0], r3, r12
81; CHECK-NEXT:    ldr.w r1, [lr]
82; CHECK-NEXT:    vmov q3[3], q3[1], r1, r7
83; CHECK-NEXT:    ldr r2, [r2]
84; CHECK-NEXT:    ldr r6, [r6]
85; CHECK-NEXT:    vmov q0[2], q0[0], r2, r4
86; CHECK-NEXT:    vmov r2, r4, d3
87; CHECK-NEXT:    vmov q0[3], q0[1], r6, r5
88; CHECK-NEXT:    vmov r6, r5, d2
89; CHECK-NEXT:    ldr r2, [r2]
90; CHECK-NEXT:    ldr r6, [r6]
91; CHECK-NEXT:    ldr r5, [r5]
92; CHECK-NEXT:    vmov q1[2], q1[0], r6, r2
93; CHECK-NEXT:    ldr r6, [r4]
94; CHECK-NEXT:    vmov r0, r2, d5
95; CHECK-NEXT:    vmov q1[3], q1[1], r5, r6
96; CHECK-NEXT:    vmov r6, r5, d4
97; CHECK-NEXT:    ldr r0, [r0]
98; CHECK-NEXT:    ldr r6, [r6]
99; CHECK-NEXT:    ldr r2, [r2]
100; CHECK-NEXT:    ldr r5, [r5]
101; CHECK-NEXT:    vmov q2[2], q2[0], r6, r0
102; CHECK-NEXT:    vmov q2[3], q2[1], r5, r2
103; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
104entry:
105  %offs = load <16 x ptr>, ptr %offptr, align 4
106  %gather = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
107  ret <16 x i32> %gather
108}
109
110; f32
111
112define arm_aapcs_vfpcc <2 x float> @ptr_v2f32(ptr %offptr) {
113; CHECK-LABEL: ptr_v2f32:
114; CHECK:       @ %bb.0: @ %entry
115; CHECK-NEXT:    ldrd r0, r1, [r0]
116; CHECK-NEXT:    vldr s1, [r1]
117; CHECK-NEXT:    vldr s0, [r0]
118; CHECK-NEXT:    bx lr
119entry:
120  %offs = load <2 x ptr>, ptr %offptr, align 4
121  %gather = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %offs, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
122  ret <2 x float> %gather
123}
124
125define arm_aapcs_vfpcc <4 x float> @ptr_v4f32(ptr %offptr) {
126; CHECK-LABEL: ptr_v4f32:
127; CHECK:       @ %bb.0: @ %entry
128; CHECK-NEXT:    vldrw.u32 q1, [r0]
129; CHECK-NEXT:    vldrw.u32 q0, [q1]
130; CHECK-NEXT:    bx lr
131entry:
132  %offs = load <4 x ptr>, ptr %offptr, align 4
133  %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
134  ret <4 x float> %gather
135}
136
137define arm_aapcs_vfpcc <8 x float> @ptr_v8f32(ptr %offptr) {
138; CHECK-LABEL: ptr_v8f32:
139; CHECK:       @ %bb.0: @ %entry
140; CHECK-NEXT:    .save {r4, r5, r7, lr}
141; CHECK-NEXT:    push {r4, r5, r7, lr}
142; CHECK-NEXT:    vldrw.u32 q0, [r0]
143; CHECK-NEXT:    vmov r12, r2, d1
144; CHECK-NEXT:    vmov lr, r1, d0
145; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
146; CHECK-NEXT:    vmov r0, r3, d1
147; CHECK-NEXT:    vmov r4, r5, d0
148; CHECK-NEXT:    vldr s3, [r2]
149; CHECK-NEXT:    vldr s2, [r12]
150; CHECK-NEXT:    vldr s1, [r1]
151; CHECK-NEXT:    vldr s0, [lr]
152; CHECK-NEXT:    vldr s7, [r3]
153; CHECK-NEXT:    vldr s6, [r0]
154; CHECK-NEXT:    vldr s5, [r5]
155; CHECK-NEXT:    vldr s4, [r4]
156; CHECK-NEXT:    pop {r4, r5, r7, pc}
157entry:
158  %offs = load <8 x ptr>, ptr %offptr, align 4
159  %gather = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef)
160  ret <8 x float> %gather
161}
162
163; i16
164
165define arm_aapcs_vfpcc <8 x i16> @ptr_i16(ptr %offptr) {
166; CHECK-LABEL: ptr_i16:
167; CHECK:       @ %bb.0: @ %entry
168; CHECK-NEXT:    .save {r4, r5, r6, lr}
169; CHECK-NEXT:    push {r4, r5, r6, lr}
170; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
171; CHECK-NEXT:    vmov r1, r2, d0
172; CHECK-NEXT:    vmov r3, r12, d1
173; CHECK-NEXT:    vldrw.u32 q0, [r0]
174; CHECK-NEXT:    vmov r4, r5, d0
175; CHECK-NEXT:    vmov r0, lr, d1
176; CHECK-NEXT:    ldrh r1, [r1]
177; CHECK-NEXT:    ldrh r6, [r3]
178; CHECK-NEXT:    ldrh r2, [r2]
179; CHECK-NEXT:    ldrh r4, [r4]
180; CHECK-NEXT:    ldrh r5, [r5]
181; CHECK-NEXT:    vmov.16 q0[0], r4
182; CHECK-NEXT:    ldrh r0, [r0]
183; CHECK-NEXT:    vmov.16 q0[1], r5
184; CHECK-NEXT:    ldrh.w r3, [lr]
185; CHECK-NEXT:    vmov.16 q0[2], r0
186; CHECK-NEXT:    ldrh.w r12, [r12]
187; CHECK-NEXT:    vmov.16 q0[3], r3
188; CHECK-NEXT:    vmov.16 q0[4], r1
189; CHECK-NEXT:    vmov.16 q0[5], r2
190; CHECK-NEXT:    vmov.16 q0[6], r6
191; CHECK-NEXT:    vmov.16 q0[7], r12
192; CHECK-NEXT:    pop {r4, r5, r6, pc}
193entry:
194  %offs = load <8 x ptr>, ptr %offptr, align 4
195  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
196  ret <8 x i16> %gather
197}
198
199define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_sext(ptr %offptr) {
200; CHECK-LABEL: ptr_v2i16_sext:
201; CHECK:       @ %bb.0: @ %entry
202; CHECK-NEXT:    ldrd r0, r1, [r0]
203; CHECK-NEXT:    ldrsh.w r1, [r1]
204; CHECK-NEXT:    ldrsh.w r0, [r0]
205; CHECK-NEXT:    vmov q0[2], q0[0], r0, r1
206; CHECK-NEXT:    asrs r1, r1, #31
207; CHECK-NEXT:    asrs r0, r0, #31
208; CHECK-NEXT:    vmov q0[3], q0[1], r0, r1
209; CHECK-NEXT:    bx lr
210entry:
211  %offs = load <2 x ptr>, ptr %offptr, align 4
212  %gather = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %offs, i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> undef)
213  %ext = sext <2 x i16> %gather to <2 x i32>
214  ret <2 x i32> %ext
215}
216
217define arm_aapcs_vfpcc <2 x i32> @ptr_v2i16_zext(ptr %offptr) {
218; CHECK-LABEL: ptr_v2i16_zext:
219; CHECK:       @ %bb.0: @ %entry
220; CHECK-NEXT:    ldrd r0, r1, [r0]
221; CHECK-NEXT:    vmov.i64 q0, #0xffff
222; CHECK-NEXT:    ldrh r1, [r1]
223; CHECK-NEXT:    ldrh r0, [r0]
224; CHECK-NEXT:    vmov q1[2], q1[0], r0, r1
225; CHECK-NEXT:    vand q0, q1, q0
226; CHECK-NEXT:    bx lr
227entry:
228  %offs = load <2 x ptr>, ptr %offptr, align 4
229  %gather = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> %offs, i32 2, <2 x i1> <i1 true, i1 true>, <2 x i16> undef)
230  %ext = zext <2 x i16> %gather to <2 x i32>
231  ret <2 x i32> %ext
232}
233
234define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(ptr %offptr) {
235; CHECK-LABEL: ptr_v4i16_sext:
236; CHECK:       @ %bb.0: @ %entry
237; CHECK-NEXT:    vldrw.u32 q1, [r0]
238; CHECK-NEXT:    movs r1, #0
239; CHECK-NEXT:    vldrh.s32 q0, [r1, q1]
240; CHECK-NEXT:    bx lr
241entry:
242  %offs = load <4 x ptr>, ptr %offptr, align 4
243  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
244  %ext = sext <4 x i16> %gather to <4 x i32>
245  ret <4 x i32> %ext
246}
247
248define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(ptr %offptr) {
249; CHECK-LABEL: ptr_v4i16_zext:
250; CHECK:       @ %bb.0: @ %entry
251; CHECK-NEXT:    vldrw.u32 q1, [r0]
252; CHECK-NEXT:    movs r1, #0
253; CHECK-NEXT:    vldrh.u32 q0, [r1, q1]
254; CHECK-NEXT:    bx lr
255entry:
256  %offs = load <4 x ptr>, ptr %offptr, align 4
257  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
258  %ext = zext <4 x i16> %gather to <4 x i32>
259  ret <4 x i32> %ext
260}
261
262define arm_aapcs_vfpcc <4 x i16> @ptr_v4i16(ptr %offptr) {
263; CHECK-LABEL: ptr_v4i16:
264; CHECK:       @ %bb.0: @ %entry
265; CHECK-NEXT:    vldrw.u32 q1, [r0]
266; CHECK-NEXT:    movs r1, #0
267; CHECK-NEXT:    vldrh.u32 q0, [r1, q1]
268; CHECK-NEXT:    bx lr
269entry:
270  %offs = load <4 x ptr>, ptr %offptr, align 4
271  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
272  ret <4 x i16> %gather
273}
274
275define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(ptr %offptr) {
276; CHECK-LABEL: ptr_v8i16_sext:
277; CHECK:       @ %bb.0: @ %entry
278; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
279; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
280; CHECK-NEXT:    .pad #16
281; CHECK-NEXT:    sub sp, #16
282; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
283; CHECK-NEXT:    vmov r3, r1, d1
284; CHECK-NEXT:    vmov r12, r2, d0
285; CHECK-NEXT:    vldrw.u32 q0, [r0]
286; CHECK-NEXT:    vmov lr, r0, d1
287; CHECK-NEXT:    ldrh r7, [r1]
288; CHECK-NEXT:    ldrh.w r1, [r12]
289; CHECK-NEXT:    ldrh r2, [r2]
290; CHECK-NEXT:    ldrh r4, [r0]
291; CHECK-NEXT:    vmov r0, r5, d0
292; CHECK-NEXT:    ldrh.w r6, [lr]
293; CHECK-NEXT:    ldrh r3, [r3]
294; CHECK-NEXT:    ldrh r0, [r0]
295; CHECK-NEXT:    ldrh r5, [r5]
296; CHECK-NEXT:    vmov.16 q0[0], r0
297; CHECK-NEXT:    mov r0, sp
298; CHECK-NEXT:    vmov.16 q0[1], r5
299; CHECK-NEXT:    vmov.16 q0[2], r6
300; CHECK-NEXT:    vmov.16 q0[3], r4
301; CHECK-NEXT:    vmov.16 q0[4], r1
302; CHECK-NEXT:    vmov.16 q0[5], r2
303; CHECK-NEXT:    vmov.16 q0[6], r3
304; CHECK-NEXT:    vmov.16 q0[7], r7
305; CHECK-NEXT:    vstrw.32 q0, [r0]
306; CHECK-NEXT:    vldrh.s32 q0, [r0]
307; CHECK-NEXT:    vldrh.s32 q1, [r0, #8]
308; CHECK-NEXT:    add sp, #16
309; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
310entry:
311  %offs = load <8 x ptr>, ptr %offptr, align 4
312  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
313  %ext = sext <8 x i16> %gather to <8 x i32>
314  ret <8 x i32> %ext
315}
316
317define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_zext(ptr %offptr) {
318; CHECK-LABEL: ptr_v8i16_zext:
319; CHECK:       @ %bb.0: @ %entry
320; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
321; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
322; CHECK-NEXT:    .pad #16
323; CHECK-NEXT:    sub sp, #16
324; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
325; CHECK-NEXT:    vmov r3, r1, d1
326; CHECK-NEXT:    vmov r12, r2, d0
327; CHECK-NEXT:    vldrw.u32 q0, [r0]
328; CHECK-NEXT:    vmov lr, r0, d1
329; CHECK-NEXT:    ldrh r7, [r1]
330; CHECK-NEXT:    ldrh.w r1, [r12]
331; CHECK-NEXT:    ldrh r2, [r2]
332; CHECK-NEXT:    ldrh r4, [r0]
333; CHECK-NEXT:    vmov r0, r5, d0
334; CHECK-NEXT:    ldrh.w r6, [lr]
335; CHECK-NEXT:    ldrh r3, [r3]
336; CHECK-NEXT:    ldrh r0, [r0]
337; CHECK-NEXT:    ldrh r5, [r5]
338; CHECK-NEXT:    vmov.16 q0[0], r0
339; CHECK-NEXT:    mov r0, sp
340; CHECK-NEXT:    vmov.16 q0[1], r5
341; CHECK-NEXT:    vmov.16 q0[2], r6
342; CHECK-NEXT:    vmov.16 q0[3], r4
343; CHECK-NEXT:    vmov.16 q0[4], r1
344; CHECK-NEXT:    vmov.16 q0[5], r2
345; CHECK-NEXT:    vmov.16 q0[6], r3
346; CHECK-NEXT:    vmov.16 q0[7], r7
347; CHECK-NEXT:    vstrw.32 q0, [r0]
348; CHECK-NEXT:    vldrh.u32 q0, [r0]
349; CHECK-NEXT:    vldrh.u32 q1, [r0, #8]
350; CHECK-NEXT:    add sp, #16
351; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
352entry:
353  %offs = load <8 x ptr>, ptr %offptr, align 4
354  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
355  %ext = zext <8 x i16> %gather to <8 x i32>
356  ret <8 x i32> %ext
357}
358
359; f16
360
361define arm_aapcs_vfpcc <8 x half> @ptr_f16(ptr %offptr) {
362; CHECK-LABEL: ptr_f16:
363; CHECK:       @ %bb.0: @ %entry
364; CHECK-NEXT:    vldrw.u32 q0, [r0]
365; CHECK-NEXT:    vmov r1, r2, d0
366; CHECK-NEXT:    vldr.16 s4, [r2]
367; CHECK-NEXT:    vldr.16 s0, [r1]
368; CHECK-NEXT:    vmov r1, r2, d1
369; CHECK-NEXT:    vins.f16 s0, s4
370; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
371; CHECK-NEXT:    vldr.16 s1, [r1]
372; CHECK-NEXT:    vldr.16 s2, [r2]
373; CHECK-NEXT:    vmov r0, r1, d2
374; CHECK-NEXT:    vins.f16 s1, s2
375; CHECK-NEXT:    vldr.16 s4, [r1]
376; CHECK-NEXT:    vldr.16 s2, [r0]
377; CHECK-NEXT:    vmov r0, r1, d3
378; CHECK-NEXT:    vldr.16 s3, [r0]
379; CHECK-NEXT:    vins.f16 s2, s4
380; CHECK-NEXT:    vldr.16 s4, [r1]
381; CHECK-NEXT:    vins.f16 s3, s4
382; CHECK-NEXT:    bx lr
383entry:
384  %offs = load <8 x ptr>, ptr %offptr, align 4
385  %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x half> undef)
386  ret <8 x half> %gather
387}
388
389define arm_aapcs_vfpcc <4 x half> @ptr_v4f16(ptr %offptr) {
390; CHECK-LABEL: ptr_v4f16:
391; CHECK:       @ %bb.0: @ %entry
392; CHECK-NEXT:    vldrw.u32 q0, [r0]
393; CHECK-NEXT:    vmov r0, r1, d0
394; CHECK-NEXT:    vldr.16 s4, [r1]
395; CHECK-NEXT:    vldr.16 s0, [r0]
396; CHECK-NEXT:    vmov r0, r1, d1
397; CHECK-NEXT:    vldr.16 s2, [r1]
398; CHECK-NEXT:    vldr.16 s1, [r0]
399; CHECK-NEXT:    vins.f16 s0, s4
400; CHECK-NEXT:    vins.f16 s1, s2
401; CHECK-NEXT:    bx lr
402entry:
403  %offs = load <4 x ptr>, ptr %offptr, align 4
404  %gather = call <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x half> undef)
405  ret <4 x half> %gather
406}
407
408; i8
409
410define arm_aapcs_vfpcc <16 x i8> @ptr_i8(ptr %offptr) {
411; CHECK-LABEL: ptr_i8:
412; CHECK:       @ %bb.0: @ %entry
413; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
414; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
415; CHECK-NEXT:    vldrw.u32 q0, [r0, #48]
416; CHECK-NEXT:    vldrw.u32 q2, [r0]
417; CHECK-NEXT:    vldrw.u32 q1, [r0, #32]
418; CHECK-NEXT:    vmov r1, r2, d0
419; CHECK-NEXT:    vmov r6, r7, d4
420; CHECK-NEXT:    vmov r4, r3, d1
421; CHECK-NEXT:    ldrb r5, [r1]
422; CHECK-NEXT:    ldrb r1, [r2]
423; CHECK-NEXT:    ldrb r2, [r6]
424; CHECK-NEXT:    ldrb.w r12, [r3]
425; CHECK-NEXT:    vmov.8 q0[0], r2
426; CHECK-NEXT:    vmov r2, r3, d3
427; CHECK-NEXT:    ldrb.w lr, [r4]
428; CHECK-NEXT:    ldrb r4, [r2]
429; CHECK-NEXT:    ldrb r2, [r3]
430; CHECK-NEXT:    ldrb r3, [r7]
431; CHECK-NEXT:    vmov.8 q0[1], r3
432; CHECK-NEXT:    vmov r3, r6, d5
433; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
434; CHECK-NEXT:    ldrb r3, [r3]
435; CHECK-NEXT:    ldrb r6, [r6]
436; CHECK-NEXT:    vmov.8 q0[2], r3
437; CHECK-NEXT:    vmov r0, r3, d4
438; CHECK-NEXT:    vmov.8 q0[3], r6
439; CHECK-NEXT:    ldrb r0, [r0]
440; CHECK-NEXT:    ldrb r3, [r3]
441; CHECK-NEXT:    vmov.8 q0[4], r0
442; CHECK-NEXT:    vmov.8 q0[5], r3
443; CHECK-NEXT:    vmov r0, r3, d5
444; CHECK-NEXT:    ldrb r0, [r0]
445; CHECK-NEXT:    ldrb r3, [r3]
446; CHECK-NEXT:    vmov.8 q0[6], r0
447; CHECK-NEXT:    vmov.8 q0[7], r3
448; CHECK-NEXT:    vmov r0, r3, d2
449; CHECK-NEXT:    ldrb r0, [r0]
450; CHECK-NEXT:    ldrb r3, [r3]
451; CHECK-NEXT:    vmov.8 q0[8], r0
452; CHECK-NEXT:    vmov.8 q0[9], r3
453; CHECK-NEXT:    vmov.8 q0[10], r4
454; CHECK-NEXT:    vmov.8 q0[11], r2
455; CHECK-NEXT:    vmov.8 q0[12], r5
456; CHECK-NEXT:    vmov.8 q0[13], r1
457; CHECK-NEXT:    vmov.8 q0[14], lr
458; CHECK-NEXT:    vmov.8 q0[15], r12
459; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
460entry:
461  %offs = load <16 x ptr>, ptr %offptr, align 4
462  %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %offs, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef)
463  ret <16 x i8> %gather
464}
465
466define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_sext16(ptr %offptr) {
467; CHECK-LABEL: ptr_v8i8_sext16:
468; CHECK:       @ %bb.0: @ %entry
469; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
470; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
471; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
472; CHECK-NEXT:    vmov r3, r1, d1
473; CHECK-NEXT:    vmov r12, r2, d0
474; CHECK-NEXT:    vldrw.u32 q0, [r0]
475; CHECK-NEXT:    vmov r4, r5, d0
476; CHECK-NEXT:    vmov lr, r0, d1
477; CHECK-NEXT:    ldrb r7, [r1]
478; CHECK-NEXT:    ldrb.w r1, [r12]
479; CHECK-NEXT:    ldrb r2, [r2]
480; CHECK-NEXT:    ldrb r4, [r4]
481; CHECK-NEXT:    ldrb r5, [r5]
482; CHECK-NEXT:    vmov.16 q0[0], r4
483; CHECK-NEXT:    ldrb.w r6, [lr]
484; CHECK-NEXT:    vmov.16 q0[1], r5
485; CHECK-NEXT:    ldrb r0, [r0]
486; CHECK-NEXT:    vmov.16 q0[2], r6
487; CHECK-NEXT:    ldrb r3, [r3]
488; CHECK-NEXT:    vmov.16 q0[3], r0
489; CHECK-NEXT:    vmov.16 q0[4], r1
490; CHECK-NEXT:    vmov.16 q0[5], r2
491; CHECK-NEXT:    vmov.16 q0[6], r3
492; CHECK-NEXT:    vmov.16 q0[7], r7
493; CHECK-NEXT:    vmovlb.s8 q0, q0
494; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
495entry:
496  %offs = load <8 x ptr>, ptr %offptr, align 4
497  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
498  %ext = sext <8 x i8> %gather to <8 x i16>
499  ret <8 x i16> %ext
500}
501
502define arm_aapcs_vfpcc <8 x i16> @ptr_v8i8_zext16(ptr %offptr) {
503; CHECK-LABEL: ptr_v8i8_zext16:
504; CHECK:       @ %bb.0: @ %entry
505; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
506; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
507; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
508; CHECK-NEXT:    vmov r3, r1, d1
509; CHECK-NEXT:    vmov r12, r2, d0
510; CHECK-NEXT:    vldrw.u32 q0, [r0]
511; CHECK-NEXT:    vmov r4, r5, d0
512; CHECK-NEXT:    vmov lr, r0, d1
513; CHECK-NEXT:    ldrb r7, [r1]
514; CHECK-NEXT:    ldrb.w r1, [r12]
515; CHECK-NEXT:    ldrb r2, [r2]
516; CHECK-NEXT:    ldrb r4, [r4]
517; CHECK-NEXT:    ldrb r5, [r5]
518; CHECK-NEXT:    vmov.16 q0[0], r4
519; CHECK-NEXT:    ldrb.w r6, [lr]
520; CHECK-NEXT:    vmov.16 q0[1], r5
521; CHECK-NEXT:    ldrb r0, [r0]
522; CHECK-NEXT:    vmov.16 q0[2], r6
523; CHECK-NEXT:    ldrb r3, [r3]
524; CHECK-NEXT:    vmov.16 q0[3], r0
525; CHECK-NEXT:    vmov.16 q0[4], r1
526; CHECK-NEXT:    vmov.16 q0[5], r2
527; CHECK-NEXT:    vmov.16 q0[6], r3
528; CHECK-NEXT:    vmov.16 q0[7], r7
529; CHECK-NEXT:    vmovlb.u8 q0, q0
530; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
531entry:
532  %offs = load <8 x ptr>, ptr %offptr, align 4
533  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
534  %ext = zext <8 x i8> %gather to <8 x i16>
535  ret <8 x i16> %ext
536}
537
538define arm_aapcs_vfpcc <8 x i8> @ptr_v8i8(ptr %offptr) {
539; CHECK-LABEL: ptr_v8i8:
540; CHECK:       @ %bb.0: @ %entry
541; CHECK-NEXT:    .save {r4, r5, r6, lr}
542; CHECK-NEXT:    push {r4, r5, r6, lr}
543; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
544; CHECK-NEXT:    vmov r1, r2, d0
545; CHECK-NEXT:    vmov r3, r12, d1
546; CHECK-NEXT:    vldrw.u32 q0, [r0]
547; CHECK-NEXT:    vmov r4, r5, d0
548; CHECK-NEXT:    vmov r0, lr, d1
549; CHECK-NEXT:    ldrb r1, [r1]
550; CHECK-NEXT:    ldrb r6, [r3]
551; CHECK-NEXT:    ldrb r2, [r2]
552; CHECK-NEXT:    ldrb r4, [r4]
553; CHECK-NEXT:    ldrb r5, [r5]
554; CHECK-NEXT:    vmov.16 q0[0], r4
555; CHECK-NEXT:    ldrb r0, [r0]
556; CHECK-NEXT:    vmov.16 q0[1], r5
557; CHECK-NEXT:    ldrb.w r3, [lr]
558; CHECK-NEXT:    vmov.16 q0[2], r0
559; CHECK-NEXT:    ldrb.w r12, [r12]
560; CHECK-NEXT:    vmov.16 q0[3], r3
561; CHECK-NEXT:    vmov.16 q0[4], r1
562; CHECK-NEXT:    vmov.16 q0[5], r2
563; CHECK-NEXT:    vmov.16 q0[6], r6
564; CHECK-NEXT:    vmov.16 q0[7], r12
565; CHECK-NEXT:    pop {r4, r5, r6, pc}
566entry:
567  %offs = load <8 x ptr>, ptr %offptr, align 4
568  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
569  ret <8 x i8> %gather
570}
571
572define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(ptr %offptr) {
573; CHECK-LABEL: ptr_v4i8_sext32:
574; CHECK:       @ %bb.0: @ %entry
575; CHECK-NEXT:    vldrw.u32 q1, [r0]
576; CHECK-NEXT:    movs r1, #0
577; CHECK-NEXT:    vldrb.s32 q0, [r1, q1]
578; CHECK-NEXT:    bx lr
579entry:
580  %offs = load <4 x ptr>, ptr %offptr, align 4
581  %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
582  %ext = sext <4 x i8> %gather to <4 x i32>
583  ret <4 x i32> %ext
584}
585
586define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(ptr %offptr) {
587; CHECK-LABEL: ptr_v4i8_zext32:
588; CHECK:       @ %bb.0: @ %entry
589; CHECK-NEXT:    vldrw.u32 q1, [r0]
590; CHECK-NEXT:    movs r1, #0
591; CHECK-NEXT:    vldrb.u32 q0, [r1, q1]
592; CHECK-NEXT:    bx lr
593entry:
594  %offs = load <4 x ptr>, ptr %offptr, align 4
595  %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
596  %ext = zext <4 x i8> %gather to <4 x i32>
597  ret <4 x i32> %ext
598}
599
600define arm_aapcs_vfpcc <4 x i8> @ptr_v4i8(ptr %offptr) {
601; CHECK-LABEL: ptr_v4i8:
602; CHECK:       @ %bb.0: @ %entry
603; CHECK-NEXT:    vldrw.u32 q1, [r0]
604; CHECK-NEXT:    movs r1, #0
605; CHECK-NEXT:    vldrb.u32 q0, [r1, q1]
606; CHECK-NEXT:    bx lr
607entry:
608  %offs = load <4 x ptr>, ptr %offptr, align 4
609  %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
610  ret <4 x i8> %gather
611}
612
613define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(ptr %offptr) {
614; CHECK-LABEL: ptr_v8i8_sext32:
615; CHECK:       @ %bb.0: @ %entry
616; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
617; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
618; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
619; CHECK-NEXT:    vmov r1, r2, d1
620; CHECK-NEXT:    vmov r3, r12, d0
621; CHECK-NEXT:    vldrw.u32 q0, [r0]
622; CHECK-NEXT:    vmov r0, lr, d1
623; CHECK-NEXT:    ldrb r7, [r2]
624; CHECK-NEXT:    vmov r2, r4, d0
625; CHECK-NEXT:    ldrb r6, [r1]
626; CHECK-NEXT:    ldrb r3, [r3]
627; CHECK-NEXT:    ldrb r0, [r0]
628; CHECK-NEXT:    ldrb.w r1, [r12]
629; CHECK-NEXT:    vmov q1[2], q1[0], r3, r6
630; CHECK-NEXT:    ldrb.w r5, [lr]
631; CHECK-NEXT:    vmov q1[3], q1[1], r1, r7
632; CHECK-NEXT:    vmovlb.s8 q1, q1
633; CHECK-NEXT:    vmovlb.s16 q1, q1
634; CHECK-NEXT:    ldrb r2, [r2]
635; CHECK-NEXT:    ldrb r4, [r4]
636; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
637; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
638; CHECK-NEXT:    vmovlb.s8 q0, q0
639; CHECK-NEXT:    vmovlb.s16 q0, q0
640; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
641entry:
642  %offs = load <8 x ptr>, ptr %offptr, align 4
643  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
644  %ext = sext <8 x i8> %gather to <8 x i32>
645  ret <8 x i32> %ext
646}
647
648define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_zext32(ptr %offptr) {
649; CHECK-LABEL: ptr_v8i8_zext32:
650; CHECK:       @ %bb.0: @ %entry
651; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
652; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
653; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
654; CHECK-NEXT:    vmov.i32 q1, #0xff
655; CHECK-NEXT:    vmov r1, r2, d1
656; CHECK-NEXT:    vmov r12, r3, d0
657; CHECK-NEXT:    vldrw.u32 q0, [r0]
658; CHECK-NEXT:    vmov r4, r5, d0
659; CHECK-NEXT:    vmov r0, lr, d1
660; CHECK-NEXT:    ldrb r7, [r2]
661; CHECK-NEXT:    ldrb r1, [r1]
662; CHECK-NEXT:    ldrb.w r2, [r12]
663; CHECK-NEXT:    ldrb r4, [r4]
664; CHECK-NEXT:    ldrb r0, [r0]
665; CHECK-NEXT:    vmov q2[2], q2[0], r2, r1
666; CHECK-NEXT:    ldrb r3, [r3]
667; CHECK-NEXT:    ldrb.w r6, [lr]
668; CHECK-NEXT:    vmov q0[2], q0[0], r4, r0
669; CHECK-NEXT:    ldrb r5, [r5]
670; CHECK-NEXT:    vmov q2[3], q2[1], r3, r7
671; CHECK-NEXT:    vmov q0[3], q0[1], r5, r6
672; CHECK-NEXT:    vand q0, q0, q1
673; CHECK-NEXT:    vand q1, q2, q1
674; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
675entry:
676  %offs = load <8 x ptr>, ptr %offptr, align 4
677  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
678  %ext = zext <8 x i8> %gather to <8 x i32>
679  ret <8 x i32> %ext
680}
681
682; loops
683
684define void @foo_ptr_p_int32_t(ptr %dest, ptr %src, i32 %n) {
685; CHECK-LABEL: foo_ptr_p_int32_t:
686; CHECK:       @ %bb.0: @ %entry
687; CHECK-NEXT:    .save {r7, lr}
688; CHECK-NEXT:    push {r7, lr}
689; CHECK-NEXT:    bic r2, r2, #15
690; CHECK-NEXT:    cmp r2, #1
691; CHECK-NEXT:    it lt
692; CHECK-NEXT:    poplt {r7, pc}
693; CHECK-NEXT:  .LBB26_1: @ %vector.body.preheader
694; CHECK-NEXT:    subs r2, #4
695; CHECK-NEXT:    movs r3, #1
696; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
697; CHECK-NEXT:  .LBB26_2: @ %vector.body
698; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
699; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
700; CHECK-NEXT:    vptt.i32 ne, q0, zr
701; CHECK-NEXT:    vldrwt.u32 q1, [q0]
702; CHECK-NEXT:    vstrwt.32 q1, [r0], #16
703; CHECK-NEXT:    le lr, .LBB26_2
704; CHECK-NEXT:  @ %bb.3: @ %for.end
705; CHECK-NEXT:    pop {r7, pc}
706entry:
707  %and = and i32 %n, -16
708  %cmp11 = icmp sgt i32 %and, 0
709  br i1 %cmp11, label %vector.body, label %for.end
710
711vector.body:                                      ; preds = %vector.body, %entry
712  %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ]
713  %i = getelementptr inbounds ptr, ptr %src, i32 %index
714  %wide.load = load <4 x ptr>, ptr %i, align 4
715  %i2 = icmp ne <4 x ptr> %wide.load, zeroinitializer
716  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %wide.load, i32 4, <4 x i1> %i2, <4 x i32> undef)
717  %i3 = getelementptr inbounds i32, ptr %dest, i32 %index
718  call void @llvm.masked.store.v4i32.p0(<4 x i32> %wide.masked.gather, ptr %i3, i32 4, <4 x i1> %i2)
719  %index.next = add i32 %index, 4
720  %i5 = icmp eq i32 %index.next, %and
721  br i1 %i5, label %for.end, label %vector.body
722
723for.end:                                          ; preds = %vector.body, %entry
724  ret void
725}
726
727define void @foo_ptr_p_float(ptr %dest, ptr %src, i32 %n) {
728; CHECK-LABEL: foo_ptr_p_float:
729; CHECK:       @ %bb.0: @ %entry
730; CHECK-NEXT:    .save {r7, lr}
731; CHECK-NEXT:    push {r7, lr}
732; CHECK-NEXT:    bic r2, r2, #15
733; CHECK-NEXT:    cmp r2, #1
734; CHECK-NEXT:    it lt
735; CHECK-NEXT:    poplt {r7, pc}
736; CHECK-NEXT:  .LBB27_1: @ %vector.body.preheader
737; CHECK-NEXT:    subs r2, #4
738; CHECK-NEXT:    movs r3, #1
739; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
740; CHECK-NEXT:  .LBB27_2: @ %vector.body
741; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
742; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
743; CHECK-NEXT:    vptt.i32 ne, q0, zr
744; CHECK-NEXT:    vldrwt.u32 q1, [q0]
745; CHECK-NEXT:    vstrwt.32 q1, [r0], #16
746; CHECK-NEXT:    le lr, .LBB27_2
747; CHECK-NEXT:  @ %bb.3: @ %for.end
748; CHECK-NEXT:    pop {r7, pc}
749entry:
750  %and = and i32 %n, -16
751  %cmp11 = icmp sgt i32 %and, 0
752  br i1 %cmp11, label %vector.body, label %for.end
753
754vector.body:                                      ; preds = %vector.body, %entry
755  %index = phi i32 [ %index.next, %vector.body ], [ 0, %entry ]
756  %i = getelementptr inbounds ptr, ptr %src, i32 %index
757  %wide.load = load <4 x ptr>, ptr %i, align 4
758  %i2 = icmp ne <4 x ptr> %wide.load, zeroinitializer
759  %i3 = bitcast <4 x ptr> %wide.load to <4 x ptr>
760  %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %i3, i32 4, <4 x i1> %i2, <4 x i32> undef)
761  %i4 = getelementptr inbounds float, ptr %dest, i32 %index
762  call void @llvm.masked.store.v4i32.p0(<4 x i32> %wide.masked.gather, ptr %i4, i32 4, <4 x i1> %i2)
763  %index.next = add i32 %index, 4
764  %i6 = icmp eq i32 %index.next, %and
765  br i1 %i6, label %for.end, label %vector.body
766
767for.end:                                          ; preds = %vector.body, %entry
768  ret void
769}
770
771define arm_aapcs_vfpcc <4 x i32> @qi4(<4 x ptr> %p) {
772; CHECK-LABEL: qi4:
773; CHECK:       @ %bb.0: @ %entry
774; CHECK-NEXT:    movs r0, #16
775; CHECK-NEXT:    vadd.i32 q1, q0, r0
776; CHECK-NEXT:    vldrw.u32 q0, [q1]
777; CHECK-NEXT:    bx lr
778entry:
779  %g = getelementptr inbounds i32, <4 x ptr> %p, i32 4
780  %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %g, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
781  ret <4 x i32> %gather
782}
783
784define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(ptr %base, ptr %offptr) {
785; CHECK-LABEL: sext_unsigned_unscaled_i8_i8_toi64:
786; CHECK:       @ %bb.0: @ %entry
787; CHECK-NEXT:    vldrb.u16 q0, [r1]
788; CHECK-NEXT:    vldrb.u16 q1, [r0, q0]
789; CHECK-NEXT:    vmov.u16 r0, q1[2]
790; CHECK-NEXT:    vmov.u16 r1, q1[0]
791; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
792; CHECK-NEXT:    vmov.u16 r0, q1[3]
793; CHECK-NEXT:    vmov.u16 r1, q1[1]
794; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
795; CHECK-NEXT:    vmov.u16 r0, q1[6]
796; CHECK-NEXT:    vmov.u16 r1, q1[4]
797; CHECK-NEXT:    vmovlb.s8 q0, q0
798; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
799; CHECK-NEXT:    vmov.u16 r0, q1[7]
800; CHECK-NEXT:    vmov.u16 r1, q1[5]
801; CHECK-NEXT:    vmovlb.s16 q0, q0
802; CHECK-NEXT:    vmov q2[3], q2[1], r1, r0
803; CHECK-NEXT:    vmovlb.s8 q1, q2
804; CHECK-NEXT:    vmovlb.s16 q1, q1
805; CHECK-NEXT:    bx lr
806entry:
807  %offs = load <8 x i8>, ptr %offptr, align 1
808  %offs.zext = zext <8 x i8> %offs to <8 x i32>
809  %ptrs = getelementptr inbounds i8, ptr %base, <8 x i32> %offs.zext
810  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
811  %gather.sext = sext <8 x i8> %gather to <8 x i32>
812  ret <8 x i32> %gather.sext
813}
814
815define arm_aapcs_vfpcc <4 x i32> @gepconstoff_i32(ptr %base) {
816; CHECK-LABEL: gepconstoff_i32:
817; CHECK:       @ %bb.0: @ %bb
818; CHECK-NEXT:    adr r1, .LCPI30_0
819; CHECK-NEXT:    vldrw.u32 q1, [r1]
820; CHECK-NEXT:    vldrw.u32 q0, [r0, q1, uxtw #2]
821; CHECK-NEXT:    bx lr
822; CHECK-NEXT:    .p2align 4
823; CHECK-NEXT:  @ %bb.1:
824; CHECK-NEXT:  .LCPI30_0:
825; CHECK-NEXT:    .long 0 @ 0x0
826; CHECK-NEXT:    .long 4 @ 0x4
827; CHECK-NEXT:    .long 8 @ 0x8
828; CHECK-NEXT:    .long 12 @ 0xc
829bb:
830  %a = getelementptr i32, ptr %base, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
831  %g = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %a, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
832  ret <4 x i32> %g
833}
834
835define arm_aapcs_vfpcc <4 x i32> @gepconstoff_i8(ptr %base) {
836; CHECK-LABEL: gepconstoff_i8:
837; CHECK:       @ %bb.0: @ %bb
838; CHECK-NEXT:    adr r1, .LCPI31_0
839; CHECK-NEXT:    vldrw.u32 q1, [r1]
840; CHECK-NEXT:    vldrw.u32 q0, [r0, q1]
841; CHECK-NEXT:    bx lr
842; CHECK-NEXT:    .p2align 4
843; CHECK-NEXT:  @ %bb.1:
844; CHECK-NEXT:  .LCPI31_0:
845; CHECK-NEXT:    .long 4294967292 @ 0xfffffffc
846; CHECK-NEXT:    .long 12 @ 0xc
847; CHECK-NEXT:    .long 28 @ 0x1c
848; CHECK-NEXT:    .long 44 @ 0x2c
849bb:
850  %a = getelementptr i8, ptr %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48>
851  %b = bitcast <4 x ptr> %a to <4 x ptr>
852  %c = getelementptr inbounds i32, <4 x ptr> %b, i32 -1
853  %g = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %c, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
854  ret <4 x i32> %g
855}
856
857define arm_aapcs_vfpcc <4 x i32> @gepconstoff3_i16(ptr %base) {
858; CHECK-LABEL: gepconstoff3_i16:
859; CHECK:       @ %bb.0: @ %bb
860; CHECK-NEXT:    adr r1, .LCPI32_0
861; CHECK-NEXT:    vldrw.u32 q1, [r1]
862; CHECK-NEXT:    vldrw.u32 q0, [r0, q1]
863; CHECK-NEXT:    bx lr
864; CHECK-NEXT:    .p2align 4
865; CHECK-NEXT:  @ %bb.1:
866; CHECK-NEXT:  .LCPI32_0:
867; CHECK-NEXT:    .long 12 @ 0xc
868; CHECK-NEXT:    .long 18 @ 0x12
869; CHECK-NEXT:    .long 58 @ 0x3a
870; CHECK-NEXT:    .long 280 @ 0x118
871bb:
872  %a = getelementptr i16, ptr %base, <4 x i32> <i32 0, i32 16, i32 32, i32 48>
873  %b = bitcast <4 x ptr> %a to <4 x ptr>
874  %c = getelementptr i8, <4 x ptr> %b, <4 x i32> <i32 16, i32 -10, i32 -2, i32 188>
875  %d = bitcast <4 x ptr> %c to <4 x ptr>
876  %e = getelementptr inbounds i32, <4 x ptr> %d, i32 -1
877  %g = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %e, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
878  ret <4 x i32> %g
879}
880
881declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>)
882declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>)
883declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>)
884declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>)
885declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>)
886declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>)
887declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>)
888declare <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i16>)
889declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>)
890declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>)
891declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>)
892declare <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x half>)
893declare <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x half>)
894declare <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x half>)
895declare <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i8>)
896declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>)
897declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>)
898declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>)
899declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>)
900