xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s
3
4; i32
5
6; Expand
7define arm_aapcs_vfpcc void @ptr_v2i32(<2 x i32> %v, ptr %offptr) {
8; CHECK-LABEL: ptr_v2i32:
9; CHECK:       @ %bb.0: @ %entry
10; CHECK-NEXT:    vmov r2, s0
11; CHECK-NEXT:    ldrd r0, r1, [r0]
12; CHECK-NEXT:    str r2, [r0]
13; CHECK-NEXT:    vmov r0, s2
14; CHECK-NEXT:    str r0, [r1]
15; CHECK-NEXT:    bx lr
16entry:
17  %offs = load <2 x ptr>, ptr %offptr, align 4
18  call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %v, <2 x ptr> %offs, i32 4, <2 x i1> <i1 true, i1 true>)
19  ret void
20}
21
22; VSTRW.32 Qd, [offs, 0]
23define arm_aapcs_vfpcc void @ptr_v4i32(<4 x i32> %v, ptr %offptr) {
24; CHECK-LABEL: ptr_v4i32:
25; CHECK:       @ %bb.0: @ %entry
26; CHECK-NEXT:    vldrw.u32 q1, [r0]
27; CHECK-NEXT:    vstrw.32 q0, [q1]
28; CHECK-NEXT:    bx lr
29entry:
30  %offs = load <4 x ptr>, ptr %offptr, align 4
31  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %v, <4 x ptr> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
32  ret void
33}
34
35; Expand
36define arm_aapcs_vfpcc void @ptr_v8i32(<8 x i32> %v, ptr %offptr) {
37; CHECK-LABEL: ptr_v8i32:
38; CHECK:       @ %bb.0: @ %entry
39; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
40; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
41; CHECK-NEXT:    vldrw.u32 q2, [r0]
42; CHECK-NEXT:    vmov r3, r4, d0
43; CHECK-NEXT:    vmov r1, r2, d4
44; CHECK-NEXT:    vmov lr, r12, d5
45; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
46; CHECK-NEXT:    vmov r0, r5, d1
47; CHECK-NEXT:    str r3, [r1]
48; CHECK-NEXT:    vmov r1, r7, d4
49; CHECK-NEXT:    str r4, [r2]
50; CHECK-NEXT:    vmov r2, r4, d5
51; CHECK-NEXT:    str.w r0, [lr]
52; CHECK-NEXT:    vmov r0, r3, d2
53; CHECK-NEXT:    str.w r5, [r12]
54; CHECK-NEXT:    vmov r5, r6, d3
55; CHECK-NEXT:    str r0, [r1]
56; CHECK-NEXT:    str r3, [r7]
57; CHECK-NEXT:    str r5, [r2]
58; CHECK-NEXT:    str r6, [r4]
59; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
60entry:
61  %offs = load <8 x ptr>, ptr %offptr, align 4
62  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %v, <8 x ptr> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
63  ret void
64}
65
66; Expand
67define arm_aapcs_vfpcc void @ptr_v16i32(<16 x i32> %v, ptr %offptr) {
68; CHECK-LABEL: ptr_v16i32:
69; CHECK:       @ %bb.0: @ %entry
70; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
71; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
72; CHECK-NEXT:    .pad #4
73; CHECK-NEXT:    sub sp, #4
74; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
75; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
76; CHECK-NEXT:    vldrw.u32 q4, [r0]
77; CHECK-NEXT:    vmov r3, r4, d0
78; CHECK-NEXT:    vldrw.u32 q5, [r0, #32]
79; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
80; CHECK-NEXT:    vmov r1, r2, d8
81; CHECK-NEXT:    vmov lr, r12, d9
82; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
83; CHECK-NEXT:    vmov r0, r5, d1
84; CHECK-NEXT:    str r3, [r1]
85; CHECK-NEXT:    vmov r1, r3, d12
86; CHECK-NEXT:    str r4, [r2]
87; CHECK-NEXT:    vmov r2, r7, d13
88; CHECK-NEXT:    str.w r0, [lr]
89; CHECK-NEXT:    vmov r0, r4, d2
90; CHECK-NEXT:    str.w r5, [r12]
91; CHECK-NEXT:    vmov r5, r6, d3
92; CHECK-NEXT:    str r0, [r1]
93; CHECK-NEXT:    vmov r0, r1, d10
94; CHECK-NEXT:    str r4, [r3]
95; CHECK-NEXT:    vmov r3, r4, d11
96; CHECK-NEXT:    str r5, [r2]
97; CHECK-NEXT:    vmov r2, r5, d4
98; CHECK-NEXT:    str r6, [r7]
99; CHECK-NEXT:    vmov r7, r6, d5
100; CHECK-NEXT:    str r2, [r0]
101; CHECK-NEXT:    vmov r0, r2, d8
102; CHECK-NEXT:    str r5, [r1]
103; CHECK-NEXT:    vmov r1, r5, d9
104; CHECK-NEXT:    str r7, [r3]
105; CHECK-NEXT:    vmov r3, r7, d6
106; CHECK-NEXT:    str r6, [r4]
107; CHECK-NEXT:    vmov r6, r4, d7
108; CHECK-NEXT:    str r3, [r0]
109; CHECK-NEXT:    str r7, [r2]
110; CHECK-NEXT:    str r6, [r1]
111; CHECK-NEXT:    str r4, [r5]
112; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
113; CHECK-NEXT:    add sp, #4
114; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
115entry:
116  %offs = load <16 x ptr>, ptr %offptr, align 4
117  call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %v, <16 x ptr> %offs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
118  ret void
119}
120
121; f32
122
123; Expand
124define arm_aapcs_vfpcc void @ptr_v2f32(<2 x float> %v, ptr %offptr) {
125; CHECK-LABEL: ptr_v2f32:
126; CHECK:       @ %bb.0: @ %entry
127; CHECK-NEXT:    ldrd r0, r1, [r0]
128; CHECK-NEXT:    vstr s0, [r0]
129; CHECK-NEXT:    vstr s1, [r1]
130; CHECK-NEXT:    bx lr
131entry:
132  %offs = load <2 x ptr>, ptr %offptr, align 4
133  call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> %v, <2 x ptr> %offs, i32 4, <2 x i1> <i1 true, i1 true>)
134  ret void
135}
136
137; VSTRW.32 Qd, [offs, 0]
138define arm_aapcs_vfpcc void @ptr_v4f32(<4 x float> %v, ptr %offptr) {
139; CHECK-LABEL: ptr_v4f32:
140; CHECK:       @ %bb.0: @ %entry
141; CHECK-NEXT:    vldrw.u32 q1, [r0]
142; CHECK-NEXT:    vstrw.32 q0, [q1]
143; CHECK-NEXT:    bx lr
144entry:
145  %offs = load <4 x ptr>, ptr %offptr, align 4
146  call void @llvm.masked.scatter.v4f32.v4p0(<4 x float> %v, <4 x ptr> %offs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
147  ret void
148}
149
150; Expand
151define arm_aapcs_vfpcc void @ptr_v8f32(<8 x float> %v, ptr %offptr) {
152; CHECK-LABEL: ptr_v8f32:
153; CHECK:       @ %bb.0: @ %entry
154; CHECK-NEXT:    .save {r4, r5, r7, lr}
155; CHECK-NEXT:    push {r4, r5, r7, lr}
156; CHECK-NEXT:    vldrw.u32 q2, [r0]
157; CHECK-NEXT:    vmov r1, lr, d4
158; CHECK-NEXT:    vmov r3, r12, d5
159; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
160; CHECK-NEXT:    vmov r0, r2, d4
161; CHECK-NEXT:    vmov r4, r5, d5
162; CHECK-NEXT:    vstr s0, [r1]
163; CHECK-NEXT:    vstr s1, [lr]
164; CHECK-NEXT:    vstr s2, [r3]
165; CHECK-NEXT:    vstr s3, [r12]
166; CHECK-NEXT:    vstr s4, [r0]
167; CHECK-NEXT:    vstr s5, [r2]
168; CHECK-NEXT:    vstr s6, [r4]
169; CHECK-NEXT:    vstr s7, [r5]
170; CHECK-NEXT:    pop {r4, r5, r7, pc}
171entry:
172  %offs = load <8 x ptr>, ptr %offptr, align 4
173  call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %v, <8 x ptr> %offs, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
174  ret void
175}
176
177; i16
178
179; Expand.
180define arm_aapcs_vfpcc void @ptr_i16(<8 x i16> %v, ptr %offptr) {
181; CHECK-LABEL: ptr_i16:
182; CHECK:       @ %bb.0: @ %entry
183; CHECK-NEXT:    .save {r4, r5, r6, lr}
184; CHECK-NEXT:    push {r4, r5, r6, lr}
185; CHECK-NEXT:    vldrw.u32 q1, [r0]
186; CHECK-NEXT:    vmov.u16 r6, q0[0]
187; CHECK-NEXT:    vmov r1, r2, d2
188; CHECK-NEXT:    vmov r3, r12, d3
189; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
190; CHECK-NEXT:    vmov r0, lr, d2
191; CHECK-NEXT:    vmov r4, r5, d3
192; CHECK-NEXT:    strh r6, [r1]
193; CHECK-NEXT:    vmov.u16 r1, q0[1]
194; CHECK-NEXT:    strh r1, [r2]
195; CHECK-NEXT:    vmov.u16 r1, q0[2]
196; CHECK-NEXT:    strh r1, [r3]
197; CHECK-NEXT:    vmov.u16 r1, q0[3]
198; CHECK-NEXT:    strh.w r1, [r12]
199; CHECK-NEXT:    vmov.u16 r1, q0[4]
200; CHECK-NEXT:    strh r1, [r0]
201; CHECK-NEXT:    vmov.u16 r0, q0[5]
202; CHECK-NEXT:    strh.w r0, [lr]
203; CHECK-NEXT:    vmov.u16 r0, q0[6]
204; CHECK-NEXT:    strh r0, [r4]
205; CHECK-NEXT:    vmov.u16 r0, q0[7]
206; CHECK-NEXT:    strh r0, [r5]
207; CHECK-NEXT:    pop {r4, r5, r6, pc}
208entry:
209  %offs = load <8 x ptr>, ptr %offptr, align 4
210  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %v, <8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
211  ret void
212}
213
214; Expand
215define arm_aapcs_vfpcc void @ptr_v2i16_trunc(<2 x i32> %v, ptr %offptr) {
216; CHECK-LABEL: ptr_v2i16_trunc:
217; CHECK:       @ %bb.0: @ %entry
218; CHECK-NEXT:    vmov r2, s0
219; CHECK-NEXT:    ldrd r0, r1, [r0]
220; CHECK-NEXT:    strh r2, [r0]
221; CHECK-NEXT:    vmov r0, s2
222; CHECK-NEXT:    strh r0, [r1]
223; CHECK-NEXT:    bx lr
224entry:
225  %offs = load <2 x ptr>, ptr %offptr, align 4
226  %ext = trunc <2 x i32> %v to <2 x i16>
227  call void @llvm.masked.scatter.v2i16.v2p0(<2 x i16> %ext, <2 x ptr> %offs, i32 2, <2 x i1> <i1 true, i1 true>)
228  ret void
229}
230
231define arm_aapcs_vfpcc void @ptr_v4i16_trunc(<4 x i32> %v, ptr %offptr) {
232; CHECK-LABEL: ptr_v4i16_trunc:
233; CHECK:       @ %bb.0: @ %entry
234; CHECK-NEXT:    vldrw.u32 q1, [r0]
235; CHECK-NEXT:    movs r0, #0
236; CHECK-NEXT:    vstrh.32 q0, [r0, q1]
237; CHECK-NEXT:    bx lr
238entry:
239  %offs = load <4 x ptr>, ptr %offptr, align 4
240  %ext = trunc <4 x i32> %v to <4 x i16>
241  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %ext, <4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
242  ret void
243}
244
245define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x ptr> %offs) {
246; CHECK-LABEL: ptr_v4i16_dup:
247; CHECK:       @ %bb.0: @ %entry
248; CHECK-NEXT:    vdup.32 q1, r0
249; CHECK-NEXT:    movs r1, #0
250; CHECK-NEXT:    vmovlb.u16 q1, q1
251; CHECK-NEXT:    vstrh.32 q1, [r1, q0]
252; CHECK-NEXT:    bx lr
253entry:
254  %ext = trunc i32 %v to i16
255  %splatinsert = insertelement <4 x i16> poison, i16 %ext, i32 0
256  %splat = shufflevector <4 x i16> %splatinsert, <4 x i16> poison, <4 x i32> zeroinitializer
257  call void @llvm.masked.scatter.v4i16.v4p0(<4 x i16> %splat, <4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
258  ret void
259}
260
261; Expand
262define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, ptr %offptr) {
263; CHECK-LABEL: ptr_v8i16_trunc:
264; CHECK:       @ %bb.0: @ %entry
265; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
266; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
267; CHECK-NEXT:    vldrw.u32 q2, [r0]
268; CHECK-NEXT:    vmov r3, r4, d0
269; CHECK-NEXT:    vmov r1, r2, d4
270; CHECK-NEXT:    vmov lr, r12, d5
271; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
272; CHECK-NEXT:    vmov r0, r5, d1
273; CHECK-NEXT:    strh r3, [r1]
274; CHECK-NEXT:    vmov r1, r7, d4
275; CHECK-NEXT:    strh r4, [r2]
276; CHECK-NEXT:    vmov r2, r4, d5
277; CHECK-NEXT:    strh.w r0, [lr]
278; CHECK-NEXT:    vmov r0, r3, d2
279; CHECK-NEXT:    strh.w r5, [r12]
280; CHECK-NEXT:    vmov r5, r6, d3
281; CHECK-NEXT:    strh r0, [r1]
282; CHECK-NEXT:    strh r3, [r7]
283; CHECK-NEXT:    strh r5, [r2]
284; CHECK-NEXT:    strh r6, [r4]
285; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
286entry:
287  %offs = load <8 x ptr>, ptr %offptr, align 4
288  %ext = trunc <8 x i32> %v to <8 x i16>
289  call void @llvm.masked.scatter.v8i16.v8p0(<8 x i16> %ext, <8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
290  ret void
291}
292
293; f16
294
295; Expand.
296define arm_aapcs_vfpcc void @ptr_f16(<8 x half> %v, ptr %offptr) {
297; CHECK-LABEL: ptr_f16:
298; CHECK:       @ %bb.0: @ %entry
299; CHECK-NEXT:    vldrw.u32 q2, [r0]
300; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
301; CHECK-NEXT:    vmov r0, r1, d4
302; CHECK-NEXT:    vstr.16 s0, [r0]
303; CHECK-NEXT:    vmovx.f16 s0, s0
304; CHECK-NEXT:    vstr.16 s0, [r1]
305; CHECK-NEXT:    vmov r0, r1, d5
306; CHECK-NEXT:    vmovx.f16 s0, s1
307; CHECK-NEXT:    vstr.16 s1, [r0]
308; CHECK-NEXT:    vstr.16 s0, [r1]
309; CHECK-NEXT:    vmov r0, r1, d2
310; CHECK-NEXT:    vmovx.f16 s0, s2
311; CHECK-NEXT:    vstr.16 s2, [r0]
312; CHECK-NEXT:    vstr.16 s0, [r1]
313; CHECK-NEXT:    vmov r0, r1, d3
314; CHECK-NEXT:    vmovx.f16 s0, s3
315; CHECK-NEXT:    vstr.16 s3, [r0]
316; CHECK-NEXT:    vstr.16 s0, [r1]
317; CHECK-NEXT:    bx lr
318entry:
319  %offs = load <8 x ptr>, ptr %offptr, align 4
320  call void @llvm.masked.scatter.v8f16.v8p0(<8 x half> %v, <8 x ptr> %offs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
321  ret void
322}
323
324define arm_aapcs_vfpcc void @ptr_v4f16(<4 x half> %v, ptr %offptr) {
325; CHECK-LABEL: ptr_v4f16:
326; CHECK:       @ %bb.0: @ %entry
327; CHECK-NEXT:    vldrw.u32 q1, [r0]
328; CHECK-NEXT:    vmov r0, r1, d2
329; CHECK-NEXT:    vstr.16 s0, [r0]
330; CHECK-NEXT:    vmovx.f16 s0, s0
331; CHECK-NEXT:    vstr.16 s0, [r1]
332; CHECK-NEXT:    vmov r0, r1, d3
333; CHECK-NEXT:    vmovx.f16 s0, s1
334; CHECK-NEXT:    vstr.16 s1, [r0]
335; CHECK-NEXT:    vstr.16 s0, [r1]
336; CHECK-NEXT:    bx lr
337entry:
338  %offs = load <4 x ptr>, ptr %offptr, align 4
339  call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> %v, <4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
340  ret void
341}
342
343define arm_aapcs_vfpcc void @ptr_v4f16_dup(half %v, <4 x ptr> %offs) {
344; CHECK-LABEL: ptr_v4f16_dup:
345; CHECK:       @ %bb.0: @ %entry
346; CHECK-NEXT:    vmov r0, r1, d2
347; CHECK-NEXT:    vmov r2, r3, d3
348; CHECK-NEXT:    vstr.16 s0, [r0]
349; CHECK-NEXT:    vstr.16 s0, [r1]
350; CHECK-NEXT:    vstr.16 s0, [r2]
351; CHECK-NEXT:    vstr.16 s0, [r3]
352; CHECK-NEXT:    bx lr
353entry:
354  %splatinsert = insertelement <4 x half> poison, half %v, i32 0
355  %splat = shufflevector <4 x half> %splatinsert, <4 x half> poison, <4 x i32> zeroinitializer
356  call void @llvm.masked.scatter.v4f16.v4p0(<4 x half> %splat, <4 x ptr> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
357  ret void
358}
359
360; i8
361
362; Expand.
363define arm_aapcs_vfpcc void @ptr_i8(<16 x i8> %v, ptr %offptr) {
364; CHECK-LABEL: ptr_i8:
365; CHECK:       @ %bb.0: @ %entry
366; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
367; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
368; CHECK-NEXT:    vldrw.u32 q1, [r0]
369; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
370; CHECK-NEXT:    vldrw.u32 q3, [r0, #32]
371; CHECK-NEXT:    vmov.u8 r6, q0[0]
372; CHECK-NEXT:    vmov r1, r2, d2
373; CHECK-NEXT:    vmov.u8 r5, q0[4]
374; CHECK-NEXT:    vmov r3, r12, d3
375; CHECK-NEXT:    vldrw.u32 q1, [r0, #48]
376; CHECK-NEXT:    vmov lr, r4, d4
377; CHECK-NEXT:    vmov.u8 r7, q0[6]
378; CHECK-NEXT:    vmov r0, r8, d5
379; CHECK-NEXT:    strb r6, [r1]
380; CHECK-NEXT:    vmov.u8 r1, q0[1]
381; CHECK-NEXT:    strb r1, [r2]
382; CHECK-NEXT:    vmov.u8 r6, q0[2]
383; CHECK-NEXT:    vmov r1, r9, d6
384; CHECK-NEXT:    strb r6, [r3]
385; CHECK-NEXT:    vmov.u8 r3, q0[3]
386; CHECK-NEXT:    vmov.u8 r2, q0[8]
387; CHECK-NEXT:    strb.w r3, [r12]
388; CHECK-NEXT:    vmov r3, r6, d7
389; CHECK-NEXT:    strb.w r5, [lr]
390; CHECK-NEXT:    vmov.u8 r5, q0[5]
391; CHECK-NEXT:    strb r5, [r4]
392; CHECK-NEXT:    vmov r5, r4, d2
393; CHECK-NEXT:    strb r7, [r0]
394; CHECK-NEXT:    vmov.u8 r0, q0[7]
395; CHECK-NEXT:    strb.w r0, [r8]
396; CHECK-NEXT:    vmov r0, r7, d3
397; CHECK-NEXT:    strb r2, [r1]
398; CHECK-NEXT:    vmov.u8 r1, q0[9]
399; CHECK-NEXT:    strb.w r1, [r9]
400; CHECK-NEXT:    vmov.u8 r1, q0[10]
401; CHECK-NEXT:    strb r1, [r3]
402; CHECK-NEXT:    vmov.u8 r1, q0[11]
403; CHECK-NEXT:    strb r1, [r6]
404; CHECK-NEXT:    vmov.u8 r1, q0[12]
405; CHECK-NEXT:    strb r1, [r5]
406; CHECK-NEXT:    vmov.u8 r1, q0[13]
407; CHECK-NEXT:    strb r1, [r4]
408; CHECK-NEXT:    vmov.u8 r1, q0[14]
409; CHECK-NEXT:    strb r1, [r0]
410; CHECK-NEXT:    vmov.u8 r0, q0[15]
411; CHECK-NEXT:    strb r0, [r7]
412; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
413entry:
414  %offs = load <16 x ptr>, ptr %offptr, align 4
415  call void @llvm.masked.scatter.v16i8.v16p0(<16 x i8> %v, <16 x ptr> %offs, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
416  ret void
417}
418
419; Expand
420define arm_aapcs_vfpcc void @ptr_v8i8_trunc16(<8 x i16> %v, ptr %offptr) {
421; CHECK-LABEL: ptr_v8i8_trunc16:
422; CHECK:       @ %bb.0: @ %entry
423; CHECK-NEXT:    .save {r4, r5, r6, lr}
424; CHECK-NEXT:    push {r4, r5, r6, lr}
425; CHECK-NEXT:    vldrw.u32 q1, [r0]
426; CHECK-NEXT:    vmov.u16 r6, q0[0]
427; CHECK-NEXT:    vmov r1, r2, d2
428; CHECK-NEXT:    vmov r3, r12, d3
429; CHECK-NEXT:    vldrw.u32 q1, [r0, #16]
430; CHECK-NEXT:    vmov r0, lr, d2
431; CHECK-NEXT:    vmov r4, r5, d3
432; CHECK-NEXT:    strb r6, [r1]
433; CHECK-NEXT:    vmov.u16 r1, q0[1]
434; CHECK-NEXT:    strb r1, [r2]
435; CHECK-NEXT:    vmov.u16 r1, q0[2]
436; CHECK-NEXT:    strb r1, [r3]
437; CHECK-NEXT:    vmov.u16 r1, q0[3]
438; CHECK-NEXT:    strb.w r1, [r12]
439; CHECK-NEXT:    vmov.u16 r1, q0[4]
440; CHECK-NEXT:    strb r1, [r0]
441; CHECK-NEXT:    vmov.u16 r0, q0[5]
442; CHECK-NEXT:    strb.w r0, [lr]
443; CHECK-NEXT:    vmov.u16 r0, q0[6]
444; CHECK-NEXT:    strb r0, [r4]
445; CHECK-NEXT:    vmov.u16 r0, q0[7]
446; CHECK-NEXT:    strb r0, [r5]
447; CHECK-NEXT:    pop {r4, r5, r6, pc}
448entry:
449  %offs = load <8 x ptr>, ptr %offptr, align 4
450  %ext = trunc <8 x i16> %v to <8 x i8>
451  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %ext, <8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
452  ret void
453}
454
455define arm_aapcs_vfpcc void @ptr_v4i8_trunc32(<4 x i32> %v, ptr %offptr) {
456; CHECK-LABEL: ptr_v4i8_trunc32:
457; CHECK:       @ %bb.0: @ %entry
458; CHECK-NEXT:    vldrw.u32 q1, [r0]
459; CHECK-NEXT:    movs r0, #0
460; CHECK-NEXT:    vstrb.32 q0, [r0, q1]
461; CHECK-NEXT:    bx lr
462entry:
463  %offs = load <4 x ptr>, ptr %offptr, align 4
464  %ext = trunc <4 x i32> %v to <4 x i8>
465  call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> %ext, <4 x ptr> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
466  ret void
467}
468
469; Expand
470define arm_aapcs_vfpcc void @ptr_v8i8_trunc32(<8 x i32> %v, ptr %offptr) {
471; CHECK-LABEL: ptr_v8i8_trunc32:
472; CHECK:       @ %bb.0: @ %entry
473; CHECK-NEXT:    .save {r4, r5, r6, r7, lr}
474; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
475; CHECK-NEXT:    vldrw.u32 q2, [r0]
476; CHECK-NEXT:    vmov r3, r4, d0
477; CHECK-NEXT:    vmov r1, r2, d4
478; CHECK-NEXT:    vmov lr, r12, d5
479; CHECK-NEXT:    vldrw.u32 q2, [r0, #16]
480; CHECK-NEXT:    vmov r0, r5, d1
481; CHECK-NEXT:    strb r3, [r1]
482; CHECK-NEXT:    vmov r1, r7, d4
483; CHECK-NEXT:    strb r4, [r2]
484; CHECK-NEXT:    vmov r2, r4, d5
485; CHECK-NEXT:    strb.w r0, [lr]
486; CHECK-NEXT:    vmov r0, r3, d2
487; CHECK-NEXT:    strb.w r5, [r12]
488; CHECK-NEXT:    vmov r5, r6, d3
489; CHECK-NEXT:    strb r0, [r1]
490; CHECK-NEXT:    strb r3, [r7]
491; CHECK-NEXT:    strb r5, [r2]
492; CHECK-NEXT:    strb r6, [r4]
493; CHECK-NEXT:    pop {r4, r5, r6, r7, pc}
494entry:
495  %offs = load <8 x ptr>, ptr %offptr, align 4
496  %ext = trunc <8 x i32> %v to <8 x i8>
497  call void @llvm.masked.scatter.v8i8.v8p0(<8 x i8> %ext, <8 x ptr> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
498  ret void
499}
500
501; loops
502
503define void @foo_ptr_p_int32_t(ptr %dest, ptr %src, i32 %n) {
504; CHECK-LABEL: foo_ptr_p_int32_t:
505; CHECK:       @ %bb.0: @ %entry
506; CHECK-NEXT:    bic r3, r2, #15
507; CHECK-NEXT:    cmp r3, #1
508; CHECK-NEXT:    it lt
509; CHECK-NEXT:    bxlt lr
510; CHECK-NEXT:  .LBB19_1: @ %vector.body
511; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
512; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
513; CHECK-NEXT:    subs r2, #4
514; CHECK-NEXT:    vptt.i32 ne, q0, zr
515; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
516; CHECK-NEXT:    vstrwt.32 q1, [q0]
517; CHECK-NEXT:    bne .LBB19_1
518; CHECK-NEXT:  @ %bb.2: @ %for.end
519; CHECK-NEXT:    bx lr
520entry:
521  %and = and i32 %n, -16
522  %cmp11 = icmp sgt i32 %and, 0
523  br i1 %cmp11, label %vector.body, label %for.end
524
525vector.body:                                      ; preds = %entry, %vector.body
526  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
527  %0 = getelementptr inbounds ptr, ptr %src, i32 %index
528  %wide.load = load <4 x ptr>, ptr %0, align 4
529  %1 = icmp ne <4 x ptr> %wide.load, zeroinitializer
530  %2 = getelementptr inbounds i32, ptr %dest, i32 %index
531  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.v4p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef)
532  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %wide.masked.load, <4 x ptr> %wide.load, i32 4, <4 x i1> %1)
533  %index.next = add i32 %index, 4
534  %3 = icmp eq i32 %index.next, %n
535  br i1 %3, label %for.end, label %vector.body
536
537for.end:                                          ; preds = %vector.body, %entry
538  ret void
539}
540
541define void @foo_ptr_p_float(ptr %dest, ptr %src, i32 %n) {
542; CHECK-LABEL: foo_ptr_p_float:
543; CHECK:       @ %bb.0: @ %entry
544; CHECK-NEXT:    bic r3, r2, #15
545; CHECK-NEXT:    cmp r3, #1
546; CHECK-NEXT:    it lt
547; CHECK-NEXT:    bxlt lr
548; CHECK-NEXT:  .LBB20_1: @ %vector.body
549; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
550; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
551; CHECK-NEXT:    subs r2, #4
552; CHECK-NEXT:    vptt.i32 ne, q0, zr
553; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
554; CHECK-NEXT:    vstrwt.32 q1, [q0]
555; CHECK-NEXT:    bne .LBB20_1
556; CHECK-NEXT:  @ %bb.2: @ %for.end
557; CHECK-NEXT:    bx lr
558entry:
559  %and = and i32 %n, -16
560  %cmp11 = icmp sgt i32 %and, 0
561  br i1 %cmp11, label %vector.body, label %for.end
562
563vector.body:                                      ; preds = %entry, %vector.body
564  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
565  %0 = getelementptr inbounds ptr, ptr %src, i32 %index
566  %wide.load = load <4 x ptr>, ptr %0, align 4
567  %1 = icmp ne <4 x ptr> %wide.load, zeroinitializer
568  %2 = getelementptr inbounds float, ptr %dest, i32 %index
569  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.v4p0(ptr %2, i32 4, <4 x i1> %1, <4 x i32> undef)
570  %3 = bitcast <4 x ptr> %wide.load to <4 x ptr>
571  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %wide.masked.load, <4 x ptr> %3, i32 4, <4 x i1> %1)
572  %index.next = add i32 %index, 4
573  %4 = icmp eq i32 %index.next, %n
574  br i1 %4, label %for.end, label %vector.body
575
576for.end:                                          ; preds = %vector.body, %entry
577  ret void
578}
579
580; VLSTW.u32 Qd, [P, 4]
581define arm_aapcs_vfpcc void @qi4(<4 x i32> %v, <4 x ptr> %p) {
582; CHECK-LABEL: qi4:
583; CHECK:       @ %bb.0: @ %entry
584; CHECK-NEXT:    movs r0, #16
585; CHECK-NEXT:    vadd.i32 q1, q1, r0
586; CHECK-NEXT:    vstrw.32 q0, [q1]
587; CHECK-NEXT:    bx lr
588entry:
589  %g = getelementptr inbounds i32, <4 x ptr> %p, i32 4
590  call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %v, <4 x ptr> %g, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
591  ret void
592}
593
594declare void @llvm.masked.scatter.v2i16.v2p0(<2 x i16>, <2 x ptr>, i32, <2 x i1>)
595declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32>, <2 x ptr>, i32, <2 x i1>)
596declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float>, <2 x ptr>, i32, <2 x i1>)
597declare void @llvm.masked.scatter.v4i8.v4p0(<4 x i8>, <4 x ptr>, i32, <4 x i1>)
598declare void @llvm.masked.scatter.v4i16.v4p0(<4 x i16>, <4 x ptr>, i32, <4 x i1>)
599declare void @llvm.masked.scatter.v4f16.v4p0(<4 x half>, <4 x ptr>, i32, <4 x i1>)
600declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>, <4 x ptr>, i32, <4 x i1>)
601declare void @llvm.masked.scatter.v4f32.v4p0(<4 x float>, <4 x ptr>, i32, <4 x i1>)
602declare void @llvm.masked.scatter.v8i8.v8p0(<8 x i8>, <8 x ptr>, i32, <8 x i1>)
603declare void @llvm.masked.scatter.v8i16.v8p0(<8 x i16>, <8 x ptr>, i32, <8 x i1>)
604declare void @llvm.masked.scatter.v8f16.v8p0(<8 x half>, <8 x ptr>, i32, <8 x i1>)
605declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32, <8 x i1>)
606declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32, <8 x i1>)
607declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32, <16 x i1>)
608declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>, <16 x ptr>, i32, <16 x i1>)
609declare <4 x i32> @llvm.masked.load.v4i32.v4p0(ptr, i32, <4 x i1>, <4 x i32>)
610