xref: /llvm-project/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll (revision b31fffbc7f1e0491bf599e82b7195e320d26e140)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled %s -o - | FileCheck %s
3
4define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_add_add_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
5; CHECK-LABEL: one_loop_add_add_v16i8:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    push {r7, lr}
8; CHECK-NEXT:    cbz r2, .LBB0_4
9; CHECK-NEXT:  @ %bb.1: @ %vector.ph
10; CHECK-NEXT:    vmov.i32 q0, #0x0
11; CHECK-NEXT:    dlstp.8 lr, r2
12; CHECK-NEXT:  .LBB0_2: @ %vector.body
13; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
14; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
15; CHECK-NEXT:    vldrb.u8 q2, [r0], #16
16; CHECK-NEXT:    vadd.i8 q0, q2, q1
17; CHECK-NEXT:    vaddv.u8 r12, q0
18; CHECK-NEXT:    letp lr, .LBB0_2
19; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
20; CHECK-NEXT:    uxtb.w r0, r12
21; CHECK-NEXT:    pop {r7, pc}
22; CHECK-NEXT:  .LBB0_4:
23; CHECK-NEXT:    mov.w r12, #0
24; CHECK-NEXT:    uxtb.w r0, r12
25; CHECK-NEXT:    pop {r7, pc}
26entry:
27  %cmp11 = icmp eq i32 %N, 0
28  br i1 %cmp11, label %for.cond.cleanup, label %vector.ph
29
30vector.ph:                                        ; preds = %entry
31  %n.rnd.up = add i32 %N, 15
32  %n.vec = and i32 %n.rnd.up, -16
33  br label %vector.body
34
35vector.body:                                      ; preds = %vector.body, %vector.ph
36  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
37  %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
38  %i = getelementptr inbounds i8, ptr %a, i32 %index
39  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
40  %i1 = bitcast ptr %i to ptr
41  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
42  %i2 = getelementptr inbounds i8, ptr %b, i32 %index
43  %i3 = bitcast ptr %i2 to ptr
44  %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
45  %i4 = add <16 x i8> %wide.masked.load, %wide.masked.load16
46  %i5 = select <16 x i1> %active.lane.mask, <16 x i8> %i4, <16 x i8> %vec.phi
47  %i6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i5)
48  %index.next = add i32 %index, 16
49  %i7 = icmp eq i32 %index.next, %n.vec
50  br i1 %i7, label %middle.block, label %vector.body
51
52middle.block:                                     ; preds = %vector.body
53  br label %for.cond.cleanup
54
55for.cond.cleanup:                                 ; preds = %middle.block, %entry
56  %res.0.lcssa = phi i8 [ 0, %entry ], [ %i6, %middle.block ]
57  ret i8 %res.0.lcssa
58}
59
60define dso_local arm_aapcs_vfpcc signext i16 @one_loop_add_add_v8i16(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
61; CHECK-LABEL: one_loop_add_add_v8i16:
62; CHECK:       @ %bb.0: @ %entry
63; CHECK-NEXT:    cmp r2, #0
64; CHECK-NEXT:    ittt eq
65; CHECK-NEXT:    moveq r0, #0
66; CHECK-NEXT:    sxtheq r0, r0
67; CHECK-NEXT:    bxeq lr
68; CHECK-NEXT:  .LBB1_1: @ %vector.ph
69; CHECK-NEXT:    push {r7, lr}
70; CHECK-NEXT:    adds r3, r2, #7
71; CHECK-NEXT:    vmov.i32 q1, #0x0
72; CHECK-NEXT:    bic r3, r3, #7
73; CHECK-NEXT:    sub.w r12, r3, #8
74; CHECK-NEXT:    movs r3, #1
75; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
76; CHECK-NEXT:    dls lr, r3
77; CHECK-NEXT:  .LBB1_2: @ %vector.body
78; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
79; CHECK-NEXT:    vctp.16 r2
80; CHECK-NEXT:    vmov q0, q1
81; CHECK-NEXT:    vpstt
82; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
83; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
84; CHECK-NEXT:    subs r2, #8
85; CHECK-NEXT:    vadd.i16 q1, q0, q1
86; CHECK-NEXT:    vadd.i16 q1, q1, q2
87; CHECK-NEXT:    le lr, .LBB1_2
88; CHECK-NEXT:  @ %bb.3: @ %middle.block
89; CHECK-NEXT:    vpsel q0, q1, q0
90; CHECK-NEXT:    vaddv.u16 r0, q0
91; CHECK-NEXT:    pop.w {r7, lr}
92; CHECK-NEXT:    sxth r0, r0
93; CHECK-NEXT:    bx lr
94entry:
95  %cmp12 = icmp eq i32 %N, 0
96  br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
97
98vector.ph:                                        ; preds = %entry
99  %n.rnd.up = add i32 %N, 7
100  %n.vec = and i32 %n.rnd.up, -8
101  br label %vector.body
102
103vector.body:                                      ; preds = %vector.body, %vector.ph
104  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
105  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
106  %i = getelementptr inbounds i8, ptr %a, i32 %index
107  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
108  %i1 = bitcast ptr %i to ptr
109  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
110  %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
111  %i3 = getelementptr inbounds i8, ptr %b, i32 %index
112  %i4 = bitcast ptr %i3 to ptr
113  %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
114  %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
115  %i6 = add <8 x i16> %vec.phi, %i2
116  %i7 = add <8 x i16> %i6, %i5
117  %index.next = add i32 %index, 8
118  %i8 = icmp eq i32 %index.next, %n.vec
119  br i1 %i8, label %middle.block, label %vector.body
120
121middle.block:                                     ; preds = %vector.body
122  %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
123  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
124  br label %for.cond.cleanup
125
126for.cond.cleanup:                                 ; preds = %middle.block, %entry
127  %res.0.lcssa = phi i16 [ 0, %entry ], [ %i10, %middle.block ]
128  ret i16 %res.0.lcssa
129}
130
131define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_sub_add_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
132; CHECK-LABEL: one_loop_sub_add_v16i8:
133; CHECK:       @ %bb.0: @ %entry
134; CHECK-NEXT:    cmp r2, #0
135; CHECK-NEXT:    ittt eq
136; CHECK-NEXT:    moveq r0, #0
137; CHECK-NEXT:    uxtbeq r0, r0
138; CHECK-NEXT:    bxeq lr
139; CHECK-NEXT:  .LBB2_1: @ %vector.ph
140; CHECK-NEXT:    push {r7, lr}
141; CHECK-NEXT:    add.w r3, r2, #15
142; CHECK-NEXT:    vmov.i32 q1, #0x0
143; CHECK-NEXT:    bic r3, r3, #15
144; CHECK-NEXT:    sub.w r12, r3, #16
145; CHECK-NEXT:    movs r3, #1
146; CHECK-NEXT:    add.w r3, r3, r12, lsr #4
147; CHECK-NEXT:    dls lr, r3
148; CHECK-NEXT:  .LBB2_2: @ %vector.body
149; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
150; CHECK-NEXT:    vctp.8 r2
151; CHECK-NEXT:    vmov q0, q1
152; CHECK-NEXT:    vpstt
153; CHECK-NEXT:    vldrbt.u8 q1, [r1], #16
154; CHECK-NEXT:    vldrbt.u8 q2, [r0], #16
155; CHECK-NEXT:    subs r2, #16
156; CHECK-NEXT:    vsub.i8 q1, q2, q1
157; CHECK-NEXT:    vadd.i8 q1, q1, q0
158; CHECK-NEXT:    le lr, .LBB2_2
159; CHECK-NEXT:  @ %bb.3: @ %middle.block
160; CHECK-NEXT:    vpsel q0, q1, q0
161; CHECK-NEXT:    vaddv.u8 r0, q0
162; CHECK-NEXT:    pop.w {r7, lr}
163; CHECK-NEXT:    uxtb r0, r0
164; CHECK-NEXT:    bx lr
165entry:
166  %cmp11 = icmp eq i32 %N, 0
167  br i1 %cmp11, label %for.cond.cleanup, label %vector.ph
168
169vector.ph:                                        ; preds = %entry
170  %n.rnd.up = add i32 %N, 15
171  %n.vec = and i32 %n.rnd.up, -16
172  br label %vector.body
173
174vector.body:                                      ; preds = %vector.body, %vector.ph
175  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
176  %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
177  %i = getelementptr inbounds i8, ptr %a, i32 %index
178  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
179  %i1 = bitcast ptr %i to ptr
180  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
181  %i2 = getelementptr inbounds i8, ptr %b, i32 %index
182  %i3 = bitcast ptr %i2 to ptr
183  %wide.masked.load16 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
184  %i4 = sub <16 x i8> %wide.masked.load, %wide.masked.load16
185  %i5 = add <16 x i8> %i4, %vec.phi
186  %index.next = add i32 %index, 16
187  %i6 = icmp eq i32 %index.next, %n.vec
188  br i1 %i6, label %middle.block, label %vector.body
189
190middle.block:                                     ; preds = %vector.body
191  %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi
192  %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7)
193  br label %for.cond.cleanup
194
195for.cond.cleanup:                                 ; preds = %middle.block, %entry
196  %res.0.lcssa = phi i8 [ 0, %entry ], [ %i8, %middle.block ]
197  ret i8 %res.0.lcssa
198}
199
200define dso_local arm_aapcs_vfpcc signext i16 @one_loop_sub_add_v8i16(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
201; CHECK-LABEL: one_loop_sub_add_v8i16:
202; CHECK:       @ %bb.0: @ %entry
203; CHECK-NEXT:    cmp r2, #0
204; CHECK-NEXT:    ittt eq
205; CHECK-NEXT:    moveq r0, #0
206; CHECK-NEXT:    sxtheq r0, r0
207; CHECK-NEXT:    bxeq lr
208; CHECK-NEXT:  .LBB3_1: @ %vector.ph
209; CHECK-NEXT:    push {r7, lr}
210; CHECK-NEXT:    adds r3, r2, #7
211; CHECK-NEXT:    vmov.i32 q1, #0x0
212; CHECK-NEXT:    bic r3, r3, #7
213; CHECK-NEXT:    sub.w r12, r3, #8
214; CHECK-NEXT:    movs r3, #1
215; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
216; CHECK-NEXT:    dls lr, r3
217; CHECK-NEXT:  .LBB3_2: @ %vector.body
218; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
219; CHECK-NEXT:    vctp.16 r2
220; CHECK-NEXT:    vmov q0, q1
221; CHECK-NEXT:    vpstt
222; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
223; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
224; CHECK-NEXT:    subs r2, #8
225; CHECK-NEXT:    vsub.i16 q1, q2, q1
226; CHECK-NEXT:    vadd.i16 q1, q1, q0
227; CHECK-NEXT:    le lr, .LBB3_2
228; CHECK-NEXT:  @ %bb.3: @ %middle.block
229; CHECK-NEXT:    vpsel q0, q1, q0
230; CHECK-NEXT:    vaddv.u16 r0, q0
231; CHECK-NEXT:    pop.w {r7, lr}
232; CHECK-NEXT:    sxth r0, r0
233; CHECK-NEXT:    bx lr
234entry:
235  %cmp12 = icmp eq i32 %N, 0
236  br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
237
238vector.ph:                                        ; preds = %entry
239  %n.rnd.up = add i32 %N, 7
240  %n.vec = and i32 %n.rnd.up, -8
241  br label %vector.body
242
243vector.body:                                      ; preds = %vector.body, %vector.ph
244  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
245  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
246  %i = getelementptr inbounds i8, ptr %a, i32 %index
247  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
248  %i1 = bitcast ptr %i to ptr
249  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
250  %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
251  %i3 = getelementptr inbounds i8, ptr %b, i32 %index
252  %i4 = bitcast ptr %i3 to ptr
253  %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
254  %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
255  %i6 = sub <8 x i16> %i5, %i2
256  %i7 = add <8 x i16> %i6, %vec.phi
257  %index.next = add i32 %index, 8
258  %i8 = icmp eq i32 %index.next, %n.vec
259  br i1 %i8, label %middle.block, label %vector.body
260
261middle.block:                                     ; preds = %vector.body
262  %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
263  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
264  br label %for.cond.cleanup
265
266for.cond.cleanup:                                 ; preds = %middle.block, %entry
267  %res.0.lcssa = phi i16 [ 0, %entry ], [ %i10, %middle.block ]
268  ret i16 %res.0.lcssa
269}
270
271define dso_local arm_aapcs_vfpcc zeroext i8 @one_loop_mul_add_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
272; CHECK-LABEL: one_loop_mul_add_v16i8:
273; CHECK:       @ %bb.0: @ %entry
274; CHECK-NEXT:    cmp r2, #0
275; CHECK-NEXT:    ittt eq
276; CHECK-NEXT:    moveq r0, #0
277; CHECK-NEXT:    uxtbeq r0, r0
278; CHECK-NEXT:    bxeq lr
279; CHECK-NEXT:  .LBB4_1: @ %vector.ph
280; CHECK-NEXT:    push {r7, lr}
281; CHECK-NEXT:    add.w r3, r2, #15
282; CHECK-NEXT:    vmov.i32 q1, #0x0
283; CHECK-NEXT:    bic r3, r3, #15
284; CHECK-NEXT:    sub.w r12, r3, #16
285; CHECK-NEXT:    movs r3, #1
286; CHECK-NEXT:    add.w r3, r3, r12, lsr #4
287; CHECK-NEXT:    dls lr, r3
288; CHECK-NEXT:  .LBB4_2: @ %vector.body
289; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
290; CHECK-NEXT:    vctp.8 r2
291; CHECK-NEXT:    vmov q0, q1
292; CHECK-NEXT:    vpstt
293; CHECK-NEXT:    vldrbt.u8 q1, [r0], #16
294; CHECK-NEXT:    vldrbt.u8 q2, [r1], #16
295; CHECK-NEXT:    subs r2, #16
296; CHECK-NEXT:    vmul.i8 q1, q2, q1
297; CHECK-NEXT:    vadd.i8 q1, q1, q0
298; CHECK-NEXT:    le lr, .LBB4_2
299; CHECK-NEXT:  @ %bb.3: @ %middle.block
300; CHECK-NEXT:    vpsel q0, q1, q0
301; CHECK-NEXT:    vaddv.u8 r0, q0
302; CHECK-NEXT:    pop.w {r7, lr}
303; CHECK-NEXT:    uxtb r0, r0
304; CHECK-NEXT:    bx lr
305entry:
306  %cmp10 = icmp eq i32 %N, 0
307  br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
308
309vector.ph:                                        ; preds = %entry
310  %n.rnd.up = add i32 %N, 15
311  %n.vec = and i32 %n.rnd.up, -16
312  br label %vector.body
313
314vector.body:                                      ; preds = %vector.body, %vector.ph
315  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
316  %vec.phi = phi <16 x i8> [ zeroinitializer, %vector.ph ], [ %i5, %vector.body ]
317  %i = getelementptr inbounds i8, ptr %a, i32 %index
318  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
319  %i1 = bitcast ptr %i to ptr
320  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
321  %i2 = getelementptr inbounds i8, ptr %b, i32 %index
322  %i3 = bitcast ptr %i2 to ptr
323  %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %i3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
324  %i4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
325  %i5 = add <16 x i8> %i4, %vec.phi
326  %index.next = add i32 %index, 16
327  %i6 = icmp eq i32 %index.next, %n.vec
328  br i1 %i6, label %middle.block, label %vector.body
329
330middle.block:                                     ; preds = %vector.body
331  %i7 = select <16 x i1> %active.lane.mask, <16 x i8> %i5, <16 x i8> %vec.phi
332  %i8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %i7)
333  br label %for.cond.cleanup
334
335for.cond.cleanup:                                 ; preds = %middle.block, %entry
336  %res.0.lcssa = phi i8 [ 0, %entry ], [ %i8, %middle.block ]
337  ret i8 %res.0.lcssa
338}
339
340define dso_local arm_aapcs_vfpcc signext i16 @one_loop_mul_add_v8i16(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
341; CHECK-LABEL: one_loop_mul_add_v8i16:
342; CHECK:       @ %bb.0: @ %entry
343; CHECK-NEXT:    cmp r2, #0
344; CHECK-NEXT:    ittt eq
345; CHECK-NEXT:    moveq r0, #0
346; CHECK-NEXT:    sxtheq r0, r0
347; CHECK-NEXT:    bxeq lr
348; CHECK-NEXT:  .LBB5_1: @ %vector.ph
349; CHECK-NEXT:    push {r7, lr}
350; CHECK-NEXT:    adds r3, r2, #7
351; CHECK-NEXT:    vmov.i32 q1, #0x0
352; CHECK-NEXT:    bic r3, r3, #7
353; CHECK-NEXT:    sub.w r12, r3, #8
354; CHECK-NEXT:    movs r3, #1
355; CHECK-NEXT:    add.w r3, r3, r12, lsr #3
356; CHECK-NEXT:    dls lr, r3
357; CHECK-NEXT:  .LBB5_2: @ %vector.body
358; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
359; CHECK-NEXT:    vctp.16 r2
360; CHECK-NEXT:    vmov q0, q1
361; CHECK-NEXT:    vpstt
362; CHECK-NEXT:    vldrbt.u16 q1, [r0], #8
363; CHECK-NEXT:    vldrbt.u16 q2, [r1], #8
364; CHECK-NEXT:    subs r2, #8
365; CHECK-NEXT:    vmul.i16 q1, q2, q1
366; CHECK-NEXT:    vadd.i16 q1, q1, q0
367; CHECK-NEXT:    le lr, .LBB5_2
368; CHECK-NEXT:  @ %bb.3: @ %middle.block
369; CHECK-NEXT:    vpsel q0, q1, q0
370; CHECK-NEXT:    vaddv.u16 r0, q0
371; CHECK-NEXT:    pop.w {r7, lr}
372; CHECK-NEXT:    sxth r0, r0
373; CHECK-NEXT:    bx lr
374entry:
375  %cmp12 = icmp eq i32 %N, 0
376  br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
377
378vector.ph:                                        ; preds = %entry
379  %n.rnd.up = add i32 %N, 7
380  %n.vec = and i32 %n.rnd.up, -8
381  br label %vector.body
382
383vector.body:                                      ; preds = %vector.body, %vector.ph
384  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
385  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
386  %i = getelementptr inbounds i8, ptr %a, i32 %index
387  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
388  %i1 = bitcast ptr %i to ptr
389  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
390  %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
391  %i3 = getelementptr inbounds i8, ptr %b, i32 %index
392  %i4 = bitcast ptr %i3 to ptr
393  %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
394  %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
395  %i6 = mul <8 x i16> %i5, %i2
396  %i7 = add <8 x i16> %i6, %vec.phi
397  %index.next = add i32 %index, 8
398  %i8 = icmp eq i32 %index.next, %n.vec
399  br i1 %i8, label %middle.block, label %vector.body
400
401middle.block:                                     ; preds = %vector.body
402  %i9 = select <8 x i1> %active.lane.mask, <8 x i16> %i7, <8 x i16> %vec.phi
403  %i10 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i9)
404  br label %for.cond.cleanup
405
406for.cond.cleanup:                                 ; preds = %middle.block, %entry
407  %res.0.lcssa = phi i16 [ 0, %entry ], [ %i10, %middle.block ]
408  ret i16 %res.0.lcssa
409}
410
411define dso_local arm_aapcs_vfpcc i32 @two_loops_mul_add_v4i32(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
412; CHECK-LABEL: two_loops_mul_add_v4i32:
413; CHECK:       @ %bb.0: @ %entry
414; CHECK-NEXT:    cmp r2, #0
415; CHECK-NEXT:    itt eq
416; CHECK-NEXT:    moveq r0, #0
417; CHECK-NEXT:    bxeq lr
418; CHECK-NEXT:  .LBB6_1: @ %vector.ph
419; CHECK-NEXT:    push {r4, r5, r6, r7, lr}
420; CHECK-NEXT:    adds r3, r2, #3
421; CHECK-NEXT:    vmov.i32 q1, #0x0
422; CHECK-NEXT:    bic r3, r3, #3
423; CHECK-NEXT:    mov r4, r0
424; CHECK-NEXT:    subs r7, r3, #4
425; CHECK-NEXT:    movs r3, #1
426; CHECK-NEXT:    mov r5, r1
427; CHECK-NEXT:    add.w r6, r3, r7, lsr #2
428; CHECK-NEXT:    mov r3, r2
429; CHECK-NEXT:    dls lr, r6
430; CHECK-NEXT:  .LBB6_2: @ %vector.body
431; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
432; CHECK-NEXT:    vctp.32 r3
433; CHECK-NEXT:    vmov q0, q1
434; CHECK-NEXT:    vpstt
435; CHECK-NEXT:    vldrbt.u32 q1, [r4], #4
436; CHECK-NEXT:    vldrbt.u32 q2, [r5], #4
437; CHECK-NEXT:    subs r3, #4
438; CHECK-NEXT:    vmul.i32 q1, q2, q1
439; CHECK-NEXT:    vadd.i32 q1, q1, q0
440; CHECK-NEXT:    le lr, .LBB6_2
441; CHECK-NEXT:  @ %bb.3: @ %middle.block
442; CHECK-NEXT:    vpsel q0, q1, q0
443; CHECK-NEXT:    vaddv.u32 r12, q0
444; CHECK-NEXT:    cbz r2, .LBB6_7
445; CHECK-NEXT:  @ %bb.4: @ %vector.ph47
446; CHECK-NEXT:    movs r3, #0
447; CHECK-NEXT:    vdup.32 q0, r3
448; CHECK-NEXT:    movs r3, #1
449; CHECK-NEXT:    add.w r3, r3, r7, lsr #2
450; CHECK-NEXT:    vmov.32 q0[0], r12
451; CHECK-NEXT:    dls lr, r3
452; CHECK-NEXT:  .LBB6_5: @ %vector.body46
453; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
454; CHECK-NEXT:    vctp.32 r2
455; CHECK-NEXT:    vmov q1, q0
456; CHECK-NEXT:    vpstt
457; CHECK-NEXT:    vldrbt.u32 q0, [r0], #4
458; CHECK-NEXT:    vldrbt.u32 q2, [r1], #4
459; CHECK-NEXT:    subs r2, #4
460; CHECK-NEXT:    vmul.i32 q0, q2, q0
461; CHECK-NEXT:    vadd.i32 q0, q0, q1
462; CHECK-NEXT:    le lr, .LBB6_5
463; CHECK-NEXT:  @ %bb.6: @ %middle.block44
464; CHECK-NEXT:    vpsel q0, q0, q1
465; CHECK-NEXT:    vaddv.u32 r12, q0
466; CHECK-NEXT:  .LBB6_7:
467; CHECK-NEXT:    pop.w {r4, r5, r6, r7, lr}
468; CHECK-NEXT:    mov r0, r12
469; CHECK-NEXT:    bx lr
470entry:
471  %cmp35 = icmp eq i32 %N, 0
472  br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph
473
474vector.ph:                                        ; preds = %entry
475  %n.rnd.up = add i32 %N, 3
476  %n.vec = and i32 %n.rnd.up, -4
477  br label %vector.body
478
479vector.body:                                      ; preds = %vector.body, %vector.ph
480  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
481  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %i7, %vector.body ]
482  %i = getelementptr inbounds i8, ptr %a, i32 %index
483  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
484  %i1 = bitcast ptr %i to ptr
485  %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %i1, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
486  %i2 = zext <4 x i8> %wide.masked.load to <4 x i32>
487  %i3 = getelementptr inbounds i8, ptr %b, i32 %index
488  %i4 = bitcast ptr %i3 to ptr
489  %wide.masked.load43 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %i4, i32 1, <4 x i1> %active.lane.mask, <4 x i8> undef)
490  %i5 = zext <4 x i8> %wide.masked.load43 to <4 x i32>
491  %i6 = mul nuw nsw <4 x i32> %i5, %i2
492  %i7 = add <4 x i32> %i6, %vec.phi
493  %index.next = add i32 %index, 4
494  %i8 = icmp eq i32 %index.next, %n.vec
495  br i1 %i8, label %middle.block, label %vector.body
496
497middle.block:                                     ; preds = %vector.body
498  %i9 = select <4 x i1> %active.lane.mask, <4 x i32> %i7, <4 x i32> %vec.phi
499  %i10 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i9)
500  br i1 %cmp35, label %for.cond.cleanup7, label %vector.ph47
501
502vector.ph47:                                      ; preds = %middle.block
503  %n.rnd.up48 = add i32 %N, 3
504  %n.vec50 = and i32 %n.rnd.up48, -4
505  %i11 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %i10, i32 0
506  br label %vector.body46
507
508vector.body46:                                    ; preds = %vector.body46, %vector.ph47
509  %index51 = phi i32 [ 0, %vector.ph47 ], [ %index.next52, %vector.body46 ]
510  %vec.phi60 = phi <4 x i32> [ %i11, %vector.ph47 ], [ %i19, %vector.body46 ]
511  %i12 = getelementptr inbounds i8, ptr %a, i32 %index51
512  %active.lane.mask61 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index51, i32 %N)
513  %i13 = bitcast ptr %i12 to ptr
514  %wide.masked.load62 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %i13, i32 1, <4 x i1> %active.lane.mask61, <4 x i8> undef)
515  %i14 = zext <4 x i8> %wide.masked.load62 to <4 x i32>
516  %i15 = getelementptr inbounds i8, ptr %b, i32 %index51
517  %i16 = bitcast ptr %i15 to ptr
518  %wide.masked.load63 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr %i16, i32 1, <4 x i1> %active.lane.mask61, <4 x i8> undef)
519  %i17 = zext <4 x i8> %wide.masked.load63 to <4 x i32>
520  %i18 = mul nuw nsw <4 x i32> %i17, %i14
521  %i19 = add <4 x i32> %i18, %vec.phi60
522  %index.next52 = add i32 %index51, 4
523  %i20 = icmp eq i32 %index.next52, %n.vec50
524  br i1 %i20, label %middle.block44, label %vector.body46
525
526middle.block44:                                   ; preds = %vector.body46
527  %i21 = select <4 x i1> %active.lane.mask61, <4 x i32> %i19, <4 x i32> %vec.phi60
528  %i22 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %i21)
529  br label %for.cond.cleanup7
530
531for.cond.cleanup7:                                ; preds = %middle.block44, %middle.block, %entry
532  %res.1.lcssa = phi i32 [ %i10, %middle.block ], [ 0, %entry ], [ %i22, %middle.block44 ]
533  ret i32 %res.1.lcssa
534}
535
536define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) local_unnamed_addr {
537; CHECK-LABEL: two_reductions_mul_add_v8i16:
538; CHECK:       @ %bb.0: @ %entry
539; CHECK-NEXT:    push {r4, lr}
540; CHECK-NEXT:    vpush {d8, d9}
541; CHECK-NEXT:    cbz r2, .LBB7_4
542; CHECK-NEXT:  @ %bb.1: @ %vector.ph
543; CHECK-NEXT:    adds r3, r2, #7
544; CHECK-NEXT:    vmov.i32 q1, #0x0
545; CHECK-NEXT:    bic r3, r3, #7
546; CHECK-NEXT:    movs r4, #1
547; CHECK-NEXT:    subs r3, #8
548; CHECK-NEXT:    vmov q3, q1
549; CHECK-NEXT:    add.w r12, r4, r3, lsr #3
550; CHECK-NEXT:    mov r3, r0
551; CHECK-NEXT:    mov r4, r1
552; CHECK-NEXT:    dls lr, r12
553; CHECK-NEXT:  .LBB7_2: @ %vector.body
554; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
555; CHECK-NEXT:    vctp.16 r2
556; CHECK-NEXT:    vmov q0, q1
557; CHECK-NEXT:    vpstt
558; CHECK-NEXT:    vldrbt.u16 q1, [r3], #8
559; CHECK-NEXT:    vldrbt.u16 q4, [r4], #8
560; CHECK-NEXT:    vmov q2, q3
561; CHECK-NEXT:    subs r2, #8
562; CHECK-NEXT:    vsub.i16 q3, q4, q1
563; CHECK-NEXT:    vmul.i16 q1, q4, q1
564; CHECK-NEXT:    vadd.i16 q3, q3, q2
565; CHECK-NEXT:    vadd.i16 q1, q1, q0
566; CHECK-NEXT:    le lr, .LBB7_2
567; CHECK-NEXT:  @ %bb.3: @ %middle.block
568; CHECK-NEXT:    vpsel q2, q3, q2
569; CHECK-NEXT:    vpsel q0, q1, q0
570; CHECK-NEXT:    vaddv.u16 r4, q2
571; CHECK-NEXT:    vaddv.u16 r2, q0
572; CHECK-NEXT:    b .LBB7_5
573; CHECK-NEXT:  .LBB7_4:
574; CHECK-NEXT:    movs r2, #0
575; CHECK-NEXT:    movs r4, #0
576; CHECK-NEXT:  .LBB7_5: @ %for.cond.cleanup
577; CHECK-NEXT:    strb r2, [r0]
578; CHECK-NEXT:    strb r4, [r1]
579; CHECK-NEXT:    vpop {d8, d9}
580; CHECK-NEXT:    pop {r4, pc}
581entry:
582  %cmp12 = icmp eq i32 %N, 0
583  br i1 %cmp12, label %for.cond.cleanup, label %vector.ph
584
585vector.ph:                                        ; preds = %entry
586  %n.rnd.up = add i32 %N, 7
587  %n.vec = and i32 %n.rnd.up, -8
588  br label %vector.body
589
590vector.body:                                      ; preds = %vector.body, %vector.ph
591  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
592  %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i8, %vector.body ]
593  %vec.phi.1 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %i9, %vector.body ]
594  %i = getelementptr inbounds i8, ptr %a, i32 %index
595  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
596  %i1 = bitcast ptr %i to ptr
597  %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i1, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
598  %i2 = zext <8 x i8> %wide.masked.load to <8 x i16>
599  %i3 = getelementptr inbounds i8, ptr %b, i32 %index
600  %i4 = bitcast ptr %i3 to ptr
601  %wide.masked.load17 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %i4, i32 1, <8 x i1> %active.lane.mask, <8 x i8> undef)
602  %i5 = zext <8 x i8> %wide.masked.load17 to <8 x i16>
603  %i6 = mul <8 x i16> %i5, %i2
604  %i7 = sub <8 x i16> %i5, %i2
605  %i8 = add <8 x i16> %i6, %vec.phi
606  %i9 = add <8 x i16> %i7, %vec.phi.1
607  %index.next = add i32 %index, 8
608  %i10 = icmp eq i32 %index.next, %n.vec
609  br i1 %i10, label %middle.block, label %vector.body
610
611middle.block:                                     ; preds = %vector.body
612  %i11 = select <8 x i1> %active.lane.mask, <8 x i16> %i8, <8 x i16> %vec.phi
613  %i12 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i11)
614  %i13 = select <8 x i1> %active.lane.mask, <8 x i16> %i9, <8 x i16> %vec.phi.1
615  %i14 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %i13)
616  br label %for.cond.cleanup
617
618for.cond.cleanup:                                 ; preds = %middle.block, %entry
619  %res.0.lcssa = phi i16 [ 0, %entry ], [ %i12, %middle.block ]
620  %res.1.lcssa = phi i16 [ 0, %entry ], [ %i14, %middle.block ]
621  %trunc.res.0 = trunc i16 %res.0.lcssa to i8
622  store i8 %trunc.res.0, ptr %a
623  %trunc.res.1 = trunc i16 %res.1.lcssa to i8
624  store i8 %trunc.res.1, ptr %b
625  ret void
626}
627
628%struct.date = type { i32, i32, i32, i32 }
629@days = internal unnamed_addr constant [2 x [13 x i32]] [[13 x i32] [i32 0, i32 31, i32 28, i32 31, i32 30, i32 31, i32 30, i32 31, i32 31, i32 30, i32 31, i32 30, i32 31], [13 x i32] [i32 0, i32 31, i32 29, i32 31, i32 30, i32 31, i32 30, i32 31, i32 31, i32 30, i32 31, i32 30, i32 31]], align 4
630define i32 @wrongop(ptr nocapture readonly %pd) {
631; CHECK-LABEL: wrongop:
632; CHECK:       @ %bb.0: @ %entry
633; CHECK-NEXT:    push {r4, lr}
634; CHECK-NEXT:    mov r1, r0
635; CHECK-NEXT:    movw r12, #47184
636; CHECK-NEXT:    movw r3, #23593
637; CHECK-NEXT:    ldrd r2, lr, [r1, #4]
638; CHECK-NEXT:    movt r12, #1310
639; CHECK-NEXT:    movt r3, #49807
640; CHECK-NEXT:    mla r3, lr, r3, r12
641; CHECK-NEXT:    movw r1, #55051
642; CHECK-NEXT:    movw r4, #23593
643; CHECK-NEXT:    movt r1, #163
644; CHECK-NEXT:    ldr r0, [r0]
645; CHECK-NEXT:    movt r4, #655
646; CHECK-NEXT:    ror.w r12, r3, #4
647; CHECK-NEXT:    cmp r12, r1
648; CHECK-NEXT:    cset r1, lo
649; CHECK-NEXT:    ror.w r3, r3, #2
650; CHECK-NEXT:    mov.w r12, #1
651; CHECK-NEXT:    cmp r3, r4
652; CHECK-NEXT:    csel r3, r1, r12, lo
653; CHECK-NEXT:    lsls.w r4, lr, #30
654; CHECK-NEXT:    csel r1, r1, r3, ne
655; CHECK-NEXT:    cmp r2, #1
656; CHECK-NEXT:    it lt
657; CHECK-NEXT:    poplt {r4, pc}
658; CHECK-NEXT:  .LBB8_1: @ %vector.ph
659; CHECK-NEXT:    movw r3, :lower16:days
660; CHECK-NEXT:    movs r4, #52
661; CHECK-NEXT:    movt r3, :upper16:days
662; CHECK-NEXT:    smlabb r1, r1, r4, r3
663; CHECK-NEXT:    movs r3, #0
664; CHECK-NEXT:    vdup.32 q0, r3
665; CHECK-NEXT:    vmov.32 q0[0], r0
666; CHECK-NEXT:    adds r0, r2, #3
667; CHECK-NEXT:    bic r0, r0, #3
668; CHECK-NEXT:    subs r0, #4
669; CHECK-NEXT:    add.w r0, r12, r0, lsr #2
670; CHECK-NEXT:    dls lr, r0
671; CHECK-NEXT:  .LBB8_2: @ %vector.body
672; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
673; CHECK-NEXT:    vctp.32 r2
674; CHECK-NEXT:    vmov q1, q0
675; CHECK-NEXT:    vpst
676; CHECK-NEXT:    vldrwt.u32 q0, [r1], #16
677; CHECK-NEXT:    subs r2, #4
678; CHECK-NEXT:    vadd.i32 q0, q0, q1
679; CHECK-NEXT:    le lr, .LBB8_2
680; CHECK-NEXT:  @ %bb.3: @ %middle.block
681; CHECK-NEXT:    vpsel q0, q0, q1
682; CHECK-NEXT:    vaddv.u32 r0, q0
683; CHECK-NEXT:    pop {r4, pc}
684entry:
685  %day1 = getelementptr inbounds %struct.date, ptr %pd, i32 0, i32 0
686  %0 = load i32, ptr %day1, align 4
687  %year = getelementptr inbounds %struct.date, ptr %pd, i32 0, i32 2
688  %1 = load i32, ptr %year, align 4
689  %2 = and i32 %1, 3
690  %cmp = icmp ne i32 %2, 0
691  %rem3 = srem i32 %1, 100
692  %cmp4.not = icmp eq i32 %rem3, 0
693  %or.cond = or i1 %cmp, %cmp4.not
694  br i1 %or.cond, label %lor.rhs, label %lor.end
695
696lor.rhs:                                          ; preds = %entry
697  %rem6 = srem i32 %1, 400
698  %cmp7 = icmp eq i32 %rem6, 0
699  %phi.cast = zext i1 %cmp7 to i32
700  br label %lor.end
701
702lor.end:                                          ; preds = %entry, %lor.rhs
703  %3 = phi i32 [ %phi.cast, %lor.rhs ], [ 1, %entry ]
704  %month = getelementptr inbounds %struct.date, ptr %pd, i32 0, i32 1
705  %4 = load i32, ptr %month, align 4
706  %cmp820 = icmp sgt i32 %4, 0
707  br i1 %cmp820, label %vector.ph, label %for.end
708
709vector.ph:                                        ; preds = %lor.end
710  %n.rnd.up = add i32 %4, 3
711  %n.vec = and i32 %n.rnd.up, -4
712  %5 = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 %0, i32 0
713  br label %vector.body
714
715vector.body:                                      ; preds = %vector.body, %vector.ph
716  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
717  %vec.phi = phi <4 x i32> [ %5, %vector.ph ], [ %8, %vector.body ]
718  %6 = getelementptr inbounds [2 x [13 x i32]], ptr @days, i32 0, i32 %3, i32 %index
719  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %4)
720  %7 = bitcast ptr %6 to ptr
721  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %7, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
722  %8 = add <4 x i32> %wide.masked.load, %vec.phi
723  %index.next = add i32 %index, 4
724  %9 = icmp eq i32 %index.next, %n.vec
725  br i1 %9, label %middle.block, label %vector.body
726
727middle.block:                                     ; preds = %vector.body
728  %10 = select <4 x i1> %active.lane.mask, <4 x i32> %8, <4 x i32> %vec.phi
729  %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10)
730  br label %for.end
731
732for.end:                                          ; preds = %middle.block, %lor.end
733  %day.0.lcssa = phi i32 [ %0, %lor.end ], [ %11, %middle.block ]
734  ret i32 %day.0.lcssa
735}
736
737declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
738declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
739declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>)
740declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
741declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
742declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32 immarg, <8 x i1>, <8 x i8>)
743declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
744declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
745declare <4 x i8> @llvm.masked.load.v4i8.p0(ptr, i32 immarg, <4 x i1>, <4 x i8>)
746declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
747