xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vecreduce-loops.ll (revision b31fffbc7f1e0491bf599e82b7195e320d26e140)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=enabled -verify-machineinstrs %s -o - | FileCheck %s
3
4define i32 @add_i32(ptr nocapture readonly %x, i32 %n) {
5; CHECK-LABEL: add_i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r7, lr}
8; CHECK-NEXT:    push {r7, lr}
9; CHECK-NEXT:    cmp r1, #1
10; CHECK-NEXT:    blt .LBB0_3
11; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
12; CHECK-NEXT:    mov r12, r0
13; CHECK-NEXT:    cmp r1, #4
14; CHECK-NEXT:    bhs .LBB0_4
15; CHECK-NEXT:  @ %bb.2:
16; CHECK-NEXT:    movs r3, #0
17; CHECK-NEXT:    movs r0, #0
18; CHECK-NEXT:    b .LBB0_7
19; CHECK-NEXT:  .LBB0_3:
20; CHECK-NEXT:    movs r0, #0
21; CHECK-NEXT:    pop {r7, pc}
22; CHECK-NEXT:  .LBB0_4: @ %vector.ph
23; CHECK-NEXT:    bic r3, r1, #3
24; CHECK-NEXT:    movs r2, #1
25; CHECK-NEXT:    subs r0, r3, #4
26; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
27; CHECK-NEXT:    movs r0, #0
28; CHECK-NEXT:    mov r2, r12
29; CHECK-NEXT:  .LBB0_5: @ %vector.body
30; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
31; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
32; CHECK-NEXT:    vaddva.u32 r0, q0
33; CHECK-NEXT:    le lr, .LBB0_5
34; CHECK-NEXT:  @ %bb.6: @ %middle.block
35; CHECK-NEXT:    cmp r3, r1
36; CHECK-NEXT:    it eq
37; CHECK-NEXT:    popeq {r7, pc}
38; CHECK-NEXT:  .LBB0_7: @ %for.body.preheader1
39; CHECK-NEXT:    sub.w lr, r1, r3
40; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
41; CHECK-NEXT:  .LBB0_8: @ %for.body
42; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
43; CHECK-NEXT:    ldr r1, [r2], #4
44; CHECK-NEXT:    add r0, r1
45; CHECK-NEXT:    le lr, .LBB0_8
46; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup
47; CHECK-NEXT:    pop {r7, pc}
48entry:
49  %cmp6 = icmp sgt i32 %n, 0
50  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
51
52for.body.preheader:                               ; preds = %entry
53  %min.iters.check = icmp ult i32 %n, 4
54  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
55
56vector.ph:                                        ; preds = %for.body.preheader
57  %n.vec = and i32 %n, -4
58  br label %vector.body
59
60vector.body:                                      ; preds = %vector.body, %vector.ph
61  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
62  %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
63  %0 = getelementptr inbounds i32, ptr %x, i32 %index
64  %1 = bitcast ptr %0 to ptr
65  %wide.load = load <4 x i32>, ptr %1, align 4
66  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
67  %3 = add i32 %2, %vec.phi
68  %index.next = add i32 %index, 4
69  %4 = icmp eq i32 %index.next, %n.vec
70  br i1 %4, label %middle.block, label %vector.body
71
72middle.block:                                     ; preds = %vector.body
73  %cmp.n = icmp eq i32 %n.vec, %n
74  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
75
76for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
77  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
78  %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %3, %middle.block ]
79  br label %for.body
80
81for.body:                                         ; preds = %for.body.preheader1, %for.body
82  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
83  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
84  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
85  %5 = load i32, ptr %arrayidx, align 4
86  %add = add nsw i32 %5, %r.07
87  %inc = add nuw nsw i32 %i.08, 1
88  %exitcond = icmp eq i32 %inc, %n
89  br i1 %exitcond, label %for.cond.cleanup, label %for.body
90
91for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
92  %r.0.lcssa = phi i32 [ 0, %entry ], [ %3, %middle.block ], [ %add, %for.body ]
93  ret i32 %r.0.lcssa
94}
95
96define i32 @mul_i32(ptr nocapture readonly %x, i32 %n) {
97; CHECK-LABEL: mul_i32:
98; CHECK:       @ %bb.0: @ %entry
99; CHECK-NEXT:    .save {r4, lr}
100; CHECK-NEXT:    push {r4, lr}
101; CHECK-NEXT:    movs r2, #1
102; CHECK-NEXT:    cmp r1, #1
103; CHECK-NEXT:    blt .LBB1_8
104; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
105; CHECK-NEXT:    cmp r1, #4
106; CHECK-NEXT:    bhs .LBB1_3
107; CHECK-NEXT:  @ %bb.2:
108; CHECK-NEXT:    mov.w r12, #0
109; CHECK-NEXT:    b .LBB1_6
110; CHECK-NEXT:  .LBB1_3: @ %vector.ph
111; CHECK-NEXT:    bic r12, r1, #3
112; CHECK-NEXT:    vmov.i32 q0, #0x1
113; CHECK-NEXT:    sub.w r3, r12, #4
114; CHECK-NEXT:    add.w lr, r2, r3, lsr #2
115; CHECK-NEXT:    mov r2, r0
116; CHECK-NEXT:  .LBB1_4: @ %vector.body
117; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
118; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
119; CHECK-NEXT:    vmul.i32 q0, q1, q0
120; CHECK-NEXT:    le lr, .LBB1_4
121; CHECK-NEXT:  @ %bb.5: @ %middle.block
122; CHECK-NEXT:    vmov lr, r3, d1
123; CHECK-NEXT:    cmp r12, r1
124; CHECK-NEXT:    vmov r2, r4, d0
125; CHECK-NEXT:    mul r3, lr, r3
126; CHECK-NEXT:    mul r2, r4, r2
127; CHECK-NEXT:    mul r2, r3, r2
128; CHECK-NEXT:    beq .LBB1_8
129; CHECK-NEXT:  .LBB1_6: @ %for.body.preheader1
130; CHECK-NEXT:    sub.w lr, r1, r12
131; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
132; CHECK-NEXT:  .LBB1_7: @ %for.body
133; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
134; CHECK-NEXT:    ldr r1, [r0], #4
135; CHECK-NEXT:    muls r2, r1, r2
136; CHECK-NEXT:    le lr, .LBB1_7
137; CHECK-NEXT:  .LBB1_8: @ %for.cond.cleanup
138; CHECK-NEXT:    mov r0, r2
139; CHECK-NEXT:    pop {r4, pc}
140entry:
141  %cmp6 = icmp sgt i32 %n, 0
142  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
143
144for.body.preheader:                               ; preds = %entry
145  %min.iters.check = icmp ult i32 %n, 4
146  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
147
148vector.ph:                                        ; preds = %for.body.preheader
149  %n.vec = and i32 %n, -4
150  br label %vector.body
151
152vector.body:                                      ; preds = %vector.body, %vector.ph
153  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
154  %vec.phi = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ %2, %vector.body ]
155  %0 = getelementptr inbounds i32, ptr %x, i32 %index
156  %1 = bitcast ptr %0 to ptr
157  %wide.load = load <4 x i32>, ptr %1, align 4
158  %2 = mul <4 x i32> %wide.load, %vec.phi
159  %index.next = add i32 %index, 4
160  %3 = icmp eq i32 %index.next, %n.vec
161  br i1 %3, label %middle.block, label %vector.body
162
163middle.block:                                     ; preds = %vector.body
164  %4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2)
165  %cmp.n = icmp eq i32 %n.vec, %n
166  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
167
168for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
169  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
170  %r.07.ph = phi i32 [ 1, %for.body.preheader ], [ %4, %middle.block ]
171  br label %for.body
172
173for.body:                                         ; preds = %for.body.preheader1, %for.body
174  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
175  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
176  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
177  %5 = load i32, ptr %arrayidx, align 4
178  %add = mul nsw i32 %5, %r.07
179  %inc = add nuw nsw i32 %i.08, 1
180  %exitcond = icmp eq i32 %inc, %n
181  br i1 %exitcond, label %for.cond.cleanup, label %for.body
182
183for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
184  %r.0.lcssa = phi i32 [ 1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
185  ret i32 %r.0.lcssa
186}
187
188define i32 @and_i32(ptr nocapture readonly %x, i32 %n) {
189; CHECK-LABEL: and_i32:
190; CHECK:       @ %bb.0: @ %entry
191; CHECK-NEXT:    .save {r4, lr}
192; CHECK-NEXT:    push {r4, lr}
193; CHECK-NEXT:    cmp r1, #1
194; CHECK-NEXT:    blt .LBB2_3
195; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
196; CHECK-NEXT:    cmp r1, #4
197; CHECK-NEXT:    bhs .LBB2_4
198; CHECK-NEXT:  @ %bb.2:
199; CHECK-NEXT:    mov.w r2, #-1
200; CHECK-NEXT:    movs r3, #0
201; CHECK-NEXT:    b .LBB2_7
202; CHECK-NEXT:  .LBB2_3:
203; CHECK-NEXT:    mov.w r2, #-1
204; CHECK-NEXT:    mov r0, r2
205; CHECK-NEXT:    pop {r4, pc}
206; CHECK-NEXT:  .LBB2_4: @ %vector.ph
207; CHECK-NEXT:    bic r3, r1, #3
208; CHECK-NEXT:    movs r2, #1
209; CHECK-NEXT:    sub.w r12, r3, #4
210; CHECK-NEXT:    vmov.i8 q0, #0xff
211; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
212; CHECK-NEXT:    mov r2, r0
213; CHECK-NEXT:  .LBB2_5: @ %vector.body
214; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
215; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
216; CHECK-NEXT:    vand q0, q1, q0
217; CHECK-NEXT:    le lr, .LBB2_5
218; CHECK-NEXT:  @ %bb.6: @ %middle.block
219; CHECK-NEXT:    vmov lr, r12, d1
220; CHECK-NEXT:    cmp r3, r1
221; CHECK-NEXT:    vmov r2, r4, d0
222; CHECK-NEXT:    and.w r12, r12, lr
223; CHECK-NEXT:    and.w r2, r2, r4
224; CHECK-NEXT:    and.w r2, r2, r12
225; CHECK-NEXT:    beq .LBB2_9
226; CHECK-NEXT:  .LBB2_7: @ %for.body.preheader1
227; CHECK-NEXT:    sub.w lr, r1, r3
228; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
229; CHECK-NEXT:  .LBB2_8: @ %for.body
230; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
231; CHECK-NEXT:    ldr r1, [r0], #4
232; CHECK-NEXT:    ands r2, r1
233; CHECK-NEXT:    le lr, .LBB2_8
234; CHECK-NEXT:  .LBB2_9: @ %for.cond.cleanup
235; CHECK-NEXT:    mov r0, r2
236; CHECK-NEXT:    pop {r4, pc}
237entry:
238  %cmp6 = icmp sgt i32 %n, 0
239  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
240
241for.body.preheader:                               ; preds = %entry
242  %min.iters.check = icmp ult i32 %n, 4
243  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
244
245vector.ph:                                        ; preds = %for.body.preheader
246  %n.vec = and i32 %n, -4
247  br label %vector.body
248
249vector.body:                                      ; preds = %vector.body, %vector.ph
250  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
251  %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %2, %vector.body ]
252  %0 = getelementptr inbounds i32, ptr %x, i32 %index
253  %1 = bitcast ptr %0 to ptr
254  %wide.load = load <4 x i32>, ptr %1, align 4
255  %2 = and <4 x i32> %wide.load, %vec.phi
256  %index.next = add i32 %index, 4
257  %3 = icmp eq i32 %index.next, %n.vec
258  br i1 %3, label %middle.block, label %vector.body
259
260middle.block:                                     ; preds = %vector.body
261  %4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2)
262  %cmp.n = icmp eq i32 %n.vec, %n
263  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
264
265for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
266  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
267  %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %4, %middle.block ]
268  br label %for.body
269
270for.body:                                         ; preds = %for.body.preheader1, %for.body
271  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
272  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
273  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
274  %5 = load i32, ptr %arrayidx, align 4
275  %add = and i32 %5, %r.07
276  %inc = add nuw nsw i32 %i.08, 1
277  %exitcond = icmp eq i32 %inc, %n
278  br i1 %exitcond, label %for.cond.cleanup, label %for.body
279
280for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
281  %r.0.lcssa = phi i32 [ -1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
282  ret i32 %r.0.lcssa
283}
284
285define i32 @or_i32(ptr nocapture readonly %x, i32 %n) {
286; CHECK-LABEL: or_i32:
287; CHECK:       @ %bb.0: @ %entry
288; CHECK-NEXT:    .save {r4, lr}
289; CHECK-NEXT:    push {r4, lr}
290; CHECK-NEXT:    cmp r1, #1
291; CHECK-NEXT:    blt .LBB3_3
292; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
293; CHECK-NEXT:    cmp r1, #4
294; CHECK-NEXT:    bhs .LBB3_4
295; CHECK-NEXT:  @ %bb.2:
296; CHECK-NEXT:    movs r3, #0
297; CHECK-NEXT:    movs r2, #0
298; CHECK-NEXT:    b .LBB3_7
299; CHECK-NEXT:  .LBB3_3:
300; CHECK-NEXT:    movs r2, #0
301; CHECK-NEXT:    mov r0, r2
302; CHECK-NEXT:    pop {r4, pc}
303; CHECK-NEXT:  .LBB3_4: @ %vector.ph
304; CHECK-NEXT:    bic r3, r1, #3
305; CHECK-NEXT:    movs r2, #1
306; CHECK-NEXT:    sub.w r12, r3, #4
307; CHECK-NEXT:    vmov.i32 q0, #0x0
308; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
309; CHECK-NEXT:    mov r2, r0
310; CHECK-NEXT:  .LBB3_5: @ %vector.body
311; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
312; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
313; CHECK-NEXT:    vorr q0, q1, q0
314; CHECK-NEXT:    le lr, .LBB3_5
315; CHECK-NEXT:  @ %bb.6: @ %middle.block
316; CHECK-NEXT:    vmov lr, r12, d1
317; CHECK-NEXT:    cmp r3, r1
318; CHECK-NEXT:    vmov r2, r4, d0
319; CHECK-NEXT:    orr.w r12, r12, lr
320; CHECK-NEXT:    orr.w r2, r2, r4
321; CHECK-NEXT:    orr.w r2, r2, r12
322; CHECK-NEXT:    beq .LBB3_9
323; CHECK-NEXT:  .LBB3_7: @ %for.body.preheader1
324; CHECK-NEXT:    sub.w lr, r1, r3
325; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
326; CHECK-NEXT:  .LBB3_8: @ %for.body
327; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
328; CHECK-NEXT:    ldr r1, [r0], #4
329; CHECK-NEXT:    orrs r2, r1
330; CHECK-NEXT:    le lr, .LBB3_8
331; CHECK-NEXT:  .LBB3_9: @ %for.cond.cleanup
332; CHECK-NEXT:    mov r0, r2
333; CHECK-NEXT:    pop {r4, pc}
334entry:
335  %cmp6 = icmp sgt i32 %n, 0
336  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
337
338for.body.preheader:                               ; preds = %entry
339  %min.iters.check = icmp ult i32 %n, 4
340  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
341
342vector.ph:                                        ; preds = %for.body.preheader
343  %n.vec = and i32 %n, -4
344  br label %vector.body
345
346vector.body:                                      ; preds = %vector.body, %vector.ph
347  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
348  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
349  %0 = getelementptr inbounds i32, ptr %x, i32 %index
350  %1 = bitcast ptr %0 to ptr
351  %wide.load = load <4 x i32>, ptr %1, align 4
352  %2 = or <4 x i32> %wide.load, %vec.phi
353  %index.next = add i32 %index, 4
354  %3 = icmp eq i32 %index.next, %n.vec
355  br i1 %3, label %middle.block, label %vector.body
356
357middle.block:                                     ; preds = %vector.body
358  %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2)
359  %cmp.n = icmp eq i32 %n.vec, %n
360  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
361
362for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
363  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
364  %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
365  br label %for.body
366
367for.body:                                         ; preds = %for.body.preheader1, %for.body
368  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
369  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
370  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
371  %5 = load i32, ptr %arrayidx, align 4
372  %add = or i32 %5, %r.07
373  %inc = add nuw nsw i32 %i.08, 1
374  %exitcond = icmp eq i32 %inc, %n
375  br i1 %exitcond, label %for.cond.cleanup, label %for.body
376
377for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
378  %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
379  ret i32 %r.0.lcssa
380}
381
382define i32 @xor_i32(ptr nocapture readonly %x, i32 %n) {
383; CHECK-LABEL: xor_i32:
384; CHECK:       @ %bb.0: @ %entry
385; CHECK-NEXT:    .save {r4, lr}
386; CHECK-NEXT:    push {r4, lr}
387; CHECK-NEXT:    cmp r1, #1
388; CHECK-NEXT:    blt .LBB4_3
389; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
390; CHECK-NEXT:    cmp r1, #4
391; CHECK-NEXT:    bhs .LBB4_4
392; CHECK-NEXT:  @ %bb.2:
393; CHECK-NEXT:    movs r3, #0
394; CHECK-NEXT:    movs r2, #0
395; CHECK-NEXT:    b .LBB4_7
396; CHECK-NEXT:  .LBB4_3:
397; CHECK-NEXT:    movs r2, #0
398; CHECK-NEXT:    mov r0, r2
399; CHECK-NEXT:    pop {r4, pc}
400; CHECK-NEXT:  .LBB4_4: @ %vector.ph
401; CHECK-NEXT:    bic r3, r1, #3
402; CHECK-NEXT:    movs r2, #1
403; CHECK-NEXT:    sub.w r12, r3, #4
404; CHECK-NEXT:    vmov.i32 q0, #0x0
405; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
406; CHECK-NEXT:    mov r2, r0
407; CHECK-NEXT:  .LBB4_5: @ %vector.body
408; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
409; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
410; CHECK-NEXT:    veor q0, q1, q0
411; CHECK-NEXT:    le lr, .LBB4_5
412; CHECK-NEXT:  @ %bb.6: @ %middle.block
413; CHECK-NEXT:    vmov lr, r12, d1
414; CHECK-NEXT:    cmp r3, r1
415; CHECK-NEXT:    vmov r2, r4, d0
416; CHECK-NEXT:    eor.w r12, r12, lr
417; CHECK-NEXT:    eor.w r2, r2, r4
418; CHECK-NEXT:    eor.w r2, r2, r12
419; CHECK-NEXT:    beq .LBB4_9
420; CHECK-NEXT:  .LBB4_7: @ %for.body.preheader1
421; CHECK-NEXT:    sub.w lr, r1, r3
422; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
423; CHECK-NEXT:  .LBB4_8: @ %for.body
424; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
425; CHECK-NEXT:    ldr r1, [r0], #4
426; CHECK-NEXT:    eors r2, r1
427; CHECK-NEXT:    le lr, .LBB4_8
428; CHECK-NEXT:  .LBB4_9: @ %for.cond.cleanup
429; CHECK-NEXT:    mov r0, r2
430; CHECK-NEXT:    pop {r4, pc}
431entry:
432  %cmp6 = icmp sgt i32 %n, 0
433  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
434
435for.body.preheader:                               ; preds = %entry
436  %min.iters.check = icmp ult i32 %n, 4
437  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
438
439vector.ph:                                        ; preds = %for.body.preheader
440  %n.vec = and i32 %n, -4
441  br label %vector.body
442
443vector.body:                                      ; preds = %vector.body, %vector.ph
444  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
445  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
446  %0 = getelementptr inbounds i32, ptr %x, i32 %index
447  %1 = bitcast ptr %0 to ptr
448  %wide.load = load <4 x i32>, ptr %1, align 4
449  %2 = xor <4 x i32> %wide.load, %vec.phi
450  %index.next = add i32 %index, 4
451  %3 = icmp eq i32 %index.next, %n.vec
452  br i1 %3, label %middle.block, label %vector.body
453
454middle.block:                                     ; preds = %vector.body
455  %4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2)
456  %cmp.n = icmp eq i32 %n.vec, %n
457  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
458
459for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
460  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
461  %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
462  br label %for.body
463
464for.body:                                         ; preds = %for.body.preheader1, %for.body
465  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
466  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
467  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
468  %5 = load i32, ptr %arrayidx, align 4
469  %add = xor i32 %5, %r.07
470  %inc = add nuw nsw i32 %i.08, 1
471  %exitcond = icmp eq i32 %inc, %n
472  br i1 %exitcond, label %for.cond.cleanup, label %for.body
473
474for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
475  %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
476  ret i32 %r.0.lcssa
477}
478
479define float @fadd_f32(ptr nocapture readonly %x, i32 %n) {
480; CHECK-LABEL: fadd_f32:
481; CHECK:       @ %bb.0: @ %entry
482; CHECK-NEXT:    .save {r7, lr}
483; CHECK-NEXT:    push {r7, lr}
484; CHECK-NEXT:    cmp r1, #1
485; CHECK-NEXT:    blt .LBB5_3
486; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
487; CHECK-NEXT:    cmp r1, #4
488; CHECK-NEXT:    bhs .LBB5_4
489; CHECK-NEXT:  @ %bb.2:
490; CHECK-NEXT:    vldr s0, .LCPI5_0
491; CHECK-NEXT:    movs r2, #0
492; CHECK-NEXT:    b .LBB5_7
493; CHECK-NEXT:  .LBB5_3:
494; CHECK-NEXT:    vldr s0, .LCPI5_0
495; CHECK-NEXT:    vmov r0, s0
496; CHECK-NEXT:    pop {r7, pc}
497; CHECK-NEXT:  .LBB5_4: @ %vector.ph
498; CHECK-NEXT:    bic r2, r1, #3
499; CHECK-NEXT:    movs r3, #1
500; CHECK-NEXT:    sub.w r12, r2, #4
501; CHECK-NEXT:    vmov.i32 q0, #0x0
502; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
503; CHECK-NEXT:    mov r3, r0
504; CHECK-NEXT:  .LBB5_5: @ %vector.body
505; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
506; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
507; CHECK-NEXT:    vadd.f32 q0, q1, q0
508; CHECK-NEXT:    le lr, .LBB5_5
509; CHECK-NEXT:  @ %bb.6: @ %middle.block
510; CHECK-NEXT:    vadd.f32 s2, s2, s3
511; CHECK-NEXT:    cmp r2, r1
512; CHECK-NEXT:    vadd.f32 s0, s0, s1
513; CHECK-NEXT:    vadd.f32 s0, s0, s2
514; CHECK-NEXT:    beq .LBB5_9
515; CHECK-NEXT:  .LBB5_7: @ %for.body.preheader1
516; CHECK-NEXT:    sub.w lr, r1, r2
517; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
518; CHECK-NEXT:  .LBB5_8: @ %for.body
519; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
520; CHECK-NEXT:    vldmia r0!, {s2}
521; CHECK-NEXT:    vadd.f32 s0, s2, s0
522; CHECK-NEXT:    le lr, .LBB5_8
523; CHECK-NEXT:  .LBB5_9: @ %for.cond.cleanup
524; CHECK-NEXT:    vmov r0, s0
525; CHECK-NEXT:    pop {r7, pc}
526; CHECK-NEXT:    .p2align 2
527; CHECK-NEXT:  @ %bb.10:
528; CHECK-NEXT:  .LCPI5_0:
529; CHECK-NEXT:    .long 0x00000000 @ float 0
530entry:
531  %cmp6 = icmp sgt i32 %n, 0
532  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
533
534for.body.preheader:                               ; preds = %entry
535  %min.iters.check = icmp ult i32 %n, 4
536  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
537
538vector.ph:                                        ; preds = %for.body.preheader
539  %n.vec = and i32 %n, -4
540  br label %vector.body
541
542vector.body:                                      ; preds = %vector.body, %vector.ph
543  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
544  %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
545  %0 = getelementptr inbounds float, ptr %x, i32 %index
546  %1 = bitcast ptr %0 to ptr
547  %wide.load = load <4 x float>, ptr %1, align 4
548  %2 = fadd fast <4 x float> %wide.load, %vec.phi
549  %index.next = add i32 %index, 4
550  %3 = icmp eq i32 %index.next, %n.vec
551  br i1 %3, label %middle.block, label %vector.body
552
553middle.block:                                     ; preds = %vector.body
554  %4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
555  %cmp.n = icmp eq i32 %n.vec, %n
556  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
557
558for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
559  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
560  %r.07.ph = phi float [ 0.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
561  br label %for.body
562
563for.body:                                         ; preds = %for.body.preheader1, %for.body
564  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
565  %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
566  %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08
567  %5 = load float, ptr %arrayidx, align 4
568  %add = fadd fast float %5, %r.07
569  %inc = add nuw nsw i32 %i.08, 1
570  %exitcond = icmp eq i32 %inc, %n
571  br i1 %exitcond, label %for.cond.cleanup, label %for.body
572
573for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
574  %r.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
575  ret float %r.0.lcssa
576}
577
578define float @fmul_f32(ptr nocapture readonly %x, i32 %n) {
579; CHECK-LABEL: fmul_f32:
580; CHECK:       @ %bb.0: @ %entry
581; CHECK-NEXT:    .save {r7, lr}
582; CHECK-NEXT:    push {r7, lr}
583; CHECK-NEXT:    cmp r1, #1
584; CHECK-NEXT:    blt .LBB6_3
585; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
586; CHECK-NEXT:    cmp r1, #4
587; CHECK-NEXT:    bhs .LBB6_4
588; CHECK-NEXT:  @ %bb.2:
589; CHECK-NEXT:    vmov.f32 s0, #1.000000e+00
590; CHECK-NEXT:    movs r2, #0
591; CHECK-NEXT:    b .LBB6_7
592; CHECK-NEXT:  .LBB6_3:
593; CHECK-NEXT:    vmov.f32 s0, #1.000000e+00
594; CHECK-NEXT:    vmov r0, s0
595; CHECK-NEXT:    pop {r7, pc}
596; CHECK-NEXT:  .LBB6_4: @ %vector.ph
597; CHECK-NEXT:    bic r2, r1, #3
598; CHECK-NEXT:    movs r3, #1
599; CHECK-NEXT:    sub.w r12, r2, #4
600; CHECK-NEXT:    vmov.f32 q0, #1.000000e+00
601; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
602; CHECK-NEXT:    mov r3, r0
603; CHECK-NEXT:  .LBB6_5: @ %vector.body
604; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
605; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
606; CHECK-NEXT:    vmul.f32 q0, q1, q0
607; CHECK-NEXT:    le lr, .LBB6_5
608; CHECK-NEXT:  @ %bb.6: @ %middle.block
609; CHECK-NEXT:    vmul.f32 s2, s2, s3
610; CHECK-NEXT:    cmp r2, r1
611; CHECK-NEXT:    vmul.f32 s0, s0, s1
612; CHECK-NEXT:    vmul.f32 s0, s0, s2
613; CHECK-NEXT:    beq .LBB6_9
614; CHECK-NEXT:  .LBB6_7: @ %for.body.preheader1
615; CHECK-NEXT:    sub.w lr, r1, r2
616; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
617; CHECK-NEXT:  .LBB6_8: @ %for.body
618; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
619; CHECK-NEXT:    vldmia r0!, {s2}
620; CHECK-NEXT:    vmul.f32 s0, s2, s0
621; CHECK-NEXT:    le lr, .LBB6_8
622; CHECK-NEXT:  .LBB6_9: @ %for.cond.cleanup
623; CHECK-NEXT:    vmov r0, s0
624; CHECK-NEXT:    pop {r7, pc}
625entry:
626  %cmp6 = icmp sgt i32 %n, 0
627  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
628
629for.body.preheader:                               ; preds = %entry
630  %min.iters.check = icmp ult i32 %n, 4
631  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
632
633vector.ph:                                        ; preds = %for.body.preheader
634  %n.vec = and i32 %n, -4
635  br label %vector.body
636
637vector.body:                                      ; preds = %vector.body, %vector.ph
638  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
639  %vec.phi = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ %2, %vector.body ]
640  %0 = getelementptr inbounds float, ptr %x, i32 %index
641  %1 = bitcast ptr %0 to ptr
642  %wide.load = load <4 x float>, ptr %1, align 4
643  %2 = fmul fast <4 x float> %wide.load, %vec.phi
644  %index.next = add i32 %index, 4
645  %3 = icmp eq i32 %index.next, %n.vec
646  br i1 %3, label %middle.block, label %vector.body
647
648middle.block:                                     ; preds = %vector.body
649  %4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
650  %cmp.n = icmp eq i32 %n.vec, %n
651  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
652
653for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
654  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
655  %r.07.ph = phi float [ 1.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
656  br label %for.body
657
658for.body:                                         ; preds = %for.body.preheader1, %for.body
659  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
660  %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
661  %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08
662  %5 = load float, ptr %arrayidx, align 4
663  %add = fmul fast float %5, %r.07
664  %inc = add nuw nsw i32 %i.08, 1
665  %exitcond = icmp eq i32 %inc, %n
666  br i1 %exitcond, label %for.cond.cleanup, label %for.body
667
668for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
669  %r.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
670  ret float %r.0.lcssa
671}
672
673define i32 @smin_i32(ptr nocapture readonly %x, i32 %n) {
674; CHECK-LABEL: smin_i32:
675; CHECK:       @ %bb.0: @ %entry
676; CHECK-NEXT:    .save {r7, lr}
677; CHECK-NEXT:    push {r7, lr}
678; CHECK-NEXT:    cmp r1, #1
679; CHECK-NEXT:    blt .LBB7_3
680; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
681; CHECK-NEXT:    cmp r1, #4
682; CHECK-NEXT:    bhs .LBB7_4
683; CHECK-NEXT:  @ %bb.2:
684; CHECK-NEXT:    mvn r2, #-2147483648
685; CHECK-NEXT:    movs r3, #0
686; CHECK-NEXT:    b .LBB7_7
687; CHECK-NEXT:  .LBB7_3:
688; CHECK-NEXT:    mvn r2, #-2147483648
689; CHECK-NEXT:    mov r0, r2
690; CHECK-NEXT:    pop {r7, pc}
691; CHECK-NEXT:  .LBB7_4: @ %vector.ph
692; CHECK-NEXT:    bic r3, r1, #3
693; CHECK-NEXT:    movs r2, #1
694; CHECK-NEXT:    sub.w r12, r3, #4
695; CHECK-NEXT:    vmvn.i32 q0, #0x80000000
696; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
697; CHECK-NEXT:    mov r2, r0
698; CHECK-NEXT:  .LBB7_5: @ %vector.body
699; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
700; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
701; CHECK-NEXT:    vmin.s32 q0, q0, q1
702; CHECK-NEXT:    le lr, .LBB7_5
703; CHECK-NEXT:  @ %bb.6: @ %middle.block
704; CHECK-NEXT:    mvn r2, #-2147483648
705; CHECK-NEXT:    cmp r3, r1
706; CHECK-NEXT:    vminv.s32 r2, q0
707; CHECK-NEXT:    beq .LBB7_9
708; CHECK-NEXT:  .LBB7_7: @ %for.body.preheader1
709; CHECK-NEXT:    sub.w lr, r1, r3
710; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
711; CHECK-NEXT:  .LBB7_8: @ %for.body
712; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
713; CHECK-NEXT:    ldr r1, [r0], #4
714; CHECK-NEXT:    cmp r2, r1
715; CHECK-NEXT:    csel r2, r2, r1, lt
716; CHECK-NEXT:    le lr, .LBB7_8
717; CHECK-NEXT:  .LBB7_9: @ %for.cond.cleanup
718; CHECK-NEXT:    mov r0, r2
719; CHECK-NEXT:    pop {r7, pc}
720entry:
721  %cmp6 = icmp sgt i32 %n, 0
722  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
723
724for.body.preheader:                               ; preds = %entry
725  %min.iters.check = icmp ult i32 %n, 4
726  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
727
728vector.ph:                                        ; preds = %for.body.preheader
729  %n.vec = and i32 %n, -4
730  br label %vector.body
731
732vector.body:                                      ; preds = %vector.body, %vector.ph
733  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
734  %vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %vector.ph ], [ %3, %vector.body ]
735  %0 = getelementptr inbounds i32, ptr %x, i32 %index
736  %1 = bitcast ptr %0 to ptr
737  %wide.load = load <4 x i32>, ptr %1, align 4
738  %2 = icmp slt <4 x i32> %vec.phi, %wide.load
739  %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
740  %index.next = add i32 %index, 4
741  %4 = icmp eq i32 %index.next, %n.vec
742  br i1 %4, label %middle.block, label %vector.body
743
744middle.block:                                     ; preds = %vector.body
745  %5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3)
746  %cmp.n = icmp eq i32 %n.vec, %n
747  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
748
749for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
750  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
751  %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
752  br label %for.body
753
754for.body:                                         ; preds = %for.body.preheader1, %for.body
755  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
756  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
757  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
758  %6 = load i32, ptr %arrayidx, align 4
759  %c = icmp slt i32 %r.07, %6
760  %add = select i1 %c, i32 %r.07, i32 %6
761  %inc = add nuw nsw i32 %i.08, 1
762  %exitcond = icmp eq i32 %inc, %n
763  br i1 %exitcond, label %for.cond.cleanup, label %for.body
764
765for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
766  %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
767  ret i32 %r.0.lcssa
768}
769
770define i32 @smin_i32_inloop(ptr nocapture readonly %x, i32 %n) {
771; CHECK-LABEL: smin_i32_inloop:
772; CHECK:       @ %bb.0: @ %entry
773; CHECK-NEXT:    .save {r7, lr}
774; CHECK-NEXT:    push {r7, lr}
775; CHECK-NEXT:    cmp r1, #1
776; CHECK-NEXT:    blt .LBB8_3
777; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
778; CHECK-NEXT:    mov r12, r0
779; CHECK-NEXT:    cmp r1, #4
780; CHECK-NEXT:    bhs .LBB8_4
781; CHECK-NEXT:  @ %bb.2:
782; CHECK-NEXT:    mvn r0, #-2147483648
783; CHECK-NEXT:    movs r3, #0
784; CHECK-NEXT:    b .LBB8_7
785; CHECK-NEXT:  .LBB8_3:
786; CHECK-NEXT:    mvn r0, #-2147483648
787; CHECK-NEXT:    pop {r7, pc}
788; CHECK-NEXT:  .LBB8_4: @ %vector.ph
789; CHECK-NEXT:    bic r3, r1, #3
790; CHECK-NEXT:    movs r2, #1
791; CHECK-NEXT:    subs r0, r3, #4
792; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
793; CHECK-NEXT:    mvn r0, #-2147483648
794; CHECK-NEXT:    mov r2, r12
795; CHECK-NEXT:  .LBB8_5: @ %vector.body
796; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
797; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
798; CHECK-NEXT:    vminv.s32 r0, q0
799; CHECK-NEXT:    le lr, .LBB8_5
800; CHECK-NEXT:  @ %bb.6: @ %middle.block
801; CHECK-NEXT:    cmp r3, r1
802; CHECK-NEXT:    it eq
803; CHECK-NEXT:    popeq {r7, pc}
804; CHECK-NEXT:  .LBB8_7: @ %for.body.preheader1
805; CHECK-NEXT:    sub.w lr, r1, r3
806; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
807; CHECK-NEXT:  .LBB8_8: @ %for.body
808; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
809; CHECK-NEXT:    ldr r1, [r2], #4
810; CHECK-NEXT:    cmp r0, r1
811; CHECK-NEXT:    csel r0, r0, r1, lt
812; CHECK-NEXT:    le lr, .LBB8_8
813; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup
814; CHECK-NEXT:    pop {r7, pc}
815entry:
816  %cmp6 = icmp sgt i32 %n, 0
817  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
818
819for.body.preheader:                               ; preds = %entry
820  %min.iters.check = icmp ult i32 %n, 4
821  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
822
823vector.ph:                                        ; preds = %for.body.preheader
824  %n.vec = and i32 %n, -4
825  br label %vector.body
826
827vector.body:                                      ; preds = %vector.body, %vector.ph
828  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
829  %vec.phi = phi i32 [ 2147483647, %vector.ph ], [ %3, %vector.body ]
830  %0 = getelementptr inbounds i32, ptr %x, i32 %index
831  %1 = bitcast ptr %0 to ptr
832  %wide.load = load <4 x i32>, ptr %1, align 4
833  %l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
834  %2 = icmp slt i32 %vec.phi, %l5
835  %3 = select i1 %2, i32 %vec.phi, i32 %l5
836  %index.next = add i32 %index, 4
837  %4 = icmp eq i32 %index.next, %n.vec
838  br i1 %4, label %middle.block, label %vector.body
839
840middle.block:                                     ; preds = %vector.body
841  %5 = phi i32 [ %3, %vector.body ]
842  %cmp.n = icmp eq i32 %n.vec, %n
843  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
844
845for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
846  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
847  %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
848  br label %for.body
849
850for.body:                                         ; preds = %for.body.preheader1, %for.body
851  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
852  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
853  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
854  %6 = load i32, ptr %arrayidx, align 4
855  %c = icmp slt i32 %r.07, %6
856  %add = select i1 %c, i32 %r.07, i32 %6
857  %inc = add nuw nsw i32 %i.08, 1
858  %exitcond = icmp eq i32 %inc, %n
859  br i1 %exitcond, label %for.cond.cleanup, label %for.body
860
861for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
862  %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
863  ret i32 %r.0.lcssa
864}
865
866define i32 @smax_i32(ptr nocapture readonly %x, i32 %n) {
867; CHECK-LABEL: smax_i32:
868; CHECK:       @ %bb.0: @ %entry
869; CHECK-NEXT:    .save {r7, lr}
870; CHECK-NEXT:    push {r7, lr}
871; CHECK-NEXT:    cmp r1, #1
872; CHECK-NEXT:    blt .LBB9_3
873; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
874; CHECK-NEXT:    cmp r1, #4
875; CHECK-NEXT:    bhs .LBB9_4
876; CHECK-NEXT:  @ %bb.2:
877; CHECK-NEXT:    mov.w r2, #-2147483648
878; CHECK-NEXT:    movs r3, #0
879; CHECK-NEXT:    b .LBB9_7
880; CHECK-NEXT:  .LBB9_3:
881; CHECK-NEXT:    mov.w r2, #-2147483648
882; CHECK-NEXT:    mov r0, r2
883; CHECK-NEXT:    pop {r7, pc}
884; CHECK-NEXT:  .LBB9_4: @ %vector.ph
885; CHECK-NEXT:    bic r3, r1, #3
886; CHECK-NEXT:    movs r2, #1
887; CHECK-NEXT:    sub.w r12, r3, #4
888; CHECK-NEXT:    vmov.i32 q0, #0x80000000
889; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
890; CHECK-NEXT:    mov r2, r0
891; CHECK-NEXT:  .LBB9_5: @ %vector.body
892; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
893; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
894; CHECK-NEXT:    vmax.s32 q0, q0, q1
895; CHECK-NEXT:    le lr, .LBB9_5
896; CHECK-NEXT:  @ %bb.6: @ %middle.block
897; CHECK-NEXT:    mov.w r2, #-2147483648
898; CHECK-NEXT:    cmp r3, r1
899; CHECK-NEXT:    vmaxv.s32 r2, q0
900; CHECK-NEXT:    beq .LBB9_9
901; CHECK-NEXT:  .LBB9_7: @ %for.body.preheader1
902; CHECK-NEXT:    sub.w lr, r1, r3
903; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
904; CHECK-NEXT:  .LBB9_8: @ %for.body
905; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
906; CHECK-NEXT:    ldr r1, [r0], #4
907; CHECK-NEXT:    cmp r2, r1
908; CHECK-NEXT:    csel r2, r2, r1, gt
909; CHECK-NEXT:    le lr, .LBB9_8
910; CHECK-NEXT:  .LBB9_9: @ %for.cond.cleanup
911; CHECK-NEXT:    mov r0, r2
912; CHECK-NEXT:    pop {r7, pc}
913entry:
914  %cmp6 = icmp sgt i32 %n, 0
915  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
916
917for.body.preheader:                               ; preds = %entry
918  %min.iters.check = icmp ult i32 %n, 4
919  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
920
921vector.ph:                                        ; preds = %for.body.preheader
922  %n.vec = and i32 %n, -4
923  br label %vector.body
924
925vector.body:                                      ; preds = %vector.body, %vector.ph
926  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
927  %vec.phi = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %vector.ph ], [ %3, %vector.body ]
928  %0 = getelementptr inbounds i32, ptr %x, i32 %index
929  %1 = bitcast ptr %0 to ptr
930  %wide.load = load <4 x i32>, ptr %1, align 4
931  %2 = icmp sgt <4 x i32> %vec.phi, %wide.load
932  %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
933  %index.next = add i32 %index, 4
934  %4 = icmp eq i32 %index.next, %n.vec
935  br i1 %4, label %middle.block, label %vector.body
936
937middle.block:                                     ; preds = %vector.body
938  %5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3)
939  %cmp.n = icmp eq i32 %n.vec, %n
940  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
941
942for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
943  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
944  %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
945  br label %for.body
946
947for.body:                                         ; preds = %for.body.preheader1, %for.body
948  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
949  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
950  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
951  %6 = load i32, ptr %arrayidx, align 4
952  %c = icmp sgt i32 %r.07, %6
953  %add = select i1 %c, i32 %r.07, i32 %6
954  %inc = add nuw nsw i32 %i.08, 1
955  %exitcond = icmp eq i32 %inc, %n
956  br i1 %exitcond, label %for.cond.cleanup, label %for.body
957
958for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
959  %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
960  ret i32 %r.0.lcssa
961}
962
963define i32 @smax_i32_inloop(ptr nocapture readonly %x, i32 %n) {
964; CHECK-LABEL: smax_i32_inloop:
965; CHECK:       @ %bb.0: @ %entry
966; CHECK-NEXT:    .save {r7, lr}
967; CHECK-NEXT:    push {r7, lr}
968; CHECK-NEXT:    cmp r1, #1
969; CHECK-NEXT:    blt .LBB10_3
970; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
971; CHECK-NEXT:    mov r12, r0
972; CHECK-NEXT:    cmp r1, #4
973; CHECK-NEXT:    bhs .LBB10_4
974; CHECK-NEXT:  @ %bb.2:
975; CHECK-NEXT:    mov.w r0, #-2147483648
976; CHECK-NEXT:    movs r3, #0
977; CHECK-NEXT:    b .LBB10_7
978; CHECK-NEXT:  .LBB10_3:
979; CHECK-NEXT:    mov.w r0, #-2147483648
980; CHECK-NEXT:    pop {r7, pc}
981; CHECK-NEXT:  .LBB10_4: @ %vector.ph
982; CHECK-NEXT:    bic r3, r1, #3
983; CHECK-NEXT:    movs r2, #1
984; CHECK-NEXT:    subs r0, r3, #4
985; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
986; CHECK-NEXT:    mov.w r0, #-2147483648
987; CHECK-NEXT:    mov r2, r12
988; CHECK-NEXT:  .LBB10_5: @ %vector.body
989; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
990; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
991; CHECK-NEXT:    vmaxv.s32 r0, q0
992; CHECK-NEXT:    le lr, .LBB10_5
993; CHECK-NEXT:  @ %bb.6: @ %middle.block
994; CHECK-NEXT:    cmp r3, r1
995; CHECK-NEXT:    it eq
996; CHECK-NEXT:    popeq {r7, pc}
997; CHECK-NEXT:  .LBB10_7: @ %for.body.preheader1
998; CHECK-NEXT:    sub.w lr, r1, r3
999; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
1000; CHECK-NEXT:  .LBB10_8: @ %for.body
1001; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1002; CHECK-NEXT:    ldr r1, [r2], #4
1003; CHECK-NEXT:    cmp r0, r1
1004; CHECK-NEXT:    csel r0, r0, r1, gt
1005; CHECK-NEXT:    le lr, .LBB10_8
1006; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup
1007; CHECK-NEXT:    pop {r7, pc}
1008entry:
1009  %cmp6 = icmp sgt i32 %n, 0
1010  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1011
1012for.body.preheader:                               ; preds = %entry
1013  %min.iters.check = icmp ult i32 %n, 4
1014  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1015
1016vector.ph:                                        ; preds = %for.body.preheader
1017  %n.vec = and i32 %n, -4
1018  br label %vector.body
1019
1020vector.body:                                      ; preds = %vector.body, %vector.ph
1021  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1022  %vec.phi = phi i32 [ -2147483648, %vector.ph ], [ %3, %vector.body ]
1023  %0 = getelementptr inbounds i32, ptr %x, i32 %index
1024  %1 = bitcast ptr %0 to ptr
1025  %wide.load = load <4 x i32>, ptr %1, align 4
1026  %l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
1027  %2 = icmp sgt i32 %vec.phi, %l5
1028  %3 = select i1 %2, i32 %vec.phi, i32 %l5
1029  %index.next = add i32 %index, 4
1030  %4 = icmp eq i32 %index.next, %n.vec
1031  br i1 %4, label %middle.block, label %vector.body
1032
1033middle.block:                                     ; preds = %vector.body
1034  %5 = phi i32 [ %3, %vector.body ]
1035  %cmp.n = icmp eq i32 %n.vec, %n
1036  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1037
1038for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1039  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1040  %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
1041  br label %for.body
1042
1043for.body:                                         ; preds = %for.body.preheader1, %for.body
1044  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1045  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1046  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
1047  %6 = load i32, ptr %arrayidx, align 4
1048  %c = icmp sgt i32 %r.07, %6
1049  %add = select i1 %c, i32 %r.07, i32 %6
1050  %inc = add nuw nsw i32 %i.08, 1
1051  %exitcond = icmp eq i32 %inc, %n
1052  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1053
1054for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1055  %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1056  ret i32 %r.0.lcssa
1057}
1058
1059define i32 @umin_i32(ptr nocapture readonly %x, i32 %n) {
1060; CHECK-LABEL: umin_i32:
1061; CHECK:       @ %bb.0: @ %entry
1062; CHECK-NEXT:    .save {r7, lr}
1063; CHECK-NEXT:    push {r7, lr}
1064; CHECK-NEXT:    cmp r1, #1
1065; CHECK-NEXT:    blt .LBB11_3
1066; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1067; CHECK-NEXT:    cmp r1, #4
1068; CHECK-NEXT:    bhs .LBB11_4
1069; CHECK-NEXT:  @ %bb.2:
1070; CHECK-NEXT:    mov.w r2, #-1
1071; CHECK-NEXT:    movs r3, #0
1072; CHECK-NEXT:    b .LBB11_7
1073; CHECK-NEXT:  .LBB11_3:
1074; CHECK-NEXT:    mov.w r2, #-1
1075; CHECK-NEXT:    mov r0, r2
1076; CHECK-NEXT:    pop {r7, pc}
1077; CHECK-NEXT:  .LBB11_4: @ %vector.ph
1078; CHECK-NEXT:    bic r3, r1, #3
1079; CHECK-NEXT:    movs r2, #1
1080; CHECK-NEXT:    sub.w r12, r3, #4
1081; CHECK-NEXT:    vmov.i8 q0, #0xff
1082; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
1083; CHECK-NEXT:    mov r2, r0
1084; CHECK-NEXT:  .LBB11_5: @ %vector.body
1085; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1086; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
1087; CHECK-NEXT:    vmin.u32 q0, q0, q1
1088; CHECK-NEXT:    le lr, .LBB11_5
1089; CHECK-NEXT:  @ %bb.6: @ %middle.block
1090; CHECK-NEXT:    mov.w r2, #-1
1091; CHECK-NEXT:    cmp r3, r1
1092; CHECK-NEXT:    vminv.u32 r2, q0
1093; CHECK-NEXT:    beq .LBB11_9
1094; CHECK-NEXT:  .LBB11_7: @ %for.body.preheader1
1095; CHECK-NEXT:    sub.w lr, r1, r3
1096; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
1097; CHECK-NEXT:  .LBB11_8: @ %for.body
1098; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1099; CHECK-NEXT:    ldr r1, [r0], #4
1100; CHECK-NEXT:    cmp r2, r1
1101; CHECK-NEXT:    csel r2, r2, r1, lo
1102; CHECK-NEXT:    le lr, .LBB11_8
1103; CHECK-NEXT:  .LBB11_9: @ %for.cond.cleanup
1104; CHECK-NEXT:    mov r0, r2
1105; CHECK-NEXT:    pop {r7, pc}
1106entry:
1107  %cmp6 = icmp sgt i32 %n, 0
1108  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1109
1110for.body.preheader:                               ; preds = %entry
1111  %min.iters.check = icmp ult i32 %n, 4
1112  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1113
1114vector.ph:                                        ; preds = %for.body.preheader
1115  %n.vec = and i32 %n, -4
1116  br label %vector.body
1117
1118vector.body:                                      ; preds = %vector.body, %vector.ph
1119  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1120  %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %3, %vector.body ]
1121  %0 = getelementptr inbounds i32, ptr %x, i32 %index
1122  %1 = bitcast ptr %0 to ptr
1123  %wide.load = load <4 x i32>, ptr %1, align 4
1124  %2 = icmp ult <4 x i32> %vec.phi, %wide.load
1125  %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
1126  %index.next = add i32 %index, 4
1127  %4 = icmp eq i32 %index.next, %n.vec
1128  br i1 %4, label %middle.block, label %vector.body
1129
1130middle.block:                                     ; preds = %vector.body
1131  %5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3)
1132  %cmp.n = icmp eq i32 %n.vec, %n
1133  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1134
1135for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1136  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1137  %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
1138  br label %for.body
1139
1140for.body:                                         ; preds = %for.body.preheader1, %for.body
1141  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1142  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1143  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
1144  %6 = load i32, ptr %arrayidx, align 4
1145  %c = icmp ult i32 %r.07, %6
1146  %add = select i1 %c, i32 %r.07, i32 %6
1147  %inc = add nuw nsw i32 %i.08, 1
1148  %exitcond = icmp eq i32 %inc, %n
1149  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1150
1151for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1152  %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1153  ret i32 %r.0.lcssa
1154}
1155
1156define i32 @umin_i32_inloop(ptr nocapture readonly %x, i32 %n) {
1157; CHECK-LABEL: umin_i32_inloop:
1158; CHECK:       @ %bb.0: @ %entry
1159; CHECK-NEXT:    .save {r7, lr}
1160; CHECK-NEXT:    push {r7, lr}
1161; CHECK-NEXT:    cmp r1, #1
1162; CHECK-NEXT:    blt .LBB12_3
1163; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1164; CHECK-NEXT:    mov r12, r0
1165; CHECK-NEXT:    cmp r1, #4
1166; CHECK-NEXT:    bhs .LBB12_4
1167; CHECK-NEXT:  @ %bb.2:
1168; CHECK-NEXT:    mov.w r0, #-1
1169; CHECK-NEXT:    movs r3, #0
1170; CHECK-NEXT:    b .LBB12_7
1171; CHECK-NEXT:  .LBB12_3:
1172; CHECK-NEXT:    mov.w r0, #-1
1173; CHECK-NEXT:    pop {r7, pc}
1174; CHECK-NEXT:  .LBB12_4: @ %vector.ph
1175; CHECK-NEXT:    bic r3, r1, #3
1176; CHECK-NEXT:    movs r2, #1
1177; CHECK-NEXT:    subs r0, r3, #4
1178; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
1179; CHECK-NEXT:    mov.w r0, #-1
1180; CHECK-NEXT:    mov r2, r12
1181; CHECK-NEXT:  .LBB12_5: @ %vector.body
1182; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1183; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
1184; CHECK-NEXT:    vminv.u32 r0, q0
1185; CHECK-NEXT:    le lr, .LBB12_5
1186; CHECK-NEXT:  @ %bb.6: @ %middle.block
1187; CHECK-NEXT:    cmp r3, r1
1188; CHECK-NEXT:    it eq
1189; CHECK-NEXT:    popeq {r7, pc}
1190; CHECK-NEXT:  .LBB12_7: @ %for.body.preheader1
1191; CHECK-NEXT:    sub.w lr, r1, r3
1192; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
1193; CHECK-NEXT:  .LBB12_8: @ %for.body
1194; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1195; CHECK-NEXT:    ldr r1, [r2], #4
1196; CHECK-NEXT:    cmp r0, r1
1197; CHECK-NEXT:    csel r0, r0, r1, hi
1198; CHECK-NEXT:    le lr, .LBB12_8
1199; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup
1200; CHECK-NEXT:    pop {r7, pc}
1201entry:
1202  %cmp6 = icmp sgt i32 %n, 0
1203  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1204
1205for.body.preheader:                               ; preds = %entry
1206  %min.iters.check = icmp ult i32 %n, 4
1207  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1208
1209vector.ph:                                        ; preds = %for.body.preheader
1210  %n.vec = and i32 %n, -4
1211  br label %vector.body
1212
1213vector.body:                                      ; preds = %vector.body, %vector.ph
1214  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1215  %vec.phi = phi i32 [ -1, %vector.ph ], [ %3, %vector.body ]
1216  %0 = getelementptr inbounds i32, ptr %x, i32 %index
1217  %1 = bitcast ptr %0 to ptr
1218  %wide.load = load <4 x i32>, ptr %1, align 4
1219  %l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
1220  %2 = icmp ult i32 %vec.phi, %l5
1221  %3 = select i1 %2, i32 %vec.phi, i32 %l5
1222  %index.next = add i32 %index, 4
1223  %4 = icmp eq i32 %index.next, %n.vec
1224  br i1 %4, label %middle.block, label %vector.body
1225
1226middle.block:                                     ; preds = %vector.body
1227  %5 = phi i32 [ %3, %vector.body ]
1228  %cmp.n = icmp eq i32 %n.vec, %n
1229  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1230
1231for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1232  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1233  %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
1234  br label %for.body
1235
1236for.body:                                         ; preds = %for.body.preheader1, %for.body
1237  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1238  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1239  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
1240  %6 = load i32, ptr %arrayidx, align 4
1241  %c = icmp ugt i32 %r.07, %6
1242  %add = select i1 %c, i32 %r.07, i32 %6
1243  %inc = add nuw nsw i32 %i.08, 1
1244  %exitcond = icmp eq i32 %inc, %n
1245  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1246
1247for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1248  %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1249  ret i32 %r.0.lcssa
1250}
1251
1252define i32 @umax_i32(ptr nocapture readonly %x, i32 %n) {
1253; CHECK-LABEL: umax_i32:
1254; CHECK:       @ %bb.0: @ %entry
1255; CHECK-NEXT:    .save {r7, lr}
1256; CHECK-NEXT:    push {r7, lr}
1257; CHECK-NEXT:    cmp r1, #1
1258; CHECK-NEXT:    blt .LBB13_3
1259; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1260; CHECK-NEXT:    cmp r1, #4
1261; CHECK-NEXT:    bhs .LBB13_4
1262; CHECK-NEXT:  @ %bb.2:
1263; CHECK-NEXT:    movs r3, #0
1264; CHECK-NEXT:    movs r2, #0
1265; CHECK-NEXT:    b .LBB13_7
1266; CHECK-NEXT:  .LBB13_3:
1267; CHECK-NEXT:    movs r2, #0
1268; CHECK-NEXT:    mov r0, r2
1269; CHECK-NEXT:    pop {r7, pc}
1270; CHECK-NEXT:  .LBB13_4: @ %vector.ph
1271; CHECK-NEXT:    bic r3, r1, #3
1272; CHECK-NEXT:    movs r2, #1
1273; CHECK-NEXT:    sub.w r12, r3, #4
1274; CHECK-NEXT:    vmov.i32 q0, #0x0
1275; CHECK-NEXT:    add.w lr, r2, r12, lsr #2
1276; CHECK-NEXT:    mov r2, r0
1277; CHECK-NEXT:  .LBB13_5: @ %vector.body
1278; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1279; CHECK-NEXT:    vldrw.u32 q1, [r2], #16
1280; CHECK-NEXT:    vmax.u32 q0, q0, q1
1281; CHECK-NEXT:    le lr, .LBB13_5
1282; CHECK-NEXT:  @ %bb.6: @ %middle.block
1283; CHECK-NEXT:    movs r2, #0
1284; CHECK-NEXT:    cmp r3, r1
1285; CHECK-NEXT:    vmaxv.u32 r2, q0
1286; CHECK-NEXT:    beq .LBB13_9
1287; CHECK-NEXT:  .LBB13_7: @ %for.body.preheader1
1288; CHECK-NEXT:    sub.w lr, r1, r3
1289; CHECK-NEXT:    add.w r0, r0, r3, lsl #2
1290; CHECK-NEXT:  .LBB13_8: @ %for.body
1291; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1292; CHECK-NEXT:    ldr r1, [r0], #4
1293; CHECK-NEXT:    cmp r2, r1
1294; CHECK-NEXT:    csel r2, r2, r1, hi
1295; CHECK-NEXT:    le lr, .LBB13_8
1296; CHECK-NEXT:  .LBB13_9: @ %for.cond.cleanup
1297; CHECK-NEXT:    mov r0, r2
1298; CHECK-NEXT:    pop {r7, pc}
1299entry:
1300  %cmp6 = icmp sgt i32 %n, 0
1301  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1302
1303for.body.preheader:                               ; preds = %entry
1304  %min.iters.check = icmp ult i32 %n, 4
1305  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1306
1307vector.ph:                                        ; preds = %for.body.preheader
1308  %n.vec = and i32 %n, -4
1309  br label %vector.body
1310
1311vector.body:                                      ; preds = %vector.body, %vector.ph
1312  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1313  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1314  %0 = getelementptr inbounds i32, ptr %x, i32 %index
1315  %1 = bitcast ptr %0 to ptr
1316  %wide.load = load <4 x i32>, ptr %1, align 4
1317  %2 = icmp ugt <4 x i32> %vec.phi, %wide.load
1318  %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
1319  %index.next = add i32 %index, 4
1320  %4 = icmp eq i32 %index.next, %n.vec
1321  br i1 %4, label %middle.block, label %vector.body
1322
1323middle.block:                                     ; preds = %vector.body
1324  %5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3)
1325  %cmp.n = icmp eq i32 %n.vec, %n
1326  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1327
1328for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1329  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1330  %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
1331  br label %for.body
1332
1333for.body:                                         ; preds = %for.body.preheader1, %for.body
1334  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1335  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1336  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
1337  %6 = load i32, ptr %arrayidx, align 4
1338  %c = icmp ugt i32 %r.07, %6
1339  %add = select i1 %c, i32 %r.07, i32 %6
1340  %inc = add nuw nsw i32 %i.08, 1
1341  %exitcond = icmp eq i32 %inc, %n
1342  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1343
1344for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1345  %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1346  ret i32 %r.0.lcssa
1347}
1348
1349define i32 @umax_i32_inloop(ptr nocapture readonly %x, i32 %n) {
1350; CHECK-LABEL: umax_i32_inloop:
1351; CHECK:       @ %bb.0: @ %entry
1352; CHECK-NEXT:    .save {r7, lr}
1353; CHECK-NEXT:    push {r7, lr}
1354; CHECK-NEXT:    cmp r1, #1
1355; CHECK-NEXT:    blt .LBB14_3
1356; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1357; CHECK-NEXT:    mov r12, r0
1358; CHECK-NEXT:    cmp r1, #4
1359; CHECK-NEXT:    bhs .LBB14_4
1360; CHECK-NEXT:  @ %bb.2:
1361; CHECK-NEXT:    movs r3, #0
1362; CHECK-NEXT:    movs r0, #0
1363; CHECK-NEXT:    b .LBB14_7
1364; CHECK-NEXT:  .LBB14_3:
1365; CHECK-NEXT:    movs r0, #0
1366; CHECK-NEXT:    pop {r7, pc}
1367; CHECK-NEXT:  .LBB14_4: @ %vector.ph
1368; CHECK-NEXT:    bic r3, r1, #3
1369; CHECK-NEXT:    movs r2, #1
1370; CHECK-NEXT:    subs r0, r3, #4
1371; CHECK-NEXT:    add.w lr, r2, r0, lsr #2
1372; CHECK-NEXT:    movs r0, #0
1373; CHECK-NEXT:    mov r2, r12
1374; CHECK-NEXT:  .LBB14_5: @ %vector.body
1375; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1376; CHECK-NEXT:    vldrw.u32 q0, [r2], #16
1377; CHECK-NEXT:    vmaxv.u32 r0, q0
1378; CHECK-NEXT:    le lr, .LBB14_5
1379; CHECK-NEXT:  @ %bb.6: @ %middle.block
1380; CHECK-NEXT:    cmp r3, r1
1381; CHECK-NEXT:    it eq
1382; CHECK-NEXT:    popeq {r7, pc}
1383; CHECK-NEXT:  .LBB14_7: @ %for.body.preheader1
1384; CHECK-NEXT:    sub.w lr, r1, r3
1385; CHECK-NEXT:    add.w r2, r12, r3, lsl #2
1386; CHECK-NEXT:  .LBB14_8: @ %for.body
1387; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1388; CHECK-NEXT:    ldr r1, [r2], #4
1389; CHECK-NEXT:    cmp r0, r1
1390; CHECK-NEXT:    csel r0, r0, r1, hi
1391; CHECK-NEXT:    le lr, .LBB14_8
1392; CHECK-NEXT:  @ %bb.9: @ %for.cond.cleanup
1393; CHECK-NEXT:    pop {r7, pc}
1394entry:
1395  %cmp6 = icmp sgt i32 %n, 0
1396  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1397
1398for.body.preheader:                               ; preds = %entry
1399  %min.iters.check = icmp ult i32 %n, 4
1400  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1401
1402vector.ph:                                        ; preds = %for.body.preheader
1403  %n.vec = and i32 %n, -4
1404  br label %vector.body
1405
1406vector.body:                                      ; preds = %vector.body, %vector.ph
1407  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1408  %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
1409  %0 = getelementptr inbounds i32, ptr %x, i32 %index
1410  %1 = bitcast ptr %0 to ptr
1411  %wide.load = load <4 x i32>, ptr %1, align 4
1412  %l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
1413  %2 = icmp ugt i32 %vec.phi, %l5
1414  %3 = select i1 %2, i32 %vec.phi, i32 %l5
1415  %index.next = add i32 %index, 4
1416  %4 = icmp eq i32 %index.next, %n.vec
1417  br i1 %4, label %middle.block, label %vector.body
1418
1419middle.block:                                     ; preds = %vector.body
1420  %5 = phi i32 [ %3, %vector.body ]
1421  %cmp.n = icmp eq i32 %n.vec, %n
1422  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1423
1424for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1425  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1426  %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
1427  br label %for.body
1428
1429for.body:                                         ; preds = %for.body.preheader1, %for.body
1430  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1431  %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1432  %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08
1433  %6 = load i32, ptr %arrayidx, align 4
1434  %c = icmp ugt i32 %r.07, %6
1435  %add = select i1 %c, i32 %r.07, i32 %6
1436  %inc = add nuw nsw i32 %i.08, 1
1437  %exitcond = icmp eq i32 %inc, %n
1438  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1439
1440for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1441  %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1442  ret i32 %r.0.lcssa
1443}
1444
1445define float @fmin_f32(ptr nocapture readonly %x, i32 %n) {
1446; CHECK-LABEL: fmin_f32:
1447; CHECK:       @ %bb.0: @ %entry
1448; CHECK-NEXT:    .save {r7, lr}
1449; CHECK-NEXT:    push {r7, lr}
1450; CHECK-NEXT:    cmp r1, #1
1451; CHECK-NEXT:    blt .LBB15_3
1452; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1453; CHECK-NEXT:    cmp r1, #4
1454; CHECK-NEXT:    bhs .LBB15_4
1455; CHECK-NEXT:  @ %bb.2:
1456; CHECK-NEXT:    vldr s0, .LCPI15_0
1457; CHECK-NEXT:    movs r2, #0
1458; CHECK-NEXT:    b .LBB15_7
1459; CHECK-NEXT:  .LBB15_3:
1460; CHECK-NEXT:    vldr s0, .LCPI15_0
1461; CHECK-NEXT:    vmov r0, s0
1462; CHECK-NEXT:    pop {r7, pc}
1463; CHECK-NEXT:  .LBB15_4: @ %vector.ph
1464; CHECK-NEXT:    bic r2, r1, #3
1465; CHECK-NEXT:    movs r3, #1
1466; CHECK-NEXT:    sub.w r12, r2, #4
1467; CHECK-NEXT:    vmov.i32 q0, #0x0
1468; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
1469; CHECK-NEXT:    mov r3, r0
1470; CHECK-NEXT:  .LBB15_5: @ %vector.body
1471; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1472; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
1473; CHECK-NEXT:    vcmp.f32 lt, q0, q1
1474; CHECK-NEXT:    vpsel q0, q0, q1
1475; CHECK-NEXT:    le lr, .LBB15_5
1476; CHECK-NEXT:  @ %bb.6: @ %middle.block
1477; CHECK-NEXT:    vminnm.f32 s2, s2, s3
1478; CHECK-NEXT:    vminnm.f32 s0, s0, s1
1479; CHECK-NEXT:    vminnm.f32 s0, s0, s2
1480; CHECK-NEXT:    cmp r2, r1
1481; CHECK-NEXT:    beq .LBB15_9
1482; CHECK-NEXT:  .LBB15_7: @ %for.body.preheader1
1483; CHECK-NEXT:    sub.w lr, r1, r2
1484; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
1485; CHECK-NEXT:  .LBB15_8: @ %for.body
1486; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1487; CHECK-NEXT:    vldmia r0!, {s2}
1488; CHECK-NEXT:    vcmp.f32 s0, s2
1489; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
1490; CHECK-NEXT:    vselge.f32 s0, s2, s0
1491; CHECK-NEXT:    le lr, .LBB15_8
1492; CHECK-NEXT:  .LBB15_9: @ %for.cond.cleanup
1493; CHECK-NEXT:    vmov r0, s0
1494; CHECK-NEXT:    pop {r7, pc}
1495; CHECK-NEXT:    .p2align 2
1496; CHECK-NEXT:  @ %bb.10:
1497; CHECK-NEXT:  .LCPI15_0:
1498; CHECK-NEXT:    .long 0x00000000 @ float 0
1499entry:
1500  %cmp6 = icmp sgt i32 %n, 0
1501  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1502
1503for.body.preheader:                               ; preds = %entry
1504  %min.iters.check = icmp ult i32 %n, 4
1505  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1506
1507vector.ph:                                        ; preds = %for.body.preheader
1508  %n.vec = and i32 %n, -4
1509  br label %vector.body
1510
1511vector.body:                                      ; preds = %vector.body, %vector.ph
1512  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1513  %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1514  %0 = getelementptr inbounds float, ptr %x, i32 %index
1515  %1 = bitcast ptr %0 to ptr
1516  %wide.load = load <4 x float>, ptr %1, align 4
1517  %2 = fcmp ult <4 x float> %vec.phi, %wide.load
1518  %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
1519  %index.next = add i32 %index, 4
1520  %4 = icmp eq i32 %index.next, %n.vec
1521  br i1 %4, label %middle.block, label %vector.body
1522
1523middle.block:                                     ; preds = %vector.body
1524  %5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3)
1525  %cmp.n = icmp eq i32 %n.vec, %n
1526  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1527
1528for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1529  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1530  %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
1531  br label %for.body
1532
1533for.body:                                         ; preds = %for.body.preheader1, %for.body
1534  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1535  %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1536  %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08
1537  %6 = load float, ptr %arrayidx, align 4
1538  %c = fcmp ult float %r.07, %6
1539  %add = select i1 %c, float %r.07, float %6
1540  %inc = add nuw nsw i32 %i.08, 1
1541  %exitcond = icmp eq i32 %inc, %n
1542  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1543
1544for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1545  %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1546  ret float %r.0.lcssa
1547}
1548
1549define float @fmax_f32(ptr nocapture readonly %x, i32 %n) {
1550; CHECK-LABEL: fmax_f32:
1551; CHECK:       @ %bb.0: @ %entry
1552; CHECK-NEXT:    .save {r7, lr}
1553; CHECK-NEXT:    push {r7, lr}
1554; CHECK-NEXT:    cmp r1, #1
1555; CHECK-NEXT:    blt .LBB16_3
1556; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1557; CHECK-NEXT:    cmp r1, #4
1558; CHECK-NEXT:    bhs .LBB16_4
1559; CHECK-NEXT:  @ %bb.2:
1560; CHECK-NEXT:    vldr s0, .LCPI16_0
1561; CHECK-NEXT:    movs r2, #0
1562; CHECK-NEXT:    b .LBB16_7
1563; CHECK-NEXT:  .LBB16_3:
1564; CHECK-NEXT:    vldr s0, .LCPI16_0
1565; CHECK-NEXT:    vmov r0, s0
1566; CHECK-NEXT:    pop {r7, pc}
1567; CHECK-NEXT:  .LBB16_4: @ %vector.ph
1568; CHECK-NEXT:    bic r2, r1, #3
1569; CHECK-NEXT:    movs r3, #1
1570; CHECK-NEXT:    sub.w r12, r2, #4
1571; CHECK-NEXT:    vmov.i32 q0, #0x0
1572; CHECK-NEXT:    add.w lr, r3, r12, lsr #2
1573; CHECK-NEXT:    mov r3, r0
1574; CHECK-NEXT:  .LBB16_5: @ %vector.body
1575; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1576; CHECK-NEXT:    vldrw.u32 q1, [r3], #16
1577; CHECK-NEXT:    vcmp.f32 lt, q1, q0
1578; CHECK-NEXT:    vpsel q0, q0, q1
1579; CHECK-NEXT:    le lr, .LBB16_5
1580; CHECK-NEXT:  @ %bb.6: @ %middle.block
1581; CHECK-NEXT:    vmaxnm.f32 s2, s2, s3
1582; CHECK-NEXT:    vmaxnm.f32 s0, s0, s1
1583; CHECK-NEXT:    vmaxnm.f32 s0, s0, s2
1584; CHECK-NEXT:    cmp r2, r1
1585; CHECK-NEXT:    beq .LBB16_9
1586; CHECK-NEXT:  .LBB16_7: @ %for.body.preheader1
1587; CHECK-NEXT:    sub.w lr, r1, r2
1588; CHECK-NEXT:    add.w r0, r0, r2, lsl #2
1589; CHECK-NEXT:  .LBB16_8: @ %for.body
1590; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1591; CHECK-NEXT:    vldmia r0!, {s2}
1592; CHECK-NEXT:    vcmp.f32 s2, s0
1593; CHECK-NEXT:    vmrs APSR_nzcv, fpscr
1594; CHECK-NEXT:    vselge.f32 s0, s2, s0
1595; CHECK-NEXT:    le lr, .LBB16_8
1596; CHECK-NEXT:  .LBB16_9: @ %for.cond.cleanup
1597; CHECK-NEXT:    vmov r0, s0
1598; CHECK-NEXT:    pop {r7, pc}
1599; CHECK-NEXT:    .p2align 2
1600; CHECK-NEXT:  @ %bb.10:
1601; CHECK-NEXT:  .LCPI16_0:
1602; CHECK-NEXT:    .long 0x00000000 @ float 0
1603entry:
1604  %cmp6 = icmp sgt i32 %n, 0
1605  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1606
1607for.body.preheader:                               ; preds = %entry
1608  %min.iters.check = icmp ult i32 %n, 4
1609  br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1610
1611vector.ph:                                        ; preds = %for.body.preheader
1612  %n.vec = and i32 %n, -4
1613  br label %vector.body
1614
1615vector.body:                                      ; preds = %vector.body, %vector.ph
1616  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1617  %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1618  %0 = getelementptr inbounds float, ptr %x, i32 %index
1619  %1 = bitcast ptr %0 to ptr
1620  %wide.load = load <4 x float>, ptr %1, align 4
1621  %2 = fcmp ugt <4 x float> %vec.phi, %wide.load
1622  %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
1623  %index.next = add i32 %index, 4
1624  %4 = icmp eq i32 %index.next, %n.vec
1625  br i1 %4, label %middle.block, label %vector.body
1626
1627middle.block:                                     ; preds = %vector.body
1628  %5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3)
1629  %cmp.n = icmp eq i32 %n.vec, %n
1630  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1631
1632for.body.preheader1:                              ; preds = %middle.block, %for.body.preheader
1633  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1634  %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
1635  br label %for.body
1636
1637for.body:                                         ; preds = %for.body.preheader1, %for.body
1638  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1639  %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1640  %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08
1641  %6 = load float, ptr %arrayidx, align 4
1642  %c = fcmp ugt float %r.07, %6
1643  %add = select i1 %c, float %r.07, float %6
1644  %inc = add nuw nsw i32 %i.08, 1
1645  %exitcond = icmp eq i32 %inc, %n
1646  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1647
1648for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1649  %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1650  ret float %r.0.lcssa
1651}
1652
1653define i32 @add4i32(ptr noalias nocapture readonly %x, i32 %n) {
1654; CHECK-LABEL: add4i32:
1655; CHECK:       @ %bb.0: @ %entry
1656; CHECK-NEXT:    .save {r7, lr}
1657; CHECK-NEXT:    push {r7, lr}
1658; CHECK-NEXT:    cbz r1, .LBB17_4
1659; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1660; CHECK-NEXT:    movs r2, #0
1661; CHECK-NEXT:    dlstp.32 lr, r1
1662; CHECK-NEXT:  .LBB17_2: @ %vector.body
1663; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1664; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
1665; CHECK-NEXT:    vaddva.u32 r2, q0
1666; CHECK-NEXT:    letp lr, .LBB17_2
1667; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1668; CHECK-NEXT:    mov r0, r2
1669; CHECK-NEXT:    pop {r7, pc}
1670; CHECK-NEXT:  .LBB17_4:
1671; CHECK-NEXT:    movs r2, #0
1672; CHECK-NEXT:    mov r0, r2
1673; CHECK-NEXT:    pop {r7, pc}
1674entry:
1675  %cmp6.not = icmp eq i32 %n, 0
1676  br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1677
1678vector.ph:                                        ; preds = %entry
1679  %n.rnd.up = add i32 %n, 3
1680  %n.vec = and i32 %n.rnd.up, -4
1681  br label %vector.body
1682
1683vector.body:                                      ; preds = %vector.body, %vector.ph
1684  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1685  %vec.phi = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ]
1686  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
1687  %0 = getelementptr inbounds i32, ptr %x, i32 %index
1688  %1 = bitcast ptr %0 to ptr
1689  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1690  %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
1691  %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
1692  %4 = add i32 %3, %vec.phi
1693  %index.next = add i32 %index, 4
1694  %5 = icmp eq i32 %index.next, %n.vec
1695  br i1 %5, label %for.cond.cleanup, label %vector.body
1696
1697for.cond.cleanup:                                 ; preds = %vector.body, %entry
1698  %s.0.lcssa = phi i32 [ 0, %entry ], [ %4, %vector.body ]
1699  ret i32 %s.0.lcssa
1700}
1701
1702define i32 @mla4i32(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
1703; CHECK-LABEL: mla4i32:
1704; CHECK:       @ %bb.0: @ %entry
1705; CHECK-NEXT:    .save {r7, lr}
1706; CHECK-NEXT:    push {r7, lr}
1707; CHECK-NEXT:    cbz r2, .LBB18_4
1708; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1709; CHECK-NEXT:    mov.w r12, #0
1710; CHECK-NEXT:    dlstp.32 lr, r2
1711; CHECK-NEXT:  .LBB18_2: @ %vector.body
1712; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1713; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
1714; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
1715; CHECK-NEXT:    vmlava.u32 r12, q1, q0
1716; CHECK-NEXT:    letp lr, .LBB18_2
1717; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1718; CHECK-NEXT:    mov r0, r12
1719; CHECK-NEXT:    pop {r7, pc}
1720; CHECK-NEXT:  .LBB18_4:
1721; CHECK-NEXT:    mov.w r12, #0
1722; CHECK-NEXT:    mov r0, r12
1723; CHECK-NEXT:    pop {r7, pc}
1724entry:
1725  %cmp8.not = icmp eq i32 %n, 0
1726  br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
1727
1728vector.ph:                                        ; preds = %entry
1729  %n.rnd.up = add i32 %n, 3
1730  %n.vec = and i32 %n.rnd.up, -4
1731  br label %vector.body
1732
1733vector.body:                                      ; preds = %vector.body, %vector.ph
1734  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1735  %vec.phi = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ]
1736  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
1737  %0 = getelementptr inbounds i32, ptr %x, i32 %index
1738  %1 = bitcast ptr %0 to ptr
1739  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1740  %2 = getelementptr inbounds i32, ptr %y, i32 %index
1741  %3 = bitcast ptr %2 to ptr
1742  %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1743  %4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
1744  %5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer
1745  %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
1746  %7 = add i32 %6, %vec.phi
1747  %index.next = add i32 %index, 4
1748  %8 = icmp eq i32 %index.next, %n.vec
1749  br i1 %8, label %for.cond.cleanup, label %vector.body
1750
1751for.cond.cleanup:                                 ; preds = %vector.body, %entry
1752  %s.0.lcssa = phi i32 [ 0, %entry ], [ %7, %vector.body ]
1753  ret i32 %s.0.lcssa
1754}
1755
1756define i32 @add8i32(ptr noalias nocapture readonly %x, i32 %n) {
1757; CHECK-LABEL: add8i32:
1758; CHECK:       @ %bb.0: @ %entry
1759; CHECK-NEXT:    .save {r7, lr}
1760; CHECK-NEXT:    push {r7, lr}
1761; CHECK-NEXT:    cbz r1, .LBB19_4
1762; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1763; CHECK-NEXT:    movs r2, #0
1764; CHECK-NEXT:    dlstp.16 lr, r1
1765; CHECK-NEXT:  .LBB19_2: @ %vector.body
1766; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1767; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
1768; CHECK-NEXT:    vaddva.s16 r2, q0
1769; CHECK-NEXT:    letp lr, .LBB19_2
1770; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1771; CHECK-NEXT:    mov r0, r2
1772; CHECK-NEXT:    pop {r7, pc}
1773; CHECK-NEXT:  .LBB19_4:
1774; CHECK-NEXT:    movs r2, #0
1775; CHECK-NEXT:    mov r0, r2
1776; CHECK-NEXT:    pop {r7, pc}
1777entry:
1778  %cmp6.not = icmp eq i32 %n, 0
1779  br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1780
1781vector.ph:                                        ; preds = %entry
1782  %n.rnd.up = add i32 %n, 7
1783  %n.vec = and i32 %n.rnd.up, -8
1784  br label %vector.body
1785
1786vector.body:                                      ; preds = %vector.body, %vector.ph
1787  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1788  %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
1789  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
1790  %0 = getelementptr inbounds i16, ptr %x, i32 %index
1791  %1 = bitcast ptr %0 to ptr
1792  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1793  %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
1794  %3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer
1795  %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
1796  %5 = add i32 %4, %vec.phi
1797  %index.next = add i32 %index, 8
1798  %6 = icmp eq i32 %index.next, %n.vec
1799  br i1 %6, label %for.cond.cleanup, label %vector.body
1800
1801for.cond.cleanup:                                 ; preds = %vector.body, %entry
1802  %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
1803  ret i32 %s.0.lcssa
1804}
1805
1806define i32 @mla8i32(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
1807; CHECK-LABEL: mla8i32:
1808; CHECK:       @ %bb.0: @ %entry
1809; CHECK-NEXT:    .save {r7, lr}
1810; CHECK-NEXT:    push {r7, lr}
1811; CHECK-NEXT:    cbz r2, .LBB20_4
1812; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1813; CHECK-NEXT:    mov.w r12, #0
1814; CHECK-NEXT:    dlstp.16 lr, r2
1815; CHECK-NEXT:  .LBB20_2: @ %vector.body
1816; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1817; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
1818; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
1819; CHECK-NEXT:    vmlava.s16 r12, q1, q0
1820; CHECK-NEXT:    letp lr, .LBB20_2
1821; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1822; CHECK-NEXT:    mov r0, r12
1823; CHECK-NEXT:    pop {r7, pc}
1824; CHECK-NEXT:  .LBB20_4:
1825; CHECK-NEXT:    mov.w r12, #0
1826; CHECK-NEXT:    mov r0, r12
1827; CHECK-NEXT:    pop {r7, pc}
1828entry:
1829  %cmp9.not = icmp eq i32 %n, 0
1830  br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
1831
1832vector.ph:                                        ; preds = %entry
1833  %n.rnd.up = add i32 %n, 7
1834  %n.vec = and i32 %n.rnd.up, -8
1835  br label %vector.body
1836
1837vector.body:                                      ; preds = %vector.body, %vector.ph
1838  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1839  %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
1840  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
1841  %0 = getelementptr inbounds i16, ptr %x, i32 %index
1842  %1 = bitcast ptr %0 to ptr
1843  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1844  %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
1845  %3 = getelementptr inbounds i16, ptr %y, i32 %index
1846  %4 = bitcast ptr %3 to ptr
1847  %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1848  %5 = sext <8 x i16> %wide.masked.load14 to <8 x i32>
1849  %6 = mul nsw <8 x i32> %5, %2
1850  %7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer
1851  %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
1852  %9 = add i32 %8, %vec.phi
1853  %index.next = add i32 %index, 8
1854  %10 = icmp eq i32 %index.next, %n.vec
1855  br i1 %10, label %for.cond.cleanup, label %vector.body
1856
1857for.cond.cleanup:                                 ; preds = %vector.body, %entry
1858  %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
1859  ret i32 %s.0.lcssa
1860}
1861
1862define i32 @add16i32(ptr noalias nocapture readonly %x, i32 %n) {
1863; CHECK-LABEL: add16i32:
1864; CHECK:       @ %bb.0: @ %entry
1865; CHECK-NEXT:    .save {r7, lr}
1866; CHECK-NEXT:    push {r7, lr}
1867; CHECK-NEXT:    cbz r1, .LBB21_4
1868; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1869; CHECK-NEXT:    movs r2, #0
1870; CHECK-NEXT:    dlstp.8 lr, r1
1871; CHECK-NEXT:  .LBB21_2: @ %vector.body
1872; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1873; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
1874; CHECK-NEXT:    vaddva.u8 r2, q0
1875; CHECK-NEXT:    letp lr, .LBB21_2
1876; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1877; CHECK-NEXT:    mov r0, r2
1878; CHECK-NEXT:    pop {r7, pc}
1879; CHECK-NEXT:  .LBB21_4:
1880; CHECK-NEXT:    movs r2, #0
1881; CHECK-NEXT:    mov r0, r2
1882; CHECK-NEXT:    pop {r7, pc}
1883entry:
1884  %cmp6.not = icmp eq i32 %n, 0
1885  br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1886
1887vector.ph:                                        ; preds = %entry
1888  %n.rnd.up = add i32 %n, 15
1889  %n.vec = and i32 %n.rnd.up, -16
1890  br label %vector.body
1891
1892vector.body:                                      ; preds = %vector.body, %vector.ph
1893  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1894  %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
1895  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
1896  %0 = getelementptr inbounds i8, ptr %x, i32 %index
1897  %1 = bitcast ptr %0 to ptr
1898  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1899  %2 = zext <16 x i8> %wide.masked.load to <16 x i32>
1900  %3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer
1901  %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
1902  %5 = add i32 %4, %vec.phi
1903  %index.next = add i32 %index, 16
1904  %6 = icmp eq i32 %index.next, %n.vec
1905  br i1 %6, label %for.cond.cleanup, label %vector.body
1906
1907for.cond.cleanup:                                 ; preds = %vector.body, %entry
1908  %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
1909  ret i32 %s.0.lcssa
1910}
1911
1912define i32 @mla16i32(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
1913; CHECK-LABEL: mla16i32:
1914; CHECK:       @ %bb.0: @ %entry
1915; CHECK-NEXT:    .save {r7, lr}
1916; CHECK-NEXT:    push {r7, lr}
1917; CHECK-NEXT:    cbz r2, .LBB22_4
1918; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1919; CHECK-NEXT:    mov.w r12, #0
1920; CHECK-NEXT:    dlstp.8 lr, r2
1921; CHECK-NEXT:  .LBB22_2: @ %vector.body
1922; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1923; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
1924; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
1925; CHECK-NEXT:    vmlava.u8 r12, q1, q0
1926; CHECK-NEXT:    letp lr, .LBB22_2
1927; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1928; CHECK-NEXT:    mov r0, r12
1929; CHECK-NEXT:    pop {r7, pc}
1930; CHECK-NEXT:  .LBB22_4:
1931; CHECK-NEXT:    mov.w r12, #0
1932; CHECK-NEXT:    mov r0, r12
1933; CHECK-NEXT:    pop {r7, pc}
1934entry:
1935  %cmp9.not = icmp eq i32 %n, 0
1936  br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
1937
1938vector.ph:                                        ; preds = %entry
1939  %n.rnd.up = add i32 %n, 15
1940  %n.vec = and i32 %n.rnd.up, -16
1941  br label %vector.body
1942
1943vector.body:                                      ; preds = %vector.body, %vector.ph
1944  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1945  %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
1946  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
1947  %0 = getelementptr inbounds i8, ptr %x, i32 %index
1948  %1 = bitcast ptr %0 to ptr
1949  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1950  %2 = zext <16 x i8> %wide.masked.load to <16 x i32>
1951  %3 = getelementptr inbounds i8, ptr %y, i32 %index
1952  %4 = bitcast ptr %3 to ptr
1953  %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1954  %5 = zext <16 x i8> %wide.masked.load14 to <16 x i32>
1955  %6 = mul nuw nsw <16 x i32> %5, %2
1956  %7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer
1957  %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
1958  %9 = add i32 %8, %vec.phi
1959  %index.next = add i32 %index, 16
1960  %10 = icmp eq i32 %index.next, %n.vec
1961  br i1 %10, label %for.cond.cleanup, label %vector.body
1962
1963for.cond.cleanup:                                 ; preds = %vector.body, %entry
1964  %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
1965  ret i32 %s.0.lcssa
1966}
1967
1968define signext i16 @add8i16(ptr noalias nocapture readonly %x, i32 %n) {
1969; CHECK-LABEL: add8i16:
1970; CHECK:       @ %bb.0: @ %entry
1971; CHECK-NEXT:    .save {r7, lr}
1972; CHECK-NEXT:    push {r7, lr}
1973; CHECK-NEXT:    cbz r1, .LBB23_4
1974; CHECK-NEXT:  @ %bb.1: @ %vector.ph
1975; CHECK-NEXT:    movs r2, #0
1976; CHECK-NEXT:    dlstp.16 lr, r1
1977; CHECK-NEXT:  .LBB23_2: @ %vector.body
1978; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1979; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
1980; CHECK-NEXT:    vaddva.u16 r2, q0
1981; CHECK-NEXT:    letp lr, .LBB23_2
1982; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
1983; CHECK-NEXT:    sxth r0, r2
1984; CHECK-NEXT:    pop {r7, pc}
1985; CHECK-NEXT:  .LBB23_4:
1986; CHECK-NEXT:    movs r2, #0
1987; CHECK-NEXT:    sxth r0, r2
1988; CHECK-NEXT:    pop {r7, pc}
1989entry:
1990  %cmp8.not = icmp eq i32 %n, 0
1991  br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
1992
1993vector.ph:                                        ; preds = %entry
1994  %n.rnd.up = add i32 %n, 7
1995  %n.vec = and i32 %n.rnd.up, -8
1996  br label %vector.body
1997
1998vector.body:                                      ; preds = %vector.body, %vector.ph
1999  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2000  %vec.phi = phi i16 [ 0, %vector.ph ], [ %4, %vector.body ]
2001  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
2002  %0 = getelementptr inbounds i16, ptr %x, i32 %index
2003  %1 = bitcast ptr %0 to ptr
2004  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2005  %2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer
2006  %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
2007  %4 = add i16 %3, %vec.phi
2008  %index.next = add i32 %index, 8
2009  %5 = icmp eq i32 %index.next, %n.vec
2010  br i1 %5, label %for.cond.cleanup, label %vector.body
2011
2012for.cond.cleanup:                                 ; preds = %vector.body, %entry
2013  %s.0.lcssa = phi i16 [ 0, %entry ], [ %4, %vector.body ]
2014  ret i16 %s.0.lcssa
2015}
2016
2017define signext i16 @mla8i16(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
2018; CHECK-LABEL: mla8i16:
2019; CHECK:       @ %bb.0: @ %entry
2020; CHECK-NEXT:    .save {r7, lr}
2021; CHECK-NEXT:    push {r7, lr}
2022; CHECK-NEXT:    cbz r2, .LBB24_4
2023; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2024; CHECK-NEXT:    mov.w r12, #0
2025; CHECK-NEXT:    dlstp.16 lr, r2
2026; CHECK-NEXT:  .LBB24_2: @ %vector.body
2027; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2028; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
2029; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
2030; CHECK-NEXT:    vmlava.u16 r12, q1, q0
2031; CHECK-NEXT:    letp lr, .LBB24_2
2032; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
2033; CHECK-NEXT:    sxth.w r0, r12
2034; CHECK-NEXT:    pop {r7, pc}
2035; CHECK-NEXT:  .LBB24_4:
2036; CHECK-NEXT:    mov.w r12, #0
2037; CHECK-NEXT:    sxth.w r0, r12
2038; CHECK-NEXT:    pop {r7, pc}
2039entry:
2040  %cmp11.not = icmp eq i32 %n, 0
2041  br i1 %cmp11.not, label %for.cond.cleanup, label %vector.ph
2042
2043vector.ph:                                        ; preds = %entry
2044  %n.rnd.up = add i32 %n, 7
2045  %n.vec = and i32 %n.rnd.up, -8
2046  br label %vector.body
2047
2048vector.body:                                      ; preds = %vector.body, %vector.ph
2049  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2050  %vec.phi = phi i16 [ 0, %vector.ph ], [ %7, %vector.body ]
2051  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
2052  %0 = getelementptr inbounds i16, ptr %x, i32 %index
2053  %1 = bitcast ptr %0 to ptr
2054  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2055  %2 = getelementptr inbounds i16, ptr %y, i32 %index
2056  %3 = bitcast ptr %2 to ptr
2057  %wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2058  %4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load
2059  %5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer
2060  %6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5)
2061  %7 = add i16 %6, %vec.phi
2062  %index.next = add i32 %index, 8
2063  %8 = icmp eq i32 %index.next, %n.vec
2064  br i1 %8, label %for.cond.cleanup, label %vector.body
2065
2066for.cond.cleanup:                                 ; preds = %vector.body, %entry
2067  %s.0.lcssa = phi i16 [ 0, %entry ], [ %7, %vector.body ]
2068  ret i16 %s.0.lcssa
2069}
2070
2071define signext i16 @add16i16(ptr noalias nocapture readonly %x, i32 %n) {
2072; CHECK-LABEL: add16i16:
2073; CHECK:       @ %bb.0: @ %entry
2074; CHECK-NEXT:    .save {r7, lr}
2075; CHECK-NEXT:    push {r7, lr}
2076; CHECK-NEXT:    cbz r1, .LBB25_4
2077; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2078; CHECK-NEXT:    movs r2, #0
2079; CHECK-NEXT:    dlstp.8 lr, r1
2080; CHECK-NEXT:  .LBB25_2: @ %vector.body
2081; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2082; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
2083; CHECK-NEXT:    vaddva.u8 r2, q0
2084; CHECK-NEXT:    letp lr, .LBB25_2
2085; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
2086; CHECK-NEXT:    sxth r0, r2
2087; CHECK-NEXT:    pop {r7, pc}
2088; CHECK-NEXT:  .LBB25_4:
2089; CHECK-NEXT:    movs r2, #0
2090; CHECK-NEXT:    sxth r0, r2
2091; CHECK-NEXT:    pop {r7, pc}
2092entry:
2093  %cmp8.not = icmp eq i32 %n, 0
2094  br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
2095
2096vector.ph:                                        ; preds = %entry
2097  %n.rnd.up = add i32 %n, 15
2098  %n.vec = and i32 %n.rnd.up, -16
2099  br label %vector.body
2100
2101vector.body:                                      ; preds = %vector.body, %vector.ph
2102  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2103  %vec.phi = phi i16 [ 0, %vector.ph ], [ %5, %vector.body ]
2104  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2105  %0 = getelementptr inbounds i8, ptr %x, i32 %index
2106  %1 = bitcast ptr %0 to ptr
2107  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2108  %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
2109  %3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer
2110  %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3)
2111  %5 = add i16 %4, %vec.phi
2112  %index.next = add i32 %index, 16
2113  %6 = icmp eq i32 %index.next, %n.vec
2114  br i1 %6, label %for.cond.cleanup, label %vector.body
2115
2116for.cond.cleanup:                                 ; preds = %vector.body, %entry
2117  %s.0.lcssa = phi i16 [ 0, %entry ], [ %5, %vector.body ]
2118  ret i16 %s.0.lcssa
2119}
2120
2121define signext i16 @mla16i16(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
2122; CHECK-LABEL: mla16i16:
2123; CHECK:       @ %bb.0: @ %entry
2124; CHECK-NEXT:    .save {r7, lr}
2125; CHECK-NEXT:    push {r7, lr}
2126; CHECK-NEXT:    cbz r2, .LBB26_4
2127; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2128; CHECK-NEXT:    mov.w r12, #0
2129; CHECK-NEXT:    dlstp.8 lr, r2
2130; CHECK-NEXT:  .LBB26_2: @ %vector.body
2131; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2132; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
2133; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
2134; CHECK-NEXT:    vmlava.u8 r12, q1, q0
2135; CHECK-NEXT:    letp lr, .LBB26_2
2136; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
2137; CHECK-NEXT:    sxth.w r0, r12
2138; CHECK-NEXT:    pop {r7, pc}
2139; CHECK-NEXT:  .LBB26_4:
2140; CHECK-NEXT:    mov.w r12, #0
2141; CHECK-NEXT:    sxth.w r0, r12
2142; CHECK-NEXT:    pop {r7, pc}
2143entry:
2144  %cmp13.not = icmp eq i32 %n, 0
2145  br i1 %cmp13.not, label %for.cond.cleanup, label %vector.ph
2146
2147vector.ph:                                        ; preds = %entry
2148  %n.rnd.up = add i32 %n, 15
2149  %n.vec = and i32 %n.rnd.up, -16
2150  br label %vector.body
2151
2152vector.body:                                      ; preds = %vector.body, %vector.ph
2153  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2154  %vec.phi = phi i16 [ 0, %vector.ph ], [ %9, %vector.body ]
2155  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2156  %0 = getelementptr inbounds i8, ptr %x, i32 %index
2157  %1 = bitcast ptr %0 to ptr
2158  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2159  %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
2160  %3 = getelementptr inbounds i8, ptr %y, i32 %index
2161  %4 = bitcast ptr %3 to ptr
2162  %wide.masked.load18 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2163  %5 = zext <16 x i8> %wide.masked.load18 to <16 x i16>
2164  %6 = mul nuw <16 x i16> %5, %2
2165  %7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer
2166  %8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7)
2167  %9 = add i16 %8, %vec.phi
2168  %index.next = add i32 %index, 16
2169  %10 = icmp eq i32 %index.next, %n.vec
2170  br i1 %10, label %for.cond.cleanup, label %vector.body
2171
2172for.cond.cleanup:                                 ; preds = %vector.body, %entry
2173  %s.0.lcssa = phi i16 [ 0, %entry ], [ %9, %vector.body ]
2174  ret i16 %s.0.lcssa
2175}
2176
2177define zeroext i8 @add16i8(ptr noalias nocapture readonly %x, i32 %n) {
2178; CHECK-LABEL: add16i8:
2179; CHECK:       @ %bb.0: @ %entry
2180; CHECK-NEXT:    .save {r7, lr}
2181; CHECK-NEXT:    push {r7, lr}
2182; CHECK-NEXT:    cbz r1, .LBB27_4
2183; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2184; CHECK-NEXT:    movs r2, #0
2185; CHECK-NEXT:    dlstp.8 lr, r1
2186; CHECK-NEXT:  .LBB27_2: @ %vector.body
2187; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2188; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
2189; CHECK-NEXT:    vaddva.u8 r2, q0
2190; CHECK-NEXT:    letp lr, .LBB27_2
2191; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
2192; CHECK-NEXT:    uxtb r0, r2
2193; CHECK-NEXT:    pop {r7, pc}
2194; CHECK-NEXT:  .LBB27_4:
2195; CHECK-NEXT:    movs r2, #0
2196; CHECK-NEXT:    uxtb r0, r2
2197; CHECK-NEXT:    pop {r7, pc}
2198entry:
2199  %cmp7.not = icmp eq i32 %n, 0
2200  br i1 %cmp7.not, label %for.cond.cleanup, label %vector.ph
2201
2202vector.ph:                                        ; preds = %entry
2203  %n.rnd.up = add i32 %n, 15
2204  %n.vec = and i32 %n.rnd.up, -16
2205  br label %vector.body
2206
2207vector.body:                                      ; preds = %vector.body, %vector.ph
2208  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2209  %vec.phi = phi i8 [ 0, %vector.ph ], [ %4, %vector.body ]
2210  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2211  %0 = getelementptr inbounds i8, ptr %x, i32 %index
2212  %1 = bitcast ptr %0 to ptr
2213  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2214  %2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer
2215  %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
2216  %4 = add i8 %3, %vec.phi
2217  %index.next = add i32 %index, 16
2218  %5 = icmp eq i32 %index.next, %n.vec
2219  br i1 %5, label %for.cond.cleanup, label %vector.body
2220
2221for.cond.cleanup:                                 ; preds = %vector.body, %entry
2222  %s.0.lcssa = phi i8 [ 0, %entry ], [ %4, %vector.body ]
2223  ret i8 %s.0.lcssa
2224}
2225
2226define zeroext i8 @mla16i8(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
2227; CHECK-LABEL: mla16i8:
2228; CHECK:       @ %bb.0: @ %entry
2229; CHECK-NEXT:    .save {r7, lr}
2230; CHECK-NEXT:    push {r7, lr}
2231; CHECK-NEXT:    cbz r2, .LBB28_4
2232; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2233; CHECK-NEXT:    mov.w r12, #0
2234; CHECK-NEXT:    dlstp.8 lr, r2
2235; CHECK-NEXT:  .LBB28_2: @ %vector.body
2236; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2237; CHECK-NEXT:    vldrb.u8 q0, [r0], #16
2238; CHECK-NEXT:    vldrb.u8 q1, [r1], #16
2239; CHECK-NEXT:    vmlava.u8 r12, q1, q0
2240; CHECK-NEXT:    letp lr, .LBB28_2
2241; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
2242; CHECK-NEXT:    uxtb.w r0, r12
2243; CHECK-NEXT:    pop {r7, pc}
2244; CHECK-NEXT:  .LBB28_4:
2245; CHECK-NEXT:    mov.w r12, #0
2246; CHECK-NEXT:    uxtb.w r0, r12
2247; CHECK-NEXT:    pop {r7, pc}
2248entry:
2249  %cmp10.not = icmp eq i32 %n, 0
2250  br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph
2251
2252vector.ph:                                        ; preds = %entry
2253  %n.rnd.up = add i32 %n, 15
2254  %n.vec = and i32 %n.rnd.up, -16
2255  br label %vector.body
2256
2257vector.body:                                      ; preds = %vector.body, %vector.ph
2258  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2259  %vec.phi = phi i8 [ 0, %vector.ph ], [ %7, %vector.body ]
2260  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2261  %0 = getelementptr inbounds i8, ptr %x, i32 %index
2262  %1 = bitcast ptr %0 to ptr
2263  %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2264  %2 = getelementptr inbounds i8, ptr %y, i32 %index
2265  %3 = bitcast ptr %2 to ptr
2266  %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2267  %4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
2268  %5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer
2269  %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5)
2270  %7 = add i8 %6, %vec.phi
2271  %index.next = add i32 %index, 16
2272  %8 = icmp eq i32 %index.next, %n.vec
2273  br i1 %8, label %for.cond.cleanup, label %vector.body
2274
2275for.cond.cleanup:                                 ; preds = %vector.body, %entry
2276  %s.0.lcssa = phi i8 [ 0, %entry ], [ %7, %vector.body ]
2277  ret i8 %s.0.lcssa
2278}
2279
2280define i64 @add4i64(ptr noalias nocapture readonly %x, i32 %n) {
2281; CHECK-LABEL: add4i64:
2282; CHECK:       @ %bb.0: @ %entry
2283; CHECK-NEXT:    .save {r7, lr}
2284; CHECK-NEXT:    push {r7, lr}
2285; CHECK-NEXT:    cbz r1, .LBB29_3
2286; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2287; CHECK-NEXT:    movs r2, #0
2288; CHECK-NEXT:    mov r3, r2
2289; CHECK-NEXT:    dlstp.32 lr, r1
2290; CHECK-NEXT:  .LBB29_2: @ %vector.body
2291; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2292; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
2293; CHECK-NEXT:    vaddlva.s32 r2, r3, q0
2294; CHECK-NEXT:    letp lr, .LBB29_2
2295; CHECK-NEXT:    b .LBB29_4
2296; CHECK-NEXT:  .LBB29_3:
2297; CHECK-NEXT:    movs r2, #0
2298; CHECK-NEXT:    mov r3, r2
2299; CHECK-NEXT:  .LBB29_4: @ %for.cond.cleanup
2300; CHECK-NEXT:    mov r0, r2
2301; CHECK-NEXT:    mov r1, r3
2302; CHECK-NEXT:    pop {r7, pc}
2303entry:
2304  %cmp6.not = icmp eq i32 %n, 0
2305  br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
2306
2307vector.ph:                                        ; preds = %entry
2308  %n.rnd.up = add i32 %n, 3
2309  %n.vec = and i32 %n.rnd.up, -4
2310  br label %vector.body
2311
2312vector.body:                                      ; preds = %vector.body, %vector.ph
2313  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2314  %vec.phi = phi i64 [ 0, %vector.ph ], [ %5, %vector.body ]
2315  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2316  %0 = getelementptr inbounds i32, ptr %x, i32 %index
2317  %1 = bitcast ptr %0 to ptr
2318  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2319  %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
2320  %3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer
2321  %4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
2322  %5 = add i64 %4, %vec.phi
2323  %index.next = add i32 %index, 4
2324  %6 = icmp eq i32 %index.next, %n.vec
2325  br i1 %6, label %for.cond.cleanup, label %vector.body
2326
2327for.cond.cleanup:                                 ; preds = %vector.body, %entry
2328  %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %vector.body ]
2329  ret i64 %s.0.lcssa
2330}
2331
2332define i64 @mla4i64(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
2333; CHECK-LABEL: mla4i64:
2334; CHECK:       @ %bb.0: @ %entry
2335; CHECK-NEXT:    .save {r7, lr}
2336; CHECK-NEXT:    push {r7, lr}
2337; CHECK-NEXT:    cbz r2, .LBB30_3
2338; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2339; CHECK-NEXT:    mov.w r12, #0
2340; CHECK-NEXT:    mov r3, r12
2341; CHECK-NEXT:    dlstp.32 lr, r2
2342; CHECK-NEXT:  .LBB30_2: @ %vector.body
2343; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2344; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
2345; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
2346; CHECK-NEXT:    vmlalva.s32 r12, r3, q1, q0
2347; CHECK-NEXT:    letp lr, .LBB30_2
2348; CHECK-NEXT:    b .LBB30_4
2349; CHECK-NEXT:  .LBB30_3:
2350; CHECK-NEXT:    mov.w r12, #0
2351; CHECK-NEXT:    mov r3, r12
2352; CHECK-NEXT:  .LBB30_4: @ %for.cond.cleanup
2353; CHECK-NEXT:    mov r0, r12
2354; CHECK-NEXT:    mov r1, r3
2355; CHECK-NEXT:    pop {r7, pc}
2356entry:
2357  %cmp9.not = icmp eq i32 %n, 0
2358  br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
2359
2360vector.ph:                                        ; preds = %entry
2361  %n.rnd.up = add i32 %n, 3
2362  %n.vec = and i32 %n.rnd.up, -4
2363  br label %vector.body
2364
2365vector.body:                                      ; preds = %vector.body, %vector.ph
2366  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2367  %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
2368  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2369  %0 = getelementptr inbounds i32, ptr %x, i32 %index
2370  %1 = bitcast ptr %0 to ptr
2371  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2372  %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
2373  %3 = getelementptr inbounds i32, ptr %y, i32 %index
2374  %4 = bitcast ptr %3 to ptr
2375  %wide.masked.load14 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2376  %5 = sext <4 x i32> %wide.masked.load14 to <4 x i64>
2377  %6 = mul nsw <4 x i64> %5, %2
2378  %7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer
2379  %8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7)
2380  %9 = add i64 %8, %vec.phi
2381  %index.next = add i32 %index, 4
2382  %10 = icmp eq i32 %index.next, %n.vec
2383  br i1 %10, label %for.cond.cleanup, label %vector.body
2384
2385for.cond.cleanup:                                 ; preds = %vector.body, %entry
2386  %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
2387  ret i64 %s.0.lcssa
2388}
2389
2390define i64 @mla8i64(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) {
2391; CHECK-LABEL: mla8i64:
2392; CHECK:       @ %bb.0: @ %entry
2393; CHECK-NEXT:    .save {r7, lr}
2394; CHECK-NEXT:    push {r7, lr}
2395; CHECK-NEXT:    cbz r2, .LBB31_3
2396; CHECK-NEXT:  @ %bb.1: @ %vector.ph
2397; CHECK-NEXT:    mov.w r12, #0
2398; CHECK-NEXT:    mov r3, r12
2399; CHECK-NEXT:    dlstp.16 lr, r2
2400; CHECK-NEXT:  .LBB31_2: @ %vector.body
2401; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
2402; CHECK-NEXT:    vldrh.u16 q0, [r0], #16
2403; CHECK-NEXT:    vldrh.u16 q1, [r1], #16
2404; CHECK-NEXT:    vmlalva.s16 r12, r3, q1, q0
2405; CHECK-NEXT:    letp lr, .LBB31_2
2406; CHECK-NEXT:    b .LBB31_4
2407; CHECK-NEXT:  .LBB31_3:
2408; CHECK-NEXT:    mov.w r12, #0
2409; CHECK-NEXT:    mov r3, r12
2410; CHECK-NEXT:  .LBB31_4: @ %for.cond.cleanup
2411; CHECK-NEXT:    mov r0, r12
2412; CHECK-NEXT:    mov r1, r3
2413; CHECK-NEXT:    pop {r7, pc}
2414entry:
2415  %cmp9.not = icmp eq i32 %n, 0
2416  br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
2417
2418vector.ph:                                        ; preds = %entry
2419  %n.rnd.up = add i32 %n, 7
2420  %n.vec = and i32 %n.rnd.up, -8
2421  br label %vector.body
2422
2423vector.body:                                      ; preds = %vector.body, %vector.ph
2424  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2425  %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
2426  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
2427  %0 = getelementptr inbounds i16, ptr %x, i32 %index
2428  %1 = bitcast ptr %0 to ptr
2429  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2430  %2 = sext <8 x i16> %wide.masked.load to <8 x i64>
2431  %3 = getelementptr inbounds i16, ptr %y, i32 %index
2432  %4 = bitcast ptr %3 to ptr
2433  %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2434  %5 = sext <8 x i16> %wide.masked.load14 to <8 x i64>
2435  %6 = mul nsw <8 x i64> %5, %2
2436  %7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer
2437  %8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7)
2438  %9 = add i64 %8, %vec.phi
2439  %index.next = add i32 %index, 8
2440  %10 = icmp eq i32 %index.next, %n.vec
2441  br i1 %10, label %for.cond.cleanup, label %vector.body
2442
2443for.cond.cleanup:                                 ; preds = %vector.body, %entry
2444  %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
2445  ret i64 %s.0.lcssa
2446}
2447
2448declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
2449declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #2
2450declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
2451declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>) #2
2452declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3
2453declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1
2454declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>) #2
2455declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3
2456declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3
2457declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3
2458declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3
2459declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3
2460declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3
2461
2462declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
2463declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
2464declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
2465declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
2466declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
2467declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
2468declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
2469declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
2470declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
2471declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
2472declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
2473declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
2474declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
2475