xref: /llvm-project/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll (revision 2f7ccaf4a8565628a4c7d2b5a49bb45478940be6)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
5; CHECK-LABEL: float_float_mul:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
8; CHECK-NEXT:    cmp r3, #0
9; CHECK-NEXT:    beq .LBB0_10
10; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
11; CHECK-NEXT:    cmp r3, #3
12; CHECK-NEXT:    bhi .LBB0_3
13; CHECK-NEXT:  @ %bb.2:
14; CHECK-NEXT:    mov.w r12, #0
15; CHECK-NEXT:    b .LBB0_4
16; CHECK-NEXT:  .LBB0_3: @ %vector.memcheck
17; CHECK-NEXT:    add.w r7, r1, r3, lsl #2
18; CHECK-NEXT:    add.w r6, r2, r3, lsl #2
19; CHECK-NEXT:    cmp r7, r2
20; CHECK-NEXT:    add.w r5, r0, r3, lsl #2
21; CHECK-NEXT:    cset r7, hi
22; CHECK-NEXT:    cmp r6, r1
23; CHECK-NEXT:    csel r7, zr, r7, ls
24; CHECK-NEXT:    cmp r6, r0
25; CHECK-NEXT:    cset r6, hi
26; CHECK-NEXT:    cmp r5, r2
27; CHECK-NEXT:    cset r5, hi
28; CHECK-NEXT:    mov.w r12, #0
29; CHECK-NEXT:    tst r5, r6
30; CHECK-NEXT:    it eq
31; CHECK-NEXT:    cmpeq r7, #0
32; CHECK-NEXT:    beq .LBB0_11
33; CHECK-NEXT:  .LBB0_4: @ %for.body.preheader22
34; CHECK-NEXT:    mvn.w r7, r12
35; CHECK-NEXT:    adds r4, r7, r3
36; CHECK-NEXT:    and r7, r3, #3
37; CHECK-NEXT:    add.w r8, r12, r7
38; CHECK-NEXT:    wls lr, r7, .LBB0_7
39; CHECK-NEXT:  @ %bb.5: @ %for.body.prol.preheader
40; CHECK-NEXT:    add.w r6, r0, r12, lsl #2
41; CHECK-NEXT:    add.w r7, r1, r12, lsl #2
42; CHECK-NEXT:    add.w r5, r2, r12, lsl #2
43; CHECK-NEXT:    mov r12, r8
44; CHECK-NEXT:  .LBB0_6: @ %for.body.prol
45; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
46; CHECK-NEXT:    vldmia r7!, {s0}
47; CHECK-NEXT:    vldmia r6!, {s2}
48; CHECK-NEXT:    vmul.f32 s0, s2, s0
49; CHECK-NEXT:    vstmia r5!, {s0}
50; CHECK-NEXT:    le lr, .LBB0_6
51; CHECK-NEXT:  .LBB0_7: @ %for.body.prol.loopexit
52; CHECK-NEXT:    cmp r4, #3
53; CHECK-NEXT:    blo .LBB0_10
54; CHECK-NEXT:  @ %bb.8: @ %for.body.preheader1
55; CHECK-NEXT:    sub.w r3, r8, r3
56; CHECK-NEXT:    movs r7, #1
57; CHECK-NEXT:    rsb r3, r3, r3, lsl #30
58; CHECK-NEXT:    subs r3, #4
59; CHECK-NEXT:    add.w lr, r7, r3, lsr #2
60; CHECK-NEXT:    lsl.w r3, r12, #2
61; CHECK-NEXT:  .LBB0_9: @ %for.body
62; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
63; CHECK-NEXT:    adds r7, r1, r3
64; CHECK-NEXT:    adds r6, r0, r3
65; CHECK-NEXT:    adds r5, r2, r3
66; CHECK-NEXT:    adds r0, #16
67; CHECK-NEXT:    vldr s0, [r7]
68; CHECK-NEXT:    adds r1, #16
69; CHECK-NEXT:    vldr s2, [r6]
70; CHECK-NEXT:    adds r2, #16
71; CHECK-NEXT:    vmul.f32 s0, s2, s0
72; CHECK-NEXT:    vstr s0, [r5]
73; CHECK-NEXT:    vldr s0, [r7, #4]
74; CHECK-NEXT:    vldr s2, [r6, #4]
75; CHECK-NEXT:    vmul.f32 s0, s2, s0
76; CHECK-NEXT:    vstr s0, [r5, #4]
77; CHECK-NEXT:    vldr s0, [r7, #8]
78; CHECK-NEXT:    vldr s2, [r6, #8]
79; CHECK-NEXT:    vmul.f32 s0, s2, s0
80; CHECK-NEXT:    vstr s0, [r5, #8]
81; CHECK-NEXT:    vldr s0, [r7, #12]
82; CHECK-NEXT:    vldr s2, [r6, #12]
83; CHECK-NEXT:    vmul.f32 s0, s2, s0
84; CHECK-NEXT:    vstr s0, [r5, #12]
85; CHECK-NEXT:    le lr, .LBB0_9
86; CHECK-NEXT:  .LBB0_10: @ %for.cond.cleanup
87; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
88; CHECK-NEXT:  .LBB0_11: @ %vector.ph
89; CHECK-NEXT:    bic r12, r3, #3
90; CHECK-NEXT:    movs r6, #1
91; CHECK-NEXT:    sub.w r7, r12, #4
92; CHECK-NEXT:    mov r4, r0
93; CHECK-NEXT:    mov r5, r1
94; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
95; CHECK-NEXT:    mov r6, r2
96; CHECK-NEXT:  .LBB0_12: @ %vector.body
97; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
98; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
99; CHECK-NEXT:    vldrw.u32 q1, [r4], #16
100; CHECK-NEXT:    vmul.f32 q0, q1, q0
101; CHECK-NEXT:    vstrb.8 q0, [r6], #16
102; CHECK-NEXT:    le lr, .LBB0_12
103; CHECK-NEXT:  @ %bb.13: @ %middle.block
104; CHECK-NEXT:    cmp r12, r3
105; CHECK-NEXT:    bne .LBB0_4
106; CHECK-NEXT:    b .LBB0_10
107entry:
108  %cmp8 = icmp eq i32 %N, 0
109  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
110
111for.body.preheader:                               ; preds = %entry
112  %min.iters.check = icmp ult i32 %N, 4
113  br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
114
115for.body.preheader22:                             ; preds = %middle.block, %vector.memcheck, %for.body.preheader
116  %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
117  %0 = xor i32 %i.09.ph, -1
118  %1 = add i32 %0, %N
119  %xtraiter = and i32 %N, 3
120  %lcmp.mod = icmp eq i32 %xtraiter, 0
121  br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
122
123for.body.prol:                                    ; preds = %for.body.preheader22, %for.body.prol
124  %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
125  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
126  %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
127  %2 = load float, ptr %arrayidx.prol, align 4
128  %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
129  %3 = load float, ptr %arrayidx1.prol, align 4
130  %mul.prol = fmul float %2, %3
131  %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
132  store float %mul.prol, ptr %arrayidx2.prol, align 4
133  %inc.prol = add nuw i32 %i.09.prol, 1
134  %prol.iter.sub = add i32 %prol.iter, -1
135  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
136  br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
137
138for.body.prol.loopexit:                           ; preds = %for.body.prol, %for.body.preheader22
139  %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
140  %4 = icmp ult i32 %1, 3
141  br i1 %4, label %for.cond.cleanup, label %for.body
142
143vector.memcheck:                                  ; preds = %for.body.preheader
144  %scevgep = getelementptr float, ptr %c, i32 %N
145  %scevgep13 = getelementptr float, ptr %a, i32 %N
146  %scevgep16 = getelementptr float, ptr %b, i32 %N
147  %bound0 = icmp ugt ptr %scevgep13, %c
148  %bound1 = icmp ugt ptr %scevgep, %a
149  %found.conflict = and i1 %bound0, %bound1
150  %bound018 = icmp ugt ptr %scevgep16, %c
151  %bound119 = icmp ugt ptr %scevgep, %b
152  %found.conflict20 = and i1 %bound018, %bound119
153  %conflict.rdx = or i1 %found.conflict, %found.conflict20
154  br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
155
156vector.ph:                                        ; preds = %vector.memcheck
157  %n.vec = and i32 %N, -4
158  br label %vector.body
159
160vector.body:                                      ; preds = %vector.body, %vector.ph
161  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
162  %5 = getelementptr inbounds float, ptr %a, i32 %index
163  %wide.load = load <4 x float>, ptr %5, align 4
164  %6 = getelementptr inbounds float, ptr %b, i32 %index
165  %wide.load21 = load <4 x float>, ptr %6, align 4
166  %7 = fmul <4 x float> %wide.load, %wide.load21
167  %8 = getelementptr inbounds float, ptr %c, i32 %index
168  store <4 x float> %7, ptr %8, align 4
169  %index.next = add i32 %index, 4
170  %9 = icmp eq i32 %index.next, %n.vec
171  br i1 %9, label %middle.block, label %vector.body
172
173middle.block:                                     ; preds = %vector.body
174  %cmp.n = icmp eq i32 %n.vec, %N
175  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
176
177for.cond.cleanup:                                 ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
178  ret void
179
180for.body:                                         ; preds = %for.body.prol.loopexit, %for.body
181  %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
182  %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
183  %10 = load float, ptr %arrayidx, align 4
184  %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
185  %11 = load float, ptr %arrayidx1, align 4
186  %mul = fmul float %10, %11
187  %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
188  store float %mul, ptr %arrayidx2, align 4
189  %inc = add nuw i32 %i.09, 1
190  %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
191  %12 = load float, ptr %arrayidx.1, align 4
192  %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
193  %13 = load float, ptr %arrayidx1.1, align 4
194  %mul.1 = fmul float %12, %13
195  %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
196  store float %mul.1, ptr %arrayidx2.1, align 4
197  %inc.1 = add nuw i32 %i.09, 2
198  %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
199  %14 = load float, ptr %arrayidx.2, align 4
200  %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
201  %15 = load float, ptr %arrayidx1.2, align 4
202  %mul.2 = fmul float %14, %15
203  %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
204  store float %mul.2, ptr %arrayidx2.2, align 4
205  %inc.2 = add nuw i32 %i.09, 3
206  %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
207  %16 = load float, ptr %arrayidx.3, align 4
208  %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
209  %17 = load float, ptr %arrayidx1.3, align 4
210  %mul.3 = fmul float %16, %17
211  %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
212  store float %mul.3, ptr %arrayidx2.3, align 4
213  %inc.3 = add nuw i32 %i.09, 4
214  %exitcond.3 = icmp eq i32 %inc.3, %N
215  br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
216}
217
218define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
219; CHECK-LABEL: float_float_add:
220; CHECK:       @ %bb.0: @ %entry
221; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
222; CHECK-NEXT:    cmp r3, #0
223; CHECK-NEXT:    beq .LBB1_10
224; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
225; CHECK-NEXT:    cmp r3, #3
226; CHECK-NEXT:    bhi .LBB1_3
227; CHECK-NEXT:  @ %bb.2:
228; CHECK-NEXT:    mov.w r12, #0
229; CHECK-NEXT:    b .LBB1_4
230; CHECK-NEXT:  .LBB1_3: @ %vector.memcheck
231; CHECK-NEXT:    add.w r7, r1, r3, lsl #2
232; CHECK-NEXT:    add.w r6, r2, r3, lsl #2
233; CHECK-NEXT:    cmp r7, r2
234; CHECK-NEXT:    add.w r5, r0, r3, lsl #2
235; CHECK-NEXT:    cset r7, hi
236; CHECK-NEXT:    cmp r6, r1
237; CHECK-NEXT:    csel r7, zr, r7, ls
238; CHECK-NEXT:    cmp r6, r0
239; CHECK-NEXT:    cset r6, hi
240; CHECK-NEXT:    cmp r5, r2
241; CHECK-NEXT:    cset r5, hi
242; CHECK-NEXT:    mov.w r12, #0
243; CHECK-NEXT:    tst r5, r6
244; CHECK-NEXT:    it eq
245; CHECK-NEXT:    cmpeq r7, #0
246; CHECK-NEXT:    beq .LBB1_11
247; CHECK-NEXT:  .LBB1_4: @ %for.body.preheader22
248; CHECK-NEXT:    mvn.w r7, r12
249; CHECK-NEXT:    adds r4, r7, r3
250; CHECK-NEXT:    and r7, r3, #3
251; CHECK-NEXT:    add.w r8, r12, r7
252; CHECK-NEXT:    wls lr, r7, .LBB1_7
253; CHECK-NEXT:  @ %bb.5: @ %for.body.prol.preheader
254; CHECK-NEXT:    add.w r6, r0, r12, lsl #2
255; CHECK-NEXT:    add.w r7, r1, r12, lsl #2
256; CHECK-NEXT:    add.w r5, r2, r12, lsl #2
257; CHECK-NEXT:    mov r12, r8
258; CHECK-NEXT:  .LBB1_6: @ %for.body.prol
259; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
260; CHECK-NEXT:    vldmia r7!, {s0}
261; CHECK-NEXT:    vldmia r6!, {s2}
262; CHECK-NEXT:    vadd.f32 s0, s2, s0
263; CHECK-NEXT:    vstmia r5!, {s0}
264; CHECK-NEXT:    le lr, .LBB1_6
265; CHECK-NEXT:  .LBB1_7: @ %for.body.prol.loopexit
266; CHECK-NEXT:    cmp r4, #3
267; CHECK-NEXT:    blo .LBB1_10
268; CHECK-NEXT:  @ %bb.8: @ %for.body.preheader1
269; CHECK-NEXT:    sub.w r3, r8, r3
270; CHECK-NEXT:    movs r7, #1
271; CHECK-NEXT:    rsb r3, r3, r3, lsl #30
272; CHECK-NEXT:    subs r3, #4
273; CHECK-NEXT:    add.w lr, r7, r3, lsr #2
274; CHECK-NEXT:    lsl.w r3, r12, #2
275; CHECK-NEXT:  .LBB1_9: @ %for.body
276; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
277; CHECK-NEXT:    adds r7, r1, r3
278; CHECK-NEXT:    adds r6, r0, r3
279; CHECK-NEXT:    adds r5, r2, r3
280; CHECK-NEXT:    adds r0, #16
281; CHECK-NEXT:    vldr s0, [r7]
282; CHECK-NEXT:    adds r1, #16
283; CHECK-NEXT:    vldr s2, [r6]
284; CHECK-NEXT:    adds r2, #16
285; CHECK-NEXT:    vadd.f32 s0, s2, s0
286; CHECK-NEXT:    vstr s0, [r5]
287; CHECK-NEXT:    vldr s0, [r7, #4]
288; CHECK-NEXT:    vldr s2, [r6, #4]
289; CHECK-NEXT:    vadd.f32 s0, s2, s0
290; CHECK-NEXT:    vstr s0, [r5, #4]
291; CHECK-NEXT:    vldr s0, [r7, #8]
292; CHECK-NEXT:    vldr s2, [r6, #8]
293; CHECK-NEXT:    vadd.f32 s0, s2, s0
294; CHECK-NEXT:    vstr s0, [r5, #8]
295; CHECK-NEXT:    vldr s0, [r7, #12]
296; CHECK-NEXT:    vldr s2, [r6, #12]
297; CHECK-NEXT:    vadd.f32 s0, s2, s0
298; CHECK-NEXT:    vstr s0, [r5, #12]
299; CHECK-NEXT:    le lr, .LBB1_9
300; CHECK-NEXT:  .LBB1_10: @ %for.cond.cleanup
301; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
302; CHECK-NEXT:  .LBB1_11: @ %vector.ph
303; CHECK-NEXT:    bic r12, r3, #3
304; CHECK-NEXT:    movs r6, #1
305; CHECK-NEXT:    sub.w r7, r12, #4
306; CHECK-NEXT:    mov r4, r0
307; CHECK-NEXT:    mov r5, r1
308; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
309; CHECK-NEXT:    mov r6, r2
310; CHECK-NEXT:  .LBB1_12: @ %vector.body
311; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
312; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
313; CHECK-NEXT:    vldrw.u32 q1, [r4], #16
314; CHECK-NEXT:    vadd.f32 q0, q1, q0
315; CHECK-NEXT:    vstrb.8 q0, [r6], #16
316; CHECK-NEXT:    le lr, .LBB1_12
317; CHECK-NEXT:  @ %bb.13: @ %middle.block
318; CHECK-NEXT:    cmp r12, r3
319; CHECK-NEXT:    bne .LBB1_4
320; CHECK-NEXT:    b .LBB1_10
321entry:
322  %cmp8 = icmp eq i32 %N, 0
323  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
324
325for.body.preheader:                               ; preds = %entry
326  %min.iters.check = icmp ult i32 %N, 4
327  br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
328
329for.body.preheader22:                             ; preds = %middle.block, %vector.memcheck, %for.body.preheader
330  %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
331  %0 = xor i32 %i.09.ph, -1
332  %1 = add i32 %0, %N
333  %xtraiter = and i32 %N, 3
334  %lcmp.mod = icmp eq i32 %xtraiter, 0
335  br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
336
337for.body.prol:                                    ; preds = %for.body.preheader22, %for.body.prol
338  %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
339  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
340  %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
341  %2 = load float, ptr %arrayidx.prol, align 4
342  %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
343  %3 = load float, ptr %arrayidx1.prol, align 4
344  %add.prol = fadd float %2, %3
345  %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
346  store float %add.prol, ptr %arrayidx2.prol, align 4
347  %inc.prol = add nuw i32 %i.09.prol, 1
348  %prol.iter.sub = add i32 %prol.iter, -1
349  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
350  br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
351
352for.body.prol.loopexit:                           ; preds = %for.body.prol, %for.body.preheader22
353  %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
354  %4 = icmp ult i32 %1, 3
355  br i1 %4, label %for.cond.cleanup, label %for.body
356
357vector.memcheck:                                  ; preds = %for.body.preheader
358  %scevgep = getelementptr float, ptr %c, i32 %N
359  %scevgep13 = getelementptr float, ptr %a, i32 %N
360  %scevgep16 = getelementptr float, ptr %b, i32 %N
361  %bound0 = icmp ugt ptr %scevgep13, %c
362  %bound1 = icmp ugt ptr %scevgep, %a
363  %found.conflict = and i1 %bound0, %bound1
364  %bound018 = icmp ugt ptr %scevgep16, %c
365  %bound119 = icmp ugt ptr %scevgep, %b
366  %found.conflict20 = and i1 %bound018, %bound119
367  %conflict.rdx = or i1 %found.conflict, %found.conflict20
368  br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
369
370vector.ph:                                        ; preds = %vector.memcheck
371  %n.vec = and i32 %N, -4
372  br label %vector.body
373
374vector.body:                                      ; preds = %vector.body, %vector.ph
375  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
376  %5 = getelementptr inbounds float, ptr %a, i32 %index
377  %wide.load = load <4 x float>, ptr %5, align 4
378  %6 = getelementptr inbounds float, ptr %b, i32 %index
379  %wide.load21 = load <4 x float>, ptr %6, align 4
380  %7 = fadd <4 x float> %wide.load, %wide.load21
381  %8 = getelementptr inbounds float, ptr %c, i32 %index
382  store <4 x float> %7, ptr %8, align 4
383  %index.next = add i32 %index, 4
384  %9 = icmp eq i32 %index.next, %n.vec
385  br i1 %9, label %middle.block, label %vector.body
386
387middle.block:                                     ; preds = %vector.body
388  %cmp.n = icmp eq i32 %n.vec, %N
389  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
390
391for.cond.cleanup:                                 ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
392  ret void
393
394for.body:                                         ; preds = %for.body.prol.loopexit, %for.body
395  %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
396  %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
397  %10 = load float, ptr %arrayidx, align 4
398  %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
399  %11 = load float, ptr %arrayidx1, align 4
400  %add = fadd float %10, %11
401  %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
402  store float %add, ptr %arrayidx2, align 4
403  %inc = add nuw i32 %i.09, 1
404  %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
405  %12 = load float, ptr %arrayidx.1, align 4
406  %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
407  %13 = load float, ptr %arrayidx1.1, align 4
408  %add.1 = fadd float %12, %13
409  %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
410  store float %add.1, ptr %arrayidx2.1, align 4
411  %inc.1 = add nuw i32 %i.09, 2
412  %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
413  %14 = load float, ptr %arrayidx.2, align 4
414  %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
415  %15 = load float, ptr %arrayidx1.2, align 4
416  %add.2 = fadd float %14, %15
417  %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
418  store float %add.2, ptr %arrayidx2.2, align 4
419  %inc.2 = add nuw i32 %i.09, 3
420  %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
421  %16 = load float, ptr %arrayidx.3, align 4
422  %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
423  %17 = load float, ptr %arrayidx1.3, align 4
424  %add.3 = fadd float %16, %17
425  %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
426  store float %add.3, ptr %arrayidx2.3, align 4
427  %inc.3 = add nuw i32 %i.09, 4
428  %exitcond.3 = icmp eq i32 %inc.3, %N
429  br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
430}
431
432define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
433; CHECK-LABEL: float_float_sub:
434; CHECK:       @ %bb.0: @ %entry
435; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
436; CHECK-NEXT:    cmp r3, #0
437; CHECK-NEXT:    beq .LBB2_10
438; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
439; CHECK-NEXT:    cmp r3, #3
440; CHECK-NEXT:    bhi .LBB2_3
441; CHECK-NEXT:  @ %bb.2:
442; CHECK-NEXT:    mov.w r12, #0
443; CHECK-NEXT:    b .LBB2_4
444; CHECK-NEXT:  .LBB2_3: @ %vector.memcheck
445; CHECK-NEXT:    add.w r7, r1, r3, lsl #2
446; CHECK-NEXT:    add.w r6, r2, r3, lsl #2
447; CHECK-NEXT:    cmp r7, r2
448; CHECK-NEXT:    add.w r5, r0, r3, lsl #2
449; CHECK-NEXT:    cset r7, hi
450; CHECK-NEXT:    cmp r6, r1
451; CHECK-NEXT:    csel r7, zr, r7, ls
452; CHECK-NEXT:    cmp r6, r0
453; CHECK-NEXT:    cset r6, hi
454; CHECK-NEXT:    cmp r5, r2
455; CHECK-NEXT:    cset r5, hi
456; CHECK-NEXT:    mov.w r12, #0
457; CHECK-NEXT:    tst r5, r6
458; CHECK-NEXT:    it eq
459; CHECK-NEXT:    cmpeq r7, #0
460; CHECK-NEXT:    beq .LBB2_11
461; CHECK-NEXT:  .LBB2_4: @ %for.body.preheader22
462; CHECK-NEXT:    mvn.w r7, r12
463; CHECK-NEXT:    adds r4, r7, r3
464; CHECK-NEXT:    and r7, r3, #3
465; CHECK-NEXT:    add.w r8, r12, r7
466; CHECK-NEXT:    wls lr, r7, .LBB2_7
467; CHECK-NEXT:  @ %bb.5: @ %for.body.prol.preheader
468; CHECK-NEXT:    add.w r6, r0, r12, lsl #2
469; CHECK-NEXT:    add.w r7, r1, r12, lsl #2
470; CHECK-NEXT:    add.w r5, r2, r12, lsl #2
471; CHECK-NEXT:    mov r12, r8
472; CHECK-NEXT:  .LBB2_6: @ %for.body.prol
473; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
474; CHECK-NEXT:    vldmia r7!, {s0}
475; CHECK-NEXT:    vldmia r6!, {s2}
476; CHECK-NEXT:    vsub.f32 s0, s2, s0
477; CHECK-NEXT:    vstmia r5!, {s0}
478; CHECK-NEXT:    le lr, .LBB2_6
479; CHECK-NEXT:  .LBB2_7: @ %for.body.prol.loopexit
480; CHECK-NEXT:    cmp r4, #3
481; CHECK-NEXT:    blo .LBB2_10
482; CHECK-NEXT:  @ %bb.8: @ %for.body.preheader1
483; CHECK-NEXT:    sub.w r3, r8, r3
484; CHECK-NEXT:    movs r7, #1
485; CHECK-NEXT:    rsb r3, r3, r3, lsl #30
486; CHECK-NEXT:    subs r3, #4
487; CHECK-NEXT:    add.w lr, r7, r3, lsr #2
488; CHECK-NEXT:    lsl.w r3, r12, #2
489; CHECK-NEXT:  .LBB2_9: @ %for.body
490; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
491; CHECK-NEXT:    adds r7, r1, r3
492; CHECK-NEXT:    adds r6, r0, r3
493; CHECK-NEXT:    adds r5, r2, r3
494; CHECK-NEXT:    adds r0, #16
495; CHECK-NEXT:    vldr s0, [r7]
496; CHECK-NEXT:    adds r1, #16
497; CHECK-NEXT:    vldr s2, [r6]
498; CHECK-NEXT:    adds r2, #16
499; CHECK-NEXT:    vsub.f32 s0, s2, s0
500; CHECK-NEXT:    vstr s0, [r5]
501; CHECK-NEXT:    vldr s0, [r7, #4]
502; CHECK-NEXT:    vldr s2, [r6, #4]
503; CHECK-NEXT:    vsub.f32 s0, s2, s0
504; CHECK-NEXT:    vstr s0, [r5, #4]
505; CHECK-NEXT:    vldr s0, [r7, #8]
506; CHECK-NEXT:    vldr s2, [r6, #8]
507; CHECK-NEXT:    vsub.f32 s0, s2, s0
508; CHECK-NEXT:    vstr s0, [r5, #8]
509; CHECK-NEXT:    vldr s0, [r7, #12]
510; CHECK-NEXT:    vldr s2, [r6, #12]
511; CHECK-NEXT:    vsub.f32 s0, s2, s0
512; CHECK-NEXT:    vstr s0, [r5, #12]
513; CHECK-NEXT:    le lr, .LBB2_9
514; CHECK-NEXT:  .LBB2_10: @ %for.cond.cleanup
515; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
516; CHECK-NEXT:  .LBB2_11: @ %vector.ph
517; CHECK-NEXT:    bic r12, r3, #3
518; CHECK-NEXT:    movs r6, #1
519; CHECK-NEXT:    sub.w r7, r12, #4
520; CHECK-NEXT:    mov r4, r0
521; CHECK-NEXT:    mov r5, r1
522; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
523; CHECK-NEXT:    mov r6, r2
524; CHECK-NEXT:  .LBB2_12: @ %vector.body
525; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
526; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
527; CHECK-NEXT:    vldrw.u32 q1, [r4], #16
528; CHECK-NEXT:    vsub.f32 q0, q1, q0
529; CHECK-NEXT:    vstrb.8 q0, [r6], #16
530; CHECK-NEXT:    le lr, .LBB2_12
531; CHECK-NEXT:  @ %bb.13: @ %middle.block
532; CHECK-NEXT:    cmp r12, r3
533; CHECK-NEXT:    bne .LBB2_4
534; CHECK-NEXT:    b .LBB2_10
535entry:
536  %cmp8 = icmp eq i32 %N, 0
537  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
538
539for.body.preheader:                               ; preds = %entry
540  %min.iters.check = icmp ult i32 %N, 4
541  br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck
542
543for.body.preheader22:                             ; preds = %middle.block, %vector.memcheck, %for.body.preheader
544  %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
545  %0 = xor i32 %i.09.ph, -1
546  %1 = add i32 %0, %N
547  %xtraiter = and i32 %N, 3
548  %lcmp.mod = icmp eq i32 %xtraiter, 0
549  br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
550
551for.body.prol:                                    ; preds = %for.body.preheader22, %for.body.prol
552  %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ]
553  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ]
554  %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
555  %2 = load float, ptr %arrayidx.prol, align 4
556  %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol
557  %3 = load float, ptr %arrayidx1.prol, align 4
558  %sub.prol = fsub float %2, %3
559  %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
560  store float %sub.prol, ptr %arrayidx2.prol, align 4
561  %inc.prol = add nuw i32 %i.09.prol, 1
562  %prol.iter.sub = add i32 %prol.iter, -1
563  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
564  br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
565
566for.body.prol.loopexit:                           ; preds = %for.body.prol, %for.body.preheader22
567  %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ]
568  %4 = icmp ult i32 %1, 3
569  br i1 %4, label %for.cond.cleanup, label %for.body
570
571vector.memcheck:                                  ; preds = %for.body.preheader
572  %scevgep = getelementptr float, ptr %c, i32 %N
573  %scevgep13 = getelementptr float, ptr %a, i32 %N
574  %scevgep16 = getelementptr float, ptr %b, i32 %N
575  %bound0 = icmp ugt ptr %scevgep13, %c
576  %bound1 = icmp ugt ptr %scevgep, %a
577  %found.conflict = and i1 %bound0, %bound1
578  %bound018 = icmp ugt ptr %scevgep16, %c
579  %bound119 = icmp ugt ptr %scevgep, %b
580  %found.conflict20 = and i1 %bound018, %bound119
581  %conflict.rdx = or i1 %found.conflict, %found.conflict20
582  br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph
583
584vector.ph:                                        ; preds = %vector.memcheck
585  %n.vec = and i32 %N, -4
586  br label %vector.body
587
588vector.body:                                      ; preds = %vector.body, %vector.ph
589  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
590  %5 = getelementptr inbounds float, ptr %a, i32 %index
591  %wide.load = load <4 x float>, ptr %5, align 4
592  %6 = getelementptr inbounds float, ptr %b, i32 %index
593  %wide.load21 = load <4 x float>, ptr %6, align 4
594  %7 = fsub <4 x float> %wide.load, %wide.load21
595  %8 = getelementptr inbounds float, ptr %c, i32 %index
596  store <4 x float> %7, ptr %8, align 4
597  %index.next = add i32 %index, 4
598  %9 = icmp eq i32 %index.next, %n.vec
599  br i1 %9, label %middle.block, label %vector.body
600
601middle.block:                                     ; preds = %vector.body
602  %cmp.n = icmp eq i32 %n.vec, %N
603  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22
604
605for.cond.cleanup:                                 ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
606  ret void
607
608for.body:                                         ; preds = %for.body.prol.loopexit, %for.body
609  %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
610  %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
611  %10 = load float, ptr %arrayidx, align 4
612  %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09
613  %11 = load float, ptr %arrayidx1, align 4
614  %sub = fsub float %10, %11
615  %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
616  store float %sub, ptr %arrayidx2, align 4
617  %inc = add nuw i32 %i.09, 1
618  %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
619  %12 = load float, ptr %arrayidx.1, align 4
620  %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc
621  %13 = load float, ptr %arrayidx1.1, align 4
622  %sub.1 = fsub float %12, %13
623  %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
624  store float %sub.1, ptr %arrayidx2.1, align 4
625  %inc.1 = add nuw i32 %i.09, 2
626  %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
627  %14 = load float, ptr %arrayidx.2, align 4
628  %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1
629  %15 = load float, ptr %arrayidx1.2, align 4
630  %sub.2 = fsub float %14, %15
631  %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
632  store float %sub.2, ptr %arrayidx2.2, align 4
633  %inc.2 = add nuw i32 %i.09, 3
634  %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
635  %16 = load float, ptr %arrayidx.3, align 4
636  %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2
637  %17 = load float, ptr %arrayidx1.3, align 4
638  %sub.3 = fsub float %16, %17
639  %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
640  store float %sub.3, ptr %arrayidx2.3, align 4
641  %inc.3 = add nuw i32 %i.09, 4
642  %exitcond.3 = icmp eq i32 %inc.3, %N
643  br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
644}
645
646define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
647; CHECK-LABEL: float_int_mul:
648; CHECK:       @ %bb.0: @ %entry
649; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
650; CHECK-NEXT:    cmp r3, #0
651; CHECK-NEXT:    beq.w .LBB3_13
652; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
653; CHECK-NEXT:    cmp r3, #3
654; CHECK-NEXT:    bls .LBB3_6
655; CHECK-NEXT:  @ %bb.2: @ %vector.memcheck
656; CHECK-NEXT:    add.w r7, r0, r3, lsl #2
657; CHECK-NEXT:    cmp r7, r2
658; CHECK-NEXT:    itt hi
659; CHECK-NEXT:    addhi.w r7, r2, r3, lsl #2
660; CHECK-NEXT:    cmphi r7, r0
661; CHECK-NEXT:    bhi .LBB3_6
662; CHECK-NEXT:  @ %bb.3: @ %vector.ph
663; CHECK-NEXT:    bic r12, r3, #3
664; CHECK-NEXT:    movs r6, #1
665; CHECK-NEXT:    sub.w r7, r12, #4
666; CHECK-NEXT:    mov r4, r0
667; CHECK-NEXT:    mov r5, r1
668; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
669; CHECK-NEXT:    mov r6, r2
670; CHECK-NEXT:  .LBB3_4: @ %vector.body
671; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
672; CHECK-NEXT:    vldrw.u32 q0, [r5], #16
673; CHECK-NEXT:    vldrw.u32 q1, [r4], #16
674; CHECK-NEXT:    vcvt.f32.s32 q0, q0
675; CHECK-NEXT:    vmul.f32 q0, q1, q0
676; CHECK-NEXT:    vstrb.8 q0, [r6], #16
677; CHECK-NEXT:    le lr, .LBB3_4
678; CHECK-NEXT:  @ %bb.5: @ %middle.block
679; CHECK-NEXT:    cmp r12, r3
680; CHECK-NEXT:    bne .LBB3_7
681; CHECK-NEXT:    b .LBB3_13
682; CHECK-NEXT:  .LBB3_6:
683; CHECK-NEXT:    mov.w r12, #0
684; CHECK-NEXT:  .LBB3_7: @ %for.body.preheader16
685; CHECK-NEXT:    mvn.w r7, r12
686; CHECK-NEXT:    add.w r9, r7, r3
687; CHECK-NEXT:    and r7, r3, #3
688; CHECK-NEXT:    add.w r8, r12, r7
689; CHECK-NEXT:    wls lr, r7, .LBB3_10
690; CHECK-NEXT:  @ %bb.8: @ %for.body.prol.preheader
691; CHECK-NEXT:    add.w r6, r0, r12, lsl #2
692; CHECK-NEXT:    add.w r7, r1, r12, lsl #2
693; CHECK-NEXT:    add.w r5, r2, r12, lsl #2
694; CHECK-NEXT:    mov r12, r8
695; CHECK-NEXT:  .LBB3_9: @ %for.body.prol
696; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
697; CHECK-NEXT:    ldr r4, [r7], #4
698; CHECK-NEXT:    vldmia r6!, {s2}
699; CHECK-NEXT:    vmov s0, r4
700; CHECK-NEXT:    vcvt.f32.s32 s0, s0
701; CHECK-NEXT:    vmul.f32 s0, s2, s0
702; CHECK-NEXT:    vstmia r5!, {s0}
703; CHECK-NEXT:    le lr, .LBB3_9
704; CHECK-NEXT:  .LBB3_10: @ %for.body.prol.loopexit
705; CHECK-NEXT:    cmp.w r9, #3
706; CHECK-NEXT:    blo .LBB3_13
707; CHECK-NEXT:  @ %bb.11: @ %for.body.preheader1
708; CHECK-NEXT:    sub.w r3, r8, r3
709; CHECK-NEXT:    add.w r1, r1, r12, lsl #2
710; CHECK-NEXT:    movs r7, #1
711; CHECK-NEXT:    adds r1, #8
712; CHECK-NEXT:    rsb r3, r3, r3, lsl #30
713; CHECK-NEXT:    subs r3, #4
714; CHECK-NEXT:    add.w lr, r7, r3, lsr #2
715; CHECK-NEXT:    lsl.w r3, r12, #2
716; CHECK-NEXT:  .LBB3_12: @ %for.body
717; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
718; CHECK-NEXT:    vldr s0, [r1, #-8]
719; CHECK-NEXT:    adds r7, r0, r3
720; CHECK-NEXT:    adds r6, r2, r3
721; CHECK-NEXT:    adds r0, #16
722; CHECK-NEXT:    vcvt.f32.s32 s0, s0
723; CHECK-NEXT:    vldr s2, [r7]
724; CHECK-NEXT:    adds r2, #16
725; CHECK-NEXT:    vmul.f32 s0, s2, s0
726; CHECK-NEXT:    vstr s0, [r6]
727; CHECK-NEXT:    vldr s0, [r1, #-4]
728; CHECK-NEXT:    vldr s2, [r7, #4]
729; CHECK-NEXT:    vcvt.f32.s32 s0, s0
730; CHECK-NEXT:    vmul.f32 s0, s2, s0
731; CHECK-NEXT:    vstr s0, [r6, #4]
732; CHECK-NEXT:    vldr s0, [r1]
733; CHECK-NEXT:    vldr s2, [r7, #8]
734; CHECK-NEXT:    vcvt.f32.s32 s0, s0
735; CHECK-NEXT:    vmul.f32 s0, s2, s0
736; CHECK-NEXT:    vstr s0, [r6, #8]
737; CHECK-NEXT:    vldr s0, [r1, #4]
738; CHECK-NEXT:    adds r1, #16
739; CHECK-NEXT:    vldr s2, [r7, #12]
740; CHECK-NEXT:    vcvt.f32.s32 s0, s0
741; CHECK-NEXT:    vmul.f32 s0, s2, s0
742; CHECK-NEXT:    vstr s0, [r6, #12]
743; CHECK-NEXT:    le lr, .LBB3_12
744; CHECK-NEXT:  .LBB3_13: @ %for.cond.cleanup
745; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
746entry:
747  %cmp8 = icmp eq i32 %N, 0
748  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
749
750for.body.preheader:                               ; preds = %entry
751  %min.iters.check = icmp ult i32 %N, 4
752  br i1 %min.iters.check, label %for.body.preheader16, label %vector.memcheck
753
754for.body.preheader16:                             ; preds = %middle.block, %vector.memcheck, %for.body.preheader
755  %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
756  %0 = xor i32 %i.09.ph, -1
757  %1 = add i32 %0, %N
758  %xtraiter = and i32 %N, 3
759  %lcmp.mod = icmp eq i32 %xtraiter, 0
760  br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
761
762for.body.prol:                                    ; preds = %for.body.preheader16, %for.body.prol
763  %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader16 ]
764  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader16 ]
765  %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol
766  %2 = load float, ptr %arrayidx.prol, align 4
767  %arrayidx1.prol = getelementptr inbounds i32, ptr %b, i32 %i.09.prol
768  %3 = load i32, ptr %arrayidx1.prol, align 4
769  %conv.prol = sitofp i32 %3 to float
770  %mul.prol = fmul float %2, %conv.prol
771  %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol
772  store float %mul.prol, ptr %arrayidx2.prol, align 4
773  %inc.prol = add nuw i32 %i.09.prol, 1
774  %prol.iter.sub = add i32 %prol.iter, -1
775  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
776  br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
777
778for.body.prol.loopexit:                           ; preds = %for.body.prol, %for.body.preheader16
779  %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader16 ], [ %inc.prol, %for.body.prol ]
780  %4 = icmp ult i32 %1, 3
781  br i1 %4, label %for.cond.cleanup, label %for.body
782
783vector.memcheck:                                  ; preds = %for.body.preheader
784  %scevgep = getelementptr float, ptr %c, i32 %N
785  %scevgep13 = getelementptr float, ptr %a, i32 %N
786  %bound0 = icmp ugt ptr %scevgep13, %c
787  %bound1 = icmp ugt ptr %scevgep, %a
788  %found.conflict = and i1 %bound0, %bound1
789  br i1 %found.conflict, label %for.body.preheader16, label %vector.ph
790
791vector.ph:                                        ; preds = %vector.memcheck
792  %n.vec = and i32 %N, -4
793  br label %vector.body
794
795vector.body:                                      ; preds = %vector.body, %vector.ph
796  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
797  %5 = getelementptr inbounds float, ptr %a, i32 %index
798  %wide.load = load <4 x float>, ptr %5, align 4
799  %6 = getelementptr inbounds i32, ptr %b, i32 %index
800  %wide.load15 = load <4 x i32>, ptr %6, align 4
801  %7 = sitofp <4 x i32> %wide.load15 to <4 x float>
802  %8 = fmul <4 x float> %wide.load, %7
803  %9 = getelementptr inbounds float, ptr %c, i32 %index
804  store <4 x float> %8, ptr %9, align 4
805  %index.next = add i32 %index, 4
806  %10 = icmp eq i32 %index.next, %n.vec
807  br i1 %10, label %middle.block, label %vector.body
808
809middle.block:                                     ; preds = %vector.body
810  %cmp.n = icmp eq i32 %n.vec, %N
811  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader16
812
813for.cond.cleanup:                                 ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry
814  ret void
815
816for.body:                                         ; preds = %for.body.prol.loopexit, %for.body
817  %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ]
818  %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09
819  %11 = load float, ptr %arrayidx, align 4
820  %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.09
821  %12 = load i32, ptr %arrayidx1, align 4
822  %conv = sitofp i32 %12 to float
823  %mul = fmul float %11, %conv
824  %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
825  store float %mul, ptr %arrayidx2, align 4
826  %inc = add nuw i32 %i.09, 1
827  %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc
828  %13 = load float, ptr %arrayidx.1, align 4
829  %arrayidx1.1 = getelementptr inbounds i32, ptr %b, i32 %inc
830  %14 = load i32, ptr %arrayidx1.1, align 4
831  %conv.1 = sitofp i32 %14 to float
832  %mul.1 = fmul float %13, %conv.1
833  %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc
834  store float %mul.1, ptr %arrayidx2.1, align 4
835  %inc.1 = add nuw i32 %i.09, 2
836  %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1
837  %15 = load float, ptr %arrayidx.2, align 4
838  %arrayidx1.2 = getelementptr inbounds i32, ptr %b, i32 %inc.1
839  %16 = load i32, ptr %arrayidx1.2, align 4
840  %conv.2 = sitofp i32 %16 to float
841  %mul.2 = fmul float %15, %conv.2
842  %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1
843  store float %mul.2, ptr %arrayidx2.2, align 4
844  %inc.2 = add nuw i32 %i.09, 3
845  %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2
846  %17 = load float, ptr %arrayidx.3, align 4
847  %arrayidx1.3 = getelementptr inbounds i32, ptr %b, i32 %inc.2
848  %18 = load i32, ptr %arrayidx1.3, align 4
849  %conv.3 = sitofp i32 %18 to float
850  %mul.3 = fmul float %17, %conv.3
851  %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2
852  store float %mul.3, ptr %arrayidx2.3, align 4
853  %inc.3 = add nuw i32 %i.09, 4
854  %exitcond.3 = icmp eq i32 %inc.3, %N
855  br i1 %exitcond.3, label %for.cond.cleanup, label %for.body
856}
857
858define arm_aapcs_vfpcc void @float_int_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
859; CHECK-LABEL: float_int_int_mul:
860; CHECK:       @ %bb.0: @ %entry
861; CHECK-NEXT:    push {r4, r5, r6, lr}
862; CHECK-NEXT:    cbz r3, .LBB4_8
863; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
864; CHECK-NEXT:    cmp r3, #3
865; CHECK-NEXT:    bhi .LBB4_3
866; CHECK-NEXT:  @ %bb.2:
867; CHECK-NEXT:    mov.w r12, #0
868; CHECK-NEXT:    b .LBB4_6
869; CHECK-NEXT:  .LBB4_3: @ %vector.ph
870; CHECK-NEXT:    bic r12, r3, #3
871; CHECK-NEXT:    movs r5, #1
872; CHECK-NEXT:    sub.w r6, r12, #4
873; CHECK-NEXT:    mov r4, r0
874; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
875; CHECK-NEXT:    mov r5, r1
876; CHECK-NEXT:    mov r6, r2
877; CHECK-NEXT:  .LBB4_4: @ %vector.body
878; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
879; CHECK-NEXT:    vldrw.u32 q0, [r4], #16
880; CHECK-NEXT:    vldrw.u32 q1, [r5], #16
881; CHECK-NEXT:    vmul.i32 q0, q1, q0
882; CHECK-NEXT:    vcvt.f32.s32 q0, q0
883; CHECK-NEXT:    vstrb.8 q0, [r6], #16
884; CHECK-NEXT:    le lr, .LBB4_4
885; CHECK-NEXT:  @ %bb.5: @ %middle.block
886; CHECK-NEXT:    cmp r12, r3
887; CHECK-NEXT:    it eq
888; CHECK-NEXT:    popeq {r4, r5, r6, pc}
889; CHECK-NEXT:  .LBB4_6: @ %for.body.preheader11
890; CHECK-NEXT:    sub.w lr, r3, r12
891; CHECK-NEXT:    add.w r0, r0, r12, lsl #2
892; CHECK-NEXT:    add.w r1, r1, r12, lsl #2
893; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
894; CHECK-NEXT:  .LBB4_7: @ %for.body
895; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
896; CHECK-NEXT:    ldr r3, [r0], #4
897; CHECK-NEXT:    ldr r6, [r1], #4
898; CHECK-NEXT:    muls r3, r6, r3
899; CHECK-NEXT:    vmov s0, r3
900; CHECK-NEXT:    vcvt.f32.s32 s0, s0
901; CHECK-NEXT:    vstmia r2!, {s0}
902; CHECK-NEXT:    le lr, .LBB4_7
903; CHECK-NEXT:  .LBB4_8: @ %for.cond.cleanup
904; CHECK-NEXT:    pop {r4, r5, r6, pc}
905entry:
906  %cmp8 = icmp eq i32 %N, 0
907  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
908
909for.body.preheader:                               ; preds = %entry
910  %min.iters.check = icmp ult i32 %N, 4
911  br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
912
913for.body.preheader11:                             ; preds = %middle.block, %for.body.preheader
914  %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
915  br label %for.body
916
917vector.ph:                                        ; preds = %for.body.preheader
918  %n.vec = and i32 %N, -4
919  br label %vector.body
920
921vector.body:                                      ; preds = %vector.body, %vector.ph
922  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
923  %0 = getelementptr inbounds i32, ptr %a, i32 %index
924  %wide.load = load <4 x i32>, ptr %0, align 4
925  %1 = getelementptr inbounds i32, ptr %b, i32 %index
926  %wide.load10 = load <4 x i32>, ptr %1, align 4
927  %2 = mul nsw <4 x i32> %wide.load10, %wide.load
928  %3 = sitofp <4 x i32> %2 to <4 x float>
929  %4 = getelementptr inbounds float, ptr %c, i32 %index
930  store <4 x float> %3, ptr %4, align 4
931  %index.next = add i32 %index, 4
932  %5 = icmp eq i32 %index.next, %n.vec
933  br i1 %5, label %middle.block, label %vector.body
934
935middle.block:                                     ; preds = %vector.body
936  %cmp.n = icmp eq i32 %n.vec, %N
937  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
938
939for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
940  ret void
941
942for.body:                                         ; preds = %for.body.preheader11, %for.body
943  %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
944  %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.09
945  %6 = load i32, ptr %arrayidx, align 4
946  %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.09
947  %7 = load i32, ptr %arrayidx1, align 4
948  %mul = mul nsw i32 %7, %6
949  %conv = sitofp i32 %mul to float
950  %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
951  store float %conv, ptr %arrayidx2, align 4
952  %inc = add nuw i32 %i.09, 1
953  %exitcond = icmp eq i32 %inc, %N
954  br i1 %exitcond, label %for.cond.cleanup, label %for.body
955}
956
957define arm_aapcs_vfpcc void @half_half_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
958; CHECK-LABEL: half_half_mul:
959; CHECK:       @ %bb.0: @ %entry
960; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
961; CHECK-NEXT:    cmp r3, #0
962; CHECK-NEXT:    beq .LBB5_8
963; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
964; CHECK-NEXT:    cmp r3, #3
965; CHECK-NEXT:    bhi .LBB5_3
966; CHECK-NEXT:  @ %bb.2:
967; CHECK-NEXT:    mov.w r12, #0
968; CHECK-NEXT:    b .LBB5_6
969; CHECK-NEXT:  .LBB5_3: @ %vector.ph
970; CHECK-NEXT:    bic r12, r3, #3
971; CHECK-NEXT:    movs r5, #1
972; CHECK-NEXT:    sub.w r6, r12, #4
973; CHECK-NEXT:    mov r4, r0
974; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
975; CHECK-NEXT:    mov r5, r1
976; CHECK-NEXT:    mov r6, r2
977; CHECK-NEXT:  .LBB5_4: @ %vector.body
978; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
979; CHECK-NEXT:    ldr.w r9, [r4]
980; CHECK-NEXT:    ldr r7, [r5]
981; CHECK-NEXT:    ldr.w r8, [r4, #4]
982; CHECK-NEXT:    vmov.32 q0[0], r9
983; CHECK-NEXT:    ldr.w r10, [r5, #4]
984; CHECK-NEXT:    vmov.32 q1[0], r7
985; CHECK-NEXT:    vmov.32 q0[1], r8
986; CHECK-NEXT:    adds r4, #8
987; CHECK-NEXT:    vmov.32 q1[1], r10
988; CHECK-NEXT:    adds r5, #8
989; CHECK-NEXT:    vmul.f16 q0, q0, q1
990; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
991; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
992; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
993; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
994; CHECK-NEXT:    vstrb.8 q0, [r6], #16
995; CHECK-NEXT:    le lr, .LBB5_4
996; CHECK-NEXT:  @ %bb.5: @ %middle.block
997; CHECK-NEXT:    cmp r12, r3
998; CHECK-NEXT:    beq .LBB5_8
999; CHECK-NEXT:  .LBB5_6: @ %for.body.preheader11
1000; CHECK-NEXT:    sub.w lr, r3, r12
1001; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
1002; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
1003; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
1004; CHECK-NEXT:  .LBB5_7: @ %for.body
1005; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1006; CHECK-NEXT:    vldr.16 s0, [r1]
1007; CHECK-NEXT:    vldr.16 s2, [r0]
1008; CHECK-NEXT:    adds r0, #2
1009; CHECK-NEXT:    adds r1, #2
1010; CHECK-NEXT:    vmul.f16 s0, s2, s0
1011; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
1012; CHECK-NEXT:    vstmia r2!, {s0}
1013; CHECK-NEXT:    le lr, .LBB5_7
1014; CHECK-NEXT:  .LBB5_8: @ %for.cond.cleanup
1015; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1016entry:
1017  %cmp8 = icmp eq i32 %N, 0
1018  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1019
1020for.body.preheader:                               ; preds = %entry
1021  %min.iters.check = icmp ult i32 %N, 4
1022  br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1023
1024for.body.preheader11:                             ; preds = %middle.block, %for.body.preheader
1025  %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1026  br label %for.body
1027
1028vector.ph:                                        ; preds = %for.body.preheader
1029  %n.vec = and i32 %N, -4
1030  br label %vector.body
1031
1032vector.body:                                      ; preds = %vector.body, %vector.ph
1033  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1034  %0 = getelementptr inbounds half, ptr %a, i32 %index
1035  %wide.load = load <4 x half>, ptr %0, align 2
1036  %1 = getelementptr inbounds half, ptr %b, i32 %index
1037  %wide.load10 = load <4 x half>, ptr %1, align 2
1038  %2 = fmul <4 x half> %wide.load, %wide.load10
1039  %3 = fpext <4 x half> %2 to <4 x float>
1040  %4 = getelementptr inbounds float, ptr %c, i32 %index
1041  store <4 x float> %3, ptr %4, align 4
1042  %index.next = add i32 %index, 4
1043  %5 = icmp eq i32 %index.next, %n.vec
1044  br i1 %5, label %middle.block, label %vector.body
1045
1046middle.block:                                     ; preds = %vector.body
1047  %cmp.n = icmp eq i32 %n.vec, %N
1048  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1049
1050for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1051  ret void
1052
1053for.body:                                         ; preds = %for.body.preheader11, %for.body
1054  %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1055  %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
1056  %6 = load half, ptr %arrayidx, align 2
1057  %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
1058  %7 = load half, ptr %arrayidx1, align 2
1059  %mul = fmul half %6, %7
1060  %conv = fpext half %mul to float
1061  %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
1062  store float %conv, ptr %arrayidx2, align 4
1063  %inc = add nuw i32 %i.09, 1
1064  %exitcond = icmp eq i32 %inc, %N
1065  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1066}
1067
1068define arm_aapcs_vfpcc void @half_half_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
1069; CHECK-LABEL: half_half_add:
1070; CHECK:       @ %bb.0: @ %entry
1071; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
1072; CHECK-NEXT:    cmp r3, #0
1073; CHECK-NEXT:    beq .LBB6_8
1074; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1075; CHECK-NEXT:    cmp r3, #3
1076; CHECK-NEXT:    bhi .LBB6_3
1077; CHECK-NEXT:  @ %bb.2:
1078; CHECK-NEXT:    mov.w r12, #0
1079; CHECK-NEXT:    b .LBB6_6
1080; CHECK-NEXT:  .LBB6_3: @ %vector.ph
1081; CHECK-NEXT:    bic r12, r3, #3
1082; CHECK-NEXT:    movs r5, #1
1083; CHECK-NEXT:    sub.w r6, r12, #4
1084; CHECK-NEXT:    mov r4, r0
1085; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
1086; CHECK-NEXT:    mov r5, r1
1087; CHECK-NEXT:    mov r6, r2
1088; CHECK-NEXT:  .LBB6_4: @ %vector.body
1089; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1090; CHECK-NEXT:    ldr.w r9, [r4]
1091; CHECK-NEXT:    ldr r7, [r5]
1092; CHECK-NEXT:    ldr.w r8, [r4, #4]
1093; CHECK-NEXT:    vmov.32 q0[0], r9
1094; CHECK-NEXT:    ldr.w r10, [r5, #4]
1095; CHECK-NEXT:    vmov.32 q1[0], r7
1096; CHECK-NEXT:    vmov.32 q0[1], r8
1097; CHECK-NEXT:    adds r4, #8
1098; CHECK-NEXT:    vmov.32 q1[1], r10
1099; CHECK-NEXT:    adds r5, #8
1100; CHECK-NEXT:    vadd.f16 q0, q0, q1
1101; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
1102; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
1103; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
1104; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
1105; CHECK-NEXT:    vstrb.8 q0, [r6], #16
1106; CHECK-NEXT:    le lr, .LBB6_4
1107; CHECK-NEXT:  @ %bb.5: @ %middle.block
1108; CHECK-NEXT:    cmp r12, r3
1109; CHECK-NEXT:    beq .LBB6_8
1110; CHECK-NEXT:  .LBB6_6: @ %for.body.preheader11
1111; CHECK-NEXT:    sub.w lr, r3, r12
1112; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
1113; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
1114; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
1115; CHECK-NEXT:  .LBB6_7: @ %for.body
1116; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1117; CHECK-NEXT:    vldr.16 s0, [r1]
1118; CHECK-NEXT:    vldr.16 s2, [r0]
1119; CHECK-NEXT:    adds r0, #2
1120; CHECK-NEXT:    adds r1, #2
1121; CHECK-NEXT:    vadd.f16 s0, s2, s0
1122; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
1123; CHECK-NEXT:    vstmia r2!, {s0}
1124; CHECK-NEXT:    le lr, .LBB6_7
1125; CHECK-NEXT:  .LBB6_8: @ %for.cond.cleanup
1126; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1127entry:
1128  %cmp8 = icmp eq i32 %N, 0
1129  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1130
1131for.body.preheader:                               ; preds = %entry
1132  %min.iters.check = icmp ult i32 %N, 4
1133  br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1134
1135for.body.preheader11:                             ; preds = %middle.block, %for.body.preheader
1136  %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1137  br label %for.body
1138
1139vector.ph:                                        ; preds = %for.body.preheader
1140  %n.vec = and i32 %N, -4
1141  br label %vector.body
1142
1143vector.body:                                      ; preds = %vector.body, %vector.ph
1144  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1145  %0 = getelementptr inbounds half, ptr %a, i32 %index
1146  %wide.load = load <4 x half>, ptr %0, align 2
1147  %1 = getelementptr inbounds half, ptr %b, i32 %index
1148  %wide.load10 = load <4 x half>, ptr %1, align 2
1149  %2 = fadd <4 x half> %wide.load, %wide.load10
1150  %3 = fpext <4 x half> %2 to <4 x float>
1151  %4 = getelementptr inbounds float, ptr %c, i32 %index
1152  store <4 x float> %3, ptr %4, align 4
1153  %index.next = add i32 %index, 4
1154  %5 = icmp eq i32 %index.next, %n.vec
1155  br i1 %5, label %middle.block, label %vector.body
1156
1157middle.block:                                     ; preds = %vector.body
1158  %cmp.n = icmp eq i32 %n.vec, %N
1159  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1160
1161for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1162  ret void
1163
1164for.body:                                         ; preds = %for.body.preheader11, %for.body
1165  %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1166  %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
1167  %6 = load half, ptr %arrayidx, align 2
1168  %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
1169  %7 = load half, ptr %arrayidx1, align 2
1170  %add = fadd half %6, %7
1171  %conv = fpext half %add to float
1172  %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
1173  store float %conv, ptr %arrayidx2, align 4
1174  %inc = add nuw i32 %i.09, 1
1175  %exitcond = icmp eq i32 %inc, %N
1176  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1177}
1178
1179define arm_aapcs_vfpcc void @half_half_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
1180; CHECK-LABEL: half_half_sub:
1181; CHECK:       @ %bb.0: @ %entry
1182; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, lr}
1183; CHECK-NEXT:    cmp r3, #0
1184; CHECK-NEXT:    beq .LBB7_8
1185; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1186; CHECK-NEXT:    cmp r3, #3
1187; CHECK-NEXT:    bhi .LBB7_3
1188; CHECK-NEXT:  @ %bb.2:
1189; CHECK-NEXT:    mov.w r12, #0
1190; CHECK-NEXT:    b .LBB7_6
1191; CHECK-NEXT:  .LBB7_3: @ %vector.ph
1192; CHECK-NEXT:    bic r12, r3, #3
1193; CHECK-NEXT:    movs r5, #1
1194; CHECK-NEXT:    sub.w r6, r12, #4
1195; CHECK-NEXT:    mov r4, r0
1196; CHECK-NEXT:    add.w lr, r5, r6, lsr #2
1197; CHECK-NEXT:    mov r5, r1
1198; CHECK-NEXT:    mov r6, r2
1199; CHECK-NEXT:  .LBB7_4: @ %vector.body
1200; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1201; CHECK-NEXT:    ldr.w r9, [r4]
1202; CHECK-NEXT:    ldr r7, [r5]
1203; CHECK-NEXT:    ldr.w r8, [r4, #4]
1204; CHECK-NEXT:    vmov.32 q0[0], r9
1205; CHECK-NEXT:    ldr.w r10, [r5, #4]
1206; CHECK-NEXT:    vmov.32 q1[0], r7
1207; CHECK-NEXT:    vmov.32 q0[1], r8
1208; CHECK-NEXT:    adds r4, #8
1209; CHECK-NEXT:    vmov.32 q1[1], r10
1210; CHECK-NEXT:    adds r5, #8
1211; CHECK-NEXT:    vsub.f16 q0, q0, q1
1212; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
1213; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
1214; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
1215; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
1216; CHECK-NEXT:    vstrb.8 q0, [r6], #16
1217; CHECK-NEXT:    le lr, .LBB7_4
1218; CHECK-NEXT:  @ %bb.5: @ %middle.block
1219; CHECK-NEXT:    cmp r12, r3
1220; CHECK-NEXT:    beq .LBB7_8
1221; CHECK-NEXT:  .LBB7_6: @ %for.body.preheader11
1222; CHECK-NEXT:    sub.w lr, r3, r12
1223; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
1224; CHECK-NEXT:    add.w r1, r1, r12, lsl #1
1225; CHECK-NEXT:    add.w r2, r2, r12, lsl #2
1226; CHECK-NEXT:  .LBB7_7: @ %for.body
1227; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1228; CHECK-NEXT:    vldr.16 s0, [r1]
1229; CHECK-NEXT:    vldr.16 s2, [r0]
1230; CHECK-NEXT:    adds r0, #2
1231; CHECK-NEXT:    adds r1, #2
1232; CHECK-NEXT:    vsub.f16 s0, s2, s0
1233; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
1234; CHECK-NEXT:    vstmia r2!, {s0}
1235; CHECK-NEXT:    le lr, .LBB7_7
1236; CHECK-NEXT:  .LBB7_8: @ %for.cond.cleanup
1237; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1238entry:
1239  %cmp8 = icmp eq i32 %N, 0
1240  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1241
1242for.body.preheader:                               ; preds = %entry
1243  %min.iters.check = icmp ult i32 %N, 4
1244  br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph
1245
1246for.body.preheader11:                             ; preds = %middle.block, %for.body.preheader
1247  %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1248  br label %for.body
1249
1250vector.ph:                                        ; preds = %for.body.preheader
1251  %n.vec = and i32 %N, -4
1252  br label %vector.body
1253
1254vector.body:                                      ; preds = %vector.body, %vector.ph
1255  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1256  %0 = getelementptr inbounds half, ptr %a, i32 %index
1257  %wide.load = load <4 x half>, ptr %0, align 2
1258  %1 = getelementptr inbounds half, ptr %b, i32 %index
1259  %wide.load10 = load <4 x half>, ptr %1, align 2
1260  %2 = fsub <4 x half> %wide.load, %wide.load10
1261  %3 = fpext <4 x half> %2 to <4 x float>
1262  %4 = getelementptr inbounds float, ptr %c, i32 %index
1263  store <4 x float> %3, ptr %4, align 4
1264  %index.next = add i32 %index, 4
1265  %5 = icmp eq i32 %index.next, %n.vec
1266  br i1 %5, label %middle.block, label %vector.body
1267
1268middle.block:                                     ; preds = %vector.body
1269  %cmp.n = icmp eq i32 %n.vec, %N
1270  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11
1271
1272for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1273  ret void
1274
1275for.body:                                         ; preds = %for.body.preheader11, %for.body
1276  %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ]
1277  %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09
1278  %6 = load half, ptr %arrayidx, align 2
1279  %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09
1280  %7 = load half, ptr %arrayidx1, align 2
1281  %sub = fsub half %6, %7
1282  %conv = fpext half %sub to float
1283  %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09
1284  store float %conv, ptr %arrayidx2, align 4
1285  %inc = add nuw i32 %i.09, 1
1286  %exitcond = icmp eq i32 %inc, %N
1287  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1288}
1289
1290define arm_aapcs_vfpcc void @half_short_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) {
1291; CHECK-LABEL: half_short_mul:
1292; CHECK:       @ %bb.0: @ %entry
1293; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
1294; CHECK-NEXT:    sub sp, #16
1295; CHECK-NEXT:    cmp r3, #0
1296; CHECK-NEXT:    beq .LBB8_8
1297; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1298; CHECK-NEXT:    mov r8, r2
1299; CHECK-NEXT:    mov r9, r1
1300; CHECK-NEXT:    cmp r3, #3
1301; CHECK-NEXT:    bhi .LBB8_3
1302; CHECK-NEXT:  @ %bb.2:
1303; CHECK-NEXT:    mov.w r12, #0
1304; CHECK-NEXT:    b .LBB8_6
1305; CHECK-NEXT:  .LBB8_3: @ %vector.ph
1306; CHECK-NEXT:    bic r12, r3, #3
1307; CHECK-NEXT:    movs r6, #1
1308; CHECK-NEXT:    sub.w r7, r12, #4
1309; CHECK-NEXT:    mov r1, sp
1310; CHECK-NEXT:    mov r5, r0
1311; CHECK-NEXT:    add.w lr, r6, r7, lsr #2
1312; CHECK-NEXT:    mov r6, r9
1313; CHECK-NEXT:    mov r7, r8
1314; CHECK-NEXT:  .LBB8_4: @ %vector.body
1315; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1316; CHECK-NEXT:    vldrh.u32 q0, [r6], #8
1317; CHECK-NEXT:    ldr r4, [r5]
1318; CHECK-NEXT:    ldr r2, [r5, #4]
1319; CHECK-NEXT:    adds r5, #8
1320; CHECK-NEXT:    vstrh.32 q0, [r1]
1321; CHECK-NEXT:    vmov.32 q1[0], r4
1322; CHECK-NEXT:    vldrw.u32 q0, [r1]
1323; CHECK-NEXT:    vmov.32 q1[1], r2
1324; CHECK-NEXT:    vcvt.f16.s16 q0, q0
1325; CHECK-NEXT:    vmul.f16 q0, q1, q0
1326; CHECK-NEXT:    vcvtt.f32.f16 s3, s1
1327; CHECK-NEXT:    vcvtb.f32.f16 s2, s1
1328; CHECK-NEXT:    vcvtt.f32.f16 s1, s0
1329; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
1330; CHECK-NEXT:    vstrb.8 q0, [r7], #16
1331; CHECK-NEXT:    le lr, .LBB8_4
1332; CHECK-NEXT:  @ %bb.5: @ %middle.block
1333; CHECK-NEXT:    cmp r12, r3
1334; CHECK-NEXT:    beq .LBB8_8
1335; CHECK-NEXT:  .LBB8_6: @ %for.body.preheader13
1336; CHECK-NEXT:    sub.w lr, r3, r12
1337; CHECK-NEXT:    add.w r0, r0, r12, lsl #1
1338; CHECK-NEXT:    add.w r1, r9, r12, lsl #1
1339; CHECK-NEXT:    add.w r2, r8, r12, lsl #2
1340; CHECK-NEXT:  .LBB8_7: @ %for.body
1341; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1342; CHECK-NEXT:    ldrsh r3, [r1], #2
1343; CHECK-NEXT:    vldr.16 s0, [r0]
1344; CHECK-NEXT:    adds r0, #2
1345; CHECK-NEXT:    vmov s2, r3
1346; CHECK-NEXT:    vcvt.f16.s32 s2, s2
1347; CHECK-NEXT:    vmul.f16 s0, s0, s2
1348; CHECK-NEXT:    vcvtb.f32.f16 s0, s0
1349; CHECK-NEXT:    vstmia r2!, {s0}
1350; CHECK-NEXT:    le lr, .LBB8_7
1351; CHECK-NEXT:  .LBB8_8: @ %for.cond.cleanup
1352; CHECK-NEXT:    add sp, #16
1353; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
1354entry:
1355  %cmp10 = icmp eq i32 %N, 0
1356  br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
1357
1358for.body.preheader:                               ; preds = %entry
1359  %min.iters.check = icmp ult i32 %N, 4
1360  br i1 %min.iters.check, label %for.body.preheader13, label %vector.ph
1361
1362for.body.preheader13:                             ; preds = %middle.block, %for.body.preheader
1363  %i.011.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1364  br label %for.body
1365
1366vector.ph:                                        ; preds = %for.body.preheader
1367  %n.vec = and i32 %N, -4
1368  br label %vector.body
1369
1370vector.body:                                      ; preds = %vector.body, %vector.ph
1371  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1372  %0 = getelementptr inbounds half, ptr %a, i32 %index
1373  %wide.load = load <4 x half>, ptr %0, align 2
1374  %1 = getelementptr inbounds i16, ptr %b, i32 %index
1375  %wide.load12 = load <4 x i16>, ptr %1, align 2
1376  %2 = sitofp <4 x i16> %wide.load12 to <4 x half>
1377  %3 = fmul <4 x half> %wide.load, %2
1378  %4 = fpext <4 x half> %3 to <4 x float>
1379  %5 = getelementptr inbounds float, ptr %c, i32 %index
1380  store <4 x float> %4, ptr %5, align 4
1381  %index.next = add i32 %index, 4
1382  %6 = icmp eq i32 %index.next, %n.vec
1383  br i1 %6, label %middle.block, label %vector.body
1384
1385middle.block:                                     ; preds = %vector.body
1386  %cmp.n = icmp eq i32 %n.vec, %N
1387  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13
1388
1389for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
1390  ret void
1391
1392for.body:                                         ; preds = %for.body.preheader13, %for.body
1393  %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader13 ]
1394  %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.011
1395  %7 = load half, ptr %arrayidx, align 2
1396  %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.011
1397  %8 = load i16, ptr %arrayidx1, align 2
1398  %conv2 = sitofp i16 %8 to half
1399  %mul = fmul half %7, %conv2
1400  %conv3 = fpext half %mul to float
1401  %arrayidx4 = getelementptr inbounds float, ptr %c, i32 %i.011
1402  store float %conv3, ptr %arrayidx4, align 4
1403  %inc = add nuw i32 %i.011, 1
1404  %exitcond = icmp eq i32 %inc, %N
1405  br i1 %exitcond, label %for.cond.cleanup, label %for.body
1406}
1407
1408define arm_aapcs_vfpcc float @half_half_mac(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
1409; CHECK-LABEL: half_half_mac:
1410; CHECK:       @ %bb.0: @ %entry
1411; CHECK-NEXT:    push {r4, r5, r7, lr}
1412; CHECK-NEXT:    cbz r2, .LBB9_3
1413; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1414; CHECK-NEXT:    subs r3, r2, #1
1415; CHECK-NEXT:    and r12, r2, #3
1416; CHECK-NEXT:    cmp r3, #3
1417; CHECK-NEXT:    bhs .LBB9_4
1418; CHECK-NEXT:  @ %bb.2:
1419; CHECK-NEXT:    vldr s0, .LCPI9_0
1420; CHECK-NEXT:    movs r2, #0
1421; CHECK-NEXT:    b .LBB9_6
1422; CHECK-NEXT:  .LBB9_3:
1423; CHECK-NEXT:    vldr s0, .LCPI9_0
1424; CHECK-NEXT:    pop {r4, r5, r7, pc}
1425; CHECK-NEXT:  .LBB9_4: @ %for.body.preheader.new
1426; CHECK-NEXT:    bic r2, r2, #3
1427; CHECK-NEXT:    movs r3, #1
1428; CHECK-NEXT:    subs r2, #4
1429; CHECK-NEXT:    vldr s0, .LCPI9_0
1430; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
1431; CHECK-NEXT:    movs r3, #0
1432; CHECK-NEXT:    movs r2, #0
1433; CHECK-NEXT:  .LBB9_5: @ %for.body
1434; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1435; CHECK-NEXT:    adds r5, r0, r3
1436; CHECK-NEXT:    adds r4, r1, r3
1437; CHECK-NEXT:    vldr.16 s2, [r4, #6]
1438; CHECK-NEXT:    vldr.16 s4, [r5, #6]
1439; CHECK-NEXT:    vldr.16 s6, [r5, #4]
1440; CHECK-NEXT:    vldr.16 s8, [r5, #2]
1441; CHECK-NEXT:    vmul.f16 s2, s4, s2
1442; CHECK-NEXT:    vldr.16 s4, [r4, #4]
1443; CHECK-NEXT:    vldr.16 s10, [r5]
1444; CHECK-NEXT:    vcvtb.f32.f16 s2, s2
1445; CHECK-NEXT:    vmul.f16 s4, s6, s4
1446; CHECK-NEXT:    vldr.16 s6, [r4, #2]
1447; CHECK-NEXT:    vcvtb.f32.f16 s4, s4
1448; CHECK-NEXT:    adds r3, #8
1449; CHECK-NEXT:    vmul.f16 s6, s8, s6
1450; CHECK-NEXT:    vldr.16 s8, [r4]
1451; CHECK-NEXT:    vcvtb.f32.f16 s6, s6
1452; CHECK-NEXT:    adds r2, #4
1453; CHECK-NEXT:    vmul.f16 s8, s10, s8
1454; CHECK-NEXT:    vcvtb.f32.f16 s8, s8
1455; CHECK-NEXT:    vadd.f32 s0, s0, s8
1456; CHECK-NEXT:    vadd.f32 s0, s0, s6
1457; CHECK-NEXT:    vadd.f32 s0, s0, s4
1458; CHECK-NEXT:    vadd.f32 s0, s0, s2
1459; CHECK-NEXT:    le lr, .LBB9_5
1460; CHECK-NEXT:  .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1461; CHECK-NEXT:    wls lr, r12, .LBB9_9
1462; CHECK-NEXT:  @ %bb.7: @ %for.body.epil.preheader
1463; CHECK-NEXT:    add.w r0, r0, r2, lsl #1
1464; CHECK-NEXT:    add.w r1, r1, r2, lsl #1
1465; CHECK-NEXT:  .LBB9_8: @ %for.body.epil
1466; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1467; CHECK-NEXT:    vldr.16 s2, [r1]
1468; CHECK-NEXT:    vldr.16 s4, [r0]
1469; CHECK-NEXT:    adds r0, #2
1470; CHECK-NEXT:    adds r1, #2
1471; CHECK-NEXT:    vmul.f16 s2, s4, s2
1472; CHECK-NEXT:    vcvtb.f32.f16 s2, s2
1473; CHECK-NEXT:    vadd.f32 s0, s0, s2
1474; CHECK-NEXT:    le lr, .LBB9_8
1475; CHECK-NEXT:  .LBB9_9: @ %for.cond.cleanup
1476; CHECK-NEXT:    pop {r4, r5, r7, pc}
1477; CHECK-NEXT:    .p2align 2
1478; CHECK-NEXT:  @ %bb.10:
1479; CHECK-NEXT:  .LCPI9_0:
1480; CHECK-NEXT:    .long 0x00000000 @ float 0
1481entry:
1482  %cmp8 = icmp eq i32 %N, 0
1483  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1484
1485for.body.preheader:                               ; preds = %entry
1486  %0 = add i32 %N, -1
1487  %xtraiter = and i32 %N, 3
1488  %1 = icmp ult i32 %0, 3
1489  br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1490
1491for.body.preheader.new:                           ; preds = %for.body.preheader
1492  %unroll_iter = sub i32 %N, %xtraiter
1493  br label %for.body
1494
1495for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
1496  %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
1497  %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1498  %res.09.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
1499  %lcmp.mod = icmp eq i32 %xtraiter, 0
1500  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1501
1502for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1503  %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1504  %res.09.epil = phi float [ %add.epil, %for.body.epil ], [ %res.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1505  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1506  %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.010.epil
1507  %2 = load half, ptr %arrayidx.epil, align 2
1508  %arrayidx1.epil = getelementptr inbounds half, ptr %b, i32 %i.010.epil
1509  %3 = load half, ptr %arrayidx1.epil, align 2
1510  %mul.epil = fmul half %2, %3
1511  %conv.epil = fpext half %mul.epil to float
1512  %add.epil = fadd float %res.09.epil, %conv.epil
1513  %inc.epil = add nuw i32 %i.010.epil, 1
1514  %epil.iter.sub = add i32 %epil.iter, -1
1515  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1516  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1517
1518for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1519  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
1520  ret float %res.0.lcssa
1521
1522for.body:                                         ; preds = %for.body, %for.body.preheader.new
1523  %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1524  %res.09 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
1525  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1526  %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.010
1527  %4 = load half, ptr %arrayidx, align 2
1528  %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.010
1529  %5 = load half, ptr %arrayidx1, align 2
1530  %mul = fmul half %4, %5
1531  %conv = fpext half %mul to float
1532  %add = fadd float %res.09, %conv
1533  %inc = or disjoint i32 %i.010, 1
1534  %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
1535  %6 = load half, ptr %arrayidx.1, align 2
1536  %arrayidx1.1 = getelementptr inbounds half, ptr %b, i32 %inc
1537  %7 = load half, ptr %arrayidx1.1, align 2
1538  %mul.1 = fmul half %6, %7
1539  %conv.1 = fpext half %mul.1 to float
1540  %add.1 = fadd float %add, %conv.1
1541  %inc.1 = or disjoint i32 %i.010, 2
1542  %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
1543  %8 = load half, ptr %arrayidx.2, align 2
1544  %arrayidx1.2 = getelementptr inbounds half, ptr %b, i32 %inc.1
1545  %9 = load half, ptr %arrayidx1.2, align 2
1546  %mul.2 = fmul half %8, %9
1547  %conv.2 = fpext half %mul.2 to float
1548  %add.2 = fadd float %add.1, %conv.2
1549  %inc.2 = or disjoint i32 %i.010, 3
1550  %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
1551  %10 = load half, ptr %arrayidx.3, align 2
1552  %arrayidx1.3 = getelementptr inbounds half, ptr %b, i32 %inc.2
1553  %11 = load half, ptr %arrayidx1.3, align 2
1554  %mul.3 = fmul half %10, %11
1555  %conv.3 = fpext half %mul.3 to float
1556  %add.3 = fadd float %add.2, %conv.3
1557  %inc.3 = add nuw i32 %i.010, 4
1558  %niter.nsub.3 = add i32 %niter, -4
1559  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1560  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1561}
1562
1563define arm_aapcs_vfpcc float @half_half_acc(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
1564; CHECK-LABEL: half_half_acc:
1565; CHECK:       @ %bb.0: @ %entry
1566; CHECK-NEXT:    push {r4, r5, r7, lr}
1567; CHECK-NEXT:    cbz r2, .LBB10_3
1568; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1569; CHECK-NEXT:    subs r3, r2, #1
1570; CHECK-NEXT:    and r12, r2, #3
1571; CHECK-NEXT:    cmp r3, #3
1572; CHECK-NEXT:    bhs .LBB10_4
1573; CHECK-NEXT:  @ %bb.2:
1574; CHECK-NEXT:    vldr s0, .LCPI10_0
1575; CHECK-NEXT:    movs r2, #0
1576; CHECK-NEXT:    b .LBB10_6
1577; CHECK-NEXT:  .LBB10_3:
1578; CHECK-NEXT:    vldr s0, .LCPI10_0
1579; CHECK-NEXT:    pop {r4, r5, r7, pc}
1580; CHECK-NEXT:  .LBB10_4: @ %for.body.preheader.new
1581; CHECK-NEXT:    bic r2, r2, #3
1582; CHECK-NEXT:    movs r3, #1
1583; CHECK-NEXT:    subs r2, #4
1584; CHECK-NEXT:    vldr s0, .LCPI10_0
1585; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
1586; CHECK-NEXT:    movs r3, #0
1587; CHECK-NEXT:    movs r2, #0
1588; CHECK-NEXT:  .LBB10_5: @ %for.body
1589; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1590; CHECK-NEXT:    adds r5, r0, r3
1591; CHECK-NEXT:    adds r4, r1, r3
1592; CHECK-NEXT:    vldr.16 s2, [r4, #6]
1593; CHECK-NEXT:    vldr.16 s4, [r5, #6]
1594; CHECK-NEXT:    vldr.16 s6, [r5, #4]
1595; CHECK-NEXT:    vldr.16 s8, [r5, #2]
1596; CHECK-NEXT:    vadd.f16 s2, s4, s2
1597; CHECK-NEXT:    vldr.16 s4, [r4, #4]
1598; CHECK-NEXT:    vldr.16 s10, [r5]
1599; CHECK-NEXT:    vcvtb.f32.f16 s2, s2
1600; CHECK-NEXT:    vadd.f16 s4, s6, s4
1601; CHECK-NEXT:    vldr.16 s6, [r4, #2]
1602; CHECK-NEXT:    vcvtb.f32.f16 s4, s4
1603; CHECK-NEXT:    adds r3, #8
1604; CHECK-NEXT:    vadd.f16 s6, s8, s6
1605; CHECK-NEXT:    vldr.16 s8, [r4]
1606; CHECK-NEXT:    vcvtb.f32.f16 s6, s6
1607; CHECK-NEXT:    adds r2, #4
1608; CHECK-NEXT:    vadd.f16 s8, s10, s8
1609; CHECK-NEXT:    vcvtb.f32.f16 s8, s8
1610; CHECK-NEXT:    vadd.f32 s0, s0, s8
1611; CHECK-NEXT:    vadd.f32 s0, s0, s6
1612; CHECK-NEXT:    vadd.f32 s0, s0, s4
1613; CHECK-NEXT:    vadd.f32 s0, s0, s2
1614; CHECK-NEXT:    le lr, .LBB10_5
1615; CHECK-NEXT:  .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1616; CHECK-NEXT:    wls lr, r12, .LBB10_9
1617; CHECK-NEXT:  @ %bb.7: @ %for.body.epil.preheader
1618; CHECK-NEXT:    add.w r0, r0, r2, lsl #1
1619; CHECK-NEXT:    add.w r1, r1, r2, lsl #1
1620; CHECK-NEXT:  .LBB10_8: @ %for.body.epil
1621; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1622; CHECK-NEXT:    vldr.16 s2, [r1]
1623; CHECK-NEXT:    vldr.16 s4, [r0]
1624; CHECK-NEXT:    adds r0, #2
1625; CHECK-NEXT:    adds r1, #2
1626; CHECK-NEXT:    vadd.f16 s2, s4, s2
1627; CHECK-NEXT:    vcvtb.f32.f16 s2, s2
1628; CHECK-NEXT:    vadd.f32 s0, s0, s2
1629; CHECK-NEXT:    le lr, .LBB10_8
1630; CHECK-NEXT:  .LBB10_9: @ %for.cond.cleanup
1631; CHECK-NEXT:    pop {r4, r5, r7, pc}
1632; CHECK-NEXT:    .p2align 2
1633; CHECK-NEXT:  @ %bb.10:
1634; CHECK-NEXT:  .LCPI10_0:
1635; CHECK-NEXT:    .long 0x00000000 @ float 0
1636entry:
1637  %cmp9 = icmp eq i32 %N, 0
1638  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
1639
1640for.body.preheader:                               ; preds = %entry
1641  %0 = add i32 %N, -1
1642  %xtraiter = and i32 %N, 3
1643  %1 = icmp ult i32 %0, 3
1644  br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1645
1646for.body.preheader.new:                           ; preds = %for.body.preheader
1647  %unroll_iter = sub i32 %N, %xtraiter
1648  br label %for.body
1649
1650for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
1651  %add2.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add2.3, %for.body ]
1652  %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1653  %res.010.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add2.3, %for.body ]
1654  %lcmp.mod = icmp eq i32 %xtraiter, 0
1655  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1656
1657for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1658  %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1659  %res.010.epil = phi float [ %add2.epil, %for.body.epil ], [ %res.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1660  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1661  %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.011.epil
1662  %2 = load half, ptr %arrayidx.epil, align 2
1663  %arrayidx1.epil = getelementptr inbounds half, ptr %b, i32 %i.011.epil
1664  %3 = load half, ptr %arrayidx1.epil, align 2
1665  %add.epil = fadd half %2, %3
1666  %conv.epil = fpext half %add.epil to float
1667  %add2.epil = fadd float %res.010.epil, %conv.epil
1668  %inc.epil = add nuw i32 %i.011.epil, 1
1669  %epil.iter.sub = add i32 %epil.iter, -1
1670  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1671  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1672
1673for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1674  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add2.epil, %for.body.epil ]
1675  ret float %res.0.lcssa
1676
1677for.body:                                         ; preds = %for.body, %for.body.preheader.new
1678  %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1679  %res.010 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add2.3, %for.body ]
1680  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1681  %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.011
1682  %4 = load half, ptr %arrayidx, align 2
1683  %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.011
1684  %5 = load half, ptr %arrayidx1, align 2
1685  %add = fadd half %4, %5
1686  %conv = fpext half %add to float
1687  %add2 = fadd float %res.010, %conv
1688  %inc = or disjoint i32 %i.011, 1
1689  %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
1690  %6 = load half, ptr %arrayidx.1, align 2
1691  %arrayidx1.1 = getelementptr inbounds half, ptr %b, i32 %inc
1692  %7 = load half, ptr %arrayidx1.1, align 2
1693  %add.1 = fadd half %6, %7
1694  %conv.1 = fpext half %add.1 to float
1695  %add2.1 = fadd float %add2, %conv.1
1696  %inc.1 = or disjoint i32 %i.011, 2
1697  %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
1698  %8 = load half, ptr %arrayidx.2, align 2
1699  %arrayidx1.2 = getelementptr inbounds half, ptr %b, i32 %inc.1
1700  %9 = load half, ptr %arrayidx1.2, align 2
1701  %add.2 = fadd half %8, %9
1702  %conv.2 = fpext half %add.2 to float
1703  %add2.2 = fadd float %add2.1, %conv.2
1704  %inc.2 = or disjoint i32 %i.011, 3
1705  %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
1706  %10 = load half, ptr %arrayidx.3, align 2
1707  %arrayidx1.3 = getelementptr inbounds half, ptr %b, i32 %inc.2
1708  %11 = load half, ptr %arrayidx1.3, align 2
1709  %add.3 = fadd half %10, %11
1710  %conv.3 = fpext half %add.3 to float
1711  %add2.3 = fadd float %add2.2, %conv.3
1712  %inc.3 = add nuw i32 %i.011, 4
1713  %niter.nsub.3 = add i32 %niter, -4
1714  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1715  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1716}
1717
1718define arm_aapcs_vfpcc float @half_short_mac(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) {
1719; CHECK-LABEL: half_short_mac:
1720; CHECK:       @ %bb.0: @ %entry
1721; CHECK-NEXT:    push {r4, r5, r6, lr}
1722; CHECK-NEXT:    cbz r2, .LBB11_3
1723; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1724; CHECK-NEXT:    subs r3, r2, #1
1725; CHECK-NEXT:    and r12, r2, #3
1726; CHECK-NEXT:    cmp r3, #3
1727; CHECK-NEXT:    bhs .LBB11_4
1728; CHECK-NEXT:  @ %bb.2:
1729; CHECK-NEXT:    vldr s0, .LCPI11_0
1730; CHECK-NEXT:    movs r2, #0
1731; CHECK-NEXT:    b .LBB11_6
1732; CHECK-NEXT:  .LBB11_3:
1733; CHECK-NEXT:    vldr s0, .LCPI11_0
1734; CHECK-NEXT:    pop {r4, r5, r6, pc}
1735; CHECK-NEXT:  .LBB11_4: @ %for.body.preheader.new
1736; CHECK-NEXT:    bic r2, r2, #3
1737; CHECK-NEXT:    movs r3, #1
1738; CHECK-NEXT:    subs r2, #4
1739; CHECK-NEXT:    vldr s0, .LCPI11_0
1740; CHECK-NEXT:    adds r4, r0, #4
1741; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
1742; CHECK-NEXT:    adds r3, r1, #4
1743; CHECK-NEXT:    movs r2, #0
1744; CHECK-NEXT:  .LBB11_5: @ %for.body
1745; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1746; CHECK-NEXT:    ldrsh.w r5, [r3, #2]
1747; CHECK-NEXT:    vldr.16 s2, [r4, #2]
1748; CHECK-NEXT:    adds r2, #4
1749; CHECK-NEXT:    vmov s4, r5
1750; CHECK-NEXT:    ldrsh r5, [r3], #8
1751; CHECK-NEXT:    vcvt.f16.s32 s4, s4
1752; CHECK-NEXT:    ldrsh r6, [r3, #-10]
1753; CHECK-NEXT:    vmul.f16 s2, s2, s4
1754; CHECK-NEXT:    vmov s6, r5
1755; CHECK-NEXT:    vldr.16 s4, [r4]
1756; CHECK-NEXT:    vcvt.f16.s32 s6, s6
1757; CHECK-NEXT:    ldrsh r5, [r3, #-12]
1758; CHECK-NEXT:    vmul.f16 s4, s4, s6
1759; CHECK-NEXT:    vmov s8, r6
1760; CHECK-NEXT:    vldr.16 s6, [r4, #-2]
1761; CHECK-NEXT:    vcvt.f16.s32 s8, s8
1762; CHECK-NEXT:    vmov s10, r5
1763; CHECK-NEXT:    vcvtb.f32.f16 s4, s4
1764; CHECK-NEXT:    vmul.f16 s6, s6, s8
1765; CHECK-NEXT:    vldr.16 s8, [r4, #-4]
1766; CHECK-NEXT:    vcvt.f16.s32 s10, s10
1767; CHECK-NEXT:    vcvtb.f32.f16 s6, s6
1768; CHECK-NEXT:    vmul.f16 s8, s8, s10
1769; CHECK-NEXT:    vcvtb.f32.f16 s2, s2
1770; CHECK-NEXT:    vcvtb.f32.f16 s8, s8
1771; CHECK-NEXT:    adds r4, #8
1772; CHECK-NEXT:    vadd.f32 s0, s0, s8
1773; CHECK-NEXT:    vadd.f32 s0, s0, s6
1774; CHECK-NEXT:    vadd.f32 s0, s0, s4
1775; CHECK-NEXT:    vadd.f32 s0, s0, s2
1776; CHECK-NEXT:    le lr, .LBB11_5
1777; CHECK-NEXT:  .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
1778; CHECK-NEXT:    wls lr, r12, .LBB11_9
1779; CHECK-NEXT:  @ %bb.7: @ %for.body.epil.preheader
1780; CHECK-NEXT:    add.w r0, r0, r2, lsl #1
1781; CHECK-NEXT:    add.w r1, r1, r2, lsl #1
1782; CHECK-NEXT:  .LBB11_8: @ %for.body.epil
1783; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1784; CHECK-NEXT:    ldrsh r2, [r1], #2
1785; CHECK-NEXT:    vldr.16 s2, [r0]
1786; CHECK-NEXT:    adds r0, #2
1787; CHECK-NEXT:    vmov s4, r2
1788; CHECK-NEXT:    vcvt.f16.s32 s4, s4
1789; CHECK-NEXT:    vmul.f16 s2, s2, s4
1790; CHECK-NEXT:    vcvtb.f32.f16 s2, s2
1791; CHECK-NEXT:    vadd.f32 s0, s0, s2
1792; CHECK-NEXT:    le lr, .LBB11_8
1793; CHECK-NEXT:  .LBB11_9: @ %for.cond.cleanup
1794; CHECK-NEXT:    pop {r4, r5, r6, pc}
1795; CHECK-NEXT:    .p2align 2
1796; CHECK-NEXT:  @ %bb.10:
1797; CHECK-NEXT:  .LCPI11_0:
1798; CHECK-NEXT:    .long 0x00000000 @ float 0
1799entry:
1800  %cmp10 = icmp eq i32 %N, 0
1801  br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader
1802
1803for.body.preheader:                               ; preds = %entry
1804  %0 = add i32 %N, -1
1805  %xtraiter = and i32 %N, 3
1806  %1 = icmp ult i32 %0, 3
1807  br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1808
1809for.body.preheader.new:                           ; preds = %for.body.preheader
1810  %unroll_iter = sub i32 %N, %xtraiter
1811  br label %for.body
1812
1813for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
1814  %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ]
1815  %i.012.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1816  %res.011.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ]
1817  %lcmp.mod = icmp eq i32 %xtraiter, 0
1818  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1819
1820for.body.epil:                                    ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1821  %i.012.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.012.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1822  %res.011.epil = phi float [ %add.epil, %for.body.epil ], [ %res.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1823  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1824  %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.012.epil
1825  %2 = load half, ptr %arrayidx.epil, align 2
1826  %arrayidx1.epil = getelementptr inbounds i16, ptr %b, i32 %i.012.epil
1827  %3 = load i16, ptr %arrayidx1.epil, align 2
1828  %conv2.epil = sitofp i16 %3 to half
1829  %mul.epil = fmul half %2, %conv2.epil
1830  %conv3.epil = fpext half %mul.epil to float
1831  %add.epil = fadd float %res.011.epil, %conv3.epil
1832  %inc.epil = add nuw i32 %i.012.epil, 1
1833  %epil.iter.sub = add i32 %epil.iter, -1
1834  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1835  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1836
1837for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1838  %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ]
1839  ret float %res.0.lcssa
1840
1841for.body:                                         ; preds = %for.body, %for.body.preheader.new
1842  %i.012 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1843  %res.011 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ]
1844  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1845  %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.012
1846  %4 = load half, ptr %arrayidx, align 2
1847  %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.012
1848  %5 = load i16, ptr %arrayidx1, align 2
1849  %conv2 = sitofp i16 %5 to half
1850  %mul = fmul half %4, %conv2
1851  %conv3 = fpext half %mul to float
1852  %add = fadd float %res.011, %conv3
1853  %inc = or disjoint i32 %i.012, 1
1854  %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc
1855  %6 = load half, ptr %arrayidx.1, align 2
1856  %arrayidx1.1 = getelementptr inbounds i16, ptr %b, i32 %inc
1857  %7 = load i16, ptr %arrayidx1.1, align 2
1858  %conv2.1 = sitofp i16 %7 to half
1859  %mul.1 = fmul half %6, %conv2.1
1860  %conv3.1 = fpext half %mul.1 to float
1861  %add.1 = fadd float %add, %conv3.1
1862  %inc.1 = or disjoint i32 %i.012, 2
1863  %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1
1864  %8 = load half, ptr %arrayidx.2, align 2
1865  %arrayidx1.2 = getelementptr inbounds i16, ptr %b, i32 %inc.1
1866  %9 = load i16, ptr %arrayidx1.2, align 2
1867  %conv2.2 = sitofp i16 %9 to half
1868  %mul.2 = fmul half %8, %conv2.2
1869  %conv3.2 = fpext half %mul.2 to float
1870  %add.2 = fadd float %add.1, %conv3.2
1871  %inc.2 = or disjoint i32 %i.012, 3
1872  %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2
1873  %10 = load half, ptr %arrayidx.3, align 2
1874  %arrayidx1.3 = getelementptr inbounds i16, ptr %b, i32 %inc.2
1875  %11 = load i16, ptr %arrayidx1.3, align 2
1876  %conv2.3 = sitofp i16 %11 to half
1877  %mul.3 = fmul half %10, %conv2.3
1878  %conv3.3 = fpext half %mul.3 to float
1879  %add.3 = fadd float %add.2, %conv3.3
1880  %inc.3 = add nuw i32 %i.012, 4
1881  %niter.nsub.3 = add i32 %niter, -4
1882  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1883  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1884}
1885
1886