xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-fma-loops.ll (revision b5b663aac17415625340eb29c8010832bfc4c21c)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs -tail-predication=enabled %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc void @fmas1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
5; CHECK-LABEL: fmas1:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    .save {r4, lr}
8; CHECK-NEXT:    push {r4, lr}
9; CHECK-NEXT:    cmp r3, #1
10; CHECK-NEXT:    it lt
11; CHECK-NEXT:    poplt {r4, pc}
12; CHECK-NEXT:  .LBB0_1: @ %vector.ph
13; CHECK-NEXT:    vmov r12, s0
14; CHECK-NEXT:    dlstp.32 lr, r3
15; CHECK-NEXT:  .LBB0_2: @ %vector.body
16; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
17; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
18; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
19; CHECK-NEXT:    vfmas.f32 q1, q0, r12
20; CHECK-NEXT:    vstrw.32 q1, [r2], #16
21; CHECK-NEXT:    letp lr, .LBB0_2
22; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
23; CHECK-NEXT:    pop {r4, pc}
24entry:
25  %cmp8 = icmp sgt i32 %n, 0
26  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
27
28vector.ph:                                        ; preds = %entry
29  %n.rnd.up = add i32 %n, 3
30  %n.vec = and i32 %n.rnd.up, -4
31  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
32  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
33  br label %vector.body
34
35vector.body:                                      ; preds = %vector.body, %vector.ph
36  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
37  %0 = getelementptr inbounds float, ptr %x, i32 %index
38  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
39  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
40  %2 = getelementptr inbounds float, ptr %y, i32 %index
41  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
42  %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
43  %4 = getelementptr inbounds float, ptr %z, i32 %index
44  call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
45  %index.next = add i32 %index, 4
46  %5 = icmp eq i32 %index.next, %n.vec
47  br i1 %5, label %for.cond.cleanup, label %vector.body
48
49for.cond.cleanup:                                 ; preds = %vector.body, %entry
50  ret void
51}
52
53define arm_aapcs_vfpcc void @fmas2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
54; CHECK-LABEL: fmas2:
55; CHECK:       @ %bb.0: @ %entry
56; CHECK-NEXT:    .save {r4, lr}
57; CHECK-NEXT:    push {r4, lr}
58; CHECK-NEXT:    cmp r3, #1
59; CHECK-NEXT:    it lt
60; CHECK-NEXT:    poplt {r4, pc}
61; CHECK-NEXT:  .LBB1_1: @ %vector.ph
62; CHECK-NEXT:    vmov r12, s0
63; CHECK-NEXT:    dlstp.32 lr, r3
64; CHECK-NEXT:  .LBB1_2: @ %vector.body
65; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
66; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
67; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
68; CHECK-NEXT:    vfmas.f32 q1, q0, r12
69; CHECK-NEXT:    vstrw.32 q1, [r2], #16
70; CHECK-NEXT:    letp lr, .LBB1_2
71; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
72; CHECK-NEXT:    pop {r4, pc}
73entry:
74  %cmp8 = icmp sgt i32 %n, 0
75  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
76
77vector.ph:                                        ; preds = %entry
78  %n.rnd.up = add i32 %n, 3
79  %n.vec = and i32 %n.rnd.up, -4
80  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
81  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
82  br label %vector.body
83
84vector.body:                                      ; preds = %vector.body, %vector.ph
85  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
86  %0 = getelementptr inbounds float, ptr %x, i32 %index
87  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
88  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
89  %2 = getelementptr inbounds float, ptr %y, i32 %index
90  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
91  %3 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
92  %4 = fadd fast <4 x float> %3, %broadcast.splat14
93  %5 = getelementptr inbounds float, ptr %z, i32 %index
94  call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
95  %index.next = add i32 %index, 4
96  %6 = icmp eq i32 %index.next, %n.vec
97  br i1 %6, label %for.cond.cleanup, label %vector.body
98
99for.cond.cleanup:                                 ; preds = %vector.body, %entry
100  ret void
101}
102
103define arm_aapcs_vfpcc void @fma1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
104; CHECK-LABEL: fma1:
105; CHECK:       @ %bb.0: @ %entry
106; CHECK-NEXT:    .save {r4, lr}
107; CHECK-NEXT:    push {r4, lr}
108; CHECK-NEXT:    cmp r3, #1
109; CHECK-NEXT:    it lt
110; CHECK-NEXT:    poplt {r4, pc}
111; CHECK-NEXT:  .LBB2_1: @ %vector.ph
112; CHECK-NEXT:    vmov r12, s0
113; CHECK-NEXT:    dlstp.32 lr, r3
114; CHECK-NEXT:  .LBB2_2: @ %vector.body
115; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
116; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
117; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
118; CHECK-NEXT:    vfma.f32 q1, q0, r12
119; CHECK-NEXT:    vstrw.32 q1, [r2], #16
120; CHECK-NEXT:    letp lr, .LBB2_2
121; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
122; CHECK-NEXT:    pop {r4, pc}
123entry:
124  %cmp8 = icmp sgt i32 %n, 0
125  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
126
127vector.ph:                                        ; preds = %entry
128  %n.rnd.up = add i32 %n, 3
129  %n.vec = and i32 %n.rnd.up, -4
130  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
131  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
132  br label %vector.body
133
134vector.body:                                      ; preds = %vector.body, %vector.ph
135  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
136  %0 = getelementptr inbounds float, ptr %x, i32 %index
137  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
138  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
139  %2 = getelementptr inbounds float, ptr %y, i32 %index
140  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
141  %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
142  %4 = getelementptr inbounds float, ptr %z, i32 %index
143  call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
144  %index.next = add i32 %index, 4
145  %5 = icmp eq i32 %index.next, %n.vec
146  br i1 %5, label %for.cond.cleanup, label %vector.body
147
148for.cond.cleanup:                                 ; preds = %vector.body, %entry
149  ret void
150}
151
152define arm_aapcs_vfpcc void @fma2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
153; CHECK-LABEL: fma2:
154; CHECK:       @ %bb.0: @ %entry
155; CHECK-NEXT:    .save {r4, lr}
156; CHECK-NEXT:    push {r4, lr}
157; CHECK-NEXT:    cmp r3, #1
158; CHECK-NEXT:    it lt
159; CHECK-NEXT:    poplt {r4, pc}
160; CHECK-NEXT:  .LBB3_1: @ %vector.ph
161; CHECK-NEXT:    vmov r12, s0
162; CHECK-NEXT:    dlstp.32 lr, r3
163; CHECK-NEXT:  .LBB3_2: @ %vector.body
164; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
165; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
166; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
167; CHECK-NEXT:    vfma.f32 q1, q0, r12
168; CHECK-NEXT:    vstrw.32 q1, [r2], #16
169; CHECK-NEXT:    letp lr, .LBB3_2
170; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
171; CHECK-NEXT:    pop {r4, pc}
172entry:
173  %cmp8 = icmp sgt i32 %n, 0
174  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
175
176vector.ph:                                        ; preds = %entry
177  %n.rnd.up = add i32 %n, 3
178  %n.vec = and i32 %n.rnd.up, -4
179  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
180  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
181  br label %vector.body
182
183vector.body:                                      ; preds = %vector.body, %vector.ph
184  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
185  %0 = getelementptr inbounds float, ptr %x, i32 %index
186  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
187  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
188  %2 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
189  %3 = getelementptr inbounds float, ptr %y, i32 %index
190  %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %3, i32 4, <4 x i1> %1, <4 x float> undef)
191  %4 = fadd fast <4 x float> %2, %wide.masked.load14
192  %5 = getelementptr inbounds float, ptr %z, i32 %index
193  call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
194  %index.next = add i32 %index, 4
195  %6 = icmp eq i32 %index.next, %n.vec
196  br i1 %6, label %for.cond.cleanup, label %vector.body
197
198for.cond.cleanup:                                 ; preds = %vector.body, %entry
199  ret void
200}
201
202define arm_aapcs_vfpcc void @fmss1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
203; CHECK-LABEL: fmss1:
204; CHECK:       @ %bb.0: @ %entry
205; CHECK-NEXT:    .save {r4, lr}
206; CHECK-NEXT:    push {r4, lr}
207; CHECK-NEXT:    cmp r3, #1
208; CHECK-NEXT:    it lt
209; CHECK-NEXT:    poplt {r4, pc}
210; CHECK-NEXT:  .LBB4_1: @ %vector.ph
211; CHECK-NEXT:    vmov r12, s0
212; CHECK-NEXT:    eor r12, r12, #-2147483648
213; CHECK-NEXT:    dlstp.32 lr, r3
214; CHECK-NEXT:  .LBB4_2: @ %vector.body
215; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
216; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
217; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
218; CHECK-NEXT:    vfmas.f32 q1, q0, r12
219; CHECK-NEXT:    vstrw.32 q1, [r2], #16
220; CHECK-NEXT:    letp lr, .LBB4_2
221; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
222; CHECK-NEXT:    pop {r4, pc}
223entry:
224  %cmp8 = icmp sgt i32 %n, 0
225  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
226
227vector.ph:                                        ; preds = %entry
228  %fneg = fneg fast float %a
229  %n.rnd.up = add i32 %n, 3
230  %n.vec = and i32 %n.rnd.up, -4
231  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
232  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
233  br label %vector.body
234
235vector.body:                                      ; preds = %vector.body, %vector.ph
236  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
237  %0 = getelementptr inbounds float, ptr %x, i32 %index
238  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
239  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
240  %2 = getelementptr inbounds float, ptr %y, i32 %index
241  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
242  %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %wide.masked.load12, <4 x float> %broadcast.splat14)
243  %4 = getelementptr inbounds float, ptr %z, i32 %index
244  call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
245  %index.next = add i32 %index, 4
246  %5 = icmp eq i32 %index.next, %n.vec
247  br i1 %5, label %for.cond.cleanup, label %vector.body
248
249for.cond.cleanup:                                 ; preds = %vector.body, %entry
250  ret void
251}
252
253define arm_aapcs_vfpcc void @fmss2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
254; CHECK-LABEL: fmss2:
255; CHECK:       @ %bb.0: @ %entry
256; CHECK-NEXT:    .save {r4, lr}
257; CHECK-NEXT:    push {r4, lr}
258; CHECK-NEXT:    cmp r3, #1
259; CHECK-NEXT:    it lt
260; CHECK-NEXT:    poplt {r4, pc}
261; CHECK-NEXT:  .LBB5_1: @ %vector.ph
262; CHECK-NEXT:    vmov r12, s0
263; CHECK-NEXT:    vdup.32 q0, r12
264; CHECK-NEXT:    vneg.f32 q0, q0
265; CHECK-NEXT:    dlstp.32 lr, r3
266; CHECK-NEXT:  .LBB5_2: @ %vector.body
267; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
268; CHECK-NEXT:    vmov q3, q0
269; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
270; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
271; CHECK-NEXT:    vfma.f32 q3, q2, q1
272; CHECK-NEXT:    vstrw.32 q3, [r2], #16
273; CHECK-NEXT:    letp lr, .LBB5_2
274; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
275; CHECK-NEXT:    pop {r4, pc}
276entry:
277  %cmp8 = icmp sgt i32 %n, 0
278  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
279
280vector.ph:                                        ; preds = %entry
281  %n.rnd.up = add i32 %n, 3
282  %n.vec = and i32 %n.rnd.up, -4
283  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
284  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
285  br label %vector.body
286
287vector.body:                                      ; preds = %vector.body, %vector.ph
288  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
289  %0 = getelementptr inbounds float, ptr %x, i32 %index
290  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
291  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
292  %2 = getelementptr inbounds float, ptr %y, i32 %index
293  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
294  %3 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
295  %4 = fsub fast <4 x float> %3, %broadcast.splat14
296  %5 = getelementptr inbounds float, ptr %z, i32 %index
297  call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
298  %index.next = add i32 %index, 4
299  %6 = icmp eq i32 %index.next, %n.vec
300  br i1 %6, label %for.cond.cleanup, label %vector.body
301
302for.cond.cleanup:                                 ; preds = %vector.body, %entry
303  ret void
304}
305
306define arm_aapcs_vfpcc void @fmss3(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
307; CHECK-LABEL: fmss3:
308; CHECK:       @ %bb.0: @ %entry
309; CHECK-NEXT:    .save {r4, lr}
310; CHECK-NEXT:    push {r4, lr}
311; CHECK-NEXT:    cmp r3, #1
312; CHECK-NEXT:    it lt
313; CHECK-NEXT:    poplt {r4, pc}
314; CHECK-NEXT:  .LBB6_1: @ %vector.ph
315; CHECK-NEXT:    vmov r4, s0
316; CHECK-NEXT:    vdup.32 q0, r4
317; CHECK-NEXT:    dlstp.32 lr, r3
318; CHECK-NEXT:  .LBB6_2: @ %vector.body
319; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
320; CHECK-NEXT:    vmov q3, q0
321; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
322; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
323; CHECK-NEXT:    vfms.f32 q3, q2, q1
324; CHECK-NEXT:    vstrw.32 q3, [r2], #16
325; CHECK-NEXT:    letp lr, .LBB6_2
326; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
327; CHECK-NEXT:    pop {r4, pc}
328entry:
329  %cmp8 = icmp sgt i32 %n, 0
330  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
331
332vector.ph:                                        ; preds = %entry
333  %n.rnd.up = add i32 %n, 3
334  %n.vec = and i32 %n.rnd.up, -4
335  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
336  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
337  br label %vector.body
338
339vector.body:                                      ; preds = %vector.body, %vector.ph
340  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
341  %0 = getelementptr inbounds float, ptr %x, i32 %index
342  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
343  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
344  %2 = getelementptr inbounds float, ptr %y, i32 %index
345  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
346  %3 = fneg fast <4 x float> %wide.masked.load12
347  %4 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %3, <4 x float> %broadcast.splat14)
348  %5 = getelementptr inbounds float, ptr %z, i32 %index
349  call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
350  %index.next = add i32 %index, 4
351  %6 = icmp eq i32 %index.next, %n.vec
352  br i1 %6, label %for.cond.cleanup, label %vector.body
353
354for.cond.cleanup:                                 ; preds = %vector.body, %entry
355  ret void
356}
357
358define arm_aapcs_vfpcc void @fmss4(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
359; CHECK-LABEL: fmss4:
360; CHECK:       @ %bb.0: @ %entry
361; CHECK-NEXT:    .save {r4, lr}
362; CHECK-NEXT:    push {r4, lr}
363; CHECK-NEXT:    cmp r3, #1
364; CHECK-NEXT:    it lt
365; CHECK-NEXT:    poplt {r4, pc}
366; CHECK-NEXT:  .LBB7_1: @ %vector.ph
367; CHECK-NEXT:    vmov r4, s0
368; CHECK-NEXT:    vdup.32 q0, r4
369; CHECK-NEXT:    dlstp.32 lr, r3
370; CHECK-NEXT:  .LBB7_2: @ %vector.body
371; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
372; CHECK-NEXT:    vmov q3, q0
373; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
374; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
375; CHECK-NEXT:    vfms.f32 q3, q2, q1
376; CHECK-NEXT:    vstrw.32 q3, [r2], #16
377; CHECK-NEXT:    letp lr, .LBB7_2
378; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
379; CHECK-NEXT:    pop {r4, pc}
380entry:
381  %cmp8 = icmp sgt i32 %n, 0
382  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
383
384vector.ph:                                        ; preds = %entry
385  %n.rnd.up = add i32 %n, 3
386  %n.vec = and i32 %n.rnd.up, -4
387  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
388  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
389  br label %vector.body
390
391vector.body:                                      ; preds = %vector.body, %vector.ph
392  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
393  %0 = getelementptr inbounds float, ptr %x, i32 %index
394  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
395  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
396  %2 = getelementptr inbounds float, ptr %y, i32 %index
397  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
398  %3 = fmul fast <4 x float> %wide.masked.load12, %wide.masked.load
399  %4 = fsub fast <4 x float> %broadcast.splat14, %3
400  %5 = getelementptr inbounds float, ptr %z, i32 %index
401  call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
402  %index.next = add i32 %index, 4
403  %6 = icmp eq i32 %index.next, %n.vec
404  br i1 %6, label %for.cond.cleanup, label %vector.body
405
406for.cond.cleanup:                                 ; preds = %vector.body, %entry
407  ret void
408}
409
410define arm_aapcs_vfpcc void @fms1(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
411; CHECK-LABEL: fms1:
412; CHECK:       @ %bb.0: @ %entry
413; CHECK-NEXT:    .save {r4, lr}
414; CHECK-NEXT:    push {r4, lr}
415; CHECK-NEXT:    cmp r3, #1
416; CHECK-NEXT:    it lt
417; CHECK-NEXT:    poplt {r4, pc}
418; CHECK-NEXT:  .LBB8_1: @ %vector.ph
419; CHECK-NEXT:    vmov r12, s0
420; CHECK-NEXT:    eor r12, r12, #-2147483648
421; CHECK-NEXT:    dlstp.32 lr, r3
422; CHECK-NEXT:  .LBB8_2: @ %vector.body
423; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
424; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
425; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
426; CHECK-NEXT:    vfma.f32 q1, q0, r12
427; CHECK-NEXT:    vstrw.32 q1, [r2], #16
428; CHECK-NEXT:    letp lr, .LBB8_2
429; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
430; CHECK-NEXT:    pop {r4, pc}
431entry:
432  %cmp8 = icmp sgt i32 %n, 0
433  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
434
435vector.ph:                                        ; preds = %entry
436  %fneg = fneg fast float %a
437  %n.rnd.up = add i32 %n, 3
438  %n.vec = and i32 %n.rnd.up, -4
439  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %fneg, i32 0
440  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
441  br label %vector.body
442
443vector.body:                                      ; preds = %vector.body, %vector.ph
444  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
445  %0 = getelementptr inbounds float, ptr %x, i32 %index
446  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
447  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
448  %2 = getelementptr inbounds float, ptr %y, i32 %index
449  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
450  %3 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %wide.masked.load12)
451  %4 = getelementptr inbounds float, ptr %z, i32 %index
452  call void @llvm.masked.store.v4f32.p0(<4 x float> %3, ptr %4, i32 4, <4 x i1> %1)
453  %index.next = add i32 %index, 4
454  %5 = icmp eq i32 %index.next, %n.vec
455  br i1 %5, label %for.cond.cleanup, label %vector.body
456
457for.cond.cleanup:                                 ; preds = %vector.body, %entry
458  ret void
459}
460
461define arm_aapcs_vfpcc void @fms2(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
462; CHECK-LABEL: fms2:
463; CHECK:       @ %bb.0: @ %entry
464; CHECK-NEXT:    .save {r4, lr}
465; CHECK-NEXT:    push {r4, lr}
466; CHECK-NEXT:    cmp r3, #1
467; CHECK-NEXT:    it lt
468; CHECK-NEXT:    poplt {r4, pc}
469; CHECK-NEXT:  .LBB9_1: @ %vector.ph
470; CHECK-NEXT:    vmov r4, s0
471; CHECK-NEXT:    vdup.32 q0, r4
472; CHECK-NEXT:    dlstp.32 lr, r3
473; CHECK-NEXT:  .LBB9_2: @ %vector.body
474; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
475; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
476; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
477; CHECK-NEXT:    vfms.f32 q2, q1, q0
478; CHECK-NEXT:    vstrw.32 q2, [r2], #16
479; CHECK-NEXT:    letp lr, .LBB9_2
480; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
481; CHECK-NEXT:    pop {r4, pc}
482entry:
483  %cmp8 = icmp sgt i32 %n, 0
484  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
485
486vector.ph:                                        ; preds = %entry
487  %n.rnd.up = add i32 %n, 3
488  %n.vec = and i32 %n.rnd.up, -4
489  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
490  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
491  br label %vector.body
492
493vector.body:                                      ; preds = %vector.body, %vector.ph
494  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
495  %0 = getelementptr inbounds float, ptr %x, i32 %index
496  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
497  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
498  %2 = getelementptr inbounds float, ptr %y, i32 %index
499  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
500  %3 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat14
501  %4 = fsub fast <4 x float> %wide.masked.load12, %3
502  %5 = getelementptr inbounds float, ptr %z, i32 %index
503  call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
504  %index.next = add i32 %index, 4
505  %6 = icmp eq i32 %index.next, %n.vec
506  br i1 %6, label %for.cond.cleanup, label %vector.body
507
508for.cond.cleanup:                                 ; preds = %vector.body, %entry
509  ret void
510}
511
512define arm_aapcs_vfpcc void @fms3(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
513; CHECK-LABEL: fms3:
514; CHECK:       @ %bb.0: @ %entry
515; CHECK-NEXT:    .save {r4, lr}
516; CHECK-NEXT:    push {r4, lr}
517; CHECK-NEXT:    cmp r3, #1
518; CHECK-NEXT:    it lt
519; CHECK-NEXT:    poplt {r4, pc}
520; CHECK-NEXT:  .LBB10_1: @ %vector.ph
521; CHECK-NEXT:    vmov r12, s0
522; CHECK-NEXT:    dlstp.32 lr, r3
523; CHECK-NEXT:  .LBB10_2: @ %vector.body
524; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
525; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
526; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
527; CHECK-NEXT:    vneg.f32 q0, q0
528; CHECK-NEXT:    vfma.f32 q0, q1, r12
529; CHECK-NEXT:    vstrw.32 q0, [r2], #16
530; CHECK-NEXT:    letp lr, .LBB10_2
531; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
532; CHECK-NEXT:    pop {r4, pc}
533entry:
534  %cmp8 = icmp sgt i32 %n, 0
535  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
536
537vector.ph:                                        ; preds = %entry
538  %n.rnd.up = add i32 %n, 3
539  %n.vec = and i32 %n.rnd.up, -4
540  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %a, i32 0
541  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
542  br label %vector.body
543
544vector.body:                                      ; preds = %vector.body, %vector.ph
545  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
546  %0 = getelementptr inbounds float, ptr %x, i32 %index
547  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
548  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
549  %2 = getelementptr inbounds float, ptr %y, i32 %index
550  %wide.masked.load12 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %2, i32 4, <4 x i1> %1, <4 x float> undef)
551  %3 = fneg fast <4 x float> %wide.masked.load12
552  %4 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %wide.masked.load, <4 x float> %broadcast.splat14, <4 x float> %3)
553  %5 = getelementptr inbounds float, ptr %z, i32 %index
554  call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
555  %index.next = add i32 %index, 4
556  %6 = icmp eq i32 %index.next, %n.vec
557  br i1 %6, label %for.cond.cleanup, label %vector.body
558
559for.cond.cleanup:                                 ; preds = %vector.body, %entry
560  ret void
561}
562
563define arm_aapcs_vfpcc void @fms4(ptr nocapture readonly %x, ptr nocapture readonly %y, ptr noalias nocapture %z, float %a, i32 %n) {
564; CHECK-LABEL: fms4:
565; CHECK:       @ %bb.0: @ %entry
566; CHECK-NEXT:    .save {r4, lr}
567; CHECK-NEXT:    push {r4, lr}
568; CHECK-NEXT:    cmp r3, #1
569; CHECK-NEXT:    it lt
570; CHECK-NEXT:    poplt {r4, pc}
571; CHECK-NEXT:  .LBB11_1: @ %vector.ph
572; CHECK-NEXT:    vmov r12, s0
573; CHECK-NEXT:    dlstp.32 lr, r3
574; CHECK-NEXT:  .LBB11_2: @ %vector.body
575; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
576; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
577; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
578; CHECK-NEXT:    vneg.f32 q0, q0
579; CHECK-NEXT:    vfma.f32 q0, q1, r12
580; CHECK-NEXT:    vstrw.32 q0, [r2], #16
581; CHECK-NEXT:    letp lr, .LBB11_2
582; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
583; CHECK-NEXT:    pop {r4, pc}
584entry:
585  %cmp8 = icmp sgt i32 %n, 0
586  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
587
588vector.ph:                                        ; preds = %entry
589  %n.rnd.up = add i32 %n, 3
590  %n.vec = and i32 %n.rnd.up, -4
591  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %a, i32 0
592  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
593  br label %vector.body
594
595vector.body:                                      ; preds = %vector.body, %vector.ph
596  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
597  %0 = getelementptr inbounds float, ptr %x, i32 %index
598  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
599  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %0, i32 4, <4 x i1> %1, <4 x float> undef)
600  %2 = fmul fast <4 x float> %wide.masked.load, %broadcast.splat13
601  %3 = getelementptr inbounds float, ptr %y, i32 %index
602  %wide.masked.load14 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %3, i32 4, <4 x i1> %1, <4 x float> undef)
603  %4 = fsub fast <4 x float> %2, %wide.masked.load14
604  %5 = getelementptr inbounds float, ptr %z, i32 %index
605  call void @llvm.masked.store.v4f32.p0(<4 x float> %4, ptr %5, i32 4, <4 x i1> %1)
606  %index.next = add i32 %index, 4
607  %6 = icmp eq i32 %index.next, %n.vec
608  br i1 %6, label %for.cond.cleanup, label %vector.body
609
610for.cond.cleanup:                                 ; preds = %vector.body, %entry
611  ret void
612}
613
614declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>)
615declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
616declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>)
617declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
618