xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll (revision e0ed0333f0fed2e73f805afd58b61176a87aa3ad)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc void @test_fadd(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
5; CHECK-LABEL: test_fadd:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    cmp r2, #1
8; CHECK-NEXT:    it lt
9; CHECK-NEXT:    bxlt lr
10; CHECK-NEXT:  .LBB0_1: @ %vector.ph
11; CHECK-NEXT:    vmov r3, s0
12; CHECK-NEXT:  .LBB0_2: @ %vector.body
13; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
14; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
15; CHECK-NEXT:    subs r2, #4
16; CHECK-NEXT:    vadd.f32 q0, q0, r3
17; CHECK-NEXT:    vstrb.8 q0, [r1], #16
18; CHECK-NEXT:    bne .LBB0_2
19; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
20; CHECK-NEXT:    bx lr
21entry:
22  %i = and i32 %n, 7
23  %cmp = icmp eq i32 %i, 0
24  tail call void @llvm.assume(i1 %cmp)
25  %cmp18 = icmp sgt i32 %n, 0
26  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
27
28vector.ph:                                        ; preds = %entry
29  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
30  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
31  br label %vector.body
32
33vector.body:                                      ; preds = %vector.body, %vector.ph
34  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
35  %i1 = getelementptr inbounds float, ptr %A, i32 %index
36  %wide.load = load <4 x float>, ptr %i1, align 4
37  %i3 = fadd fast <4 x float> %wide.load, %broadcast.splat11
38  %i4 = getelementptr inbounds float, ptr %C, i32 %index
39  store <4 x float> %i3, ptr %i4, align 4
40  %index.next = add i32 %index, 4
41  %i6 = icmp eq i32 %index.next, %n
42  br i1 %i6, label %for.cond.cleanup, label %vector.body
43
44for.cond.cleanup:                                 ; preds = %vector.body, %entry
45  ret void
46}
47
48define arm_aapcs_vfpcc void @test_fadd_r(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
49; CHECK-LABEL: test_fadd_r:
50; CHECK:       @ %bb.0: @ %entry
51; CHECK-NEXT:    cmp r2, #1
52; CHECK-NEXT:    it lt
53; CHECK-NEXT:    bxlt lr
54; CHECK-NEXT:  .LBB1_1: @ %vector.ph
55; CHECK-NEXT:    vmov r3, s0
56; CHECK-NEXT:  .LBB1_2: @ %vector.body
57; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
58; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
59; CHECK-NEXT:    subs r2, #4
60; CHECK-NEXT:    vadd.f32 q0, q0, r3
61; CHECK-NEXT:    vstrb.8 q0, [r1], #16
62; CHECK-NEXT:    bne .LBB1_2
63; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
64; CHECK-NEXT:    bx lr
65entry:
66  %i = and i32 %n, 7
67  %cmp = icmp eq i32 %i, 0
68  tail call void @llvm.assume(i1 %cmp)
69  %cmp18 = icmp sgt i32 %n, 0
70  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
71
72vector.ph:                                        ; preds = %entry
73  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
74  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
75  br label %vector.body
76
77vector.body:                                      ; preds = %vector.body, %vector.ph
78  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
79  %i1 = getelementptr inbounds float, ptr %A, i32 %index
80  %wide.load = load <4 x float>, ptr %i1, align 4
81  %i3 = fadd fast <4 x float> %broadcast.splat11, %wide.load
82  %i4 = getelementptr inbounds float, ptr %C, i32 %index
83  store <4 x float> %i3, ptr %i4, align 4
84  %index.next = add i32 %index, 4
85  %i6 = icmp eq i32 %index.next, %n
86  br i1 %i6, label %for.cond.cleanup, label %vector.body
87
88for.cond.cleanup:                                 ; preds = %vector.body, %entry
89  ret void
90}
91
92define arm_aapcs_vfpcc void @test_fmul(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
93; CHECK-LABEL: test_fmul:
94; CHECK:       @ %bb.0: @ %entry
95; CHECK-NEXT:    cmp r2, #1
96; CHECK-NEXT:    it lt
97; CHECK-NEXT:    bxlt lr
98; CHECK-NEXT:  .LBB2_1: @ %vector.ph
99; CHECK-NEXT:    vmov r3, s0
100; CHECK-NEXT:  .LBB2_2: @ %vector.body
101; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
102; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
103; CHECK-NEXT:    subs r2, #4
104; CHECK-NEXT:    vmul.f32 q0, q0, r3
105; CHECK-NEXT:    vstrb.8 q0, [r1], #16
106; CHECK-NEXT:    bne .LBB2_2
107; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
108; CHECK-NEXT:    bx lr
109entry:
110  %i = and i32 %n, 7
111  %cmp = icmp eq i32 %i, 0
112  tail call void @llvm.assume(i1 %cmp)
113  %cmp18 = icmp sgt i32 %n, 0
114  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
115
116vector.ph:                                        ; preds = %entry
117  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
118  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
119  br label %vector.body
120
121vector.body:                                      ; preds = %vector.body, %vector.ph
122  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
123  %i1 = getelementptr inbounds float, ptr %A, i32 %index
124  %wide.load = load <4 x float>, ptr %i1, align 4
125  %i3 = fmul fast <4 x float> %wide.load, %broadcast.splat11
126  %i4 = getelementptr inbounds float, ptr %C, i32 %index
127  store <4 x float> %i3, ptr %i4, align 4
128  %index.next = add i32 %index, 4
129  %i6 = icmp eq i32 %index.next, %n
130  br i1 %i6, label %for.cond.cleanup, label %vector.body
131
132for.cond.cleanup:                                 ; preds = %vector.body, %entry
133  ret void
134}
135
136define arm_aapcs_vfpcc void @test_fmul_r(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
137; CHECK-LABEL: test_fmul_r:
138; CHECK:       @ %bb.0: @ %entry
139; CHECK-NEXT:    cmp r2, #1
140; CHECK-NEXT:    it lt
141; CHECK-NEXT:    bxlt lr
142; CHECK-NEXT:  .LBB3_1: @ %vector.ph
143; CHECK-NEXT:    vmov r3, s0
144; CHECK-NEXT:  .LBB3_2: @ %vector.body
145; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
146; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
147; CHECK-NEXT:    subs r2, #4
148; CHECK-NEXT:    vmul.f32 q0, q0, r3
149; CHECK-NEXT:    vstrb.8 q0, [r1], #16
150; CHECK-NEXT:    bne .LBB3_2
151; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
152; CHECK-NEXT:    bx lr
153entry:
154  %i = and i32 %n, 7
155  %cmp = icmp eq i32 %i, 0
156  tail call void @llvm.assume(i1 %cmp)
157  %cmp18 = icmp sgt i32 %n, 0
158  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
159
160vector.ph:                                        ; preds = %entry
161  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
162  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
163  br label %vector.body
164
165vector.body:                                      ; preds = %vector.body, %vector.ph
166  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
167  %i1 = getelementptr inbounds float, ptr %A, i32 %index
168  %wide.load = load <4 x float>, ptr %i1, align 4
169  %i3 = fmul fast <4 x float> %broadcast.splat11, %wide.load
170  %i4 = getelementptr inbounds float, ptr %C, i32 %index
171  store <4 x float> %i3, ptr %i4, align 4
172  %index.next = add i32 %index, 4
173  %i6 = icmp eq i32 %index.next, %n
174  br i1 %i6, label %for.cond.cleanup, label %vector.body
175
176for.cond.cleanup:                                 ; preds = %vector.body, %entry
177  ret void
178}
179
180define arm_aapcs_vfpcc void @test_fsub(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
181; CHECK-LABEL: test_fsub:
182; CHECK:       @ %bb.0: @ %entry
183; CHECK-NEXT:    cmp r2, #1
184; CHECK-NEXT:    it lt
185; CHECK-NEXT:    bxlt lr
186; CHECK-NEXT:  .LBB4_1: @ %vector.ph
187; CHECK-NEXT:    vmov r3, s0
188; CHECK-NEXT:  .LBB4_2: @ %vector.body
189; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
190; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
191; CHECK-NEXT:    subs r2, #4
192; CHECK-NEXT:    vsub.f32 q0, q0, r3
193; CHECK-NEXT:    vstrb.8 q0, [r1], #16
194; CHECK-NEXT:    bne .LBB4_2
195; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
196; CHECK-NEXT:    bx lr
197entry:
198  %i = and i32 %n, 7
199  %cmp = icmp eq i32 %i, 0
200  tail call void @llvm.assume(i1 %cmp)
201  %cmp18 = icmp sgt i32 %n, 0
202  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
203
204vector.ph:                                        ; preds = %entry
205  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
206  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
207  br label %vector.body
208
209vector.body:                                      ; preds = %vector.body, %vector.ph
210  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
211  %i1 = getelementptr inbounds float, ptr %A, i32 %index
212  %wide.load = load <4 x float>, ptr %i1, align 4
213  %i3 = fsub fast <4 x float> %wide.load, %broadcast.splat11
214  %i4 = getelementptr inbounds float, ptr %C, i32 %index
215  store <4 x float> %i3, ptr %i4, align 4
216  %index.next = add i32 %index, 4
217  %i6 = icmp eq i32 %index.next, %n
218  br i1 %i6, label %for.cond.cleanup, label %vector.body
219
220for.cond.cleanup:                                 ; preds = %vector.body, %entry
221  ret void
222}
223
224define arm_aapcs_vfpcc void @test_fsub_r(ptr noalias nocapture readonly %A, float %B, ptr noalias nocapture %C, i32 %n) {
225; CHECK-LABEL: test_fsub_r:
226; CHECK:       @ %bb.0: @ %entry
227; CHECK-NEXT:    cmp r2, #1
228; CHECK-NEXT:    it lt
229; CHECK-NEXT:    bxlt lr
230; CHECK-NEXT:  .LBB5_1: @ %vector.ph
231; CHECK-NEXT:    vmov r3, s0
232; CHECK-NEXT:    vdup.32 q0, r3
233; CHECK-NEXT:  .LBB5_2: @ %vector.body
234; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
235; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
236; CHECK-NEXT:    subs r2, #4
237; CHECK-NEXT:    vsub.f32 q1, q0, q1
238; CHECK-NEXT:    vstrb.8 q1, [r1], #16
239; CHECK-NEXT:    bne .LBB5_2
240; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
241; CHECK-NEXT:    bx lr
242entry:
243  %i = and i32 %n, 7
244  %cmp = icmp eq i32 %i, 0
245  tail call void @llvm.assume(i1 %cmp)
246  %cmp18 = icmp sgt i32 %n, 0
247  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
248
249vector.ph:                                        ; preds = %entry
250  %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
251  %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
252  br label %vector.body
253
254vector.body:                                      ; preds = %vector.body, %vector.ph
255  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
256  %i1 = getelementptr inbounds float, ptr %A, i32 %index
257  %wide.load = load <4 x float>, ptr %i1, align 4
258  %i3 = fsub fast <4 x float> %broadcast.splat11, %wide.load
259  %i4 = getelementptr inbounds float, ptr %C, i32 %index
260  store <4 x float> %i3, ptr %i4, align 4
261  %index.next = add i32 %index, 4
262  %i6 = icmp eq i32 %index.next, %n
263  br i1 %i6, label %for.cond.cleanup, label %vector.body
264
265for.cond.cleanup:                                 ; preds = %vector.body, %entry
266  ret void
267}
268
269
270define arm_aapcs_vfpcc void @test_fmas(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
271; CHECK-LABEL: test_fmas:
272; CHECK:       @ %bb.0: @ %entry
273; CHECK-NEXT:    cmp r3, #1
274; CHECK-NEXT:    it lt
275; CHECK-NEXT:    bxlt lr
276; CHECK-NEXT:  .LBB6_1: @ %vector.ph
277; CHECK-NEXT:    vmov r12, s0
278; CHECK-NEXT:  .LBB6_2: @ %vector.body
279; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
280; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
281; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
282; CHECK-NEXT:    subs r3, #4
283; CHECK-NEXT:    vfmas.f32 q1, q0, r12
284; CHECK-NEXT:    vstrb.8 q1, [r2], #16
285; CHECK-NEXT:    bne .LBB6_2
286; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
287; CHECK-NEXT:    bx lr
288entry:
289  %i = and i32 %n, 7
290  %cmp = icmp eq i32 %i, 0
291  tail call void @llvm.assume(i1 %cmp)
292  %cmp110 = icmp sgt i32 %n, 0
293  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
294
295vector.ph:                                        ; preds = %entry
296  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
297  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
298  br label %vector.body
299
300vector.body:                                      ; preds = %vector.body, %vector.ph
301  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
302  %i1 = getelementptr inbounds float, ptr %A, i32 %index
303  %wide.load = load <4 x float>, ptr %i1, align 4
304  %i3 = getelementptr inbounds float, ptr %B, i32 %index
305  %wide.load12 = load <4 x float>, ptr %i3, align 4
306  %i5 = fmul fast <4 x float> %wide.load12, %wide.load
307  %i6 = fadd fast <4 x float> %i5, %broadcast.splat14
308  %i7 = getelementptr inbounds float, ptr %D, i32 %index
309  store <4 x float> %i6, ptr %i7, align 4
310  %index.next = add i32 %index, 4
311  %i9 = icmp eq i32 %index.next, %n
312  br i1 %i9, label %for.cond.cleanup, label %vector.body
313
314for.cond.cleanup:                                 ; preds = %vector.body, %entry
315  ret void
316}
317
318define arm_aapcs_vfpcc void @test_fmas_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
319; CHECK-LABEL: test_fmas_r:
320; CHECK:       @ %bb.0: @ %entry
321; CHECK-NEXT:    cmp r3, #1
322; CHECK-NEXT:    it lt
323; CHECK-NEXT:    bxlt lr
324; CHECK-NEXT:  .LBB7_1: @ %vector.ph
325; CHECK-NEXT:    vmov r12, s0
326; CHECK-NEXT:  .LBB7_2: @ %vector.body
327; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
328; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
329; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
330; CHECK-NEXT:    subs r3, #4
331; CHECK-NEXT:    vfmas.f32 q1, q0, r12
332; CHECK-NEXT:    vstrb.8 q1, [r2], #16
333; CHECK-NEXT:    bne .LBB7_2
334; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
335; CHECK-NEXT:    bx lr
336entry:
337  %i = and i32 %n, 7
338  %cmp = icmp eq i32 %i, 0
339  tail call void @llvm.assume(i1 %cmp)
340  %cmp110 = icmp sgt i32 %n, 0
341  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
342
343vector.ph:                                        ; preds = %entry
344  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
345  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
346  br label %vector.body
347
348vector.body:                                      ; preds = %vector.body, %vector.ph
349  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
350  %i1 = getelementptr inbounds float, ptr %A, i32 %index
351  %wide.load = load <4 x float>, ptr %i1, align 4
352  %i3 = getelementptr inbounds float, ptr %B, i32 %index
353  %wide.load12 = load <4 x float>, ptr %i3, align 4
354  %i5 = fmul fast <4 x float> %wide.load12, %wide.load
355  %i6 = fadd fast <4 x float> %broadcast.splat14, %i5
356  %i7 = getelementptr inbounds float, ptr %D, i32 %index
357  store <4 x float> %i6, ptr %i7, align 4
358  %index.next = add i32 %index, 4
359  %i9 = icmp eq i32 %index.next, %n
360  br i1 %i9, label %for.cond.cleanup, label %vector.body
361
362for.cond.cleanup:                                 ; preds = %vector.body, %entry
363  ret void
364}
365
366define arm_aapcs_vfpcc void @test_fma(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
367; CHECK-LABEL: test_fma:
368; CHECK:       @ %bb.0: @ %entry
369; CHECK-NEXT:    cmp r3, #1
370; CHECK-NEXT:    it lt
371; CHECK-NEXT:    bxlt lr
372; CHECK-NEXT:  .LBB8_1: @ %vector.ph
373; CHECK-NEXT:    vmov r12, s0
374; CHECK-NEXT:  .LBB8_2: @ %vector.body
375; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
376; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
377; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
378; CHECK-NEXT:    subs r3, #4
379; CHECK-NEXT:    vfma.f32 q1, q0, r12
380; CHECK-NEXT:    vstrb.8 q1, [r2], #16
381; CHECK-NEXT:    bne .LBB8_2
382; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
383; CHECK-NEXT:    bx lr
384entry:
385  %i = and i32 %n, 7
386  %cmp = icmp eq i32 %i, 0
387  tail call void @llvm.assume(i1 %cmp)
388  %cmp110 = icmp sgt i32 %n, 0
389  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
390
391vector.ph:                                        ; preds = %entry
392  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
393  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
394  br label %vector.body
395
396vector.body:                                      ; preds = %vector.body, %vector.ph
397  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
398  %i1 = getelementptr inbounds float, ptr %A, i32 %index
399  %wide.load = load <4 x float>, ptr %i1, align 4
400  %i3 = fmul fast <4 x float> %wide.load, %broadcast.splat13
401  %i4 = getelementptr inbounds float, ptr %B, i32 %index
402  %wide.load14 = load <4 x float>, ptr %i4, align 4
403  %i6 = fadd fast <4 x float> %i3, %wide.load14
404  %i7 = getelementptr inbounds float, ptr %D, i32 %index
405  store <4 x float> %i6, ptr %i7, align 4
406  %index.next = add i32 %index, 4
407  %i9 = icmp eq i32 %index.next, %n
408  br i1 %i9, label %for.cond.cleanup, label %vector.body
409
410for.cond.cleanup:                                 ; preds = %vector.body, %entry
411  ret void
412}
413
414define arm_aapcs_vfpcc void @test_fma_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
415; CHECK-LABEL: test_fma_r:
416; CHECK:       @ %bb.0: @ %entry
417; CHECK-NEXT:    cmp r3, #1
418; CHECK-NEXT:    it lt
419; CHECK-NEXT:    bxlt lr
420; CHECK-NEXT:  .LBB9_1: @ %vector.ph
421; CHECK-NEXT:    vmov r12, s0
422; CHECK-NEXT:  .LBB9_2: @ %vector.body
423; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
424; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
425; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
426; CHECK-NEXT:    subs r3, #4
427; CHECK-NEXT:    vfma.f32 q1, q0, r12
428; CHECK-NEXT:    vstrb.8 q1, [r2], #16
429; CHECK-NEXT:    bne .LBB9_2
430; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
431; CHECK-NEXT:    bx lr
432entry:
433  %i = and i32 %n, 7
434  %cmp = icmp eq i32 %i, 0
435  tail call void @llvm.assume(i1 %cmp)
436  %cmp110 = icmp sgt i32 %n, 0
437  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
438
439vector.ph:                                        ; preds = %entry
440  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
441  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
442  br label %vector.body
443
444vector.body:                                      ; preds = %vector.body, %vector.ph
445  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
446  %i1 = getelementptr inbounds float, ptr %A, i32 %index
447  %wide.load = load <4 x float>, ptr %i1, align 4
448  %i3 = fmul fast <4 x float> %broadcast.splat13, %wide.load
449  %i4 = getelementptr inbounds float, ptr %B, i32 %index
450  %wide.load14 = load <4 x float>, ptr %i4, align 4
451  %i6 = fadd fast <4 x float> %i3, %wide.load14
452  %i7 = getelementptr inbounds float, ptr %D, i32 %index
453  store <4 x float> %i6, ptr %i7, align 4
454  %index.next = add i32 %index, 4
455  %i9 = icmp eq i32 %index.next, %n
456  br i1 %i9, label %for.cond.cleanup, label %vector.body
457
458for.cond.cleanup:                                 ; preds = %vector.body, %entry
459  ret void
460}
461
462
463define arm_aapcs_vfpcc void @test_fmss(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
464; CHECK-LABEL: test_fmss:
465; CHECK:       @ %bb.0: @ %entry
466; CHECK-NEXT:    cmp r3, #1
467; CHECK-NEXT:    it lt
468; CHECK-NEXT:    bxlt lr
469; CHECK-NEXT:  .LBB10_1: @ %vector.ph
470; CHECK-NEXT:    vmov r12, s0
471; CHECK-NEXT:    vdup.32 q0, r12
472; CHECK-NEXT:    vneg.f32 q0, q0
473; CHECK-NEXT:  .LBB10_2: @ %vector.body
474; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
475; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
476; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
477; CHECK-NEXT:    vmov q3, q0
478; CHECK-NEXT:    subs r3, #4
479; CHECK-NEXT:    vfma.f32 q3, q2, q1
480; CHECK-NEXT:    vstrb.8 q3, [r2], #16
481; CHECK-NEXT:    bne .LBB10_2
482; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
483; CHECK-NEXT:    bx lr
484entry:
485  %i = and i32 %n, 7
486  %cmp = icmp eq i32 %i, 0
487  tail call void @llvm.assume(i1 %cmp)
488  %cmp110 = icmp sgt i32 %n, 0
489  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
490
491vector.ph:                                        ; preds = %entry
492  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
493  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
494  br label %vector.body
495
496vector.body:                                      ; preds = %vector.body, %vector.ph
497  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
498  %i1 = getelementptr inbounds float, ptr %A, i32 %index
499  %wide.load = load <4 x float>, ptr %i1, align 4
500  %i3 = getelementptr inbounds float, ptr %B, i32 %index
501  %wide.load12 = load <4 x float>, ptr %i3, align 4
502  %i5 = fmul fast <4 x float> %wide.load12, %wide.load
503  %i6 = fsub fast <4 x float> %i5, %broadcast.splat14
504  %i7 = getelementptr inbounds float, ptr %D, i32 %index
505  store <4 x float> %i6, ptr %i7, align 4
506  %index.next = add i32 %index, 4
507  %i9 = icmp eq i32 %index.next, %n
508  br i1 %i9, label %for.cond.cleanup, label %vector.body
509
510for.cond.cleanup:                                 ; preds = %vector.body, %entry
511  ret void
512}
513
514define arm_aapcs_vfpcc void @test_fmss_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
515; CHECK-LABEL: test_fmss_r:
516; CHECK:       @ %bb.0: @ %entry
517; CHECK-NEXT:    cmp r3, #1
518; CHECK-NEXT:    it lt
519; CHECK-NEXT:    bxlt lr
520; CHECK-NEXT:  .LBB11_1: @ %vector.ph
521; CHECK-NEXT:    vmov r12, s0
522; CHECK-NEXT:    vdup.32 q0, r12
523; CHECK-NEXT:  .LBB11_2: @ %vector.body
524; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
525; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
526; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
527; CHECK-NEXT:    vmov q3, q0
528; CHECK-NEXT:    subs r3, #4
529; CHECK-NEXT:    vfms.f32 q3, q2, q1
530; CHECK-NEXT:    vstrb.8 q3, [r2], #16
531; CHECK-NEXT:    bne .LBB11_2
532; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
533; CHECK-NEXT:    bx lr
534entry:
535  %i = and i32 %n, 7
536  %cmp = icmp eq i32 %i, 0
537  tail call void @llvm.assume(i1 %cmp)
538  %cmp110 = icmp sgt i32 %n, 0
539  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
540
541vector.ph:                                        ; preds = %entry
542  %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
543  %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
544  br label %vector.body
545
546vector.body:                                      ; preds = %vector.body, %vector.ph
547  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
548  %i1 = getelementptr inbounds float, ptr %A, i32 %index
549  %wide.load = load <4 x float>, ptr %i1, align 4
550  %i3 = getelementptr inbounds float, ptr %B, i32 %index
551  %wide.load12 = load <4 x float>, ptr %i3, align 4
552  %i5 = fmul fast <4 x float> %wide.load12, %wide.load
553  %i6 = fsub fast <4 x float> %broadcast.splat14, %i5
554  %i7 = getelementptr inbounds float, ptr %D, i32 %index
555  store <4 x float> %i6, ptr %i7, align 4
556  %index.next = add i32 %index, 4
557  %i9 = icmp eq i32 %index.next, %n
558  br i1 %i9, label %for.cond.cleanup, label %vector.body
559
560for.cond.cleanup:                                 ; preds = %vector.body, %entry
561  ret void
562}
563
564define arm_aapcs_vfpcc void @test_fms(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
565; CHECK-LABEL: test_fms:
566; CHECK:       @ %bb.0: @ %entry
567; CHECK-NEXT:    cmp r3, #1
568; CHECK-NEXT:    it lt
569; CHECK-NEXT:    bxlt lr
570; CHECK-NEXT:  .LBB12_1: @ %vector.ph
571; CHECK-NEXT:    vmov r12, s0
572; CHECK-NEXT:  .LBB12_2: @ %vector.body
573; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
574; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
575; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
576; CHECK-NEXT:    subs r3, #4
577; CHECK-NEXT:    vneg.f32 q0, q0
578; CHECK-NEXT:    vfma.f32 q0, q1, r12
579; CHECK-NEXT:    vstrb.8 q0, [r2], #16
580; CHECK-NEXT:    bne .LBB12_2
581; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
582; CHECK-NEXT:    bx lr
583entry:
584  %i = and i32 %n, 7
585  %cmp = icmp eq i32 %i, 0
586  tail call void @llvm.assume(i1 %cmp)
587  %cmp110 = icmp sgt i32 %n, 0
588  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
589
590vector.ph:                                        ; preds = %entry
591  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
592  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
593  br label %vector.body
594
595vector.body:                                      ; preds = %vector.body, %vector.ph
596  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
597  %i1 = getelementptr inbounds float, ptr %A, i32 %index
598  %wide.load = load <4 x float>, ptr %i1, align 4
599  %i3 = fmul fast <4 x float> %wide.load, %broadcast.splat13
600  %i4 = getelementptr inbounds float, ptr %B, i32 %index
601  %wide.load14 = load <4 x float>, ptr %i4, align 4
602  %i6 = fsub fast <4 x float> %i3, %wide.load14
603  %i7 = getelementptr inbounds float, ptr %D, i32 %index
604  store <4 x float> %i6, ptr %i7, align 4
605  %index.next = add i32 %index, 4
606  %i9 = icmp eq i32 %index.next, %n
607  br i1 %i9, label %for.cond.cleanup, label %vector.body
608
609for.cond.cleanup:                                 ; preds = %vector.body, %entry
610  ret void
611}
612
613define arm_aapcs_vfpcc void @test_fms_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, float %C, ptr noalias nocapture %D, i32 %n) {
614; CHECK-LABEL: test_fms_r:
615; CHECK:       @ %bb.0: @ %entry
616; CHECK-NEXT:    cmp r3, #1
617; CHECK-NEXT:    it lt
618; CHECK-NEXT:    bxlt lr
619; CHECK-NEXT:  .LBB13_1: @ %vector.ph
620; CHECK-NEXT:    vmov r12, s0
621; CHECK-NEXT:  .LBB13_2: @ %vector.body
622; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
623; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
624; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
625; CHECK-NEXT:    subs r3, #4
626; CHECK-NEXT:    vneg.f32 q0, q0
627; CHECK-NEXT:    vfma.f32 q0, q1, r12
628; CHECK-NEXT:    vstrb.8 q0, [r2], #16
629; CHECK-NEXT:    bne .LBB13_2
630; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
631; CHECK-NEXT:    bx lr
632entry:
633  %i = and i32 %n, 7
634  %cmp = icmp eq i32 %i, 0
635  tail call void @llvm.assume(i1 %cmp)
636  %cmp110 = icmp sgt i32 %n, 0
637  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
638
639vector.ph:                                        ; preds = %entry
640  %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
641  %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
642  br label %vector.body
643
644vector.body:                                      ; preds = %vector.body, %vector.ph
645  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
646  %i1 = getelementptr inbounds float, ptr %A, i32 %index
647  %wide.load = load <4 x float>, ptr %i1, align 4
648  %i3 = fmul fast <4 x float> %broadcast.splat13, %wide.load
649  %i4 = getelementptr inbounds float, ptr %B, i32 %index
650  %wide.load14 = load <4 x float>, ptr %i4, align 4
651  %i6 = fsub fast <4 x float> %i3, %wide.load14
652  %i7 = getelementptr inbounds float, ptr %D, i32 %index
653  store <4 x float> %i6, ptr %i7, align 4
654  %index.next = add i32 %index, 4
655  %i9 = icmp eq i32 %index.next, %n
656  br i1 %i9, label %for.cond.cleanup, label %vector.body
657
658for.cond.cleanup:                                 ; preds = %vector.body, %entry
659  ret void
660}
661
662
663define dso_local void @test_nested(ptr noalias nocapture %pInT1, ptr noalias nocapture readonly %pOutT1, ptr noalias nocapture readonly %pPRT_in, ptr noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l) local_unnamed_addr {
664; CHECK-LABEL: test_nested:
665; CHECK:       @ %bb.0: @ %for.body.us.preheader
666; CHECK-NEXT:    .save {r4, r5, r6, lr}
667; CHECK-NEXT:    push {r4, r5, r6, lr}
668; CHECK-NEXT:    ldrd lr, r12, [sp, #16]
669; CHECK-NEXT:    lsl.w r3, r12, #2
670; CHECK-NEXT:  .LBB14_1: @ %for.body.us
671; CHECK-NEXT:    @ =>This Loop Header: Depth=1
672; CHECK-NEXT:    @ Child Loop BB14_2 Depth 2
673; CHECK-NEXT:    ldr r4, [r1]
674; CHECK-NEXT:    mov r5, r2
675; CHECK-NEXT:    mov r6, r12
676; CHECK-NEXT:    vdup.32 q0, r4
677; CHECK-NEXT:    mov r4, r0
678; CHECK-NEXT:  .LBB14_2: @ %vector.body
679; CHECK-NEXT:    @ Parent Loop BB14_1 Depth=1
680; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
681; CHECK-NEXT:    vldrw.u32 q1, [r5], #16
682; CHECK-NEXT:    vldrw.u32 q2, [r4]
683; CHECK-NEXT:    subs r6, #4
684; CHECK-NEXT:    vfms.f32 q2, q1, q0
685; CHECK-NEXT:    vstrb.8 q2, [r4], #16
686; CHECK-NEXT:    bne .LBB14_2
687; CHECK-NEXT:  @ %bb.3: @ %for.cond6.for.end_crit_edge.us
688; CHECK-NEXT:    @ in Loop: Header=BB14_1 Depth=1
689; CHECK-NEXT:    add r0, r3
690; CHECK-NEXT:    add r2, r3
691; CHECK-NEXT:    adds r1, #4
692; CHECK-NEXT:    le lr, .LBB14_1
693; CHECK-NEXT:  @ %bb.4: @ %for.end14
694; CHECK-NEXT:    pop {r4, r5, r6, pc}
695for.body.us.preheader:
696  %cmp = icmp sgt i32 %numRows, 0
697  tail call void @llvm.assume(i1 %cmp)
698  %cmp1 = icmp sgt i32 %numCols, 0
699  tail call void @llvm.assume(i1 %cmp1)
700  %rem = and i32 %numCols, 7
701  %cmp2 = icmp eq i32 %rem, 0
702  tail call void @llvm.assume(i1 %cmp2)
703  %cmp3 = icmp slt i32 %l, %numCols
704  tail call void @llvm.assume(i1 %cmp3)
705  br label %for.body.us
706
707for.body.us:                                      ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader
708  %pInT1.addr.038.us = phi ptr [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ]
709  %i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ]
710  %pOutT1.addr.036.us = phi ptr [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ]
711  %pPRT_in.addr.035.us = phi ptr [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ]
712  %scevgep = getelementptr float, ptr %pPRT_in.addr.035.us, i32 %numCols
713  %i = load float, ptr %pOutT1.addr.036.us, align 4
714  %broadcast.splatinsert47 = insertelement <4 x float> undef, float %i, i32 0
715  %broadcast.splat48 = shufflevector <4 x float> %broadcast.splatinsert47, <4 x float> undef, <4 x i32> zeroinitializer
716  br label %vector.body
717
718vector.body:                                      ; preds = %vector.body, %for.body.us
719  %index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ]
720  %next.gep = getelementptr float, ptr %pInT1.addr.038.us, i32 %index
721  %next.gep45 = getelementptr float, ptr %pPRT_in.addr.035.us, i32 %index
722  %wide.load = load <4 x float>, ptr %next.gep, align 4
723  %wide.load46 = load <4 x float>, ptr %next.gep45, align 4
724  %i3 = fmul fast <4 x float> %wide.load46, %broadcast.splat48
725  %i4 = fsub fast <4 x float> %wide.load, %i3
726  store <4 x float> %i4, ptr %next.gep, align 4
727  %index.next = add i32 %index, 4
728  %i5 = icmp eq i32 %index.next, %numCols
729  br i1 %i5, label %for.cond6.for.end_crit_edge.us, label %vector.body
730
731for.cond6.for.end_crit_edge.us:                   ; preds = %vector.body
732  %incdec.ptr.us = getelementptr inbounds float, ptr %pOutT1.addr.036.us, i32 1
733  %scevgep40 = getelementptr float, ptr %pInT1.addr.038.us, i32 %numCols
734  %inc13.us = add nuw nsw i32 %i.037.us, 1
735  %exitcond41 = icmp eq i32 %inc13.us, %numRows
736  br i1 %exitcond41, label %for.end14, label %for.body.us
737
738for.end14:                                        ; preds = %for.cond6.for.end_crit_edge.us
739  ret void
740}
741
742%struct.arm_fir_instance_f32 = type { i16, ptr, ptr }
743define void @arm_fir_f32_1_4_mve(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr %pDst, i32 %blockSize) {
744; CHECK-LABEL: arm_fir_f32_1_4_mve:
745; CHECK:       @ %bb.0: @ %entry
746; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
747; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
748; CHECK-NEXT:    .pad #8
749; CHECK-NEXT:    sub sp, #8
750; CHECK-NEXT:    ldrh.w r10, [r0]
751; CHECK-NEXT:    mov r11, r1
752; CHECK-NEXT:    ldr.w r12, [r0, #4]
753; CHECK-NEXT:    sub.w r1, r10, #1
754; CHECK-NEXT:    cmp r1, #3
755; CHECK-NEXT:    bhi .LBB15_6
756; CHECK-NEXT:  @ %bb.1: @ %if.then
757; CHECK-NEXT:    ldr r4, [r0, #8]
758; CHECK-NEXT:    ldrd r7, r6, [r4]
759; CHECK-NEXT:    ldrd r5, r8, [r4, #8]
760; CHECK-NEXT:    add.w r4, r12, r1, lsl #2
761; CHECK-NEXT:    lsrs r1, r3, #2
762; CHECK-NEXT:    wls lr, r1, .LBB15_5
763; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
764; CHECK-NEXT:    bic r1, r3, #3
765; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
766; CHECK-NEXT:    add.w r9, r12, #4
767; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
768; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
769; CHECK-NEXT:    mov r1, r11
770; CHECK-NEXT:  .LBB15_3: @ %while.body
771; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
772; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
773; CHECK-NEXT:    vstrb.8 q0, [r4], #16
774; CHECK-NEXT:    vldrw.u32 q0, [r9, #-4]
775; CHECK-NEXT:    vldrw.u32 q1, [r9], #16
776; CHECK-NEXT:    vmul.f32 q0, q0, r7
777; CHECK-NEXT:    vldrw.u32 q2, [r9, #-8]
778; CHECK-NEXT:    vfma.f32 q0, q1, r6
779; CHECK-NEXT:    vldrw.u32 q1, [r9, #-12]
780; CHECK-NEXT:    vfma.f32 q0, q1, r5
781; CHECK-NEXT:    vfma.f32 q0, q2, r8
782; CHECK-NEXT:    vstrb.8 q0, [r2], #16
783; CHECK-NEXT:    le lr, .LBB15_3
784; CHECK-NEXT:  @ %bb.4: @ %while.end.loopexit
785; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
786; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
787; CHECK-NEXT:    add.w r12, r12, r1, lsl #2
788; CHECK-NEXT:    add.w r11, r11, r1, lsl #2
789; CHECK-NEXT:  .LBB15_5: @ %while.end
790; CHECK-NEXT:    and r1, r3, #3
791; CHECK-NEXT:    vldrw.u32 q0, [r11]
792; CHECK-NEXT:    vctp.32 r1
793; CHECK-NEXT:    vpst
794; CHECK-NEXT:    vstrwt.32 q0, [r4]
795; CHECK-NEXT:    vldrw.u32 q0, [r12]
796; CHECK-NEXT:    vldrw.u32 q1, [r12, #4]
797; CHECK-NEXT:    vmul.f32 q0, q0, r7
798; CHECK-NEXT:    vfma.f32 q0, q1, r6
799; CHECK-NEXT:    vldrw.u32 q1, [r12, #8]
800; CHECK-NEXT:    vfma.f32 q0, q1, r5
801; CHECK-NEXT:    vldrw.u32 q1, [r12, #12]
802; CHECK-NEXT:    vfma.f32 q0, q1, r8
803; CHECK-NEXT:    vpst
804; CHECK-NEXT:    vstrwt.32 q0, [r2]
805; CHECK-NEXT:    ldr.w r12, [r0, #4]
806; CHECK-NEXT:  .LBB15_6: @ %if.end
807; CHECK-NEXT:    add.w r0, r12, r3, lsl #2
808; CHECK-NEXT:    lsr.w r1, r10, #2
809; CHECK-NEXT:    wls lr, r1, .LBB15_10
810; CHECK-NEXT:  @ %bb.7: @ %while.body51.preheader
811; CHECK-NEXT:    bic r2, r10, #3
812; CHECK-NEXT:    adds r1, r2, r3
813; CHECK-NEXT:    mov r3, r12
814; CHECK-NEXT:    add.w r1, r12, r1, lsl #2
815; CHECK-NEXT:  .LBB15_8: @ %while.body51
816; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
817; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
818; CHECK-NEXT:    vstrb.8 q0, [r3], #16
819; CHECK-NEXT:    le lr, .LBB15_8
820; CHECK-NEXT:  @ %bb.9: @ %while.end55.loopexit
821; CHECK-NEXT:    add.w r12, r12, r2, lsl #2
822; CHECK-NEXT:    mov r0, r1
823; CHECK-NEXT:  .LBB15_10: @ %while.end55
824; CHECK-NEXT:    ands r1, r10, #3
825; CHECK-NEXT:    beq .LBB15_12
826; CHECK-NEXT:  @ %bb.11: @ %if.then59
827; CHECK-NEXT:    vldrw.u32 q0, [r0]
828; CHECK-NEXT:    vctp.32 r1
829; CHECK-NEXT:    vpst
830; CHECK-NEXT:    vstrwt.32 q0, [r12]
831; CHECK-NEXT:  .LBB15_12: @ %if.end61
832; CHECK-NEXT:    add sp, #8
833; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
834entry:
835  %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
836  %i = load ptr, ptr %pState1, align 4
837  %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 2
838  %i1 = load ptr, ptr %pCoeffs2, align 4
839  %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 0
840  %i2 = load i16, ptr %numTaps3, align 4
841  %conv = zext i16 %i2 to i32
842  %sub = add nsw i32 %conv, -1
843  %cmp = icmp ult i32 %sub, 4
844  br i1 %cmp, label %if.then, label %if.end
845
846if.then:                                          ; preds = %entry
847  %arrayidx = getelementptr inbounds float, ptr %i, i32 %sub
848  %incdec.ptr = getelementptr inbounds float, ptr %i1, i32 1
849  %i3 = load float, ptr %i1, align 4
850  %incdec.ptr6 = getelementptr inbounds float, ptr %i1, i32 2
851  %i4 = load float, ptr %incdec.ptr, align 4
852  %incdec.ptr7 = getelementptr inbounds float, ptr %i1, i32 3
853  %i5 = load float, ptr %incdec.ptr6, align 4
854  %i6 = load float, ptr %incdec.ptr7, align 4
855  %shr = lshr i32 %blockSize, 2
856  %cmp9146 = icmp eq i32 %shr, 0
857  %.pre161 = insertelement <4 x float> undef, float %i3, i32 0
858  %.pre162 = shufflevector <4 x float> %.pre161, <4 x float> undef, <4 x i32> zeroinitializer
859  %.pre163 = insertelement <4 x float> undef, float %i4, i32 0
860  %.pre164 = shufflevector <4 x float> %.pre163, <4 x float> undef, <4 x i32> zeroinitializer
861  %.pre165 = insertelement <4 x float> undef, float %i5, i32 0
862  %.pre166 = shufflevector <4 x float> %.pre165, <4 x float> undef, <4 x i32> zeroinitializer
863  %.pre167 = insertelement <4 x float> undef, float %i6, i32 0
864  %.pre168 = shufflevector <4 x float> %.pre167, <4 x float> undef, <4 x i32> zeroinitializer
865  br i1 %cmp9146, label %while.end, label %while.body.lr.ph
866
867while.body.lr.ph:                                 ; preds = %if.then
868  %i7 = and i32 %blockSize, -4
869  %scevgep158 = getelementptr float, ptr %pDst, i32 %i7
870  br label %while.body
871
872while.body:                                       ; preds = %while.body, %while.body.lr.ph
873  %pStateCur.0151 = phi ptr [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ]
874  %pSamples.0150 = phi ptr [ %i, %while.body.lr.ph ], [ %add.ptr24, %while.body ]
875  %pOutput.0149 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ]
876  %pTempSrc.0148 = phi ptr [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ]
877  %blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
878  %i9 = load <4 x float>, ptr %pTempSrc.0148, align 4
879  store <4 x float> %i9, ptr %pStateCur.0151, align 4
880  %add.ptr = getelementptr inbounds float, ptr %pStateCur.0151, i32 4
881  %add.ptr11 = getelementptr inbounds float, ptr %pTempSrc.0148, i32 4
882  %i12 = load <4 x float>, ptr %pSamples.0150, align 4
883  %i13 = fmul fast <4 x float> %i12, %.pre162
884  %arrayidx12 = getelementptr inbounds float, ptr %pSamples.0150, i32 1
885  %i15 = load <4 x float>, ptr %arrayidx12, align 4
886  %mul = fmul fast <4 x float> %i15, %.pre164
887  %add = fadd fast <4 x float> %mul, %i13
888  %arrayidx13 = getelementptr inbounds float, ptr %pSamples.0150, i32 2
889  %i17 = load <4 x float>, ptr %arrayidx13, align 4
890  %mul16 = fmul fast <4 x float> %i17, %.pre166
891  %add17 = fadd fast <4 x float> %add, %mul16
892  %arrayidx18 = getelementptr inbounds float, ptr %pSamples.0150, i32 3
893  %i19 = load <4 x float>, ptr %arrayidx18, align 4
894  %mul21 = fmul fast <4 x float> %i19, %.pre168
895  %add22 = fadd fast <4 x float> %add17, %mul21
896  store <4 x float> %add22, ptr %pOutput.0149, align 4
897  %add.ptr23 = getelementptr inbounds float, ptr %pOutput.0149, i32 4
898  %add.ptr24 = getelementptr inbounds float, ptr %pSamples.0150, i32 4
899  %dec = add nsw i32 %blkCnt.0147, -1
900  %cmp9 = icmp eq i32 %dec, 0
901  br i1 %cmp9, label %while.end.loopexit, label %while.body
902
903while.end.loopexit:                               ; preds = %while.body
904  %scevgep157 = getelementptr float, ptr %pSrc, i32 %i7
905  %scevgep159 = getelementptr float, ptr %i, i32 %i7
906  br label %while.end
907
908while.end:                                        ; preds = %while.end.loopexit, %if.then
909  %pTempSrc.0.lcssa = phi ptr [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ]
910  %pOutput.0.lcssa = phi ptr [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ]
911  %pSamples.0.lcssa = phi ptr [ %scevgep159, %while.end.loopexit ], [ %i, %if.then ]
912  %pStateCur.0.lcssa = phi ptr [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ]
913  %and = and i32 %blockSize, 3
914  %i21 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and)
915  %i23 = load <4 x float>, ptr %pTempSrc.0.lcssa, align 4
916  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %i23, ptr %pStateCur.0.lcssa, i32 4, <4 x i1> %i21)
917  %i26 = load <4 x float>, ptr %pSamples.0.lcssa, align 4
918  %i27 = fmul fast <4 x float> %i26, %.pre162
919  %arrayidx29 = getelementptr inbounds float, ptr %pSamples.0.lcssa, i32 1
920  %i29 = load <4 x float>, ptr %arrayidx29, align 4
921  %mul32 = fmul fast <4 x float> %i29, %.pre164
922  %add33 = fadd fast <4 x float> %mul32, %i27
923  %arrayidx34 = getelementptr inbounds float, ptr %pSamples.0.lcssa, i32 2
924  %i31 = load <4 x float>, ptr %arrayidx34, align 4
925  %mul37 = fmul fast <4 x float> %i31, %.pre166
926  %add38 = fadd fast <4 x float> %add33, %mul37
927  %arrayidx39 = getelementptr inbounds float, ptr %pSamples.0.lcssa, i32 3
928  %i33 = load <4 x float>, ptr %arrayidx39, align 4
929  %mul42 = fmul fast <4 x float> %i33, %.pre168
930  %add43 = fadd fast <4 x float> %add38, %mul42
931  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %add43, ptr %pOutput.0.lcssa, i32 4, <4 x i1> %i21)
932  %.pre = load ptr, ptr %pState1, align 4
933  br label %if.end
934
935if.end:                                           ; preds = %while.end, %entry
936  %i35 = phi ptr [ %.pre, %while.end ], [ %i, %entry ]
937  %arrayidx45 = getelementptr inbounds float, ptr %i35, i32 %blockSize
938  %shr47 = lshr i32 %conv, 2
939  %cmp49141 = icmp eq i32 %shr47, 0
940  br i1 %cmp49141, label %while.end55, label %while.body51.preheader
941
942while.body51.preheader:                           ; preds = %if.end
943  %i36 = and i32 %conv, 65532
944  %i37 = add i32 %i36, %blockSize
945  %scevgep = getelementptr float, ptr %i35, i32 %i37
946  br label %while.body51
947
948while.body51:                                     ; preds = %while.body51, %while.body51.preheader
949  %pTempSrc.1144 = phi ptr [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ]
950  %pTempDest.0143 = phi ptr [ %add.ptr53, %while.body51 ], [ %i35, %while.body51.preheader ]
951  %blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ]
952  %i39 = load <4 x float>, ptr %pTempSrc.1144, align 4
953  store <4 x float> %i39, ptr %pTempDest.0143, align 4
954  %add.ptr52 = getelementptr inbounds float, ptr %pTempSrc.1144, i32 4
955  %add.ptr53 = getelementptr inbounds float, ptr %pTempDest.0143, i32 4
956  %dec54 = add nsw i32 %blkCnt.1142, -1
957  %cmp49 = icmp eq i32 %dec54, 0
958  br i1 %cmp49, label %while.end55.loopexit, label %while.body51
959
960while.end55.loopexit:                             ; preds = %while.body51
961  %scevgep156 = getelementptr float, ptr %i35, i32 %i36
962  br label %while.end55
963
964while.end55:                                      ; preds = %while.end55.loopexit, %if.end
965  %pTempDest.0.lcssa = phi ptr [ %i35, %if.end ], [ %scevgep156, %while.end55.loopexit ]
966  %pTempSrc.1.lcssa = phi ptr [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ]
967  %and56 = and i32 %conv, 3
968  %cmp57 = icmp eq i32 %and56, 0
969  br i1 %cmp57, label %if.end61, label %if.then59
970
971if.then59:                                        ; preds = %while.end55
972  %i41 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and56)
973  %i43 = load <4 x float>, ptr %pTempSrc.1.lcssa, align 4
974  tail call void @llvm.masked.store.v4f32.p0(<4 x float> %i43, ptr %pTempDest.0.lcssa, i32 4, <4 x i1> %i41)
975  br label %if.end61
976
977if.end61:                                         ; preds = %if.then59, %while.end55
978  ret void
979}
980
981
982define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) {
983; CHECK-LABEL: fir:
984; CHECK:       @ %bb.0: @ %entry
985; CHECK-NEXT:    cmp r3, #8
986; CHECK-NEXT:    blo.w .LBB16_13
987; CHECK-NEXT:  @ %bb.1: @ %if.then
988; CHECK-NEXT:    lsrs.w r12, r3, #2
989; CHECK-NEXT:    it eq
990; CHECK-NEXT:    bxeq lr
991; CHECK-NEXT:  .LBB16_2: @ %while.body.lr.ph
992; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
993; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
994; CHECK-NEXT:    .pad #4
995; CHECK-NEXT:    sub sp, #4
996; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
997; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
998; CHECK-NEXT:    .pad #32
999; CHECK-NEXT:    sub sp, #32
1000; CHECK-NEXT:    ldrh r6, [r0]
1001; CHECK-NEXT:    movs r5, #1
1002; CHECK-NEXT:    ldrd r4, r10, [r0, #4]
1003; CHECK-NEXT:    sub.w r0, r6, #8
1004; CHECK-NEXT:    add.w r3, r0, r0, lsr #29
1005; CHECK-NEXT:    and r0, r0, #7
1006; CHECK-NEXT:    asrs r7, r3, #3
1007; CHECK-NEXT:    cmp r7, #1
1008; CHECK-NEXT:    it gt
1009; CHECK-NEXT:    asrgt r5, r3, #3
1010; CHECK-NEXT:    add.w r3, r4, r6, lsl #2
1011; CHECK-NEXT:    sub.w r9, r3, #4
1012; CHECK-NEXT:    rsbs r3, r6, #0
1013; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
1014; CHECK-NEXT:    add.w r3, r10, #32
1015; CHECK-NEXT:    str r5, [sp, #4] @ 4-byte Spill
1016; CHECK-NEXT:    str r6, [sp, #16] @ 4-byte Spill
1017; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
1018; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
1019; CHECK-NEXT:    b .LBB16_6
1020; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
1021; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
1022; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
1023; CHECK-NEXT:    add.w r4, r4, r0, lsl #2
1024; CHECK-NEXT:    b .LBB16_5
1025; CHECK-NEXT:  .LBB16_4: @ %for.end
1026; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
1027; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
1028; CHECK-NEXT:    ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
1029; CHECK-NEXT:    wls lr, r0, .LBB16_5
1030; CHECK-NEXT:    b .LBB16_10
1031; CHECK-NEXT:  .LBB16_5: @ %while.end
1032; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
1033; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
1034; CHECK-NEXT:    subs.w r12, r12, #1
1035; CHECK-NEXT:    vstrb.8 q0, [r2], #16
1036; CHECK-NEXT:    add.w r0, r4, r0, lsl #2
1037; CHECK-NEXT:    add.w r4, r0, #16
1038; CHECK-NEXT:    beq .LBB16_12
1039; CHECK-NEXT:  .LBB16_6: @ %while.body
1040; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1041; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
1042; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
1043; CHECK-NEXT:    add.w lr, r10, #8
1044; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
1045; CHECK-NEXT:    ldrd r3, r7, [r10]
1046; CHECK-NEXT:    ldm.w lr, {r0, r5, r6, lr}
1047; CHECK-NEXT:    ldrd r11, r8, [r10, #24]
1048; CHECK-NEXT:    vstrb.8 q0, [r9], #16
1049; CHECK-NEXT:    vldrw.u32 q0, [r4], #32
1050; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
1051; CHECK-NEXT:    str.w r9, [sp, #24] @ 4-byte Spill
1052; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
1053; CHECK-NEXT:    vmul.f32 q0, q0, r3
1054; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
1055; CHECK-NEXT:    vldrw.u32 q4, [r4, #-20]
1056; CHECK-NEXT:    vfma.f32 q0, q1, r7
1057; CHECK-NEXT:    vldrw.u32 q5, [r4, #-16]
1058; CHECK-NEXT:    vfma.f32 q0, q6, r0
1059; CHECK-NEXT:    vldrw.u32 q2, [r4, #-12]
1060; CHECK-NEXT:    vfma.f32 q0, q4, r5
1061; CHECK-NEXT:    vldrw.u32 q3, [r4, #-8]
1062; CHECK-NEXT:    vfma.f32 q0, q5, r6
1063; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
1064; CHECK-NEXT:    vfma.f32 q0, q2, lr
1065; CHECK-NEXT:    vldrw.u32 q1, [r4, #-4]
1066; CHECK-NEXT:    vfma.f32 q0, q3, r11
1067; CHECK-NEXT:    cmp r0, #16
1068; CHECK-NEXT:    vfma.f32 q0, q1, r8
1069; CHECK-NEXT:    blo .LBB16_9
1070; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
1071; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
1072; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
1073; CHECK-NEXT:    dls lr, r0
1074; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
1075; CHECK-NEXT:  .LBB16_8: @ %for.body
1076; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
1077; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1078; CHECK-NEXT:    ldm.w r7, {r0, r3, r5, r6, r8, r11}
1079; CHECK-NEXT:    vldrw.u32 q1, [r4], #32
1080; CHECK-NEXT:    vldrw.u32 q6, [r4, #-24]
1081; CHECK-NEXT:    vldrw.u32 q4, [r4, #-20]
1082; CHECK-NEXT:    vfma.f32 q0, q1, r0
1083; CHECK-NEXT:    vldrw.u32 q1, [r4, #-28]
1084; CHECK-NEXT:    vldrw.u32 q5, [r4, #-16]
1085; CHECK-NEXT:    vldrw.u32 q2, [r4, #-12]
1086; CHECK-NEXT:    vfma.f32 q0, q1, r3
1087; CHECK-NEXT:    ldrd r9, r1, [r7, #24]
1088; CHECK-NEXT:    vfma.f32 q0, q6, r5
1089; CHECK-NEXT:    vldrw.u32 q3, [r4, #-8]
1090; CHECK-NEXT:    vfma.f32 q0, q4, r6
1091; CHECK-NEXT:    vldrw.u32 q1, [r4, #-4]
1092; CHECK-NEXT:    vfma.f32 q0, q5, r8
1093; CHECK-NEXT:    adds r7, #32
1094; CHECK-NEXT:    vfma.f32 q0, q2, r11
1095; CHECK-NEXT:    vfma.f32 q0, q3, r9
1096; CHECK-NEXT:    vfma.f32 q0, q1, r1
1097; CHECK-NEXT:    le lr, .LBB16_8
1098; CHECK-NEXT:    b .LBB16_4
1099; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
1100; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
1101; CHECK-NEXT:    b .LBB16_4
1102; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
1103; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
1104; CHECK-NEXT:    mov r3, r4
1105; CHECK-NEXT:  .LBB16_11: @ %while.body76
1106; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
1107; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1108; CHECK-NEXT:    ldr r0, [r7], #4
1109; CHECK-NEXT:    vldrw.u32 q1, [r3], #4
1110; CHECK-NEXT:    vfma.f32 q0, q1, r0
1111; CHECK-NEXT:    le lr, .LBB16_11
1112; CHECK-NEXT:    b .LBB16_3
1113; CHECK-NEXT:  .LBB16_12:
1114; CHECK-NEXT:    add sp, #32
1115; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
1116; CHECK-NEXT:    add sp, #4
1117; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1118; CHECK-NEXT:  .LBB16_13: @ %if.end
1119; CHECK-NEXT:    bx lr
1120entry:
1121  %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
1122  %i = load ptr, ptr %pState1, align 4
1123  %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 2
1124  %i1 = load ptr, ptr %pCoeffs2, align 4
1125  %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 0
1126  %i2 = load i16, ptr %numTaps3, align 4
1127  %conv = zext i16 %i2 to i32
1128  %cmp = icmp ugt i32 %blockSize, 7
1129  br i1 %cmp, label %if.then, label %if.end
1130
1131if.then:                                          ; preds = %entry
1132  %shr = lshr i32 %blockSize, 2
1133  %cmp5217 = icmp eq i32 %shr, 0
1134  br i1 %cmp5217, label %if.end, label %while.body.lr.ph
1135
1136while.body.lr.ph:                                 ; preds = %if.then
1137  %sub = add nsw i32 %conv, -1
1138  %arrayidx = getelementptr inbounds float, ptr %i, i32 %sub
1139  %incdec.ptr = getelementptr inbounds float, ptr %i1, i32 1
1140  %incdec.ptr7 = getelementptr inbounds float, ptr %i1, i32 2
1141  %incdec.ptr8 = getelementptr inbounds float, ptr %i1, i32 3
1142  %incdec.ptr9 = getelementptr inbounds float, ptr %i1, i32 4
1143  %incdec.ptr10 = getelementptr inbounds float, ptr %i1, i32 5
1144  %incdec.ptr11 = getelementptr inbounds float, ptr %i1, i32 6
1145  %incdec.ptr12 = getelementptr inbounds float, ptr %i1, i32 7
1146  %sub37 = add nsw i32 %conv, -8
1147  %div = sdiv i32 %sub37, 8
1148  %pCoeffsCur.0199 = getelementptr inbounds float, ptr %i1, i32 8
1149  %cmp38201 = icmp ugt i16 %i2, 15
1150  %and = and i32 %sub37, 7
1151  %cmp74210 = icmp eq i32 %and, 0
1152  %idx.neg = sub nsw i32 0, %conv
1153  %i3 = icmp sgt i32 %div, 1
1154  %smax = select i1 %i3, i32 %div, i32 1
1155  br label %while.body
1156
1157while.body:                                       ; preds = %while.end, %while.body.lr.ph
1158  %blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ]
1159  %pStateCur.0221 = phi ptr [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ]
1160  %pSamples.0220 = phi ptr [ %i, %while.body.lr.ph ], [ %add.ptr83, %while.end ]
1161  %pTempSrc.0219 = phi ptr [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ]
1162  %pOutput.0218 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ]
1163  %i4 = load float, ptr %i1, align 4
1164  %i5 = load float, ptr %incdec.ptr, align 4
1165  %i6 = load float, ptr %incdec.ptr7, align 4
1166  %i7 = load float, ptr %incdec.ptr8, align 4
1167  %i8 = load float, ptr %incdec.ptr9, align 4
1168  %i9 = load float, ptr %incdec.ptr10, align 4
1169  %i10 = load float, ptr %incdec.ptr11, align 4
1170  %i11 = load float, ptr %incdec.ptr12, align 4
1171  %i13 = load <4 x float>, ptr %pTempSrc.0219, align 4
1172  store <4 x float> %i13, ptr %pStateCur.0221, align 4
1173  %add.ptr = getelementptr inbounds float, ptr %pStateCur.0221, i32 4
1174  %add.ptr14 = getelementptr inbounds float, ptr %pTempSrc.0219, i32 4
1175  %i16 = load <4 x float>, ptr %pSamples.0220, align 4
1176  %.splatinsert = insertelement <4 x float> undef, float %i4, i32 0
1177  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1178  %i17 = fmul fast <4 x float> %i16, %.splat
1179  %arrayidx15 = getelementptr inbounds float, ptr %pSamples.0220, i32 1
1180  %i19 = load <4 x float>, ptr %arrayidx15, align 4
1181  %.splatinsert16 = insertelement <4 x float> undef, float %i5, i32 0
1182  %.splat17 = shufflevector <4 x float> %.splatinsert16, <4 x float> undef, <4 x i32> zeroinitializer
1183  %i20 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i19, <4 x float> %.splat17, <4 x float> %i17)
1184  %arrayidx18 = getelementptr inbounds float, ptr %pSamples.0220, i32 2
1185  %i22 = load <4 x float>, ptr %arrayidx18, align 4
1186  %.splatinsert19 = insertelement <4 x float> undef, float %i6, i32 0
1187  %.splat20 = shufflevector <4 x float> %.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
1188  %i23 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i22, <4 x float> %.splat20, <4 x float> %i20)
1189  %arrayidx21 = getelementptr inbounds float, ptr %pSamples.0220, i32 3
1190  %i25 = load <4 x float>, ptr %arrayidx21, align 4
1191  %.splatinsert22 = insertelement <4 x float> undef, float %i7, i32 0
1192  %.splat23 = shufflevector <4 x float> %.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer
1193  %i26 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i25, <4 x float> %.splat23, <4 x float> %i23)
1194  %arrayidx24 = getelementptr inbounds float, ptr %pSamples.0220, i32 4
1195  %i28 = load <4 x float>, ptr %arrayidx24, align 4
1196  %.splatinsert25 = insertelement <4 x float> undef, float %i8, i32 0
1197  %.splat26 = shufflevector <4 x float> %.splatinsert25, <4 x float> undef, <4 x i32> zeroinitializer
1198  %i29 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i28, <4 x float> %.splat26, <4 x float> %i26)
1199  %arrayidx27 = getelementptr inbounds float, ptr %pSamples.0220, i32 5
1200  %i31 = load <4 x float>, ptr %arrayidx27, align 4
1201  %.splatinsert28 = insertelement <4 x float> undef, float %i9, i32 0
1202  %.splat29 = shufflevector <4 x float> %.splatinsert28, <4 x float> undef, <4 x i32> zeroinitializer
1203  %i32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i31, <4 x float> %.splat29, <4 x float> %i29)
1204  %arrayidx30 = getelementptr inbounds float, ptr %pSamples.0220, i32 6
1205  %i34 = load <4 x float>, ptr %arrayidx30, align 4
1206  %.splatinsert31 = insertelement <4 x float> undef, float %i10, i32 0
1207  %.splat32 = shufflevector <4 x float> %.splatinsert31, <4 x float> undef, <4 x i32> zeroinitializer
1208  %i35 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i34, <4 x float> %.splat32, <4 x float> %i32)
1209  %arrayidx33 = getelementptr inbounds float, ptr %pSamples.0220, i32 7
1210  %i37 = load <4 x float>, ptr %arrayidx33, align 4
1211  %.splatinsert34 = insertelement <4 x float> undef, float %i11, i32 0
1212  %.splat35 = shufflevector <4 x float> %.splatinsert34, <4 x float> undef, <4 x i32> zeroinitializer
1213  %i38 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i37, <4 x float> %.splat35, <4 x float> %i35)
1214  %pSamples.1200 = getelementptr inbounds float, ptr %pSamples.0220, i32 8
1215  br i1 %cmp38201, label %for.body, label %for.end
1216
1217for.body:                                         ; preds = %for.body, %while.body
1218  %pSamples.1207 = phi ptr [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ]
1219  %pCoeffsCur.0206 = phi ptr [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ]
1220  %.pn205 = phi ptr [ %pCoeffsCur.0206, %for.body ], [ %i1, %while.body ]
1221  %i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ]
1222  %vecAcc0.0203 = phi <4 x float> [ %i70, %for.body ], [ %i38, %while.body ]
1223  %pSamples.0.pn202 = phi ptr [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ]
1224  %incdec.ptr40 = getelementptr inbounds float, ptr %.pn205, i32 9
1225  %i39 = load float, ptr %pCoeffsCur.0206, align 4
1226  %incdec.ptr41 = getelementptr inbounds float, ptr %.pn205, i32 10
1227  %i40 = load float, ptr %incdec.ptr40, align 4
1228  %incdec.ptr42 = getelementptr inbounds float, ptr %.pn205, i32 11
1229  %i41 = load float, ptr %incdec.ptr41, align 4
1230  %incdec.ptr43 = getelementptr inbounds float, ptr %.pn205, i32 12
1231  %i42 = load float, ptr %incdec.ptr42, align 4
1232  %incdec.ptr44 = getelementptr inbounds float, ptr %.pn205, i32 13
1233  %i43 = load float, ptr %incdec.ptr43, align 4
1234  %incdec.ptr45 = getelementptr inbounds float, ptr %.pn205, i32 14
1235  %i44 = load float, ptr %incdec.ptr44, align 4
1236  %incdec.ptr46 = getelementptr inbounds float, ptr %.pn205, i32 15
1237  %i45 = load float, ptr %incdec.ptr45, align 4
1238  %i46 = load float, ptr %incdec.ptr46, align 4
1239  %i48 = load <4 x float>, ptr %pSamples.1207, align 4
1240  %.splatinsert48 = insertelement <4 x float> undef, float %i39, i32 0
1241  %.splat49 = shufflevector <4 x float> %.splatinsert48, <4 x float> undef, <4 x i32> zeroinitializer
1242  %i49 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i48, <4 x float> %.splat49, <4 x float> %vecAcc0.0203)
1243  %arrayidx50 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 9
1244  %i51 = load <4 x float>, ptr %arrayidx50, align 4
1245  %.splatinsert51 = insertelement <4 x float> undef, float %i40, i32 0
1246  %.splat52 = shufflevector <4 x float> %.splatinsert51, <4 x float> undef, <4 x i32> zeroinitializer
1247  %i52 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i51, <4 x float> %.splat52, <4 x float> %i49)
1248  %arrayidx53 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 10
1249  %i54 = load <4 x float>, ptr %arrayidx53, align 4
1250  %.splatinsert54 = insertelement <4 x float> undef, float %i41, i32 0
1251  %.splat55 = shufflevector <4 x float> %.splatinsert54, <4 x float> undef, <4 x i32> zeroinitializer
1252  %i55 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i54, <4 x float> %.splat55, <4 x float> %i52)
1253  %arrayidx56 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 11
1254  %i57 = load <4 x float>, ptr %arrayidx56, align 4
1255  %.splatinsert57 = insertelement <4 x float> undef, float %i42, i32 0
1256  %.splat58 = shufflevector <4 x float> %.splatinsert57, <4 x float> undef, <4 x i32> zeroinitializer
1257  %i58 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i57, <4 x float> %.splat58, <4 x float> %i55)
1258  %arrayidx59 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 12
1259  %i60 = load <4 x float>, ptr %arrayidx59, align 4
1260  %.splatinsert60 = insertelement <4 x float> undef, float %i43, i32 0
1261  %.splat61 = shufflevector <4 x float> %.splatinsert60, <4 x float> undef, <4 x i32> zeroinitializer
1262  %i61 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i60, <4 x float> %.splat61, <4 x float> %i58)
1263  %arrayidx62 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 13
1264  %i63 = load <4 x float>, ptr %arrayidx62, align 4
1265  %.splatinsert63 = insertelement <4 x float> undef, float %i44, i32 0
1266  %.splat64 = shufflevector <4 x float> %.splatinsert63, <4 x float> undef, <4 x i32> zeroinitializer
1267  %i64 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i63, <4 x float> %.splat64, <4 x float> %i61)
1268  %arrayidx65 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 14
1269  %i66 = load <4 x float>, ptr %arrayidx65, align 4
1270  %.splatinsert66 = insertelement <4 x float> undef, float %i45, i32 0
1271  %.splat67 = shufflevector <4 x float> %.splatinsert66, <4 x float> undef, <4 x i32> zeroinitializer
1272  %i67 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i66, <4 x float> %.splat67, <4 x float> %i64)
1273  %arrayidx68 = getelementptr inbounds float, ptr %pSamples.0.pn202, i32 15
1274  %i69 = load <4 x float>, ptr %arrayidx68, align 4
1275  %.splatinsert69 = insertelement <4 x float> undef, float %i46, i32 0
1276  %.splat70 = shufflevector <4 x float> %.splatinsert69, <4 x float> undef, <4 x i32> zeroinitializer
1277  %i70 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i69, <4 x float> %.splat70, <4 x float> %i67)
1278  %inc = add nuw nsw i32 %i.0204, 1
1279  %pCoeffsCur.0 = getelementptr inbounds float, ptr %pCoeffsCur.0206, i32 8
1280  %pSamples.1 = getelementptr inbounds float, ptr %pSamples.1207, i32 8
1281  %exitcond = icmp eq i32 %inc, %smax
1282  br i1 %exitcond, label %for.end, label %for.body
1283
1284for.end:                                          ; preds = %for.body, %while.body
1285  %vecAcc0.0.lcssa = phi <4 x float> [ %i38, %while.body ], [ %i70, %for.body ]
1286  %pCoeffsCur.0.lcssa = phi ptr [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ]
1287  %pSamples.1.lcssa = phi ptr [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ]
1288  br i1 %cmp74210, label %while.end, label %while.body76
1289
1290while.body76:                                     ; preds = %while.body76, %for.end
1291  %pCoeffsCur.1214 = phi ptr [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ]
1292  %vecAcc0.1213 = phi <4 x float> [ %i74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ]
1293  %numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ]
1294  %pSamples.2211 = phi ptr [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ]
1295  %incdec.ptr77 = getelementptr inbounds float, ptr %pCoeffsCur.1214, i32 1
1296  %i71 = load float, ptr %pCoeffsCur.1214, align 4
1297  %i73 = load <4 x float>, ptr %pSamples.2211, align 4
1298  %.splatinsert78 = insertelement <4 x float> undef, float %i71, i32 0
1299  %.splat79 = shufflevector <4 x float> %.splatinsert78, <4 x float> undef, <4 x i32> zeroinitializer
1300  %i74 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i73, <4 x float> %.splat79, <4 x float> %vecAcc0.1213)
1301  %incdec.ptr80 = getelementptr inbounds float, ptr %pSamples.2211, i32 1
1302  %dec = add nsw i32 %numCnt.0212, -1
1303  %cmp74 = icmp sgt i32 %numCnt.0212, 1
1304  br i1 %cmp74, label %while.body76, label %while.end.loopexit
1305
1306while.end.loopexit:                               ; preds = %while.body76
1307  %scevgep = getelementptr float, ptr %pSamples.1.lcssa, i32 %and
1308  br label %while.end
1309
1310while.end:                                        ; preds = %while.end.loopexit, %for.end
1311  %pSamples.2.lcssa = phi ptr [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ]
1312  %vecAcc0.1.lcssa = phi <4 x float> [ %vecAcc0.0.lcssa, %for.end ], [ %i74, %while.end.loopexit ]
1313  store <4 x float> %vecAcc0.1.lcssa, ptr %pOutput.0218, align 4
1314  %add.ptr81 = getelementptr inbounds float, ptr %pOutput.0218, i32 4
1315  %add.ptr82 = getelementptr inbounds float, ptr %pSamples.2.lcssa, i32 4
1316  %add.ptr83 = getelementptr inbounds float, ptr %add.ptr82, i32 %idx.neg
1317  %dec84 = add nsw i32 %blkCnt.0222, -1
1318  %cmp5 = icmp eq i32 %dec84, 0
1319  br i1 %cmp5, label %if.end, label %while.body
1320
1321if.end:                                           ; preds = %while.end, %if.then, %entry
1322  ret void
1323}
1324
1325%struct.arm_biquad_cascade_stereo_df2T_instance_f32 = type { i8, ptr, ptr }
1326define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(ptr nocapture readonly %arg, ptr %arg1, ptr %arg2, i32 %arg3) {
1327; CHECK-LABEL: arm_biquad_cascade_stereo_df2T_f32:
1328; CHECK:       @ %bb.0: @ %bb
1329; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
1330; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
1331; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
1332; CHECK-NEXT:    vpush {d8, d9, d10, d11}
1333; CHECK-NEXT:    .pad #24
1334; CHECK-NEXT:    sub sp, #24
1335; CHECK-NEXT:    mov r8, r3
1336; CHECK-NEXT:    ldrb.w r12, [r0]
1337; CHECK-NEXT:    ldrd r0, r3, [r0, #4]
1338; CHECK-NEXT:    movs r4, #0
1339; CHECK-NEXT:    cmp.w r8, #0
1340; CHECK-NEXT:    strd r4, r4, [sp, #16]
1341; CHECK-NEXT:    beq .LBB17_5
1342; CHECK-NEXT:  @ %bb.1:
1343; CHECK-NEXT:    movs r5, #2
1344; CHECK-NEXT:    viwdup.u32 q0, r4, r5, #1
1345; CHECK-NEXT:    mov r4, sp
1346; CHECK-NEXT:  .LBB17_2: @ %bb29
1347; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1348; CHECK-NEXT:    @ Child Loop BB17_3 Depth 2
1349; CHECK-NEXT:    ldrd r5, r7, [r3]
1350; CHECK-NEXT:    vldrw.u32 q1, [r0]
1351; CHECK-NEXT:    ldr r6, [r3, #12]
1352; CHECK-NEXT:    vldr s8, [r3, #8]
1353; CHECK-NEXT:    vstrw.32 q1, [r4]
1354; CHECK-NEXT:    vdup.32 q1, r7
1355; CHECK-NEXT:    vldr s12, [r3, #16]
1356; CHECK-NEXT:    vmov.f32 s6, s8
1357; CHECK-NEXT:    dls lr, r8
1358; CHECK-NEXT:    vmov.f32 s7, s8
1359; CHECK-NEXT:    vdup.32 q2, r6
1360; CHECK-NEXT:    vmov.f32 s10, s12
1361; CHECK-NEXT:    mov r7, r2
1362; CHECK-NEXT:    vmov.f32 s11, s12
1363; CHECK-NEXT:  .LBB17_3: @ %bb55
1364; CHECK-NEXT:    @ Parent Loop BB17_2 Depth=1
1365; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1366; CHECK-NEXT:    vldrw.u32 q4, [r1, q0, uxtw #2]
1367; CHECK-NEXT:    vldrw.u32 q5, [r4, q0, uxtw #2]
1368; CHECK-NEXT:    vldrw.u32 q3, [sp, #8]
1369; CHECK-NEXT:    adds r1, #8
1370; CHECK-NEXT:    vfma.f32 q5, q4, r5
1371; CHECK-NEXT:    vfma.f32 q3, q5, q2
1372; CHECK-NEXT:    vstmia r7!, {s20, s21}
1373; CHECK-NEXT:    vfma.f32 q3, q4, q1
1374; CHECK-NEXT:    vstrw.32 q3, [r4]
1375; CHECK-NEXT:    le lr, .LBB17_3
1376; CHECK-NEXT:  @ %bb.4: @ %bb75
1377; CHECK-NEXT:    @ in Loop: Header=BB17_2 Depth=1
1378; CHECK-NEXT:    adds r3, #20
1379; CHECK-NEXT:    subs.w r12, r12, #1
1380; CHECK-NEXT:    vstrb.8 q3, [r0], #16
1381; CHECK-NEXT:    mov r1, r2
1382; CHECK-NEXT:    bne .LBB17_2
1383; CHECK-NEXT:    b .LBB17_7
1384; CHECK-NEXT:  .LBB17_5: @ %bb21.preheader
1385; CHECK-NEXT:    dls lr, r12
1386; CHECK-NEXT:    mov r1, sp
1387; CHECK-NEXT:  .LBB17_6: @ %bb21
1388; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
1389; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
1390; CHECK-NEXT:    vstrw.32 q0, [r1]
1391; CHECK-NEXT:    le lr, .LBB17_6
1392; CHECK-NEXT:  .LBB17_7: @ %bb80
1393; CHECK-NEXT:    add sp, #24
1394; CHECK-NEXT:    vpop {d8, d9, d10, d11}
1395; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
1396bb:
1397  %i = alloca [6 x float], align 4
1398  %i4 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, ptr %arg, i32 0, i32 1
1399  %i5 = load ptr, ptr %i4, align 4
1400  %i6 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, ptr %arg, i32 0, i32 2
1401  %i7 = load ptr, ptr %i6, align 4
1402  %i8 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, ptr %arg, i32 0, i32 0
1403  %i9 = load i8, ptr %i8, align 4
1404  %i10 = zext i8 %i9 to i32
1405  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %i)
1406  %i12 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 0, i32 2, i32 1)
1407  %i13 = extractvalue { <4 x i32>, i32 } %i12, 0
1408  %i14 = getelementptr inbounds [6 x float], ptr %i, i32 0, i32 4
1409  store float 0.000000e+00, ptr %i14, align 4
1410  %i15 = getelementptr inbounds [6 x float], ptr %i, i32 0, i32 5
1411  store float 0.000000e+00, ptr %i15, align 4
1412  %i17 = icmp eq i32 %arg3, 0
1413  %i19 = getelementptr inbounds [6 x float], ptr %i, i32 0, i32 2
1414  br i1 %i17, label %bb21, label %bb29
1415
1416bb21:                                             ; preds = %bb21, %bb
1417  %i22 = phi i32 [ %i27, %bb21 ], [ %i10, %bb ]
1418  %i23 = phi ptr [ %i26, %bb21 ], [ %i5, %bb ]
1419  %i25 = load <4 x float>, ptr %i23, align 8
1420  store <4 x float> %i25, ptr %i, align 4
1421  %i26 = getelementptr inbounds float, ptr %i23, i32 4
1422  %i27 = add i32 %i22, -1
1423  %i28 = icmp eq i32 %i27, 0
1424  br i1 %i28, label %bb80, label %bb21
1425
1426bb29:                                             ; preds = %bb75, %bb
1427  %i30 = phi i32 [ %i78, %bb75 ], [ %i10, %bb ]
1428  %i31 = phi ptr [ %i76, %bb75 ], [ %i7, %bb ]
1429  %i32 = phi ptr [ %i77, %bb75 ], [ %i5, %bb ]
1430  %i33 = phi ptr [ %arg2, %bb75 ], [ %arg1, %bb ]
1431  %i34 = getelementptr inbounds float, ptr %i31, i32 1
1432  %i35 = load float, ptr %i31, align 4
1433  %i36 = getelementptr inbounds float, ptr %i31, i32 2
1434  %i37 = load float, ptr %i34, align 4
1435  %i38 = getelementptr inbounds float, ptr %i31, i32 3
1436  %i39 = load float, ptr %i36, align 4
1437  %i40 = getelementptr inbounds float, ptr %i31, i32 4
1438  %i41 = load float, ptr %i38, align 4
1439  %i42 = load float, ptr %i40, align 4
1440  %i43 = insertelement <4 x float> undef, float %i41, i32 0
1441  %i44 = shufflevector <4 x float> %i43, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
1442  %i45 = insertelement <4 x float> %i44, float %i42, i32 2
1443  %i46 = insertelement <4 x float> %i45, float %i42, i32 3
1444  %i47 = insertelement <4 x float> undef, float %i37, i32 0
1445  %i48 = shufflevector <4 x float> %i47, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
1446  %i49 = insertelement <4 x float> %i48, float %i39, i32 2
1447  %i50 = insertelement <4 x float> %i49, float %i39, i32 3
1448  %i52 = load <4 x float>, ptr %i32, align 8
1449  store <4 x float> %i52, ptr %i, align 4
1450  %i53 = insertelement <4 x float> undef, float %i35, i32 0
1451  %i54 = shufflevector <4 x float> %i53, <4 x float> undef, <4 x i32> zeroinitializer
1452  br label %bb55
1453
1454bb55:                                             ; preds = %bb55, %bb29
1455  %i56 = phi ptr [ %i33, %bb29 ], [ %i72, %bb55 ]
1456  %i57 = phi ptr [ %arg2, %bb29 ], [ %i68, %bb55 ]
1457  %i58 = phi i32 [ %arg3, %bb29 ], [ %i73, %bb55 ]
1458  %i59 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr nonnull %i, <4 x i32> %i13, i32 32, i32 2, i32 1)
1459  %i60 = bitcast <4 x i32> %i59 to <4 x float>
1460  %i62 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr %i56, <4 x i32> %i13, i32 32, i32 2, i32 1)
1461  %i63 = bitcast <4 x i32> %i62 to <4 x float>
1462  %i64 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i63, <4 x float> %i54, <4 x float> %i60)
1463  %i65 = extractelement <4 x float> %i64, i32 0
1464  %i66 = getelementptr inbounds float, ptr %i57, i32 1
1465  store float %i65, ptr %i57, align 4
1466  %i67 = extractelement <4 x float> %i64, i32 1
1467  %i68 = getelementptr inbounds float, ptr %i57, i32 2
1468  store float %i67, ptr %i66, align 4
1469  %i69 = load <4 x float>, ptr %i19, align 4
1470  %i70 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i64, <4 x float> %i46, <4 x float> %i69)
1471  %i71 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i63, <4 x float> %i50, <4 x float> %i70)
1472  store <4 x float> %i71, ptr %i, align 4
1473  %i72 = getelementptr inbounds float, ptr %i56, i32 2
1474  %i73 = add i32 %i58, -1
1475  %i74 = icmp eq i32 %i73, 0
1476  br i1 %i74, label %bb75, label %bb55
1477
1478bb75:                                             ; preds = %bb55
1479  %i76 = getelementptr inbounds float, ptr %i31, i32 5
1480  store <4 x float> %i71, ptr %i32, align 4
1481  %i77 = getelementptr inbounds float, ptr %i32, i32 4
1482  %i78 = add i32 %i30, -1
1483  %i79 = icmp eq i32 %i78, 0
1484  br i1 %i79, label %bb80, label %bb29
1485
1486bb80:                                             ; preds = %bb75, %bb21
1487  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %i)
1488  ret void
1489}
1490
1491define arm_aapcs_vfpcc void @fms(ptr nocapture readonly %pSrc1, ptr nocapture readonly %pSrc2, ptr nocapture readonly %pSrc3, ptr nocapture %pDst, i32 %N, i32 %M) {
1492; CHECK-LABEL: fms:
1493; CHECK:       @ %bb.0: @ %entry
1494; CHECK-NEXT:    .save {r4, r5, r7, lr}
1495; CHECK-NEXT:    push {r4, r5, r7, lr}
1496; CHECK-NEXT:    ldr r4, [sp, #16]
1497; CHECK-NEXT:    lsrs r5, r4, #2
1498; CHECK-NEXT:    beq .LBB18_5
1499; CHECK-NEXT:  @ %bb.1: @ %do.body.preheader
1500; CHECK-NEXT:    ldr.w r12, [sp, #20]
1501; CHECK-NEXT:  .LBB18_2: @ %do.body
1502; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1503; CHECK-NEXT:    @ Child Loop BB18_3 Depth 2
1504; CHECK-NEXT:    ldr r4, [r2]
1505; CHECK-NEXT:    dls lr, r5
1506; CHECK-NEXT:    vdup.32 q0, r4
1507; CHECK-NEXT:  .LBB18_3: @ %while.body
1508; CHECK-NEXT:    @ Parent Loop BB18_2 Depth=1
1509; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1510; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
1511; CHECK-NEXT:    vldrw.u32 q2, [r0], #16
1512; CHECK-NEXT:    vfms.f32 q2, q1, q0
1513; CHECK-NEXT:    vstrb.8 q2, [r3], #16
1514; CHECK-NEXT:    le lr, .LBB18_3
1515; CHECK-NEXT:  @ %bb.4: @ %while.end
1516; CHECK-NEXT:    @ in Loop: Header=BB18_2 Depth=1
1517; CHECK-NEXT:    adds r2, #4
1518; CHECK-NEXT:    subs.w r12, r12, #1
1519; CHECK-NEXT:    bne .LBB18_2
1520; CHECK-NEXT:  .LBB18_5: @ %do.end
1521; CHECK-NEXT:    pop {r4, r5, r7, pc}
1522entry:
1523  %shr = lshr i32 %N, 2
1524  %cmp15 = icmp eq i32 %shr, 0
1525  br i1 %cmp15, label %do.end, label %do.body
1526
1527do.body:                                          ; preds = %while.end, %entry
1528  %pDst.addr.0 = phi ptr [ %add.ptr2, %while.end ], [ %pDst, %entry ]
1529  %M.addr.0 = phi i32 [ %dec3, %while.end ], [ %M, %entry ]
1530  %pSrc3.addr.0 = phi ptr [ %incdec.ptr, %while.end ], [ %pSrc3, %entry ]
1531  %pSrc2.addr.0 = phi ptr [ %add.ptr1, %while.end ], [ %pSrc2, %entry ]
1532  %pSrc1.addr.0 = phi ptr [ %add.ptr, %while.end ], [ %pSrc1, %entry ]
1533  %i = load float, ptr %pSrc3.addr.0, align 4
1534  %.splatinsert = insertelement <4 x float> undef, float %i, i32 0
1535  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1536  br label %while.body
1537
1538while.body:                                       ; preds = %while.body, %do.body
1539  %pSrc1.addr.119 = phi ptr [ %pSrc1.addr.0, %do.body ], [ %add.ptr, %while.body ]
1540  %pSrc2.addr.118 = phi ptr [ %pSrc2.addr.0, %do.body ], [ %add.ptr1, %while.body ]
1541  %blkCnt.017 = phi i32 [ %shr, %do.body ], [ %dec, %while.body ]
1542  %pDst.addr.116 = phi ptr [ %pDst.addr.0, %do.body ], [ %add.ptr2, %while.body ]
1543  %i2 = load <4 x float>, ptr %pSrc1.addr.119, align 4
1544  %i4 = load <4 x float>, ptr %pSrc2.addr.118, align 4
1545  %i5 = fneg fast <4 x float> %i4
1546  %i6 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %.splat, <4 x float> %i5, <4 x float> %i2)
1547  store <4 x float> %i6, ptr %pDst.addr.116, align 4
1548  %add.ptr = getelementptr inbounds float, ptr %pSrc1.addr.119, i32 4
1549  %add.ptr1 = getelementptr inbounds float, ptr %pSrc2.addr.118, i32 4
1550  %add.ptr2 = getelementptr inbounds float, ptr %pDst.addr.116, i32 4
1551  %dec = add nsw i32 %blkCnt.017, -1
1552  %cmp = icmp eq i32 %dec, 0
1553  br i1 %cmp, label %while.end, label %while.body
1554
1555while.end:                                        ; preds = %while.body
1556  %incdec.ptr = getelementptr inbounds float, ptr %pSrc3.addr.0, i32 1
1557  %dec3 = add i32 %M.addr.0, -1
1558  %cmp4 = icmp eq i32 %dec3, 0
1559  br i1 %cmp4, label %do.end, label %do.body
1560
1561do.end:                                           ; preds = %while.end, %entry
1562  ret void
1563}
1564
1565
1566%struct.arm_biquad_casd_df1_inst_f32 = type { i32, ptr, ptr }
1567define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) {
1568; CHECK-LABEL: arm_biquad_cascade_df1_f32:
1569; CHECK:       @ %bb.0: @ %entry
1570; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1571; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1572; CHECK-NEXT:    .pad #4
1573; CHECK-NEXT:    sub sp, #4
1574; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1575; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1576; CHECK-NEXT:    .pad #16
1577; CHECK-NEXT:    sub sp, #16
1578; CHECK-NEXT:    ldrd r7, r9, [r0]
1579; CHECK-NEXT:    and r6, r3, #3
1580; CHECK-NEXT:    ldr r0, [r0, #8]
1581; CHECK-NEXT:    lsrs r3, r3, #2
1582; CHECK-NEXT:    @ implicit-def: $r12
1583; CHECK-NEXT:    str r6, [sp, #4] @ 4-byte Spill
1584; CHECK-NEXT:    str r3, [sp] @ 4-byte Spill
1585; CHECK-NEXT:    str r2, [sp, #8] @ 4-byte Spill
1586; CHECK-NEXT:    b .LBB19_3
1587; CHECK-NEXT:  .LBB19_1: @ in Loop: Header=BB19_3 Depth=1
1588; CHECK-NEXT:    mov r3, r8
1589; CHECK-NEXT:    mov r2, r5
1590; CHECK-NEXT:    mov r4, r11
1591; CHECK-NEXT:    mov r8, r10
1592; CHECK-NEXT:  .LBB19_2: @ %if.end69
1593; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
1594; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
1595; CHECK-NEXT:    adds r0, #128
1596; CHECK-NEXT:    strd r2, r4, [r9]
1597; CHECK-NEXT:    ldr r2, [sp, #8] @ 4-byte Reload
1598; CHECK-NEXT:    subs r7, #1
1599; CHECK-NEXT:    strd r3, r8, [r9, #8]
1600; CHECK-NEXT:    add.w r9, r9, #16
1601; CHECK-NEXT:    mov r1, r2
1602; CHECK-NEXT:    beq.w .LBB19_13
1603; CHECK-NEXT:  .LBB19_3: @ %do.body
1604; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1605; CHECK-NEXT:    @ Child Loop BB19_5 Depth 2
1606; CHECK-NEXT:    mov r6, r2
1607; CHECK-NEXT:    ldrd r5, r11, [r9]
1608; CHECK-NEXT:    ldrd r8, r10, [r9, #8]
1609; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
1610; CHECK-NEXT:    str r7, [sp, #12] @ 4-byte Spill
1611; CHECK-NEXT:    wls lr, r2, .LBB19_6
1612; CHECK-NEXT:  @ %bb.4: @ %while.body.lr.ph
1613; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
1614; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
1615; CHECK-NEXT:    mov r4, r11
1616; CHECK-NEXT:    mov r3, r5
1617; CHECK-NEXT:  .LBB19_5: @ %while.body
1618; CHECK-NEXT:    @ Parent Loop BB19_3 Depth=1
1619; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1620; CHECK-NEXT:    ldr r5, [r1, #12]
1621; CHECK-NEXT:    vldrw.u32 q2, [r0]
1622; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
1623; CHECK-NEXT:    ldm.w r1, {r2, r7, r11}
1624; CHECK-NEXT:    vmul.f32 q2, q2, r5
1625; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
1626; CHECK-NEXT:    vfma.f32 q2, q6, r11
1627; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
1628; CHECK-NEXT:    vfma.f32 q2, q7, r7
1629; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
1630; CHECK-NEXT:    vfma.f32 q2, q4, r2
1631; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
1632; CHECK-NEXT:    vfma.f32 q2, q5, r3
1633; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
1634; CHECK-NEXT:    vfma.f32 q2, q3, r4
1635; CHECK-NEXT:    vldrw.u32 q0, [r0, #112]
1636; CHECK-NEXT:    vfma.f32 q2, q1, r8
1637; CHECK-NEXT:    adds r1, #16
1638; CHECK-NEXT:    vfma.f32 q2, q0, r10
1639; CHECK-NEXT:    mov r4, r11
1640; CHECK-NEXT:    vmov r10, r8, d5
1641; CHECK-NEXT:    vstrb.8 q2, [r6], #16
1642; CHECK-NEXT:    mov r3, r5
1643; CHECK-NEXT:    mov r12, r5
1644; CHECK-NEXT:    le lr, .LBB19_5
1645; CHECK-NEXT:  .LBB19_6: @ %while.end
1646; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
1647; CHECK-NEXT:    ldr r3, [sp, #4] @ 4-byte Reload
1648; CHECK-NEXT:    cmp r3, #0
1649; CHECK-NEXT:    beq .LBB19_1
1650; CHECK-NEXT:  @ %bb.7: @ %if.then
1651; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
1652; CHECK-NEXT:    ldrd lr, r4, [r1]
1653; CHECK-NEXT:    vldrw.u32 q0, [r0]
1654; CHECK-NEXT:    ldrd r2, r1, [r1, #8]
1655; CHECK-NEXT:    vldrw.u32 q6, [r0, #16]
1656; CHECK-NEXT:    vldrw.u32 q7, [r0, #32]
1657; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
1658; CHECK-NEXT:    vmul.f32 q0, q0, r1
1659; CHECK-NEXT:    vldrw.u32 q5, [r0, #64]
1660; CHECK-NEXT:    vfma.f32 q0, q6, r2
1661; CHECK-NEXT:    vldrw.u32 q3, [r0, #80]
1662; CHECK-NEXT:    vfma.f32 q0, q7, r4
1663; CHECK-NEXT:    vldrw.u32 q2, [r0, #96]
1664; CHECK-NEXT:    vfma.f32 q0, q4, lr
1665; CHECK-NEXT:    vldrw.u32 q1, [r0, #112]
1666; CHECK-NEXT:    vfma.f32 q0, q5, r5
1667; CHECK-NEXT:    cmp r3, #1
1668; CHECK-NEXT:    vfma.f32 q0, q3, r11
1669; CHECK-NEXT:    vfma.f32 q0, q2, r8
1670; CHECK-NEXT:    vfma.f32 q0, q1, r10
1671; CHECK-NEXT:    vmov r5, s0
1672; CHECK-NEXT:    bne .LBB19_9
1673; CHECK-NEXT:  @ %bb.8: @ %if.then58
1674; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
1675; CHECK-NEXT:    str r5, [r6]
1676; CHECK-NEXT:    mov r2, lr
1677; CHECK-NEXT:    mov r4, r12
1678; CHECK-NEXT:    mov r3, r5
1679; CHECK-NEXT:    b .LBB19_12
1680; CHECK-NEXT:  .LBB19_9: @ %if.else
1681; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
1682; CHECK-NEXT:    vmov r8, s1
1683; CHECK-NEXT:    cmp r3, #2
1684; CHECK-NEXT:    vstr s1, [r6, #4]
1685; CHECK-NEXT:    str r5, [r6]
1686; CHECK-NEXT:    bne .LBB19_11
1687; CHECK-NEXT:  @ %bb.10: @ in Loop: Header=BB19_3 Depth=1
1688; CHECK-NEXT:    mov r2, r4
1689; CHECK-NEXT:    mov r3, r8
1690; CHECK-NEXT:    mov r4, lr
1691; CHECK-NEXT:    mov r8, r5
1692; CHECK-NEXT:    b .LBB19_12
1693; CHECK-NEXT:  .LBB19_11: @ %if.else64
1694; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
1695; CHECK-NEXT:    vmov r3, s2
1696; CHECK-NEXT:    vstr s2, [r6, #8]
1697; CHECK-NEXT:  .LBB19_12: @ %if.end69
1698; CHECK-NEXT:    @ in Loop: Header=BB19_3 Depth=1
1699; CHECK-NEXT:    mov r12, r1
1700; CHECK-NEXT:    b .LBB19_2
1701; CHECK-NEXT:  .LBB19_13: @ %do.end
1702; CHECK-NEXT:    add sp, #16
1703; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1704; CHECK-NEXT:    add sp, #4
1705; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1706entry:
1707  %pState1 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, ptr %S, i32 0, i32 1
1708  %i = load ptr, ptr %pState1, align 4
1709  %pCoeffs2 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, ptr %S, i32 0, i32 2
1710  %i1 = load ptr, ptr %pCoeffs2, align 4
1711  %numStages = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, ptr %S, i32 0, i32 0
1712  %i2 = load i32, ptr %numStages, align 4
1713  %shr = lshr i32 %blockSize, 2
1714  %cmp201 = icmp eq i32 %shr, 0
1715  %and = and i32 %blockSize, 3
1716  %tobool = icmp eq i32 %and, 0
1717  %cmp57 = icmp eq i32 %and, 1
1718  %cmp60 = icmp eq i32 %and, 2
1719  br label %do.body
1720
1721do.body:                                          ; preds = %if.end69, %entry
1722  %pState.0 = phi ptr [ %i, %entry ], [ %incdec.ptr73, %if.end69 ]
1723  %pCoeffs.0 = phi ptr [ %i1, %entry ], [ %add.ptr74, %if.end69 ]
1724  %pIn.0 = phi ptr [ %pSrc, %entry ], [ %pDst, %if.end69 ]
1725  %X3.0 = phi float [ undef, %entry ], [ %X3.2, %if.end69 ]
1726  %stage.0 = phi i32 [ %i2, %entry ], [ %dec75, %if.end69 ]
1727  %i3 = load float, ptr %pState.0, align 4
1728  %arrayidx3 = getelementptr inbounds float, ptr %pState.0, i32 1
1729  %i4 = load float, ptr %arrayidx3, align 4
1730  %arrayidx4 = getelementptr inbounds float, ptr %pState.0, i32 2
1731  %i5 = load float, ptr %arrayidx4, align 4
1732  %arrayidx5 = getelementptr inbounds float, ptr %pState.0, i32 3
1733  %i6 = load float, ptr %arrayidx5, align 4
1734  br i1 %cmp201, label %while.end, label %while.body.lr.ph
1735
1736while.body.lr.ph:                                 ; preds = %do.body
1737  %arrayidx9 = getelementptr inbounds float, ptr %pCoeffs.0, i32 4
1738  %arrayidx12 = getelementptr inbounds float, ptr %pCoeffs.0, i32 8
1739  %arrayidx15 = getelementptr inbounds float, ptr %pCoeffs.0, i32 12
1740  %arrayidx18 = getelementptr inbounds float, ptr %pCoeffs.0, i32 16
1741  %arrayidx21 = getelementptr inbounds float, ptr %pCoeffs.0, i32 20
1742  %arrayidx24 = getelementptr inbounds float, ptr %pCoeffs.0, i32 24
1743  %arrayidx27 = getelementptr inbounds float, ptr %pCoeffs.0, i32 28
1744  br label %while.body
1745
1746while.body:                                       ; preds = %while.body, %while.body.lr.ph
1747  %sample.0208 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
1748  %pIn.1207 = phi ptr [ %pIn.0, %while.body.lr.ph ], [ %incdec.ptr8, %while.body ]
1749  %pOut.1206 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr, %while.body ]
1750  %Yn2.0205 = phi float [ %i6, %while.body.lr.ph ], [ %i37, %while.body ]
1751  %Yn1.0204 = phi float [ %i5, %while.body.lr.ph ], [ %i36, %while.body ]
1752  %Xn2.0203 = phi float [ %i4, %while.body.lr.ph ], [ %i17, %while.body ]
1753  %Xn1.0202 = phi float [ %i3, %while.body.lr.ph ], [ %i18, %while.body ]
1754  %incdec.ptr = getelementptr inbounds float, ptr %pIn.1207, i32 1
1755  %i15 = load float, ptr %pIn.1207, align 4
1756  %incdec.ptr6 = getelementptr inbounds float, ptr %pIn.1207, i32 2
1757  %i16 = load float, ptr %incdec.ptr, align 4
1758  %incdec.ptr7 = getelementptr inbounds float, ptr %pIn.1207, i32 3
1759  %i17 = load float, ptr %incdec.ptr6, align 4
1760  %incdec.ptr8 = getelementptr inbounds float, ptr %pIn.1207, i32 4
1761  %i18 = load float, ptr %incdec.ptr7, align 4
1762  %i19 = load <4 x float>, ptr %pCoeffs.0, align 4
1763  %.splatinsert = insertelement <4 x float> undef, float %i18, i32 0
1764  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
1765  %i20 = fmul fast <4 x float> %.splat, %i19
1766  %i21 = load <4 x float>, ptr %arrayidx9, align 4
1767  %.splatinsert10 = insertelement <4 x float> undef, float %i17, i32 0
1768  %.splat11 = shufflevector <4 x float> %.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
1769  %i22 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i21, <4 x float> %.splat11, <4 x float> %i20)
1770  %i23 = load <4 x float>, ptr %arrayidx12, align 4
1771  %.splatinsert13 = insertelement <4 x float> undef, float %i16, i32 0
1772  %.splat14 = shufflevector <4 x float> %.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
1773  %i24 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i23, <4 x float> %.splat14, <4 x float> %i22)
1774  %i25 = load <4 x float>, ptr %arrayidx15, align 4
1775  %.splatinsert16 = insertelement <4 x float> undef, float %i15, i32 0
1776  %.splat17 = shufflevector <4 x float> %.splatinsert16, <4 x float> undef, <4 x i32> zeroinitializer
1777  %i26 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i25, <4 x float> %.splat17, <4 x float> %i24)
1778  %i27 = load <4 x float>, ptr %arrayidx18, align 4
1779  %.splatinsert19 = insertelement <4 x float> undef, float %Xn1.0202, i32 0
1780  %.splat20 = shufflevector <4 x float> %.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
1781  %i28 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i27, <4 x float> %.splat20, <4 x float> %i26)
1782  %i29 = load <4 x float>, ptr %arrayidx21, align 4
1783  %.splatinsert22 = insertelement <4 x float> undef, float %Xn2.0203, i32 0
1784  %.splat23 = shufflevector <4 x float> %.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer
1785  %i30 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i29, <4 x float> %.splat23, <4 x float> %i28)
1786  %i31 = load <4 x float>, ptr %arrayidx24, align 4
1787  %.splatinsert25 = insertelement <4 x float> undef, float %Yn1.0204, i32 0
1788  %.splat26 = shufflevector <4 x float> %.splatinsert25, <4 x float> undef, <4 x i32> zeroinitializer
1789  %i32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i31, <4 x float> %.splat26, <4 x float> %i30)
1790  %i33 = load <4 x float>, ptr %arrayidx27, align 4
1791  %.splatinsert28 = insertelement <4 x float> undef, float %Yn2.0205, i32 0
1792  %.splat29 = shufflevector <4 x float> %.splatinsert28, <4 x float> undef, <4 x i32> zeroinitializer
1793  %i34 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i33, <4 x float> %.splat29, <4 x float> %i32)
1794  store <4 x float> %i34, ptr %pOut.1206, align 4
1795  %add.ptr = getelementptr inbounds float, ptr %pOut.1206, i32 4
1796  %i36 = extractelement <4 x float> %i34, i32 3
1797  %i37 = extractelement <4 x float> %i34, i32 2
1798  %dec = add nsw i32 %sample.0208, -1
1799  %cmp = icmp eq i32 %dec, 0
1800  br i1 %cmp, label %while.end, label %while.body
1801
1802while.end:                                        ; preds = %while.body, %do.body
1803  %Xn1.0.lcssa = phi float [ %i3, %do.body ], [ %i18, %while.body ]
1804  %Xn2.0.lcssa = phi float [ %i4, %do.body ], [ %i17, %while.body ]
1805  %Yn1.0.lcssa = phi float [ %i5, %do.body ], [ %i36, %while.body ]
1806  %Yn2.0.lcssa = phi float [ %i6, %do.body ], [ %i37, %while.body ]
1807  %pOut.1.lcssa = phi ptr [ %pDst, %do.body ], [ %add.ptr, %while.body ]
1808  %pIn.1.lcssa = phi ptr [ %pIn.0, %do.body ], [ %incdec.ptr8, %while.body ]
1809  %X3.1.lcssa = phi float [ %X3.0, %do.body ], [ %i18, %while.body ]
1810  br i1 %tobool, label %if.end69, label %if.then
1811
1812if.then:                                          ; preds = %while.end
1813  %incdec.ptr30 = getelementptr inbounds float, ptr %pIn.1.lcssa, i32 1
1814  %i38 = load float, ptr %pIn.1.lcssa, align 4
1815  %incdec.ptr31 = getelementptr inbounds float, ptr %pIn.1.lcssa, i32 2
1816  %i39 = load float, ptr %incdec.ptr30, align 4
1817  %incdec.ptr32 = getelementptr inbounds float, ptr %pIn.1.lcssa, i32 3
1818  %i40 = load float, ptr %incdec.ptr31, align 4
1819  %i41 = load float, ptr %incdec.ptr32, align 4
1820  %i43 = load <4 x float>, ptr %pCoeffs.0, align 4
1821  %.splatinsert34 = insertelement <4 x float> undef, float %i41, i32 0
1822  %.splat35 = shufflevector <4 x float> %.splatinsert34, <4 x float> undef, <4 x i32> zeroinitializer
1823  %i44 = fmul fast <4 x float> %.splat35, %i43
1824  %arrayidx36 = getelementptr inbounds float, ptr %pCoeffs.0, i32 4
1825  %i46 = load <4 x float>, ptr %arrayidx36, align 4
1826  %.splatinsert37 = insertelement <4 x float> undef, float %i40, i32 0
1827  %.splat38 = shufflevector <4 x float> %.splatinsert37, <4 x float> undef, <4 x i32> zeroinitializer
1828  %i47 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i46, <4 x float> %.splat38, <4 x float> %i44)
1829  %arrayidx39 = getelementptr inbounds float, ptr %pCoeffs.0, i32 8
1830  %i49 = load <4 x float>, ptr %arrayidx39, align 4
1831  %.splatinsert40 = insertelement <4 x float> undef, float %i39, i32 0
1832  %.splat41 = shufflevector <4 x float> %.splatinsert40, <4 x float> undef, <4 x i32> zeroinitializer
1833  %i50 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i49, <4 x float> %.splat41, <4 x float> %i47)
1834  %arrayidx42 = getelementptr inbounds float, ptr %pCoeffs.0, i32 12
1835  %i52 = load <4 x float>, ptr %arrayidx42, align 4
1836  %.splatinsert43 = insertelement <4 x float> undef, float %i38, i32 0
1837  %.splat44 = shufflevector <4 x float> %.splatinsert43, <4 x float> undef, <4 x i32> zeroinitializer
1838  %i53 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i52, <4 x float> %.splat44, <4 x float> %i50)
1839  %arrayidx45 = getelementptr inbounds float, ptr %pCoeffs.0, i32 16
1840  %i55 = load <4 x float>, ptr %arrayidx45, align 4
1841  %.splatinsert46 = insertelement <4 x float> undef, float %Xn1.0.lcssa, i32 0
1842  %.splat47 = shufflevector <4 x float> %.splatinsert46, <4 x float> undef, <4 x i32> zeroinitializer
1843  %i56 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i55, <4 x float> %.splat47, <4 x float> %i53)
1844  %arrayidx48 = getelementptr inbounds float, ptr %pCoeffs.0, i32 20
1845  %i58 = load <4 x float>, ptr %arrayidx48, align 4
1846  %.splatinsert49 = insertelement <4 x float> undef, float %Xn2.0.lcssa, i32 0
1847  %.splat50 = shufflevector <4 x float> %.splatinsert49, <4 x float> undef, <4 x i32> zeroinitializer
1848  %i59 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i58, <4 x float> %.splat50, <4 x float> %i56)
1849  %arrayidx51 = getelementptr inbounds float, ptr %pCoeffs.0, i32 24
1850  %i61 = load <4 x float>, ptr %arrayidx51, align 4
1851  %.splatinsert52 = insertelement <4 x float> undef, float %Yn1.0.lcssa, i32 0
1852  %.splat53 = shufflevector <4 x float> %.splatinsert52, <4 x float> undef, <4 x i32> zeroinitializer
1853  %i62 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i61, <4 x float> %.splat53, <4 x float> %i59)
1854  %arrayidx54 = getelementptr inbounds float, ptr %pCoeffs.0, i32 28
1855  %i64 = load <4 x float>, ptr %arrayidx54, align 4
1856  %.splatinsert55 = insertelement <4 x float> undef, float %Yn2.0.lcssa, i32 0
1857  %.splat56 = shufflevector <4 x float> %.splatinsert55, <4 x float> undef, <4 x i32> zeroinitializer
1858  %i65 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i64, <4 x float> %.splat56, <4 x float> %i62)
1859  %i66 = extractelement <4 x float> %i65, i32 0
1860  br i1 %cmp57, label %if.then58, label %if.else
1861
1862if.then58:                                        ; preds = %if.then
1863  store float %i66, ptr %pOut.1.lcssa, align 4
1864  br label %if.end69
1865
1866if.else:                                          ; preds = %if.then
1867  %incdec.ptr62 = getelementptr inbounds float, ptr %pOut.1.lcssa, i32 1
1868  store float %i66, ptr %pOut.1.lcssa, align 4
1869  %i67 = extractelement <4 x float> %i65, i32 1
1870  store float %i67, ptr %incdec.ptr62, align 4
1871  br i1 %cmp60, label %if.end69, label %if.else64
1872
1873if.else64:                                        ; preds = %if.else
1874  %incdec.ptr63 = getelementptr inbounds float, ptr %pOut.1.lcssa, i32 2
1875  %i68 = extractelement <4 x float> %i65, i32 2
1876  store float %i68, ptr %incdec.ptr63, align 4
1877  br label %if.end69
1878
1879if.end69:                                         ; preds = %if.else64, %if.else, %if.then58, %while.end
1880  %Xn1.1 = phi float [ %i38, %if.then58 ], [ %i40, %if.else64 ], [ %Xn1.0.lcssa, %while.end ], [ %i39, %if.else ]
1881  %Xn2.1 = phi float [ %X3.1.lcssa, %if.then58 ], [ %i39, %if.else64 ], [ %Xn2.0.lcssa, %while.end ], [ %i38, %if.else ]
1882  %Yn1.1 = phi float [ %i66, %if.then58 ], [ %i68, %if.else64 ], [ %Yn1.0.lcssa, %while.end ], [ %i67, %if.else ]
1883  %Yn2.1 = phi float [ %Yn1.0.lcssa, %if.then58 ], [ %i67, %if.else64 ], [ %Yn2.0.lcssa, %while.end ], [ %i66, %if.else ]
1884  %X3.2 = phi float [ %i41, %if.then58 ], [ %i41, %if.else64 ], [ %X3.1.lcssa, %while.end ], [ %i41, %if.else ]
1885  store float %Xn1.1, ptr %pState.0, align 4
1886  store float %Xn2.1, ptr %arrayidx3, align 4
1887  store float %Yn1.1, ptr %arrayidx4, align 4
1888  %incdec.ptr73 = getelementptr inbounds float, ptr %pState.0, i32 4
1889  store float %Yn2.1, ptr %arrayidx5, align 4
1890  %add.ptr74 = getelementptr inbounds float, ptr %pCoeffs.0, i32 32
1891  %dec75 = add i32 %stage.0, -1
1892  %cmp76 = icmp eq i32 %dec75, 0
1893  br i1 %cmp76, label %do.end, label %do.body
1894
1895do.end:                                           ; preds = %if.end69
1896  ret void
1897}
1898
1899
1900%struct.arm_biquad_cascade_df2T_instance_f32 = type { i8, ptr, ptr }
1901define void @arm_biquad_cascade_df2T_f32(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) {
1902; CHECK-LABEL: arm_biquad_cascade_df2T_f32:
1903; CHECK:       @ %bb.0: @ %entry
1904; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, lr}
1905; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
1906; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
1907; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
1908; CHECK-NEXT:    ldrd r6, r12, [r0, #4]
1909; CHECK-NEXT:    lsr.w r8, r3, #1
1910; CHECK-NEXT:    ldrb r0, [r0]
1911; CHECK-NEXT:    vldr s0, .LCPI20_0
1912; CHECK-NEXT:    b .LBB20_3
1913; CHECK-NEXT:  .LBB20_1: @ %if.else
1914; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
1915; CHECK-NEXT:    vmov.f32 s6, s5
1916; CHECK-NEXT:    vstr s4, [r6]
1917; CHECK-NEXT:  .LBB20_2: @ %if.end
1918; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
1919; CHECK-NEXT:    vstr s6, [r6, #4]
1920; CHECK-NEXT:    add.w r12, r12, #20
1921; CHECK-NEXT:    adds r6, #8
1922; CHECK-NEXT:    subs r0, #1
1923; CHECK-NEXT:    mov r1, r2
1924; CHECK-NEXT:    beq .LBB20_8
1925; CHECK-NEXT:  .LBB20_3: @ %do.body
1926; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1927; CHECK-NEXT:    @ Child Loop BB20_5 Depth 2
1928; CHECK-NEXT:    vldrw.u32 q3, [r12]
1929; CHECK-NEXT:    movs r5, #0
1930; CHECK-NEXT:    vmov q4, q3
1931; CHECK-NEXT:    vshlc q4, r5, #32
1932; CHECK-NEXT:    vldrw.u32 q2, [r12, #8]
1933; CHECK-NEXT:    vmov q5, q2
1934; CHECK-NEXT:    vshlc q5, r5, #32
1935; CHECK-NEXT:    vldrw.u32 q1, [r6]
1936; CHECK-NEXT:    vmov.f32 s6, s0
1937; CHECK-NEXT:    mov r5, r2
1938; CHECK-NEXT:    vmov.f32 s7, s0
1939; CHECK-NEXT:    wls lr, r8, .LBB20_6
1940; CHECK-NEXT:  @ %bb.4: @ %while.body.preheader
1941; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
1942; CHECK-NEXT:    vmov q6, q1
1943; CHECK-NEXT:    mov r5, r2
1944; CHECK-NEXT:  .LBB20_5: @ %while.body
1945; CHECK-NEXT:    @ Parent Loop BB20_3 Depth=1
1946; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1947; CHECK-NEXT:    ldrd r7, r4, [r1], #8
1948; CHECK-NEXT:    vfma.f32 q6, q3, r7
1949; CHECK-NEXT:    vmov r7, s24
1950; CHECK-NEXT:    vmov q1, q6
1951; CHECK-NEXT:    vfma.f32 q1, q2, r7
1952; CHECK-NEXT:    vstr s24, [r5]
1953; CHECK-NEXT:    vmov.f32 s7, s0
1954; CHECK-NEXT:    vfma.f32 q1, q4, r4
1955; CHECK-NEXT:    vmov r4, s5
1956; CHECK-NEXT:    vstr s5, [r5, #4]
1957; CHECK-NEXT:    vfma.f32 q1, q5, r4
1958; CHECK-NEXT:    adds r5, #8
1959; CHECK-NEXT:    vmov.f32 s4, s6
1960; CHECK-NEXT:    vmov.f32 s5, s7
1961; CHECK-NEXT:    vmov.f32 s6, s0
1962; CHECK-NEXT:    vmov q6, q1
1963; CHECK-NEXT:    le lr, .LBB20_5
1964; CHECK-NEXT:  .LBB20_6: @ %while.end
1965; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
1966; CHECK-NEXT:    lsls r7, r3, #31
1967; CHECK-NEXT:    beq .LBB20_1
1968; CHECK-NEXT:  @ %bb.7: @ %if.then
1969; CHECK-NEXT:    @ in Loop: Header=BB20_3 Depth=1
1970; CHECK-NEXT:    ldr r1, [r1]
1971; CHECK-NEXT:    vfma.f32 q1, q3, r1
1972; CHECK-NEXT:    vmov r1, s4
1973; CHECK-NEXT:    vstr s4, [r5]
1974; CHECK-NEXT:    vfma.f32 q1, q2, r1
1975; CHECK-NEXT:    vstr s5, [r6]
1976; CHECK-NEXT:    b .LBB20_2
1977; CHECK-NEXT:  .LBB20_8: @ %do.end
1978; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
1979; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
1980; CHECK-NEXT:    .p2align 2
1981; CHECK-NEXT:  @ %bb.9:
1982; CHECK-NEXT:  .LCPI20_0:
1983; CHECK-NEXT:    .long 0x00000000 @ float 0
1984entry:
1985  %pState1 = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f32, ptr %S, i32 0, i32 1
1986  %i = load ptr, ptr %pState1, align 4
1987  %numStages = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f32, ptr %S, i32 0, i32 0
1988  %i1 = load i8, ptr %numStages, align 4
1989  %conv = zext i8 %i1 to i32
1990  %pCoeffs = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f32, ptr %S, i32 0, i32 2
1991  %i2 = load ptr, ptr %pCoeffs, align 4
1992  %div = lshr i32 %blockSize, 1
1993  %cmp.not90 = icmp eq i32 %div, 0
1994  %and = and i32 %blockSize, 1
1995  %tobool.not = icmp eq i32 %and, 0
1996  br label %do.body
1997
1998do.body:                                          ; preds = %if.end, %entry
1999  %stage.0 = phi i32 [ %conv, %entry ], [ %dec23, %if.end ]
2000  %pCurCoeffs.0 = phi ptr [ %i2, %entry ], [ %add.ptr2, %if.end ]
2001  %pState.0 = phi ptr [ %i, %entry ], [ %pState.1, %if.end ]
2002  %pIn.0 = phi ptr [ %pSrc, %entry ], [ %pDst, %if.end ]
2003  %i4 = load <4 x float>, ptr %pCurCoeffs.0, align 4
2004  %add.ptr = getelementptr inbounds float, ptr %pCurCoeffs.0, i32 2
2005  %i6 = load <4 x float>, ptr %add.ptr, align 4
2006  %add.ptr2 = getelementptr inbounds float, ptr %pCurCoeffs.0, i32 5
2007  %i8 = load <4 x float>, ptr %pState.0, align 8
2008  %i9 = shufflevector <4 x float> %i8, <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
2009  %i10 = bitcast <4 x float> %i4 to <4 x i32>
2010  %i11 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %i10, i32 0, i32 32)
2011  %i12 = extractvalue { i32, <4 x i32> } %i11, 0
2012  %i13 = extractvalue { i32, <4 x i32> } %i11, 1
2013  %i14 = bitcast <4 x i32> %i13 to <4 x float>
2014  %i15 = bitcast <4 x float> %i6 to <4 x i32>
2015  %i16 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %i15, i32 %i12, i32 32)
2016  %i17 = extractvalue { i32, <4 x i32> } %i16, 1
2017  %i18 = bitcast <4 x i32> %i17 to <4 x float>
2018  br i1 %cmp.not90, label %while.end, label %while.body
2019
2020while.body:                                       ; preds = %while.body, %do.body
2021  %pIn.194 = phi ptr [ %incdec.ptr4, %while.body ], [ %pIn.0, %do.body ]
2022  %state.093 = phi <4 x float> [ %i30, %while.body ], [ %i9, %do.body ]
2023  %pOut.192 = phi ptr [ %incdec.ptr12, %while.body ], [ %pDst, %do.body ]
2024  %sample.091 = phi i32 [ %dec, %while.body ], [ %div, %do.body ]
2025  %incdec.ptr = getelementptr inbounds float, ptr %pIn.194, i32 1
2026  %i19 = load float, ptr %pIn.194, align 4
2027  %incdec.ptr4 = getelementptr inbounds float, ptr %pIn.194, i32 2
2028  %i20 = load float, ptr %incdec.ptr, align 4
2029  %.splatinsert = insertelement <4 x float> poison, float %i19, i32 0
2030  %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
2031  %i21 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i4, <4 x float> %.splat, <4 x float> %state.093)
2032  %i22 = extractelement <4 x float> %i21, i32 0
2033  %.splat6 = shufflevector <4 x float> %i21, <4 x float> poison, <4 x i32> zeroinitializer
2034  %i23 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i6, <4 x float> %.splat6, <4 x float> %i21)
2035  %i24 = insertelement <4 x float> %i23, float 0.000000e+00, i32 3
2036  %.splatinsert7 = insertelement <4 x float> poison, float %i20, i32 0
2037  %.splat8 = shufflevector <4 x float> %.splatinsert7, <4 x float> poison, <4 x i32> zeroinitializer
2038  %i25 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i14, <4 x float> %.splat8, <4 x float> %i24)
2039  %i26 = extractelement <4 x float> %i25, i32 1
2040  %.splat10 = shufflevector <4 x float> %i25, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
2041  %i27 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i18, <4 x float> %.splat10, <4 x float> %i25)
2042  %i28 = shufflevector <4 x float> %i27, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 3>
2043  %i29 = insertelement <4 x float> %i28, float 0.000000e+00, i32 2
2044  %i30 = shufflevector <4 x float> %i29, <4 x float> %i27, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
2045  %incdec.ptr11 = getelementptr inbounds float, ptr %pOut.192, i32 1
2046  store float %i22, ptr %pOut.192, align 4
2047  %incdec.ptr12 = getelementptr inbounds float, ptr %pOut.192, i32 2
2048  store float %i26, ptr %incdec.ptr11, align 4
2049  %dec = add nsw i32 %sample.091, -1
2050  %cmp.not = icmp eq i32 %dec, 0
2051  br i1 %cmp.not, label %while.end, label %while.body
2052
2053while.end:                                        ; preds = %while.body, %do.body
2054  %pOut.1.lcssa = phi ptr [ %pDst, %do.body ], [ %incdec.ptr12, %while.body ]
2055  %state.0.lcssa = phi <4 x float> [ %i9, %do.body ], [ %i30, %while.body ]
2056  %pIn.1.lcssa = phi ptr [ %pIn.0, %do.body ], [ %incdec.ptr4, %while.body ]
2057  br i1 %tobool.not, label %if.else, label %if.then
2058
2059if.then:                                          ; preds = %while.end
2060  %i31 = load float, ptr %pIn.1.lcssa, align 4
2061  %.splatinsert14 = insertelement <4 x float> poison, float %i31, i32 0
2062  %.splat15 = shufflevector <4 x float> %.splatinsert14, <4 x float> poison, <4 x i32> zeroinitializer
2063  %i32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i4, <4 x float> %.splat15, <4 x float> %state.0.lcssa)
2064  %i33 = extractelement <4 x float> %i32, i32 0
2065  %.splat17 = shufflevector <4 x float> %i32, <4 x float> poison, <4 x i32> zeroinitializer
2066  %i34 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %i6, <4 x float> %.splat17, <4 x float> %i32)
2067  store float %i33, ptr %pOut.1.lcssa, align 4
2068  %i35 = extractelement <4 x float> %i34, i32 1
2069  store float %i35, ptr %pState.0, align 4
2070  %i36 = extractelement <4 x float> %i34, i32 2
2071  br label %if.end
2072
2073if.else:                                          ; preds = %while.end
2074  %i37 = extractelement <4 x float> %state.0.lcssa, i32 0
2075  store float %i37, ptr %pState.0, align 4
2076  %i38 = extractelement <4 x float> %state.0.lcssa, i32 1
2077  br label %if.end
2078
2079if.end:                                           ; preds = %if.else, %if.then
2080  %.sink = phi float [ %i38, %if.else ], [ %i36, %if.then ]
2081  %i39 = getelementptr inbounds float, ptr %pState.0, i32 1
2082  store float %.sink, ptr %i39, align 4
2083  %pState.1 = getelementptr inbounds float, ptr %pState.0, i32 2
2084  %dec23 = add i32 %stage.0, -1
2085  %cmp24.not = icmp eq i32 %dec23, 0
2086  br i1 %cmp24.not, label %do.end, label %do.body
2087
2088do.end:                                           ; preds = %if.end
2089  ret void
2090}
2091
2092define arm_aapcs_vfpcc float @vecAddAcrossF32Mve(<4 x float> %in) {
2093; CHECK-LABEL: vecAddAcrossF32Mve:
2094; CHECK:       @ %bb.0: @ %entry
2095; CHECK-NEXT:    vadd.f32 s0, s0, s1
2096; CHECK-NEXT:    vadd.f32 s0, s0, s2
2097; CHECK-NEXT:    vadd.f32 s0, s0, s3
2098; CHECK-NEXT:    bx lr
2099entry:
2100  %i = extractelement <4 x float> %in, i32 0
2101  %i1 = extractelement <4 x float> %in, i32 1
2102  %add = fadd fast float %i, %i1
2103  %i2 = extractelement <4 x float> %in, i32 2
2104  %add1 = fadd fast float %add, %i2
2105  %i3 = extractelement <4 x float> %in, i32 3
2106  %add2 = fadd fast float %add1, %i3
2107  ret float %add2
2108}
2109
2110
2111declare { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32>, i32, i32) #1
2112declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
2113declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
2114declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32, i32, i32)
2115declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0.v4i32(ptr, <4 x i32>, i32, i32, i32)
2116declare void @llvm.assume(i1)
2117declare <4 x i1> @llvm.arm.mve.vctp32(i32)
2118declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
2119declare void @llvm.masked.store.v4f32.p0(<4 x float>, ptr, i32 immarg, <4 x i1>)
2120