xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll (revision e0ed0333f0fed2e73f805afd58b61176a87aa3ad)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
3
4define arm_aapcs_vfpcc void @test_fadd(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
5; CHECK-LABEL: test_fadd:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    cmp r2, #1
8; CHECK-NEXT:    it lt
9; CHECK-NEXT:    bxlt lr
10; CHECK-NEXT:  .LBB0_1: @ %vector.ph
11; CHECK-NEXT:    vmov.f16 r3, s0
12; CHECK-NEXT:  .LBB0_2: @ %vector.body
13; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
14; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
15; CHECK-NEXT:    subs r2, #8
16; CHECK-NEXT:    vadd.f16 q0, q0, r3
17; CHECK-NEXT:    vstrb.8 q0, [r1], #16
18; CHECK-NEXT:    bne .LBB0_2
19; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
20; CHECK-NEXT:    bx lr
21entry:
22  %i = and i32 %n, 7
23  %cmp = icmp eq i32 %i, 0
24  tail call void @llvm.assume(i1 %cmp)
25  %cmp18 = icmp sgt i32 %n, 0
26  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
27
28vector.ph:                                        ; preds = %entry
29  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
30  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
31  br label %vector.body
32
33vector.body:                                      ; preds = %vector.body, %vector.ph
34  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
35  %i1 = getelementptr inbounds half, ptr %A, i32 %index
36  %wide.load = load <8 x half>, ptr %i1, align 4
37  %i3 = fadd fast <8 x half> %wide.load, %broadcast.splat11
38  %i4 = getelementptr inbounds half, ptr %C, i32 %index
39  store <8 x half> %i3, ptr %i4, align 4
40  %index.next = add i32 %index, 8
41  %i6 = icmp eq i32 %index.next, %n
42  br i1 %i6, label %for.cond.cleanup, label %vector.body
43
44for.cond.cleanup:                                 ; preds = %vector.body, %entry
45  ret void
46}
47
48define arm_aapcs_vfpcc void @test_fadd_r(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
49; CHECK-LABEL: test_fadd_r:
50; CHECK:       @ %bb.0: @ %entry
51; CHECK-NEXT:    cmp r2, #1
52; CHECK-NEXT:    it lt
53; CHECK-NEXT:    bxlt lr
54; CHECK-NEXT:  .LBB1_1: @ %vector.ph
55; CHECK-NEXT:    vmov.f16 r3, s0
56; CHECK-NEXT:  .LBB1_2: @ %vector.body
57; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
58; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
59; CHECK-NEXT:    subs r2, #8
60; CHECK-NEXT:    vadd.f16 q0, q0, r3
61; CHECK-NEXT:    vstrb.8 q0, [r1], #16
62; CHECK-NEXT:    bne .LBB1_2
63; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
64; CHECK-NEXT:    bx lr
65entry:
66  %i = and i32 %n, 7
67  %cmp = icmp eq i32 %i, 0
68  tail call void @llvm.assume(i1 %cmp)
69  %cmp18 = icmp sgt i32 %n, 0
70  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
71
72vector.ph:                                        ; preds = %entry
73  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
74  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
75  br label %vector.body
76
77vector.body:                                      ; preds = %vector.body, %vector.ph
78  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
79  %i1 = getelementptr inbounds half, ptr %A, i32 %index
80  %wide.load = load <8 x half>, ptr %i1, align 4
81  %i3 = fadd fast <8 x half> %broadcast.splat11, %wide.load
82  %i4 = getelementptr inbounds half, ptr %C, i32 %index
83  store <8 x half> %i3, ptr %i4, align 4
84  %index.next = add i32 %index, 8
85  %i6 = icmp eq i32 %index.next, %n
86  br i1 %i6, label %for.cond.cleanup, label %vector.body
87
88for.cond.cleanup:                                 ; preds = %vector.body, %entry
89  ret void
90}
91
92define arm_aapcs_vfpcc void @test_fmul(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
93; CHECK-LABEL: test_fmul:
94; CHECK:       @ %bb.0: @ %entry
95; CHECK-NEXT:    cmp r2, #1
96; CHECK-NEXT:    it lt
97; CHECK-NEXT:    bxlt lr
98; CHECK-NEXT:  .LBB2_1: @ %vector.ph
99; CHECK-NEXT:    vmov.f16 r3, s0
100; CHECK-NEXT:  .LBB2_2: @ %vector.body
101; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
102; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
103; CHECK-NEXT:    subs r2, #8
104; CHECK-NEXT:    vmul.f16 q0, q0, r3
105; CHECK-NEXT:    vstrb.8 q0, [r1], #16
106; CHECK-NEXT:    bne .LBB2_2
107; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
108; CHECK-NEXT:    bx lr
109entry:
110  %i = and i32 %n, 7
111  %cmp = icmp eq i32 %i, 0
112  tail call void @llvm.assume(i1 %cmp)
113  %cmp18 = icmp sgt i32 %n, 0
114  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
115
116vector.ph:                                        ; preds = %entry
117  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
118  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
119  br label %vector.body
120
121vector.body:                                      ; preds = %vector.body, %vector.ph
122  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
123  %i1 = getelementptr inbounds half, ptr %A, i32 %index
124  %wide.load = load <8 x half>, ptr %i1, align 4
125  %i3 = fmul fast <8 x half> %wide.load, %broadcast.splat11
126  %i4 = getelementptr inbounds half, ptr %C, i32 %index
127  store <8 x half> %i3, ptr %i4, align 4
128  %index.next = add i32 %index, 8
129  %i6 = icmp eq i32 %index.next, %n
130  br i1 %i6, label %for.cond.cleanup, label %vector.body
131
132for.cond.cleanup:                                 ; preds = %vector.body, %entry
133  ret void
134}
135
136define arm_aapcs_vfpcc void @test_fmul_r(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
137; CHECK-LABEL: test_fmul_r:
138; CHECK:       @ %bb.0: @ %entry
139; CHECK-NEXT:    cmp r2, #1
140; CHECK-NEXT:    it lt
141; CHECK-NEXT:    bxlt lr
142; CHECK-NEXT:  .LBB3_1: @ %vector.ph
143; CHECK-NEXT:    vmov.f16 r3, s0
144; CHECK-NEXT:  .LBB3_2: @ %vector.body
145; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
146; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
147; CHECK-NEXT:    subs r2, #8
148; CHECK-NEXT:    vmul.f16 q0, q0, r3
149; CHECK-NEXT:    vstrb.8 q0, [r1], #16
150; CHECK-NEXT:    bne .LBB3_2
151; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
152; CHECK-NEXT:    bx lr
153entry:
154  %i = and i32 %n, 7
155  %cmp = icmp eq i32 %i, 0
156  tail call void @llvm.assume(i1 %cmp)
157  %cmp18 = icmp sgt i32 %n, 0
158  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
159
160vector.ph:                                        ; preds = %entry
161  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
162  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
163  br label %vector.body
164
165vector.body:                                      ; preds = %vector.body, %vector.ph
166  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
167  %i1 = getelementptr inbounds half, ptr %A, i32 %index
168  %wide.load = load <8 x half>, ptr %i1, align 4
169  %i3 = fmul fast <8 x half> %broadcast.splat11, %wide.load
170  %i4 = getelementptr inbounds half, ptr %C, i32 %index
171  store <8 x half> %i3, ptr %i4, align 4
172  %index.next = add i32 %index, 8
173  %i6 = icmp eq i32 %index.next, %n
174  br i1 %i6, label %for.cond.cleanup, label %vector.body
175
176for.cond.cleanup:                                 ; preds = %vector.body, %entry
177  ret void
178}
179
180define arm_aapcs_vfpcc void @test_fsub(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
181; CHECK-LABEL: test_fsub:
182; CHECK:       @ %bb.0: @ %entry
183; CHECK-NEXT:    cmp r2, #1
184; CHECK-NEXT:    it lt
185; CHECK-NEXT:    bxlt lr
186; CHECK-NEXT:  .LBB4_1: @ %vector.ph
187; CHECK-NEXT:    vmov.f16 r3, s0
188; CHECK-NEXT:  .LBB4_2: @ %vector.body
189; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
190; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
191; CHECK-NEXT:    subs r2, #8
192; CHECK-NEXT:    vsub.f16 q0, q0, r3
193; CHECK-NEXT:    vstrb.8 q0, [r1], #16
194; CHECK-NEXT:    bne .LBB4_2
195; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
196; CHECK-NEXT:    bx lr
197entry:
198  %i = and i32 %n, 7
199  %cmp = icmp eq i32 %i, 0
200  tail call void @llvm.assume(i1 %cmp)
201  %cmp18 = icmp sgt i32 %n, 0
202  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
203
204vector.ph:                                        ; preds = %entry
205  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
206  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
207  br label %vector.body
208
209vector.body:                                      ; preds = %vector.body, %vector.ph
210  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
211  %i1 = getelementptr inbounds half, ptr %A, i32 %index
212  %wide.load = load <8 x half>, ptr %i1, align 4
213  %i3 = fsub fast <8 x half> %wide.load, %broadcast.splat11
214  %i4 = getelementptr inbounds half, ptr %C, i32 %index
215  store <8 x half> %i3, ptr %i4, align 4
216  %index.next = add i32 %index, 8
217  %i6 = icmp eq i32 %index.next, %n
218  br i1 %i6, label %for.cond.cleanup, label %vector.body
219
220for.cond.cleanup:                                 ; preds = %vector.body, %entry
221  ret void
222}
223
224define arm_aapcs_vfpcc void @test_fsub_r(ptr noalias nocapture readonly %A, half %B, ptr noalias nocapture %C, i32 %n) {
225; CHECK-LABEL: test_fsub_r:
226; CHECK:       @ %bb.0: @ %entry
227; CHECK-NEXT:    cmp r2, #1
228; CHECK-NEXT:    it lt
229; CHECK-NEXT:    bxlt lr
230; CHECK-NEXT:  .LBB5_1: @ %vector.ph
231; CHECK-NEXT:    vmov.f16 r3, s0
232; CHECK-NEXT:    vdup.16 q0, r3
233; CHECK-NEXT:  .LBB5_2: @ %vector.body
234; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
235; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
236; CHECK-NEXT:    subs r2, #8
237; CHECK-NEXT:    vsub.f16 q1, q0, q1
238; CHECK-NEXT:    vstrb.8 q1, [r1], #16
239; CHECK-NEXT:    bne .LBB5_2
240; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
241; CHECK-NEXT:    bx lr
242entry:
243  %i = and i32 %n, 7
244  %cmp = icmp eq i32 %i, 0
245  tail call void @llvm.assume(i1 %cmp)
246  %cmp18 = icmp sgt i32 %n, 0
247  br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
248
249vector.ph:                                        ; preds = %entry
250  %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
251  %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
252  br label %vector.body
253
254vector.body:                                      ; preds = %vector.body, %vector.ph
255  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
256  %i1 = getelementptr inbounds half, ptr %A, i32 %index
257  %wide.load = load <8 x half>, ptr %i1, align 4
258  %i3 = fsub fast <8 x half> %broadcast.splat11, %wide.load
259  %i4 = getelementptr inbounds half, ptr %C, i32 %index
260  store <8 x half> %i3, ptr %i4, align 4
261  %index.next = add i32 %index, 8
262  %i6 = icmp eq i32 %index.next, %n
263  br i1 %i6, label %for.cond.cleanup, label %vector.body
264
265for.cond.cleanup:                                 ; preds = %vector.body, %entry
266  ret void
267}
268
269
270define arm_aapcs_vfpcc void @test_fmas(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
271; CHECK-LABEL: test_fmas:
272; CHECK:       @ %bb.0: @ %entry
273; CHECK-NEXT:    cmp r3, #1
274; CHECK-NEXT:    it lt
275; CHECK-NEXT:    bxlt lr
276; CHECK-NEXT:  .LBB6_1: @ %vector.ph
277; CHECK-NEXT:    vmov.f16 r12, s0
278; CHECK-NEXT:  .LBB6_2: @ %vector.body
279; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
280; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
281; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
282; CHECK-NEXT:    subs r3, #8
283; CHECK-NEXT:    vfmas.f16 q1, q0, r12
284; CHECK-NEXT:    vstrb.8 q1, [r2], #16
285; CHECK-NEXT:    bne .LBB6_2
286; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
287; CHECK-NEXT:    bx lr
288entry:
289  %i = and i32 %n, 7
290  %cmp = icmp eq i32 %i, 0
291  tail call void @llvm.assume(i1 %cmp)
292  %cmp110 = icmp sgt i32 %n, 0
293  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
294
295vector.ph:                                        ; preds = %entry
296  %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
297  %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
298  br label %vector.body
299
300vector.body:                                      ; preds = %vector.body, %vector.ph
301  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
302  %i1 = getelementptr inbounds half, ptr %A, i32 %index
303  %wide.load = load <8 x half>, ptr %i1, align 4
304  %i3 = getelementptr inbounds half, ptr %B, i32 %index
305  %wide.load12 = load <8 x half>, ptr %i3, align 4
306  %i5 = fmul fast <8 x half> %wide.load12, %wide.load
307  %i6 = fadd fast <8 x half> %i5, %broadcast.splat14
308  %i7 = getelementptr inbounds half, ptr %D, i32 %index
309  store <8 x half> %i6, ptr %i7, align 4
310  %index.next = add i32 %index, 8
311  %i9 = icmp eq i32 %index.next, %n
312  br i1 %i9, label %for.cond.cleanup, label %vector.body
313
314for.cond.cleanup:                                 ; preds = %vector.body, %entry
315  ret void
316}
317
318define arm_aapcs_vfpcc void @test_fmas_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
319; CHECK-LABEL: test_fmas_r:
320; CHECK:       @ %bb.0: @ %entry
321; CHECK-NEXT:    cmp r3, #1
322; CHECK-NEXT:    it lt
323; CHECK-NEXT:    bxlt lr
324; CHECK-NEXT:  .LBB7_1: @ %vector.ph
325; CHECK-NEXT:    vmov.f16 r12, s0
326; CHECK-NEXT:  .LBB7_2: @ %vector.body
327; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
328; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
329; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
330; CHECK-NEXT:    subs r3, #8
331; CHECK-NEXT:    vfmas.f16 q1, q0, r12
332; CHECK-NEXT:    vstrb.8 q1, [r2], #16
333; CHECK-NEXT:    bne .LBB7_2
334; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
335; CHECK-NEXT:    bx lr
336entry:
337  %i = and i32 %n, 7
338  %cmp = icmp eq i32 %i, 0
339  tail call void @llvm.assume(i1 %cmp)
340  %cmp110 = icmp sgt i32 %n, 0
341  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
342
343vector.ph:                                        ; preds = %entry
344  %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
345  %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
346  br label %vector.body
347
348vector.body:                                      ; preds = %vector.body, %vector.ph
349  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
350  %i1 = getelementptr inbounds half, ptr %A, i32 %index
351  %wide.load = load <8 x half>, ptr %i1, align 4
352  %i3 = getelementptr inbounds half, ptr %B, i32 %index
353  %wide.load12 = load <8 x half>, ptr %i3, align 4
354  %i5 = fmul fast <8 x half> %wide.load12, %wide.load
355  %i6 = fadd fast <8 x half> %broadcast.splat14, %i5
356  %i7 = getelementptr inbounds half, ptr %D, i32 %index
357  store <8 x half> %i6, ptr %i7, align 4
358  %index.next = add i32 %index, 8
359  %i9 = icmp eq i32 %index.next, %n
360  br i1 %i9, label %for.cond.cleanup, label %vector.body
361
362for.cond.cleanup:                                 ; preds = %vector.body, %entry
363  ret void
364}
365
366define arm_aapcs_vfpcc void @test_fma(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
367; CHECK-LABEL: test_fma:
368; CHECK:       @ %bb.0: @ %entry
369; CHECK-NEXT:    cmp r3, #1
370; CHECK-NEXT:    it lt
371; CHECK-NEXT:    bxlt lr
372; CHECK-NEXT:  .LBB8_1: @ %vector.ph
373; CHECK-NEXT:    vmov.f16 r12, s0
374; CHECK-NEXT:  .LBB8_2: @ %vector.body
375; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
376; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
377; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
378; CHECK-NEXT:    subs r3, #8
379; CHECK-NEXT:    vfma.f16 q1, q0, r12
380; CHECK-NEXT:    vstrb.8 q1, [r2], #16
381; CHECK-NEXT:    bne .LBB8_2
382; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
383; CHECK-NEXT:    bx lr
384entry:
385  %i = and i32 %n, 7
386  %cmp = icmp eq i32 %i, 0
387  tail call void @llvm.assume(i1 %cmp)
388  %cmp110 = icmp sgt i32 %n, 0
389  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
390
391vector.ph:                                        ; preds = %entry
392  %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
393  %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
394  br label %vector.body
395
396vector.body:                                      ; preds = %vector.body, %vector.ph
397  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
398  %i1 = getelementptr inbounds half, ptr %A, i32 %index
399  %wide.load = load <8 x half>, ptr %i1, align 4
400  %i3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
401  %i4 = getelementptr inbounds half, ptr %B, i32 %index
402  %wide.load14 = load <8 x half>, ptr %i4, align 4
403  %i6 = fadd fast <8 x half> %i3, %wide.load14
404  %i7 = getelementptr inbounds half, ptr %D, i32 %index
405  store <8 x half> %i6, ptr %i7, align 4
406  %index.next = add i32 %index, 8
407  %i9 = icmp eq i32 %index.next, %n
408  br i1 %i9, label %for.cond.cleanup, label %vector.body
409
410for.cond.cleanup:                                 ; preds = %vector.body, %entry
411  ret void
412}
413
414define arm_aapcs_vfpcc void @test_fma_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
415; CHECK-LABEL: test_fma_r:
416; CHECK:       @ %bb.0: @ %entry
417; CHECK-NEXT:    cmp r3, #1
418; CHECK-NEXT:    it lt
419; CHECK-NEXT:    bxlt lr
420; CHECK-NEXT:  .LBB9_1: @ %vector.ph
421; CHECK-NEXT:    vmov.f16 r12, s0
422; CHECK-NEXT:  .LBB9_2: @ %vector.body
423; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
424; CHECK-NEXT:    vldrw.u32 q0, [r0], #16
425; CHECK-NEXT:    vldrw.u32 q1, [r1], #16
426; CHECK-NEXT:    subs r3, #8
427; CHECK-NEXT:    vfma.f16 q1, q0, r12
428; CHECK-NEXT:    vstrb.8 q1, [r2], #16
429; CHECK-NEXT:    bne .LBB9_2
430; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
431; CHECK-NEXT:    bx lr
432entry:
433  %i = and i32 %n, 7
434  %cmp = icmp eq i32 %i, 0
435  tail call void @llvm.assume(i1 %cmp)
436  %cmp110 = icmp sgt i32 %n, 0
437  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
438
439vector.ph:                                        ; preds = %entry
440  %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
441  %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
442  br label %vector.body
443
444vector.body:                                      ; preds = %vector.body, %vector.ph
445  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
446  %i1 = getelementptr inbounds half, ptr %A, i32 %index
447  %wide.load = load <8 x half>, ptr %i1, align 4
448  %i3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
449  %i4 = getelementptr inbounds half, ptr %B, i32 %index
450  %wide.load14 = load <8 x half>, ptr %i4, align 4
451  %i6 = fadd fast <8 x half> %i3, %wide.load14
452  %i7 = getelementptr inbounds half, ptr %D, i32 %index
453  store <8 x half> %i6, ptr %i7, align 4
454  %index.next = add i32 %index, 8
455  %i9 = icmp eq i32 %index.next, %n
456  br i1 %i9, label %for.cond.cleanup, label %vector.body
457
458for.cond.cleanup:                                 ; preds = %vector.body, %entry
459  ret void
460}
461
462
463define arm_aapcs_vfpcc void @test_fmss(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
464; CHECK-LABEL: test_fmss:
465; CHECK:       @ %bb.0: @ %entry
466; CHECK-NEXT:    cmp r3, #1
467; CHECK-NEXT:    it lt
468; CHECK-NEXT:    bxlt lr
469; CHECK-NEXT:  .LBB10_1: @ %vector.ph
470; CHECK-NEXT:    vmov.f16 r12, s0
471; CHECK-NEXT:    vdup.16 q0, r12
472; CHECK-NEXT:    vneg.f16 q0, q0
473; CHECK-NEXT:  .LBB10_2: @ %vector.body
474; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
475; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
476; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
477; CHECK-NEXT:    vmov q3, q0
478; CHECK-NEXT:    subs r3, #8
479; CHECK-NEXT:    vfma.f16 q3, q2, q1
480; CHECK-NEXT:    vstrb.8 q3, [r2], #16
481; CHECK-NEXT:    bne .LBB10_2
482; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
483; CHECK-NEXT:    bx lr
484entry:
485  %i = and i32 %n, 7
486  %cmp = icmp eq i32 %i, 0
487  tail call void @llvm.assume(i1 %cmp)
488  %cmp110 = icmp sgt i32 %n, 0
489  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
490
491vector.ph:                                        ; preds = %entry
492  %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
493  %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
494  br label %vector.body
495
496vector.body:                                      ; preds = %vector.body, %vector.ph
497  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
498  %i1 = getelementptr inbounds half, ptr %A, i32 %index
499  %wide.load = load <8 x half>, ptr %i1, align 4
500  %i3 = getelementptr inbounds half, ptr %B, i32 %index
501  %wide.load12 = load <8 x half>, ptr %i3, align 4
502  %i5 = fmul fast <8 x half> %wide.load12, %wide.load
503  %i6 = fsub fast <8 x half> %i5, %broadcast.splat14
504  %i7 = getelementptr inbounds half, ptr %D, i32 %index
505  store <8 x half> %i6, ptr %i7, align 4
506  %index.next = add i32 %index, 8
507  %i9 = icmp eq i32 %index.next, %n
508  br i1 %i9, label %for.cond.cleanup, label %vector.body
509
510for.cond.cleanup:                                 ; preds = %vector.body, %entry
511  ret void
512}
513
514define arm_aapcs_vfpcc void @test_fmss_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
515; CHECK-LABEL: test_fmss_r:
516; CHECK:       @ %bb.0: @ %entry
517; CHECK-NEXT:    cmp r3, #1
518; CHECK-NEXT:    it lt
519; CHECK-NEXT:    bxlt lr
520; CHECK-NEXT:  .LBB11_1: @ %vector.ph
521; CHECK-NEXT:    vmov.f16 r12, s0
522; CHECK-NEXT:    vdup.16 q0, r12
523; CHECK-NEXT:  .LBB11_2: @ %vector.body
524; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
525; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
526; CHECK-NEXT:    vldrw.u32 q2, [r1], #16
527; CHECK-NEXT:    vmov q3, q0
528; CHECK-NEXT:    subs r3, #8
529; CHECK-NEXT:    vfms.f16 q3, q2, q1
530; CHECK-NEXT:    vstrb.8 q3, [r2], #16
531; CHECK-NEXT:    bne .LBB11_2
532; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
533; CHECK-NEXT:    bx lr
534entry:
535  %i = and i32 %n, 7
536  %cmp = icmp eq i32 %i, 0
537  tail call void @llvm.assume(i1 %cmp)
538  %cmp110 = icmp sgt i32 %n, 0
539  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
540
541vector.ph:                                        ; preds = %entry
542  %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
543  %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
544  br label %vector.body
545
546vector.body:                                      ; preds = %vector.body, %vector.ph
547  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
548  %i1 = getelementptr inbounds half, ptr %A, i32 %index
549  %wide.load = load <8 x half>, ptr %i1, align 4
550  %i3 = getelementptr inbounds half, ptr %B, i32 %index
551  %wide.load12 = load <8 x half>, ptr %i3, align 4
552  %i5 = fmul fast <8 x half> %wide.load12, %wide.load
553  %i6 = fsub fast <8 x half> %broadcast.splat14, %i5
554  %i7 = getelementptr inbounds half, ptr %D, i32 %index
555  store <8 x half> %i6, ptr %i7, align 4
556  %index.next = add i32 %index, 8
557  %i9 = icmp eq i32 %index.next, %n
558  br i1 %i9, label %for.cond.cleanup, label %vector.body
559
560for.cond.cleanup:                                 ; preds = %vector.body, %entry
561  ret void
562}
563
564define arm_aapcs_vfpcc void @test_fms(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
565; CHECK-LABEL: test_fms:
566; CHECK:       @ %bb.0: @ %entry
567; CHECK-NEXT:    cmp r3, #1
568; CHECK-NEXT:    it lt
569; CHECK-NEXT:    bxlt lr
570; CHECK-NEXT:  .LBB12_1: @ %vector.ph
571; CHECK-NEXT:    vmov.f16 r12, s0
572; CHECK-NEXT:  .LBB12_2: @ %vector.body
573; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
574; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
575; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
576; CHECK-NEXT:    subs r3, #8
577; CHECK-NEXT:    vneg.f16 q0, q0
578; CHECK-NEXT:    vfma.f16 q0, q1, r12
579; CHECK-NEXT:    vstrb.8 q0, [r2], #16
580; CHECK-NEXT:    bne .LBB12_2
581; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
582; CHECK-NEXT:    bx lr
583entry:
584  %i = and i32 %n, 7
585  %cmp = icmp eq i32 %i, 0
586  tail call void @llvm.assume(i1 %cmp)
587  %cmp110 = icmp sgt i32 %n, 0
588  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
589
590vector.ph:                                        ; preds = %entry
591  %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
592  %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
593  br label %vector.body
594
595vector.body:                                      ; preds = %vector.body, %vector.ph
596  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
597  %i1 = getelementptr inbounds half, ptr %A, i32 %index
598  %wide.load = load <8 x half>, ptr %i1, align 4
599  %i3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
600  %i4 = getelementptr inbounds half, ptr %B, i32 %index
601  %wide.load14 = load <8 x half>, ptr %i4, align 4
602  %i6 = fsub fast <8 x half> %i3, %wide.load14
603  %i7 = getelementptr inbounds half, ptr %D, i32 %index
604  store <8 x half> %i6, ptr %i7, align 4
605  %index.next = add i32 %index, 8
606  %i9 = icmp eq i32 %index.next, %n
607  br i1 %i9, label %for.cond.cleanup, label %vector.body
608
609for.cond.cleanup:                                 ; preds = %vector.body, %entry
610  ret void
611}
612
613define arm_aapcs_vfpcc void @test_fms_r(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, half %C, ptr noalias nocapture %D, i32 %n) {
614; CHECK-LABEL: test_fms_r:
615; CHECK:       @ %bb.0: @ %entry
616; CHECK-NEXT:    cmp r3, #1
617; CHECK-NEXT:    it lt
618; CHECK-NEXT:    bxlt lr
619; CHECK-NEXT:  .LBB13_1: @ %vector.ph
620; CHECK-NEXT:    vmov.f16 r12, s0
621; CHECK-NEXT:  .LBB13_2: @ %vector.body
622; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
623; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
624; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
625; CHECK-NEXT:    subs r3, #8
626; CHECK-NEXT:    vneg.f16 q0, q0
627; CHECK-NEXT:    vfma.f16 q0, q1, r12
628; CHECK-NEXT:    vstrb.8 q0, [r2], #16
629; CHECK-NEXT:    bne .LBB13_2
630; CHECK-NEXT:  @ %bb.3: @ %for.cond.cleanup
631; CHECK-NEXT:    bx lr
632entry:
633  %i = and i32 %n, 7
634  %cmp = icmp eq i32 %i, 0
635  tail call void @llvm.assume(i1 %cmp)
636  %cmp110 = icmp sgt i32 %n, 0
637  br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
638
639vector.ph:                                        ; preds = %entry
640  %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
641  %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
642  br label %vector.body
643
644vector.body:                                      ; preds = %vector.body, %vector.ph
645  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
646  %i1 = getelementptr inbounds half, ptr %A, i32 %index
647  %wide.load = load <8 x half>, ptr %i1, align 4
648  %i3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
649  %i4 = getelementptr inbounds half, ptr %B, i32 %index
650  %wide.load14 = load <8 x half>, ptr %i4, align 4
651  %i6 = fsub fast <8 x half> %i3, %wide.load14
652  %i7 = getelementptr inbounds half, ptr %D, i32 %index
653  store <8 x half> %i6, ptr %i7, align 4
654  %index.next = add i32 %index, 8
655  %i9 = icmp eq i32 %index.next, %n
656  br i1 %i9, label %for.cond.cleanup, label %vector.body
657
658for.cond.cleanup:                                 ; preds = %vector.body, %entry
659  ret void
660}
661
662
663define dso_local void @test_nested(ptr noalias nocapture %pInT1, ptr noalias nocapture readonly %pOutT1, ptr noalias nocapture readonly %pPRT_in, ptr noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l) local_unnamed_addr {
664; CHECK-LABEL: test_nested:
665; CHECK:       @ %bb.0: @ %for.body.us.preheader
666; CHECK-NEXT:    .save {r4, r5, r6, lr}
667; CHECK-NEXT:    push {r4, r5, r6, lr}
668; CHECK-NEXT:    ldrd lr, r12, [sp, #16]
669; CHECK-NEXT:    lsl.w r3, r12, #1
670; CHECK-NEXT:  .LBB14_1: @ %for.body.us
671; CHECK-NEXT:    @ =>This Loop Header: Depth=1
672; CHECK-NEXT:    @ Child Loop BB14_2 Depth 2
673; CHECK-NEXT:    ldrh r4, [r1]
674; CHECK-NEXT:    mov r5, r2
675; CHECK-NEXT:    mov r6, r12
676; CHECK-NEXT:    vdup.16 q0, r4
677; CHECK-NEXT:    mov r4, r0
678; CHECK-NEXT:  .LBB14_2: @ %vector.body
679; CHECK-NEXT:    @ Parent Loop BB14_1 Depth=1
680; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
681; CHECK-NEXT:    vldrw.u32 q1, [r5], #16
682; CHECK-NEXT:    vldrw.u32 q2, [r4]
683; CHECK-NEXT:    subs r6, #8
684; CHECK-NEXT:    vfms.f16 q2, q1, q0
685; CHECK-NEXT:    vstrb.8 q2, [r4], #16
686; CHECK-NEXT:    bne .LBB14_2
687; CHECK-NEXT:  @ %bb.3: @ %for.cond6.for.end_crit_edge.us
688; CHECK-NEXT:    @ in Loop: Header=BB14_1 Depth=1
689; CHECK-NEXT:    add r0, r3
690; CHECK-NEXT:    add r2, r3
691; CHECK-NEXT:    adds r1, #2
692; CHECK-NEXT:    le lr, .LBB14_1
693; CHECK-NEXT:  @ %bb.4: @ %for.end14
694; CHECK-NEXT:    pop {r4, r5, r6, pc}
695for.body.us.preheader:
696  %cmp = icmp sgt i32 %numRows, 0
697  tail call void @llvm.assume(i1 %cmp)
698  %cmp1 = icmp sgt i32 %numCols, 0
699  tail call void @llvm.assume(i1 %cmp1)
700  %rem = and i32 %numCols, 7
701  %cmp2 = icmp eq i32 %rem, 0
702  tail call void @llvm.assume(i1 %cmp2)
703  %cmp3 = icmp slt i32 %l, %numCols
704  tail call void @llvm.assume(i1 %cmp3)
705  br label %for.body.us
706
707for.body.us:                                      ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader
708  %pInT1.addr.038.us = phi ptr [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ]
709  %i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ]
710  %pOutT1.addr.036.us = phi ptr [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ]
711  %pPRT_in.addr.035.us = phi ptr [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ]
712  %scevgep = getelementptr half, ptr %pPRT_in.addr.035.us, i32 %numCols
713  %i = load half, ptr %pOutT1.addr.036.us, align 4
714  %broadcast.splatinsert47 = insertelement <8 x half> undef, half %i, i32 0
715  %broadcast.splat48 = shufflevector <8 x half> %broadcast.splatinsert47, <8 x half> undef, <8 x i32> zeroinitializer
716  br label %vector.body
717
718vector.body:                                      ; preds = %vector.body, %for.body.us
719  %index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ]
720  %next.gep = getelementptr half, ptr %pInT1.addr.038.us, i32 %index
721  %next.gep45 = getelementptr half, ptr %pPRT_in.addr.035.us, i32 %index
722  %wide.load = load <8 x half>, ptr %next.gep, align 4
723  %wide.load46 = load <8 x half>, ptr %next.gep45, align 4
724  %i3 = fmul fast <8 x half> %wide.load46, %broadcast.splat48
725  %i4 = fsub fast <8 x half> %wide.load, %i3
726  store <8 x half> %i4, ptr %next.gep, align 4
727  %index.next = add i32 %index, 8
728  %i5 = icmp eq i32 %index.next, %numCols
729  br i1 %i5, label %for.cond6.for.end_crit_edge.us, label %vector.body
730
731for.cond6.for.end_crit_edge.us:                   ; preds = %vector.body
732  %incdec.ptr.us = getelementptr inbounds half, ptr %pOutT1.addr.036.us, i32 1
733  %scevgep40 = getelementptr half, ptr %pInT1.addr.038.us, i32 %numCols
734  %inc13.us = add nuw nsw i32 %i.037.us, 1
735  %exitcond41 = icmp eq i32 %inc13.us, %numRows
736  br i1 %exitcond41, label %for.end14, label %for.body.us
737
738for.end14:                                        ; preds = %for.cond6.for.end_crit_edge.us
739  ret void
740}
741
742%struct.arm_fir_instance_f32 = type { i16, ptr, ptr }
743define void @arm_fir_f32_1_4_mve(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr %pDst, i32 %blockSize) {
744; CHECK-LABEL: arm_fir_f32_1_4_mve:
745; CHECK:       @ %bb.0: @ %entry
746; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
747; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
748; CHECK-NEXT:    .pad #16
749; CHECK-NEXT:    sub sp, #16
750; CHECK-NEXT:    ldrh.w r9, [r0]
751; CHECK-NEXT:    ldr.w r10, [r0, #4]
752; CHECK-NEXT:    sub.w r6, r9, #1
753; CHECK-NEXT:    cmp r6, #3
754; CHECK-NEXT:    bhi .LBB15_6
755; CHECK-NEXT:  @ %bb.1: @ %if.then
756; CHECK-NEXT:    ldr r7, [r0, #8]
757; CHECK-NEXT:    add.w r4, r10, r6, lsl #1
758; CHECK-NEXT:    lsrs r5, r3, #2
759; CHECK-NEXT:    ldrh.w r8, [r7, #6]
760; CHECK-NEXT:    ldrh.w r12, [r7, #4]
761; CHECK-NEXT:    ldrh r6, [r7, #2]
762; CHECK-NEXT:    ldrh r7, [r7]
763; CHECK-NEXT:    wls lr, r5, .LBB15_5
764; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
765; CHECK-NEXT:    str.w r9, [sp, #12] @ 4-byte Spill
766; CHECK-NEXT:    bic r5, r3, #3
767; CHECK-NEXT:    add.w r9, r10, #2
768; CHECK-NEXT:    str r5, [sp] @ 4-byte Spill
769; CHECK-NEXT:    add.w r5, r2, r5, lsl #1
770; CHECK-NEXT:    str r5, [sp, #4] @ 4-byte Spill
771; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
772; CHECK-NEXT:  .LBB15_3: @ %while.body
773; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
774; CHECK-NEXT:    vldrw.u32 q0, [r1], #8
775; CHECK-NEXT:    sub.w r11, r9, #2
776; CHECK-NEXT:    add.w r5, r9, #2
777; CHECK-NEXT:    vstrb.8 q0, [r4], #8
778; CHECK-NEXT:    vldrw.u32 q0, [r11]
779; CHECK-NEXT:    vldrw.u32 q1, [r9]
780; CHECK-NEXT:    vmul.f16 q0, q0, r7
781; CHECK-NEXT:    vfma.f16 q0, q1, r6
782; CHECK-NEXT:    vldrw.u32 q1, [r5]
783; CHECK-NEXT:    vfma.f16 q0, q1, r12
784; CHECK-NEXT:    vldrw.u32 q1, [r9, #4]
785; CHECK-NEXT:    add.w r9, r9, #8
786; CHECK-NEXT:    vfma.f16 q0, q1, r8
787; CHECK-NEXT:    vstrb.8 q0, [r2], #8
788; CHECK-NEXT:    le lr, .LBB15_3
789; CHECK-NEXT:  @ %bb.4: @ %while.end.loopexit
790; CHECK-NEXT:    ldr r2, [sp] @ 4-byte Reload
791; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
792; CHECK-NEXT:    ldr.w r9, [sp, #12] @ 4-byte Reload
793; CHECK-NEXT:    add.w r10, r10, r2, lsl #1
794; CHECK-NEXT:    add.w r1, r1, r2, lsl #1
795; CHECK-NEXT:    ldr r2, [sp, #4] @ 4-byte Reload
796; CHECK-NEXT:  .LBB15_5: @ %while.end
797; CHECK-NEXT:    and r5, r3, #3
798; CHECK-NEXT:    vldrw.u32 q0, [r1]
799; CHECK-NEXT:    vctp.16 r5
800; CHECK-NEXT:    add.w r1, r10, #2
801; CHECK-NEXT:    vpst
802; CHECK-NEXT:    vstrht.16 q0, [r4]
803; CHECK-NEXT:    vldrw.u32 q0, [r10]
804; CHECK-NEXT:    vldrw.u32 q1, [r1]
805; CHECK-NEXT:    add.w r1, r10, #6
806; CHECK-NEXT:    vmul.f16 q0, q0, r7
807; CHECK-NEXT:    vfma.f16 q0, q1, r6
808; CHECK-NEXT:    vldrw.u32 q1, [r10, #4]
809; CHECK-NEXT:    vfma.f16 q0, q1, r12
810; CHECK-NEXT:    vldrw.u32 q1, [r1]
811; CHECK-NEXT:    vfma.f16 q0, q1, r8
812; CHECK-NEXT:    vpst
813; CHECK-NEXT:    vstrht.16 q0, [r2]
814; CHECK-NEXT:    ldr.w r10, [r0, #4]
815; CHECK-NEXT:  .LBB15_6: @ %if.end
816; CHECK-NEXT:    add.w r0, r10, r3, lsl #1
817; CHECK-NEXT:    lsr.w r1, r9, #2
818; CHECK-NEXT:    wls lr, r1, .LBB15_10
819; CHECK-NEXT:  @ %bb.7: @ %while.body51.preheader
820; CHECK-NEXT:    bic r2, r9, #3
821; CHECK-NEXT:    adds r1, r2, r3
822; CHECK-NEXT:    mov r3, r10
823; CHECK-NEXT:    add.w r1, r10, r1, lsl #1
824; CHECK-NEXT:  .LBB15_8: @ %while.body51
825; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
826; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
827; CHECK-NEXT:    vstrb.8 q0, [r3], #8
828; CHECK-NEXT:    le lr, .LBB15_8
829; CHECK-NEXT:  @ %bb.9: @ %while.end55.loopexit
830; CHECK-NEXT:    add.w r10, r10, r2, lsl #1
831; CHECK-NEXT:    mov r0, r1
832; CHECK-NEXT:  .LBB15_10: @ %while.end55
833; CHECK-NEXT:    ands r1, r9, #3
834; CHECK-NEXT:    beq .LBB15_12
835; CHECK-NEXT:  @ %bb.11: @ %if.then59
836; CHECK-NEXT:    vldrw.u32 q0, [r0]
837; CHECK-NEXT:    vctp.16 r1
838; CHECK-NEXT:    vpst
839; CHECK-NEXT:    vstrht.16 q0, [r10]
840; CHECK-NEXT:  .LBB15_12: @ %if.end61
841; CHECK-NEXT:    add sp, #16
842; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
843entry:
844  %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
845  %i = load ptr, ptr %pState1, align 4
846  %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 2
847  %i1 = load ptr, ptr %pCoeffs2, align 4
848  %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 0
849  %i2 = load i16, ptr %numTaps3, align 4
850  %conv = zext i16 %i2 to i32
851  %sub = add nsw i32 %conv, -1
852  %cmp = icmp ult i32 %sub, 4
853  br i1 %cmp, label %if.then, label %if.end
854
855if.then:                                          ; preds = %entry
856  %arrayidx = getelementptr inbounds half, ptr %i, i32 %sub
857  %incdec.ptr = getelementptr inbounds half, ptr %i1, i32 1
858  %i3 = load half, ptr %i1, align 4
859  %incdec.ptr6 = getelementptr inbounds half, ptr %i1, i32 2
860  %i4 = load half, ptr %incdec.ptr, align 4
861  %incdec.ptr7 = getelementptr inbounds half, ptr %i1, i32 3
862  %i5 = load half, ptr %incdec.ptr6, align 4
863  %i6 = load half, ptr %incdec.ptr7, align 4
864  %shr = lshr i32 %blockSize, 2
865  %cmp9146 = icmp eq i32 %shr, 0
866  %.pre161 = insertelement <8 x half> undef, half %i3, i32 0
867  %.pre162 = shufflevector <8 x half> %.pre161, <8 x half> undef, <8 x i32> zeroinitializer
868  %.pre163 = insertelement <8 x half> undef, half %i4, i32 0
869  %.pre164 = shufflevector <8 x half> %.pre163, <8 x half> undef, <8 x i32> zeroinitializer
870  %.pre165 = insertelement <8 x half> undef, half %i5, i32 0
871  %.pre166 = shufflevector <8 x half> %.pre165, <8 x half> undef, <8 x i32> zeroinitializer
872  %.pre167 = insertelement <8 x half> undef, half %i6, i32 0
873  %.pre168 = shufflevector <8 x half> %.pre167, <8 x half> undef, <8 x i32> zeroinitializer
874  br i1 %cmp9146, label %while.end, label %while.body.lr.ph
875
876while.body.lr.ph:                                 ; preds = %if.then
877  %i7 = and i32 %blockSize, -4
878  %scevgep158 = getelementptr half, ptr %pDst, i32 %i7
879  br label %while.body
880
881while.body:                                       ; preds = %while.body, %while.body.lr.ph
882  %pStateCur.0151 = phi ptr [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ]
883  %pSamples.0150 = phi ptr [ %i, %while.body.lr.ph ], [ %add.ptr24, %while.body ]
884  %pOutput.0149 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ]
885  %pTempSrc.0148 = phi ptr [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ]
886  %blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
887  %i9 = load <8 x half>, ptr %pTempSrc.0148, align 4
888  store <8 x half> %i9, ptr %pStateCur.0151, align 4
889  %add.ptr = getelementptr inbounds half, ptr %pStateCur.0151, i32 4
890  %add.ptr11 = getelementptr inbounds half, ptr %pTempSrc.0148, i32 4
891  %i12 = load <8 x half>, ptr %pSamples.0150, align 4
892  %i13 = fmul fast <8 x half> %i12, %.pre162
893  %arrayidx12 = getelementptr inbounds half, ptr %pSamples.0150, i32 1
894  %i15 = load <8 x half>, ptr %arrayidx12, align 4
895  %mul = fmul fast <8 x half> %i15, %.pre164
896  %add = fadd fast <8 x half> %mul, %i13
897  %arrayidx13 = getelementptr inbounds half, ptr %pSamples.0150, i32 2
898  %i17 = load <8 x half>, ptr %arrayidx13, align 4
899  %mul16 = fmul fast <8 x half> %i17, %.pre166
900  %add17 = fadd fast <8 x half> %add, %mul16
901  %arrayidx18 = getelementptr inbounds half, ptr %pSamples.0150, i32 3
902  %i19 = load <8 x half>, ptr %arrayidx18, align 4
903  %mul21 = fmul fast <8 x half> %i19, %.pre168
904  %add22 = fadd fast <8 x half> %add17, %mul21
905  store <8 x half> %add22, ptr %pOutput.0149, align 4
906  %add.ptr23 = getelementptr inbounds half, ptr %pOutput.0149, i32 4
907  %add.ptr24 = getelementptr inbounds half, ptr %pSamples.0150, i32 4
908  %dec = add nsw i32 %blkCnt.0147, -1
909  %cmp9 = icmp eq i32 %dec, 0
910  br i1 %cmp9, label %while.end.loopexit, label %while.body
911
912while.end.loopexit:                               ; preds = %while.body
913  %scevgep157 = getelementptr half, ptr %pSrc, i32 %i7
914  %scevgep159 = getelementptr half, ptr %i, i32 %i7
915  br label %while.end
916
917while.end:                                        ; preds = %while.end.loopexit, %if.then
918  %pTempSrc.0.lcssa = phi ptr [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ]
919  %pOutput.0.lcssa = phi ptr [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ]
920  %pSamples.0.lcssa = phi ptr [ %scevgep159, %while.end.loopexit ], [ %i, %if.then ]
921  %pStateCur.0.lcssa = phi ptr [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ]
922  %and = and i32 %blockSize, 3
923  %i21 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and)
924  %i23 = load <8 x half>, ptr %pTempSrc.0.lcssa, align 4
925  tail call void @llvm.masked.store.v8f16.p0(<8 x half> %i23, ptr %pStateCur.0.lcssa, i32 4, <8 x i1> %i21)
926  %i26 = load <8 x half>, ptr %pSamples.0.lcssa, align 4
927  %i27 = fmul fast <8 x half> %i26, %.pre162
928  %arrayidx29 = getelementptr inbounds half, ptr %pSamples.0.lcssa, i32 1
929  %i29 = load <8 x half>, ptr %arrayidx29, align 4
930  %mul32 = fmul fast <8 x half> %i29, %.pre164
931  %add33 = fadd fast <8 x half> %mul32, %i27
932  %arrayidx34 = getelementptr inbounds half, ptr %pSamples.0.lcssa, i32 2
933  %i31 = load <8 x half>, ptr %arrayidx34, align 4
934  %mul37 = fmul fast <8 x half> %i31, %.pre166
935  %add38 = fadd fast <8 x half> %add33, %mul37
936  %arrayidx39 = getelementptr inbounds half, ptr %pSamples.0.lcssa, i32 3
937  %i33 = load <8 x half>, ptr %arrayidx39, align 4
938  %mul42 = fmul fast <8 x half> %i33, %.pre168
939  %add43 = fadd fast <8 x half> %add38, %mul42
940  tail call void @llvm.masked.store.v8f16.p0(<8 x half> %add43, ptr %pOutput.0.lcssa, i32 4, <8 x i1> %i21)
941  %.pre = load ptr, ptr %pState1, align 4
942  br label %if.end
943
944if.end:                                           ; preds = %while.end, %entry
945  %i35 = phi ptr [ %.pre, %while.end ], [ %i, %entry ]
946  %arrayidx45 = getelementptr inbounds half, ptr %i35, i32 %blockSize
947  %shr47 = lshr i32 %conv, 2
948  %cmp49141 = icmp eq i32 %shr47, 0
949  br i1 %cmp49141, label %while.end55, label %while.body51.preheader
950
951while.body51.preheader:                           ; preds = %if.end
952  %i36 = and i32 %conv, 65532
953  %i37 = add i32 %i36, %blockSize
954  %scevgep = getelementptr half, ptr %i35, i32 %i37
955  br label %while.body51
956
957while.body51:                                     ; preds = %while.body51, %while.body51.preheader
958  %pTempSrc.1144 = phi ptr [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ]
959  %pTempDest.0143 = phi ptr [ %add.ptr53, %while.body51 ], [ %i35, %while.body51.preheader ]
960  %blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ]
961  %i39 = load <8 x half>, ptr %pTempSrc.1144, align 4
962  store <8 x half> %i39, ptr %pTempDest.0143, align 4
963  %add.ptr52 = getelementptr inbounds half, ptr %pTempSrc.1144, i32 4
964  %add.ptr53 = getelementptr inbounds half, ptr %pTempDest.0143, i32 4
965  %dec54 = add nsw i32 %blkCnt.1142, -1
966  %cmp49 = icmp eq i32 %dec54, 0
967  br i1 %cmp49, label %while.end55.loopexit, label %while.body51
968
969while.end55.loopexit:                             ; preds = %while.body51
970  %scevgep156 = getelementptr half, ptr %i35, i32 %i36
971  br label %while.end55
972
973while.end55:                                      ; preds = %while.end55.loopexit, %if.end
974  %pTempDest.0.lcssa = phi ptr [ %i35, %if.end ], [ %scevgep156, %while.end55.loopexit ]
975  %pTempSrc.1.lcssa = phi ptr [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ]
976  %and56 = and i32 %conv, 3
977  %cmp57 = icmp eq i32 %and56, 0
978  br i1 %cmp57, label %if.end61, label %if.then59
979
980if.then59:                                        ; preds = %while.end55
981  %i41 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and56)
982  %i43 = load <8 x half>, ptr %pTempSrc.1.lcssa, align 4
983  tail call void @llvm.masked.store.v8f16.p0(<8 x half> %i43, ptr %pTempDest.0.lcssa, i32 4, <8 x i1> %i41)
984  br label %if.end61
985
986if.end61:                                         ; preds = %if.then59, %while.end55
987  ret void
988}
989
990
991define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) {
992; CHECK-LABEL: fir:
993; CHECK:       @ %bb.0: @ %entry
994; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
995; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
996; CHECK-NEXT:    .pad #24
997; CHECK-NEXT:    sub sp, #24
998; CHECK-NEXT:    cmp r3, #8
999; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
1000; CHECK-NEXT:    blo.w .LBB16_12
1001; CHECK-NEXT:  @ %bb.1: @ %if.then
1002; CHECK-NEXT:    lsrs.w r12, r3, #2
1003; CHECK-NEXT:    beq.w .LBB16_12
1004; CHECK-NEXT:  @ %bb.2: @ %while.body.lr.ph
1005; CHECK-NEXT:    ldrh r4, [r0]
1006; CHECK-NEXT:    movs r1, #1
1007; CHECK-NEXT:    ldrd r5, r3, [r0, #4]
1008; CHECK-NEXT:    sub.w r0, r4, #8
1009; CHECK-NEXT:    add.w r7, r0, r0, lsr #29
1010; CHECK-NEXT:    and r0, r0, #7
1011; CHECK-NEXT:    asrs r6, r7, #3
1012; CHECK-NEXT:    cmp r6, #1
1013; CHECK-NEXT:    it gt
1014; CHECK-NEXT:    asrgt r1, r7, #3
1015; CHECK-NEXT:    add.w r7, r5, r4, lsl #1
1016; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
1017; CHECK-NEXT:    subs r1, r7, #2
1018; CHECK-NEXT:    rsbs r7, r4, #0
1019; CHECK-NEXT:    str r7, [sp, #8] @ 4-byte Spill
1020; CHECK-NEXT:    add.w r7, r3, #16
1021; CHECK-NEXT:    str r4, [sp, #12] @ 4-byte Spill
1022; CHECK-NEXT:    str r7, [sp, #4] @ 4-byte Spill
1023; CHECK-NEXT:    str r0, [sp, #16] @ 4-byte Spill
1024; CHECK-NEXT:    b .LBB16_6
1025; CHECK-NEXT:  .LBB16_3: @ %while.end.loopexit
1026; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
1027; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
1028; CHECK-NEXT:    add.w r5, r5, r0, lsl #1
1029; CHECK-NEXT:    b .LBB16_5
1030; CHECK-NEXT:  .LBB16_4: @ %for.end
1031; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
1032; CHECK-NEXT:    ldr r0, [sp, #16] @ 4-byte Reload
1033; CHECK-NEXT:    wls lr, r0, .LBB16_5
1034; CHECK-NEXT:    b .LBB16_10
1035; CHECK-NEXT:  .LBB16_5: @ %while.end
1036; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
1037; CHECK-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
1038; CHECK-NEXT:    subs.w r12, r12, #1
1039; CHECK-NEXT:    vstrb.8 q0, [r2], #8
1040; CHECK-NEXT:    add.w r0, r5, r0, lsl #1
1041; CHECK-NEXT:    add.w r5, r0, #8
1042; CHECK-NEXT:    beq.w .LBB16_12
1043; CHECK-NEXT:  .LBB16_6: @ %while.body
1044; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1045; CHECK-NEXT:    @ Child Loop BB16_8 Depth 2
1046; CHECK-NEXT:    @ Child Loop BB16_11 Depth 2
1047; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
1048; CHECK-NEXT:    ldrh.w lr, [r3, #14]
1049; CHECK-NEXT:    vldrw.u32 q0, [r0], #8
1050; CHECK-NEXT:    ldrh.w r8, [r3, #12]
1051; CHECK-NEXT:    ldrh r7, [r3, #10]
1052; CHECK-NEXT:    ldrh r4, [r3, #8]
1053; CHECK-NEXT:    ldrh r6, [r3, #6]
1054; CHECK-NEXT:    ldrh.w r9, [r3, #4]
1055; CHECK-NEXT:    ldrh.w r11, [r3, #2]
1056; CHECK-NEXT:    ldrh.w r10, [r3]
1057; CHECK-NEXT:    vstrb.8 q0, [r1], #8
1058; CHECK-NEXT:    vldrw.u32 q0, [r5]
1059; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
1060; CHECK-NEXT:    adds r0, r5, #2
1061; CHECK-NEXT:    vldrw.u32 q1, [r0]
1062; CHECK-NEXT:    vmul.f16 q0, q0, r10
1063; CHECK-NEXT:    adds r0, r5, #6
1064; CHECK-NEXT:    vfma.f16 q0, q1, r11
1065; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
1066; CHECK-NEXT:    vfma.f16 q0, q1, r9
1067; CHECK-NEXT:    vldrw.u32 q1, [r0]
1068; CHECK-NEXT:    add.w r0, r5, #10
1069; CHECK-NEXT:    vfma.f16 q0, q1, r6
1070; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
1071; CHECK-NEXT:    vfma.f16 q0, q1, r4
1072; CHECK-NEXT:    vldrw.u32 q1, [r0]
1073; CHECK-NEXT:    add.w r0, r5, #14
1074; CHECK-NEXT:    vfma.f16 q0, q1, r7
1075; CHECK-NEXT:    vldrw.u32 q1, [r5, #12]
1076; CHECK-NEXT:    adds r5, #16
1077; CHECK-NEXT:    vfma.f16 q0, q1, r8
1078; CHECK-NEXT:    vldrw.u32 q1, [r0]
1079; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
1080; CHECK-NEXT:    vfma.f16 q0, q1, lr
1081; CHECK-NEXT:    cmp r0, #16
1082; CHECK-NEXT:    blo .LBB16_9
1083; CHECK-NEXT:  @ %bb.7: @ %for.body.preheader
1084; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
1085; CHECK-NEXT:    ldr r0, [sp] @ 4-byte Reload
1086; CHECK-NEXT:    dls lr, r0
1087; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
1088; CHECK-NEXT:  .LBB16_8: @ %for.body
1089; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
1090; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1091; CHECK-NEXT:    ldrh r0, [r6], #16
1092; CHECK-NEXT:    vldrw.u32 q1, [r5]
1093; CHECK-NEXT:    adds r4, r5, #2
1094; CHECK-NEXT:    vfma.f16 q0, q1, r0
1095; CHECK-NEXT:    vldrw.u32 q1, [r4]
1096; CHECK-NEXT:    ldrh r0, [r6, #-14]
1097; CHECK-NEXT:    adds r4, r5, #6
1098; CHECK-NEXT:    vfma.f16 q0, q1, r0
1099; CHECK-NEXT:    ldrh r0, [r6, #-12]
1100; CHECK-NEXT:    vldrw.u32 q1, [r5, #4]
1101; CHECK-NEXT:    vfma.f16 q0, q1, r0
1102; CHECK-NEXT:    vldrw.u32 q1, [r4]
1103; CHECK-NEXT:    ldrh r0, [r6, #-10]
1104; CHECK-NEXT:    add.w r4, r5, #10
1105; CHECK-NEXT:    vfma.f16 q0, q1, r0
1106; CHECK-NEXT:    ldrh r0, [r6, #-8]
1107; CHECK-NEXT:    vldrw.u32 q1, [r5, #8]
1108; CHECK-NEXT:    vfma.f16 q0, q1, r0
1109; CHECK-NEXT:    vldrw.u32 q1, [r4]
1110; CHECK-NEXT:    ldrh r0, [r6, #-6]
1111; CHECK-NEXT:    ldrh r4, [r6, #-2]
1112; CHECK-NEXT:    vfma.f16 q0, q1, r0
1113; CHECK-NEXT:    ldrh r0, [r6, #-4]
1114; CHECK-NEXT:    vldrw.u32 q1, [r5, #12]
1115; CHECK-NEXT:    vfma.f16 q0, q1, r0
1116; CHECK-NEXT:    add.w r0, r5, #14
1117; CHECK-NEXT:    vldrw.u32 q1, [r0]
1118; CHECK-NEXT:    adds r5, #16
1119; CHECK-NEXT:    vfma.f16 q0, q1, r4
1120; CHECK-NEXT:    le lr, .LBB16_8
1121; CHECK-NEXT:    b .LBB16_4
1122; CHECK-NEXT:  .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
1123; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
1124; CHECK-NEXT:    b .LBB16_4
1125; CHECK-NEXT:  .LBB16_10: @ %while.body76.preheader
1126; CHECK-NEXT:    @ in Loop: Header=BB16_6 Depth=1
1127; CHECK-NEXT:    mov r0, r5
1128; CHECK-NEXT:  .LBB16_11: @ %while.body76
1129; CHECK-NEXT:    @ Parent Loop BB16_6 Depth=1
1130; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1131; CHECK-NEXT:    ldrh r4, [r6], #2
1132; CHECK-NEXT:    vldrh.u16 q1, [r0], #2
1133; CHECK-NEXT:    vfma.f16 q0, q1, r4
1134; CHECK-NEXT:    le lr, .LBB16_11
1135; CHECK-NEXT:    b .LBB16_3
1136; CHECK-NEXT:  .LBB16_12: @ %if.end
1137; CHECK-NEXT:    add sp, #24
1138; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1139entry:
1140  %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
1141  %i = load ptr, ptr %pState1, align 4
1142  %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 2
1143  %i1 = load ptr, ptr %pCoeffs2, align 4
1144  %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 0
1145  %i2 = load i16, ptr %numTaps3, align 4
1146  %conv = zext i16 %i2 to i32
1147  %cmp = icmp ugt i32 %blockSize, 7
1148  br i1 %cmp, label %if.then, label %if.end
1149
1150if.then:                                          ; preds = %entry
1151  %shr = lshr i32 %blockSize, 2
1152  %cmp5217 = icmp eq i32 %shr, 0
1153  br i1 %cmp5217, label %if.end, label %while.body.lr.ph
1154
1155while.body.lr.ph:                                 ; preds = %if.then
1156  %sub = add nsw i32 %conv, -1
1157  %arrayidx = getelementptr inbounds half, ptr %i, i32 %sub
1158  %incdec.ptr = getelementptr inbounds half, ptr %i1, i32 1
1159  %incdec.ptr7 = getelementptr inbounds half, ptr %i1, i32 2
1160  %incdec.ptr8 = getelementptr inbounds half, ptr %i1, i32 3
1161  %incdec.ptr9 = getelementptr inbounds half, ptr %i1, i32 4
1162  %incdec.ptr10 = getelementptr inbounds half, ptr %i1, i32 5
1163  %incdec.ptr11 = getelementptr inbounds half, ptr %i1, i32 6
1164  %incdec.ptr12 = getelementptr inbounds half, ptr %i1, i32 7
1165  %sub37 = add nsw i32 %conv, -8
1166  %div = sdiv i32 %sub37, 8
1167  %pCoeffsCur.0199 = getelementptr inbounds half, ptr %i1, i32 8
1168  %cmp38201 = icmp ugt i16 %i2, 15
1169  %and = and i32 %sub37, 7
1170  %cmp74210 = icmp eq i32 %and, 0
1171  %idx.neg = sub nsw i32 0, %conv
1172  %i3 = icmp sgt i32 %div, 1
1173  %smax = select i1 %i3, i32 %div, i32 1
1174  br label %while.body
1175
1176while.body:                                       ; preds = %while.end, %while.body.lr.ph
1177  %blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ]
1178  %pStateCur.0221 = phi ptr [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ]
1179  %pSamples.0220 = phi ptr [ %i, %while.body.lr.ph ], [ %add.ptr83, %while.end ]
1180  %pTempSrc.0219 = phi ptr [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ]
1181  %pOutput.0218 = phi ptr [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ]
1182  %i4 = load half, ptr %i1, align 4
1183  %i5 = load half, ptr %incdec.ptr, align 4
1184  %i6 = load half, ptr %incdec.ptr7, align 4
1185  %i7 = load half, ptr %incdec.ptr8, align 4
1186  %i8 = load half, ptr %incdec.ptr9, align 4
1187  %i9 = load half, ptr %incdec.ptr10, align 4
1188  %i10 = load half, ptr %incdec.ptr11, align 4
1189  %i11 = load half, ptr %incdec.ptr12, align 4
1190  %i13 = load <8 x half>, ptr %pTempSrc.0219, align 4
1191  store <8 x half> %i13, ptr %pStateCur.0221, align 4
1192  %add.ptr = getelementptr inbounds half, ptr %pStateCur.0221, i32 4
1193  %add.ptr14 = getelementptr inbounds half, ptr %pTempSrc.0219, i32 4
1194  %i16 = load <8 x half>, ptr %pSamples.0220, align 4
1195  %.splatinsert = insertelement <8 x half> undef, half %i4, i32 0
1196  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
1197  %i17 = fmul fast <8 x half> %i16, %.splat
1198  %arrayidx15 = getelementptr inbounds half, ptr %pSamples.0220, i32 1
1199  %i19 = load <8 x half>, ptr %arrayidx15, align 4
1200  %.splatinsert16 = insertelement <8 x half> undef, half %i5, i32 0
1201  %.splat17 = shufflevector <8 x half> %.splatinsert16, <8 x half> undef, <8 x i32> zeroinitializer
1202  %i20 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i19, <8 x half> %.splat17, <8 x half> %i17)
1203  %arrayidx18 = getelementptr inbounds half, ptr %pSamples.0220, i32 2
1204  %i22 = load <8 x half>, ptr %arrayidx18, align 4
1205  %.splatinsert19 = insertelement <8 x half> undef, half %i6, i32 0
1206  %.splat20 = shufflevector <8 x half> %.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
1207  %i23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i22, <8 x half> %.splat20, <8 x half> %i20)
1208  %arrayidx21 = getelementptr inbounds half, ptr %pSamples.0220, i32 3
1209  %i25 = load <8 x half>, ptr %arrayidx21, align 4
1210  %.splatinsert22 = insertelement <8 x half> undef, half %i7, i32 0
1211  %.splat23 = shufflevector <8 x half> %.splatinsert22, <8 x half> undef, <8 x i32> zeroinitializer
1212  %i26 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i25, <8 x half> %.splat23, <8 x half> %i23)
1213  %arrayidx24 = getelementptr inbounds half, ptr %pSamples.0220, i32 4
1214  %i28 = load <8 x half>, ptr %arrayidx24, align 4
1215  %.splatinsert25 = insertelement <8 x half> undef, half %i8, i32 0
1216  %.splat26 = shufflevector <8 x half> %.splatinsert25, <8 x half> undef, <8 x i32> zeroinitializer
1217  %i29 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i28, <8 x half> %.splat26, <8 x half> %i26)
1218  %arrayidx27 = getelementptr inbounds half, ptr %pSamples.0220, i32 5
1219  %i31 = load <8 x half>, ptr %arrayidx27, align 4
1220  %.splatinsert28 = insertelement <8 x half> undef, half %i9, i32 0
1221  %.splat29 = shufflevector <8 x half> %.splatinsert28, <8 x half> undef, <8 x i32> zeroinitializer
1222  %i32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i31, <8 x half> %.splat29, <8 x half> %i29)
1223  %arrayidx30 = getelementptr inbounds half, ptr %pSamples.0220, i32 6
1224  %i34 = load <8 x half>, ptr %arrayidx30, align 4
1225  %.splatinsert31 = insertelement <8 x half> undef, half %i10, i32 0
1226  %.splat32 = shufflevector <8 x half> %.splatinsert31, <8 x half> undef, <8 x i32> zeroinitializer
1227  %i35 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i34, <8 x half> %.splat32, <8 x half> %i32)
1228  %arrayidx33 = getelementptr inbounds half, ptr %pSamples.0220, i32 7
1229  %i37 = load <8 x half>, ptr %arrayidx33, align 4
1230  %.splatinsert34 = insertelement <8 x half> undef, half %i11, i32 0
1231  %.splat35 = shufflevector <8 x half> %.splatinsert34, <8 x half> undef, <8 x i32> zeroinitializer
1232  %i38 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i37, <8 x half> %.splat35, <8 x half> %i35)
1233  %pSamples.1200 = getelementptr inbounds half, ptr %pSamples.0220, i32 8
1234  br i1 %cmp38201, label %for.body, label %for.end
1235
1236for.body:                                         ; preds = %for.body, %while.body
1237  %pSamples.1207 = phi ptr [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ]
1238  %pCoeffsCur.0206 = phi ptr [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ]
1239  %.pn205 = phi ptr [ %pCoeffsCur.0206, %for.body ], [ %i1, %while.body ]
1240  %i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ]
1241  %vecAcc0.0203 = phi <8 x half> [ %i70, %for.body ], [ %i38, %while.body ]
1242  %pSamples.0.pn202 = phi ptr [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ]
1243  %incdec.ptr40 = getelementptr inbounds half, ptr %.pn205, i32 9
1244  %i39 = load half, ptr %pCoeffsCur.0206, align 4
1245  %incdec.ptr41 = getelementptr inbounds half, ptr %.pn205, i32 10
1246  %i40 = load half, ptr %incdec.ptr40, align 4
1247  %incdec.ptr42 = getelementptr inbounds half, ptr %.pn205, i32 11
1248  %i41 = load half, ptr %incdec.ptr41, align 4
1249  %incdec.ptr43 = getelementptr inbounds half, ptr %.pn205, i32 12
1250  %i42 = load half, ptr %incdec.ptr42, align 4
1251  %incdec.ptr44 = getelementptr inbounds half, ptr %.pn205, i32 13
1252  %i43 = load half, ptr %incdec.ptr43, align 4
1253  %incdec.ptr45 = getelementptr inbounds half, ptr %.pn205, i32 14
1254  %i44 = load half, ptr %incdec.ptr44, align 4
1255  %incdec.ptr46 = getelementptr inbounds half, ptr %.pn205, i32 15
1256  %i45 = load half, ptr %incdec.ptr45, align 4
1257  %i46 = load half, ptr %incdec.ptr46, align 4
1258  %i48 = load <8 x half>, ptr %pSamples.1207, align 4
1259  %.splatinsert48 = insertelement <8 x half> undef, half %i39, i32 0
1260  %.splat49 = shufflevector <8 x half> %.splatinsert48, <8 x half> undef, <8 x i32> zeroinitializer
1261  %i49 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i48, <8 x half> %.splat49, <8 x half> %vecAcc0.0203)
1262  %arrayidx50 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 9
1263  %i51 = load <8 x half>, ptr %arrayidx50, align 4
1264  %.splatinsert51 = insertelement <8 x half> undef, half %i40, i32 0
1265  %.splat52 = shufflevector <8 x half> %.splatinsert51, <8 x half> undef, <8 x i32> zeroinitializer
1266  %i52 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i51, <8 x half> %.splat52, <8 x half> %i49)
1267  %arrayidx53 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 10
1268  %i54 = load <8 x half>, ptr %arrayidx53, align 4
1269  %.splatinsert54 = insertelement <8 x half> undef, half %i41, i32 0
1270  %.splat55 = shufflevector <8 x half> %.splatinsert54, <8 x half> undef, <8 x i32> zeroinitializer
1271  %i55 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i54, <8 x half> %.splat55, <8 x half> %i52)
1272  %arrayidx56 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 11
1273  %i57 = load <8 x half>, ptr %arrayidx56, align 4
1274  %.splatinsert57 = insertelement <8 x half> undef, half %i42, i32 0
1275  %.splat58 = shufflevector <8 x half> %.splatinsert57, <8 x half> undef, <8 x i32> zeroinitializer
1276  %i58 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i57, <8 x half> %.splat58, <8 x half> %i55)
1277  %arrayidx59 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 12
1278  %i60 = load <8 x half>, ptr %arrayidx59, align 4
1279  %.splatinsert60 = insertelement <8 x half> undef, half %i43, i32 0
1280  %.splat61 = shufflevector <8 x half> %.splatinsert60, <8 x half> undef, <8 x i32> zeroinitializer
1281  %i61 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i60, <8 x half> %.splat61, <8 x half> %i58)
1282  %arrayidx62 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 13
1283  %i63 = load <8 x half>, ptr %arrayidx62, align 4
1284  %.splatinsert63 = insertelement <8 x half> undef, half %i44, i32 0
1285  %.splat64 = shufflevector <8 x half> %.splatinsert63, <8 x half> undef, <8 x i32> zeroinitializer
1286  %i64 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i63, <8 x half> %.splat64, <8 x half> %i61)
1287  %arrayidx65 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 14
1288  %i66 = load <8 x half>, ptr %arrayidx65, align 4
1289  %.splatinsert66 = insertelement <8 x half> undef, half %i45, i32 0
1290  %.splat67 = shufflevector <8 x half> %.splatinsert66, <8 x half> undef, <8 x i32> zeroinitializer
1291  %i67 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i66, <8 x half> %.splat67, <8 x half> %i64)
1292  %arrayidx68 = getelementptr inbounds half, ptr %pSamples.0.pn202, i32 15
1293  %i69 = load <8 x half>, ptr %arrayidx68, align 4
1294  %.splatinsert69 = insertelement <8 x half> undef, half %i46, i32 0
1295  %.splat70 = shufflevector <8 x half> %.splatinsert69, <8 x half> undef, <8 x i32> zeroinitializer
1296  %i70 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i69, <8 x half> %.splat70, <8 x half> %i67)
1297  %inc = add nuw nsw i32 %i.0204, 1
1298  %pCoeffsCur.0 = getelementptr inbounds half, ptr %pCoeffsCur.0206, i32 8
1299  %pSamples.1 = getelementptr inbounds half, ptr %pSamples.1207, i32 8
1300  %exitcond = icmp eq i32 %inc, %smax
1301  br i1 %exitcond, label %for.end, label %for.body
1302
1303for.end:                                          ; preds = %for.body, %while.body
1304  %vecAcc0.0.lcssa = phi <8 x half> [ %i38, %while.body ], [ %i70, %for.body ]
1305  %pCoeffsCur.0.lcssa = phi ptr [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ]
1306  %pSamples.1.lcssa = phi ptr [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ]
1307  br i1 %cmp74210, label %while.end, label %while.body76
1308
1309while.body76:                                     ; preds = %while.body76, %for.end
1310  %pCoeffsCur.1214 = phi ptr [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ]
1311  %vecAcc0.1213 = phi <8 x half> [ %i74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ]
1312  %numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ]
1313  %pSamples.2211 = phi ptr [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ]
1314  %incdec.ptr77 = getelementptr inbounds half, ptr %pCoeffsCur.1214, i32 1
1315  %i71 = load half, ptr %pCoeffsCur.1214, align 4
1316  %i73 = load <8 x half>, ptr %pSamples.2211, align 4
1317  %.splatinsert78 = insertelement <8 x half> undef, half %i71, i32 0
1318  %.splat79 = shufflevector <8 x half> %.splatinsert78, <8 x half> undef, <8 x i32> zeroinitializer
1319  %i74 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i73, <8 x half> %.splat79, <8 x half> %vecAcc0.1213)
1320  %incdec.ptr80 = getelementptr inbounds half, ptr %pSamples.2211, i32 1
1321  %dec = add nsw i32 %numCnt.0212, -1
1322  %cmp74 = icmp sgt i32 %numCnt.0212, 1
1323  br i1 %cmp74, label %while.body76, label %while.end.loopexit
1324
1325while.end.loopexit:                               ; preds = %while.body76
1326  %scevgep = getelementptr half, ptr %pSamples.1.lcssa, i32 %and
1327  br label %while.end
1328
1329while.end:                                        ; preds = %while.end.loopexit, %for.end
1330  %pSamples.2.lcssa = phi ptr [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ]
1331  %vecAcc0.1.lcssa = phi <8 x half> [ %vecAcc0.0.lcssa, %for.end ], [ %i74, %while.end.loopexit ]
1332  store <8 x half> %vecAcc0.1.lcssa, ptr %pOutput.0218, align 4
1333  %add.ptr81 = getelementptr inbounds half, ptr %pOutput.0218, i32 4
1334  %add.ptr82 = getelementptr inbounds half, ptr %pSamples.2.lcssa, i32 4
1335  %add.ptr83 = getelementptr inbounds half, ptr %add.ptr82, i32 %idx.neg
1336  %dec84 = add nsw i32 %blkCnt.0222, -1
1337  %cmp5 = icmp eq i32 %dec84, 0
1338  br i1 %cmp5, label %if.end, label %while.body
1339
1340if.end:                                           ; preds = %while.end, %if.then, %entry
1341  ret void
1342}
1343
1344%struct.arm_biquad_cascade_df2T_instance_f16 = type { i8, ptr, ptr }
1345define void @arm_biquad_cascade_df2T_f16(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %blockSize) {
1346; CHECK-LABEL: arm_biquad_cascade_df2T_f16:
1347; CHECK:       @ %bb.0: @ %entry
1348; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
1349; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
1350; CHECK-NEXT:    .pad #4
1351; CHECK-NEXT:    sub sp, #4
1352; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
1353; CHECK-NEXT:    vpush {d8, d9, d10, d11}
1354; CHECK-NEXT:    vmov.i32 q0, #0x0
1355; CHECK-NEXT:    ldrd r6, r12, [r0, #4]
1356; CHECK-NEXT:    ldrb.w r9, [r0]
1357; CHECK-NEXT:    vldr.16 s0, .LCPI17_0
1358; CHECK-NEXT:    lsr.w r8, r3, #1
1359; CHECK-NEXT:    b .LBB17_3
1360; CHECK-NEXT:  .LBB17_1: @ %if.else
1361; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
1362; CHECK-NEXT:    vmovx.f16 s5, s4
1363; CHECK-NEXT:    vstr.16 s4, [r6]
1364; CHECK-NEXT:  .LBB17_2: @ %if.end
1365; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
1366; CHECK-NEXT:    vstr.16 s5, [r6, #2]
1367; CHECK-NEXT:    add.w r12, r12, #10
1368; CHECK-NEXT:    adds r6, #4
1369; CHECK-NEXT:    subs.w r9, r9, #1
1370; CHECK-NEXT:    mov r1, r2
1371; CHECK-NEXT:    beq .LBB17_8
1372; CHECK-NEXT:  .LBB17_3: @ %do.body
1373; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1374; CHECK-NEXT:    @ Child Loop BB17_5 Depth 2
1375; CHECK-NEXT:    vldrh.u16 q2, [r12]
1376; CHECK-NEXT:    movs r5, #0
1377; CHECK-NEXT:    vmov q4, q2
1378; CHECK-NEXT:    vshlc q4, r5, #16
1379; CHECK-NEXT:    vldrh.u16 q3, [r12, #4]
1380; CHECK-NEXT:    vmov q5, q3
1381; CHECK-NEXT:    vshlc q5, r5, #16
1382; CHECK-NEXT:    vldrh.u16 q1, [r6]
1383; CHECK-NEXT:    vmov.f32 s5, s1
1384; CHECK-NEXT:    mov r5, r2
1385; CHECK-NEXT:    wls lr, r8, .LBB17_6
1386; CHECK-NEXT:  @ %bb.4: @ %while.body.preheader
1387; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
1388; CHECK-NEXT:    mov r5, r2
1389; CHECK-NEXT:  .LBB17_5: @ %while.body
1390; CHECK-NEXT:    @ Parent Loop BB17_3 Depth=1
1391; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1392; CHECK-NEXT:    ldrh r7, [r1], #4
1393; CHECK-NEXT:    vmov r0, s0
1394; CHECK-NEXT:    vfma.f16 q1, q2, r7
1395; CHECK-NEXT:    ldrh r4, [r1, #-2]
1396; CHECK-NEXT:    vmov.u16 r7, q1[0]
1397; CHECK-NEXT:    vfma.f16 q1, q3, r7
1398; CHECK-NEXT:    vins.f16 s5, s0
1399; CHECK-NEXT:    vfma.f16 q1, q4, r4
1400; CHECK-NEXT:    vmov.u16 r4, q1[1]
1401; CHECK-NEXT:    vfma.f16 q1, q5, r4
1402; CHECK-NEXT:    strh r4, [r5, #2]
1403; CHECK-NEXT:    vmov.f32 s4, s5
1404; CHECK-NEXT:    strh r7, [r5], #4
1405; CHECK-NEXT:    vmov.16 q1[2], r0
1406; CHECK-NEXT:    le lr, .LBB17_5
1407; CHECK-NEXT:  .LBB17_6: @ %while.end
1408; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
1409; CHECK-NEXT:    lsls r0, r3, #31
1410; CHECK-NEXT:    beq .LBB17_1
1411; CHECK-NEXT:  @ %bb.7: @ %if.then
1412; CHECK-NEXT:    @ in Loop: Header=BB17_3 Depth=1
1413; CHECK-NEXT:    ldrh r0, [r1]
1414; CHECK-NEXT:    vfma.f16 q1, q2, r0
1415; CHECK-NEXT:    vmov.u16 r0, q1[0]
1416; CHECK-NEXT:    vfma.f16 q1, q3, r0
1417; CHECK-NEXT:    strh r0, [r5]
1418; CHECK-NEXT:    vmovx.f16 s2, s4
1419; CHECK-NEXT:    vstr.16 s2, [r6]
1420; CHECK-NEXT:    b .LBB17_2
1421; CHECK-NEXT:  .LBB17_8: @ %do.end
1422; CHECK-NEXT:    vpop {d8, d9, d10, d11}
1423; CHECK-NEXT:    add sp, #4
1424; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
1425; CHECK-NEXT:    .p2align 1
1426; CHECK-NEXT:  @ %bb.9:
1427; CHECK-NEXT:  .LCPI17_0:
1428; CHECK-NEXT:    .short 0x0000 @ half 0
1429entry:
1430  %pState1 = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, ptr %S, i32 0, i32 1
1431  %i = load ptr, ptr %pState1, align 4
1432  %numStages = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, ptr %S, i32 0, i32 0
1433  %i1 = load i8, ptr %numStages, align 4
1434  %conv = zext i8 %i1 to i32
1435  %pCoeffs = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, ptr %S, i32 0, i32 2
1436  %i2 = load ptr, ptr %pCoeffs, align 4
1437  %div = lshr i32 %blockSize, 1
1438  %cmp.not90 = icmp eq i32 %div, 0
1439  %and = and i32 %blockSize, 1
1440  %tobool.not = icmp eq i32 %and, 0
1441  br label %do.body
1442
1443do.body:                                          ; preds = %if.end, %entry
1444  %stage.0 = phi i32 [ %conv, %entry ], [ %dec23, %if.end ]
1445  %pCurCoeffs.0 = phi ptr [ %i2, %entry ], [ %add.ptr2, %if.end ]
1446  %pState.0 = phi ptr [ %i, %entry ], [ %pState.1, %if.end ]
1447  %pIn.0 = phi ptr [ %pSrc, %entry ], [ %pDst, %if.end ]
1448  %i4 = load <8 x half>, ptr %pCurCoeffs.0, align 2
1449  %add.ptr = getelementptr inbounds half, ptr %pCurCoeffs.0, i32 2
1450  %i6 = load <8 x half>, ptr %add.ptr, align 2
1451  %add.ptr2 = getelementptr inbounds half, ptr %pCurCoeffs.0, i32 5
1452  %i8 = load <8 x half>, ptr %pState.0, align 2
1453  %i9 = shufflevector <8 x half> %i8, <8 x half> <half poison, half poison, half 0xH0000, half 0xH0000, half poison, half poison, half poison, half poison>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
1454  %i10 = bitcast <8 x half> %i4 to <8 x i16>
1455  %i11 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %i10, i32 0, i32 16)
1456  %i12 = extractvalue { i32, <8 x i16> } %i11, 0
1457  %i13 = extractvalue { i32, <8 x i16> } %i11, 1
1458  %i14 = bitcast <8 x i16> %i13 to <8 x half>
1459  %i15 = bitcast <8 x half> %i6 to <8 x i16>
1460  %i16 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %i15, i32 %i12, i32 16)
1461  %i17 = extractvalue { i32, <8 x i16> } %i16, 1
1462  %i18 = bitcast <8 x i16> %i17 to <8 x half>
1463  br i1 %cmp.not90, label %while.end, label %while.body
1464
1465while.body:                                       ; preds = %while.body, %do.body
1466  %pIn.194 = phi ptr [ %incdec.ptr4, %while.body ], [ %pIn.0, %do.body ]
1467  %state.093 = phi <8 x half> [ %i30, %while.body ], [ %i9, %do.body ]
1468  %pOut.192 = phi ptr [ %incdec.ptr12, %while.body ], [ %pDst, %do.body ]
1469  %sample.091 = phi i32 [ %dec, %while.body ], [ %div, %do.body ]
1470  %incdec.ptr = getelementptr inbounds half, ptr %pIn.194, i32 1
1471  %i19 = load half, ptr %pIn.194, align 2
1472  %incdec.ptr4 = getelementptr inbounds half, ptr %pIn.194, i32 2
1473  %i20 = load half, ptr %incdec.ptr, align 2
1474  %.splatinsert = insertelement <8 x half> poison, half %i19, i32 0
1475  %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> poison, <8 x i32> zeroinitializer
1476  %i21 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i4, <8 x half> %.splat, <8 x half> %state.093)
1477  %i22 = extractelement <8 x half> %i21, i32 0
1478  %.splat6 = shufflevector <8 x half> %i21, <8 x half> poison, <8 x i32> zeroinitializer
1479  %i23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i6, <8 x half> %.splat6, <8 x half> %i21)
1480  %i24 = insertelement <8 x half> %i23, half 0xH0000, i32 3
1481  %.splatinsert7 = insertelement <8 x half> poison, half %i20, i32 0
1482  %.splat8 = shufflevector <8 x half> %.splatinsert7, <8 x half> poison, <8 x i32> zeroinitializer
1483  %i25 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i14, <8 x half> %.splat8, <8 x half> %i24)
1484  %i26 = extractelement <8 x half> %i25, i32 1
1485  %.splat10 = shufflevector <8 x half> %i25, <8 x half> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
1486  %i27 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i18, <8 x half> %.splat10, <8 x half> %i25)
1487  %i28 = shufflevector <8 x half> %i27, <8 x half> undef, <8 x i32> <i32 2, i32 undef, i32 undef, i32 3, i32 4, i32 5, i32 6, i32 7>
1488  %i29 = insertelement <8 x half> %i28, half 0xH0000, i32 2
1489  %i30 = shufflevector <8 x half> %i29, <8 x half> %i27, <8 x i32> <i32 0, i32 11, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1490  %incdec.ptr11 = getelementptr inbounds half, ptr %pOut.192, i32 1
1491  store half %i22, ptr %pOut.192, align 2
1492  %incdec.ptr12 = getelementptr inbounds half, ptr %pOut.192, i32 2
1493  store half %i26, ptr %incdec.ptr11, align 2
1494  %dec = add nsw i32 %sample.091, -1
1495  %cmp.not = icmp eq i32 %dec, 0
1496  br i1 %cmp.not, label %while.end, label %while.body
1497
1498while.end:                                        ; preds = %while.body, %do.body
1499  %pOut.1.lcssa = phi ptr [ %pDst, %do.body ], [ %incdec.ptr12, %while.body ]
1500  %state.0.lcssa = phi <8 x half> [ %i9, %do.body ], [ %i30, %while.body ]
1501  %pIn.1.lcssa = phi ptr [ %pIn.0, %do.body ], [ %incdec.ptr4, %while.body ]
1502  br i1 %tobool.not, label %if.else, label %if.then
1503
1504if.then:                                          ; preds = %while.end
1505  %i31 = load half, ptr %pIn.1.lcssa, align 2
1506  %.splatinsert14 = insertelement <8 x half> poison, half %i31, i32 0
1507  %.splat15 = shufflevector <8 x half> %.splatinsert14, <8 x half> poison, <8 x i32> zeroinitializer
1508  %i32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i4, <8 x half> %.splat15, <8 x half> %state.0.lcssa)
1509  %i33 = extractelement <8 x half> %i32, i32 0
1510  %.splat17 = shufflevector <8 x half> %i32, <8 x half> poison, <8 x i32> zeroinitializer
1511  %i34 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %i6, <8 x half> %.splat17, <8 x half> %i32)
1512  store half %i33, ptr %pOut.1.lcssa, align 2
1513  %i35 = extractelement <8 x half> %i34, i32 1
1514  store half %i35, ptr %pState.0, align 2
1515  %i36 = extractelement <8 x half> %i34, i32 2
1516  br label %if.end
1517
1518if.else:                                          ; preds = %while.end
1519  %i37 = extractelement <8 x half> %state.0.lcssa, i32 0
1520  store half %i37, ptr %pState.0, align 2
1521  %i38 = extractelement <8 x half> %state.0.lcssa, i32 1
1522  br label %if.end
1523
1524if.end:                                           ; preds = %if.else, %if.then
1525  %.sink = phi half [ %i38, %if.else ], [ %i36, %if.then ]
1526  %i39 = getelementptr inbounds half, ptr %pState.0, i32 1
1527  store half %.sink, ptr %i39, align 2
1528  %pState.1 = getelementptr inbounds half, ptr %pState.0, i32 2
1529  %dec23 = add i32 %stage.0, -1
1530  %cmp24.not = icmp eq i32 %dec23, 0
1531  br i1 %cmp24.not, label %do.end, label %do.body
1532
1533do.end:                                           ; preds = %if.end
1534  ret void
1535}
1536
1537define arm_aapcs_vfpcc half @vecAddAcrossF16Mve(<8 x half> %in) {
1538; CHECK-LABEL: vecAddAcrossF16Mve:
1539; CHECK:       @ %bb.0: @ %entry
1540; CHECK-NEXT:    vrev32.16 q1, q0
1541; CHECK-NEXT:    vadd.f16 q0, q1, q0
1542; CHECK-NEXT:    vrev64.32 q1, q0
1543; CHECK-NEXT:    vadd.f16 q0, q0, q1
1544; CHECK-NEXT:    vadd.f16 s0, s0, s2
1545; CHECK-NEXT:    bx lr
1546entry:
1547  %i = shufflevector <8 x half> %in, <8 x half> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
1548  %i1 = fadd fast <8 x half> %i, %in
1549  %i2 = bitcast <8 x half> %i1 to <4 x i32>
1550  %i3 = shufflevector <4 x i32> %i2, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
1551  %i4 = bitcast <4 x i32> %i3 to <8 x half>
1552  %i5 = fadd fast <8 x half> %i1, %i4
1553  %i6 = extractelement <8 x half> %i5, i32 0
1554  %i7 = extractelement <8 x half> %i5, i32 4
1555  %add = fadd fast half %i6, %i7
1556  ret half %add
1557}
1558
1559declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16>, i32, i32)
1560declare void @llvm.assume(i1)
1561declare <8 x i1> @llvm.arm.mve.vctp16(i32)
1562declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
1563declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32 immarg, <8 x i1>)
1564