xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
3
4%struct.DCT_InstanceTypeDef = type { ptr, i32, i32 }
5
6define void @DCT_mve1(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
7; CHECK-LABEL: DCT_mve1:
8; CHECK:       @ %bb.0: @ %entry
9; CHECK-NEXT:    ldr r3, [r0, #4]
10; CHECK-NEXT:    sub.w r12, r3, #1
11; CHECK-NEXT:    cmp.w r12, #2
12; CHECK-NEXT:    it lo
13; CHECK-NEXT:    bxlo lr
14; CHECK-NEXT:  .LBB0_1: @ %for.body.preheader
15; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
16; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
17; CHECK-NEXT:    ldr r5, [r0, #8]
18; CHECK-NEXT:    ldr r3, [r0]
19; CHECK-NEXT:    add.w r3, r3, r5, lsl #2
20; CHECK-NEXT:    movs r0, #1
21; CHECK-NEXT:    lsl.w r9, r5, #2
22; CHECK-NEXT:  .LBB0_2: @ %for.body
23; CHECK-NEXT:    @ =>This Loop Header: Depth=1
24; CHECK-NEXT:    @ Child Loop BB0_3 Depth 2
25; CHECK-NEXT:    vmov.i32 q0, #0x0
26; CHECK-NEXT:    mov r6, r1
27; CHECK-NEXT:    mov r7, r3
28; CHECK-NEXT:    dlstp.32 lr, r5
29; CHECK-NEXT:  .LBB0_3: @ %vector.body
30; CHECK-NEXT:    @ Parent Loop BB0_2 Depth=1
31; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
32; CHECK-NEXT:    vldrw.u32 q1, [r6], #16
33; CHECK-NEXT:    vldrw.u32 q2, [r7], #16
34; CHECK-NEXT:    vfma.f32 q0, q2, q1
35; CHECK-NEXT:    letp lr, .LBB0_3
36; CHECK-NEXT:  @ %bb.4: @ %middle.block
37; CHECK-NEXT:    @ in Loop: Header=BB0_2 Depth=1
38; CHECK-NEXT:    vadd.f32 s2, s2, s3
39; CHECK-NEXT:    add.w r7, r2, r0, lsl #2
40; CHECK-NEXT:    vadd.f32 s0, s0, s1
41; CHECK-NEXT:    adds r0, #1
42; CHECK-NEXT:    add r3, r9
43; CHECK-NEXT:    cmp r0, r12
44; CHECK-NEXT:    vadd.f32 s0, s0, s2
45; CHECK-NEXT:    vstr s0, [r7]
46; CHECK-NEXT:    bne .LBB0_2
47; CHECK-NEXT:  @ %bb.5:
48; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, lr}
49; CHECK-NEXT:    bx lr
50entry:
51  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
52  %i = load i32, ptr %NumInputs, align 4
53  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
54  %i1 = load i32, ptr %NumFilters, align 4
55  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
56  %i2 = load ptr, ptr %pDCTCoefs, align 4
57  %cmp = icmp ugt i32 %i, 1
58  tail call void @llvm.assume(i1 %cmp)
59  %sub = add i32 %i1, -1
60  %cmp350 = icmp ugt i32 %sub, 1
61  br i1 %cmp350, label %for.body.preheader, label %for.cond.cleanup
62
63for.body.preheader:                               ; preds = %entry
64  %n.rnd.up = add i32 %i, 3
65  %n.vec = and i32 %n.rnd.up, -4
66  br label %for.body
67
68for.cond.cleanup:                                 ; preds = %middle.block, %entry
69  ret void
70
71for.body:                                         ; preds = %middle.block, %for.body.preheader
72  %k2.051 = phi i32 [ %add16, %middle.block ], [ 1, %for.body.preheader ]
73  %mul4 = mul i32 %k2.051, %i
74  br label %vector.body
75
76vector.body:                                      ; preds = %vector.body, %for.body
77  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
78  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i10, %vector.body ]
79  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
80  %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
81  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
82  %i5 = add i32 %index, %mul4
83  %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
84  %wide.masked.load53 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
85  %i8 = fmul fast <4 x float> %wide.masked.load53, %wide.masked.load
86  %i9 = fadd fast <4 x float> %i8, %vec.phi
87  %i10 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi
88  %index.next = add i32 %index, 4
89  %i11 = icmp eq i32 %index.next, %n.vec
90  br i1 %i11, label %middle.block, label %vector.body
91
92middle.block:                                     ; preds = %vector.body
93  %i12 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i10)
94  %arrayidx14 = getelementptr inbounds float, ptr %pOut, i32 %k2.051
95  store float %i12, ptr %arrayidx14, align 4
96  %add16 = add nuw i32 %k2.051, 1
97  %exitcond52.not = icmp eq i32 %add16, %sub
98  br i1 %exitcond52.not, label %for.cond.cleanup, label %for.body
99}
100
101define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
102; CHECK-LABEL: DCT_mve2:
103; CHECK:       @ %bb.0: @ %entry
104; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
105; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
106; CHECK-NEXT:    .pad #4
107; CHECK-NEXT:    sub sp, #4
108; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
109; CHECK-NEXT:    ldr r1, [r0, #4]
110; CHECK-NEXT:    subs r1, #2
111; CHECK-NEXT:    cmp r1, #2
112; CHECK-NEXT:    blo .LBB1_5
113; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
114; CHECK-NEXT:    ldr.w r12, [r0, #8]
115; CHECK-NEXT:    movs r4, #1
116; CHECK-NEXT:    ldr r3, [r0]
117; CHECK-NEXT:    add.w r11, r3, r12, lsl #2
118; CHECK-NEXT:    add.w r7, r3, r12, lsl #3
119; CHECK-NEXT:    lsl.w r9, r12, #3
120; CHECK-NEXT:  .LBB1_2: @ %for.body
121; CHECK-NEXT:    @ =>This Loop Header: Depth=1
122; CHECK-NEXT:    @ Child Loop BB1_3 Depth 2
123; CHECK-NEXT:    ldr r5, [sp] @ 4-byte Reload
124; CHECK-NEXT:    vmov.i32 q0, #0x0
125; CHECK-NEXT:    add.w r10, r4, #1
126; CHECK-NEXT:    mov r3, r11
127; CHECK-NEXT:    mov r0, r7
128; CHECK-NEXT:    vmov q1, q0
129; CHECK-NEXT:    dlstp.32 lr, r12
130; CHECK-NEXT:  .LBB1_3: @ %vector.body
131; CHECK-NEXT:    @ Parent Loop BB1_2 Depth=1
132; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
133; CHECK-NEXT:    vldrw.u32 q2, [r5], #16
134; CHECK-NEXT:    vldrw.u32 q3, [r3], #16
135; CHECK-NEXT:    vfma.f32 q1, q3, q2
136; CHECK-NEXT:    vldrw.u32 q3, [r0], #16
137; CHECK-NEXT:    vfma.f32 q0, q3, q2
138; CHECK-NEXT:    letp lr, .LBB1_3
139; CHECK-NEXT:  @ %bb.4: @ %middle.block
140; CHECK-NEXT:    @ in Loop: Header=BB1_2 Depth=1
141; CHECK-NEXT:    vadd.f32 s2, s2, s3
142; CHECK-NEXT:    add.w r0, r2, r10, lsl #2
143; CHECK-NEXT:    vadd.f32 s0, s0, s1
144; CHECK-NEXT:    add r11, r9
145; CHECK-NEXT:    vadd.f32 s6, s6, s7
146; CHECK-NEXT:    add r7, r9
147; CHECK-NEXT:    vadd.f32 s4, s4, s5
148; CHECK-NEXT:    vadd.f32 s0, s0, s2
149; CHECK-NEXT:    vadd.f32 s2, s4, s6
150; CHECK-NEXT:    vstr s0, [r0]
151; CHECK-NEXT:    add.w r0, r2, r4, lsl #2
152; CHECK-NEXT:    adds r4, #2
153; CHECK-NEXT:    cmp r4, r1
154; CHECK-NEXT:    vstr s2, [r0]
155; CHECK-NEXT:    blo .LBB1_2
156; CHECK-NEXT:  .LBB1_5: @ %for.cond.cleanup
157; CHECK-NEXT:    add sp, #4
158; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
159entry:
160  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
161  %i = load i32, ptr %NumInputs, align 4
162  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
163  %i1 = load i32, ptr %NumFilters, align 4
164  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
165  %i2 = load ptr, ptr %pDCTCoefs, align 4
166  %cmp = icmp ugt i32 %i, 1
167  tail call void @llvm.assume(i1 %cmp)
168  %sub = add i32 %i1, -2
169  %cmp371 = icmp ugt i32 %sub, 1
170  br i1 %cmp371, label %for.body.preheader, label %for.cond.cleanup
171
172for.body.preheader:                               ; preds = %entry
173  %n.rnd.up = add i32 %i, 3
174  %n.vec = and i32 %n.rnd.up, -4
175  br label %for.body
176
177for.cond.cleanup:                                 ; preds = %middle.block, %entry
178  ret void
179
180for.body:                                         ; preds = %middle.block, %for.body.preheader
181  %k2.072 = phi i32 [ %add25, %middle.block ], [ 1, %for.body.preheader ]
182  %mul4 = mul i32 %k2.072, %i
183  %add = add nuw i32 %k2.072, 1
184  %mul5 = mul i32 %add, %i
185  br label %vector.body
186
187vector.body:                                      ; preds = %vector.body, %for.body
188  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
189  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i15, %vector.body ]
190  %vec.phi73 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i16, %vector.body ]
191  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
192  %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
193  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
194  %i5 = add i32 %index, %mul4
195  %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
196  %wide.masked.load74 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
197  %i8 = fmul fast <4 x float> %wide.masked.load74, %wide.masked.load
198  %i9 = fadd fast <4 x float> %i8, %vec.phi73
199  %i10 = add i32 %index, %mul5
200  %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
201  %wide.masked.load75 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
202  %i13 = fmul fast <4 x float> %wide.masked.load75, %wide.masked.load
203  %i14 = fadd fast <4 x float> %i13, %vec.phi
204  %i15 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi
205  %i16 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi73
206  %index.next = add i32 %index, 4
207  %i17 = icmp eq i32 %index.next, %n.vec
208  br i1 %i17, label %middle.block, label %vector.body
209
210middle.block:                                     ; preds = %vector.body
211  %i18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i16)
212  %i19 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i15)
213  %arrayidx21 = getelementptr inbounds float, ptr %pOut, i32 %k2.072
214  store float %i18, ptr %arrayidx21, align 4
215  %arrayidx23 = getelementptr inbounds float, ptr %pOut, i32 %add
216  store float %i19, ptr %arrayidx23, align 4
217  %add25 = add i32 %k2.072, 2
218  %cmp3 = icmp ult i32 %add25, %sub
219  br i1 %cmp3, label %for.body, label %for.cond.cleanup
220}
221
222define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
223; CHECK-LABEL: DCT_mve3:
224; CHECK:       @ %bb.0: @ %entry
225; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
226; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
227; CHECK-NEXT:    .pad #4
228; CHECK-NEXT:    sub sp, #4
229; CHECK-NEXT:    .vsave {d8, d9}
230; CHECK-NEXT:    vpush {d8, d9}
231; CHECK-NEXT:    .pad #24
232; CHECK-NEXT:    sub sp, #24
233; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
234; CHECK-NEXT:    ldr r1, [r0, #4]
235; CHECK-NEXT:    str r2, [sp, #8] @ 4-byte Spill
236; CHECK-NEXT:    subs r1, #3
237; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
238; CHECK-NEXT:    cmp r1, #2
239; CHECK-NEXT:    blo .LBB2_5
240; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
241; CHECK-NEXT:    ldr r3, [r0, #8]
242; CHECK-NEXT:    movs r5, #1
243; CHECK-NEXT:    ldr r1, [r0]
244; CHECK-NEXT:    str r3, [sp, #4] @ 4-byte Spill
245; CHECK-NEXT:    add.w r0, r3, r3, lsl #1
246; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
247; CHECK-NEXT:    add.w r12, r1, r3, lsl #3
248; CHECK-NEXT:    adds r3, #3
249; CHECK-NEXT:    bic r3, r3, #3
250; CHECK-NEXT:    ldr r7, [sp, #4] @ 4-byte Reload
251; CHECK-NEXT:    add.w r10, r1, r0, lsl #2
252; CHECK-NEXT:    subs r3, #4
253; CHECK-NEXT:    lsl.w r11, r0, #2
254; CHECK-NEXT:    add.w r1, r5, r3, lsr #2
255; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
256; CHECK-NEXT:  .LBB2_2: @ %for.body
257; CHECK-NEXT:    @ =>This Loop Header: Depth=1
258; CHECK-NEXT:    @ Child Loop BB2_3 Depth 2
259; CHECK-NEXT:    ldr r6, [sp, #16] @ 4-byte Reload
260; CHECK-NEXT:    vmov.i32 q0, #0x0
261; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
262; CHECK-NEXT:    adds r0, r5, #2
263; CHECK-NEXT:    adds r2, r5, #1
264; CHECK-NEXT:    str r0, [sp, #20] @ 4-byte Spill
265; CHECK-NEXT:    mov r3, r9
266; CHECK-NEXT:    mov r0, r12
267; CHECK-NEXT:    mov r4, r10
268; CHECK-NEXT:    vmov q2, q0
269; CHECK-NEXT:    vmov q1, q0
270; CHECK-NEXT:    dlstp.32 lr, r7
271; CHECK-NEXT:  .LBB2_3: @ %vector.body
272; CHECK-NEXT:    @ Parent Loop BB2_2 Depth=1
273; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
274; CHECK-NEXT:    vldrw.u32 q3, [r6], #16
275; CHECK-NEXT:    vldrw.u32 q4, [r3], #16
276; CHECK-NEXT:    vfma.f32 q1, q4, q3
277; CHECK-NEXT:    vldrw.u32 q4, [r0], #16
278; CHECK-NEXT:    vfma.f32 q2, q4, q3
279; CHECK-NEXT:    vldrw.u32 q4, [r4], #16
280; CHECK-NEXT:    vfma.f32 q0, q4, q3
281; CHECK-NEXT:    letp lr, .LBB2_3
282; CHECK-NEXT:  @ %bb.4: @ %middle.block
283; CHECK-NEXT:    @ in Loop: Header=BB2_2 Depth=1
284; CHECK-NEXT:    vadd.f32 s10, s10, s11
285; CHECK-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
286; CHECK-NEXT:    vadd.f32 s8, s8, s9
287; CHECK-NEXT:    add r9, r11
288; CHECK-NEXT:    vadd.f32 s6, s6, s7
289; CHECK-NEXT:    add.w r0, r1, r2, lsl #2
290; CHECK-NEXT:    vadd.f32 s4, s4, s5
291; CHECK-NEXT:    add r12, r11
292; CHECK-NEXT:    vadd.f32 s2, s2, s3
293; CHECK-NEXT:    add r10, r11
294; CHECK-NEXT:    vadd.f32 s0, s0, s1
295; CHECK-NEXT:    vadd.f32 s8, s8, s10
296; CHECK-NEXT:    vadd.f32 s4, s4, s6
297; CHECK-NEXT:    vadd.f32 s0, s0, s2
298; CHECK-NEXT:    vstr s8, [r0]
299; CHECK-NEXT:    add.w r0, r1, r5, lsl #2
300; CHECK-NEXT:    adds r5, #3
301; CHECK-NEXT:    vstr s4, [r0]
302; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
303; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
304; CHECK-NEXT:    vstr s0, [r0]
305; CHECK-NEXT:    ldr r0, [sp, #12] @ 4-byte Reload
306; CHECK-NEXT:    cmp r5, r0
307; CHECK-NEXT:    blo .LBB2_2
308; CHECK-NEXT:  .LBB2_5: @ %for.cond.cleanup
309; CHECK-NEXT:    add sp, #24
310; CHECK-NEXT:    vpop {d8, d9}
311; CHECK-NEXT:    add sp, #4
312; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
313entry:
314  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
315  %i = load i32, ptr %NumInputs, align 4
316  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
317  %i1 = load i32, ptr %NumFilters, align 4
318  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
319  %i2 = load ptr, ptr %pDCTCoefs, align 4
320  %cmp = icmp ugt i32 %i, 1
321  tail call void @llvm.assume(i1 %cmp)
322  %sub = add i32 %i1, -3
323  %cmp392 = icmp ugt i32 %sub, 1
324  br i1 %cmp392, label %for.body.preheader, label %for.cond.cleanup
325
326for.body.preheader:                               ; preds = %entry
327  %n.rnd.up = add i32 %i, 3
328  %n.vec = and i32 %n.rnd.up, -4
329  br label %for.body
330
331for.cond.cleanup:                                 ; preds = %middle.block, %entry
332  ret void
333
334for.body:                                         ; preds = %middle.block, %for.body.preheader
335  %k2.093 = phi i32 [ %add34, %middle.block ], [ 1, %for.body.preheader ]
336  %mul4 = mul i32 %k2.093, %i
337  %add = add nuw i32 %k2.093, 1
338  %mul5 = mul i32 %add, %i
339  %add6 = add i32 %k2.093, 2
340  %mul7 = mul i32 %add6, %i
341  br label %vector.body
342
343vector.body:                                      ; preds = %vector.body, %for.body
344  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
345  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i20, %vector.body ]
346  %vec.phi94 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i21, %vector.body ]
347  %vec.phi95 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i22, %vector.body ]
348  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
349  %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
350  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
351  %i5 = add i32 %index, %mul4
352  %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
353  %wide.masked.load96 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
354  %i8 = fmul fast <4 x float> %wide.masked.load96, %wide.masked.load
355  %i9 = fadd fast <4 x float> %i8, %vec.phi95
356  %i10 = add i32 %index, %mul5
357  %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
358  %wide.masked.load97 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
359  %i13 = fmul fast <4 x float> %wide.masked.load97, %wide.masked.load
360  %i14 = fadd fast <4 x float> %i13, %vec.phi94
361  %i15 = add i32 %index, %mul7
362  %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
363  %wide.masked.load98 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
364  %i18 = fmul fast <4 x float> %wide.masked.load98, %wide.masked.load
365  %i19 = fadd fast <4 x float> %i18, %vec.phi
366  %i20 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi
367  %i21 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi94
368  %i22 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi95
369  %index.next = add i32 %index, 4
370  %i23 = icmp eq i32 %index.next, %n.vec
371  br i1 %i23, label %middle.block, label %vector.body
372
373middle.block:                                     ; preds = %vector.body
374  %i24 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i22)
375  %i25 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i21)
376  %i26 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i20)
377  %arrayidx28 = getelementptr inbounds float, ptr %pOut, i32 %k2.093
378  store float %i24, ptr %arrayidx28, align 4
379  %arrayidx30 = getelementptr inbounds float, ptr %pOut, i32 %add
380  store float %i25, ptr %arrayidx30, align 4
381  %arrayidx32 = getelementptr inbounds float, ptr %pOut, i32 %add6
382  store float %i26, ptr %arrayidx32, align 4
383  %add34 = add i32 %k2.093, 3
384  %cmp3 = icmp ult i32 %add34, %sub
385  br i1 %cmp3, label %for.body, label %for.cond.cleanup
386}
387
388define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
389; CHECK-LABEL: DCT_mve4:
390; CHECK:       @ %bb.0: @ %entry
391; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
392; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
393; CHECK-NEXT:    .pad #4
394; CHECK-NEXT:    sub sp, #4
395; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
396; CHECK-NEXT:    vpush {d8, d9, d10, d11}
397; CHECK-NEXT:    .pad #40
398; CHECK-NEXT:    sub sp, #40
399; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
400; CHECK-NEXT:    ldr r1, [r0, #4]
401; CHECK-NEXT:    str r2, [sp, #16] @ 4-byte Spill
402; CHECK-NEXT:    subs r1, #4
403; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
404; CHECK-NEXT:    cmp r1, #2
405; CHECK-NEXT:    blo.w .LBB3_5
406; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
407; CHECK-NEXT:    ldr r2, [r0, #8]
408; CHECK-NEXT:    movs r6, #1
409; CHECK-NEXT:    ldr r1, [r0]
410; CHECK-NEXT:    add.w r0, r2, r2, lsl #1
411; CHECK-NEXT:    add.w r12, r1, r2, lsl #2
412; CHECK-NEXT:    add.w r8, r1, r2, lsl #3
413; CHECK-NEXT:    add.w r9, r1, r2, lsl #4
414; CHECK-NEXT:    add.w r11, r1, r0, lsl #2
415; CHECK-NEXT:    adds r0, r2, #3
416; CHECK-NEXT:    bic r0, r0, #3
417; CHECK-NEXT:    subs r0, #4
418; CHECK-NEXT:    add.w r0, r6, r0, lsr #2
419; CHECK-NEXT:    strd r0, r2, [sp, #8] @ 8-byte Folded Spill
420; CHECK-NEXT:    lsls r0, r2, #4
421; CHECK-NEXT:    ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
422; CHECK-NEXT:    str r0, [sp, #4] @ 4-byte Spill
423; CHECK-NEXT:  .LBB3_2: @ %for.body
424; CHECK-NEXT:    @ =>This Loop Header: Depth=1
425; CHECK-NEXT:    @ Child Loop BB3_3 Depth 2
426; CHECK-NEXT:    adds r0, r6, #3
427; CHECK-NEXT:    str r0, [sp, #36] @ 4-byte Spill
428; CHECK-NEXT:    adds r0, r6, #2
429; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
430; CHECK-NEXT:    vmov.i32 q0, #0x0
431; CHECK-NEXT:    str r0, [sp, #32] @ 4-byte Spill
432; CHECK-NEXT:    adds r0, r6, #1
433; CHECK-NEXT:    str r0, [sp, #28] @ 4-byte Spill
434; CHECK-NEXT:    mov r3, r12
435; CHECK-NEXT:    mov r0, r8
436; CHECK-NEXT:    mov r5, r11
437; CHECK-NEXT:    mov r4, r9
438; CHECK-NEXT:    vmov q1, q0
439; CHECK-NEXT:    vmov q2, q0
440; CHECK-NEXT:    vmov q3, q0
441; CHECK-NEXT:    dlstp.32 lr, r7
442; CHECK-NEXT:  .LBB3_3: @ %vector.body
443; CHECK-NEXT:    @ Parent Loop BB3_2 Depth=1
444; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
445; CHECK-NEXT:    vldrw.u32 q4, [r1], #16
446; CHECK-NEXT:    vldrw.u32 q5, [r0], #16
447; CHECK-NEXT:    vfma.f32 q3, q5, q4
448; CHECK-NEXT:    vldrw.u32 q5, [r3], #16
449; CHECK-NEXT:    vfma.f32 q2, q5, q4
450; CHECK-NEXT:    vldrw.u32 q5, [r5], #16
451; CHECK-NEXT:    vfma.f32 q1, q5, q4
452; CHECK-NEXT:    vldrw.u32 q5, [r4], #16
453; CHECK-NEXT:    vfma.f32 q0, q5, q4
454; CHECK-NEXT:    letp lr, .LBB3_3
455; CHECK-NEXT:  @ %bb.4: @ %middle.block
456; CHECK-NEXT:    @ in Loop: Header=BB3_2 Depth=1
457; CHECK-NEXT:    vadd.f32 s14, s14, s15
458; CHECK-NEXT:    ldr r0, [sp, #28] @ 4-byte Reload
459; CHECK-NEXT:    vadd.f32 s12, s12, s13
460; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
461; CHECK-NEXT:    vadd.f32 s10, s10, s11
462; CHECK-NEXT:    vadd.f32 s8, s8, s9
463; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
464; CHECK-NEXT:    vadd.f32 s6, s6, s7
465; CHECK-NEXT:    vadd.f32 s4, s4, s5
466; CHECK-NEXT:    vadd.f32 s2, s2, s3
467; CHECK-NEXT:    vadd.f32 s0, s0, s1
468; CHECK-NEXT:    vadd.f32 s12, s12, s14
469; CHECK-NEXT:    vadd.f32 s8, s8, s10
470; CHECK-NEXT:    vadd.f32 s4, s4, s6
471; CHECK-NEXT:    vadd.f32 s0, s0, s2
472; CHECK-NEXT:    vstr s12, [r0]
473; CHECK-NEXT:    add.w r0, r1, r6, lsl #2
474; CHECK-NEXT:    adds r6, #4
475; CHECK-NEXT:    vstr s8, [r0]
476; CHECK-NEXT:    ldr r0, [sp, #32] @ 4-byte Reload
477; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
478; CHECK-NEXT:    vstr s4, [r0]
479; CHECK-NEXT:    ldr r0, [sp, #36] @ 4-byte Reload
480; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
481; CHECK-NEXT:    vstr s0, [r0]
482; CHECK-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
483; CHECK-NEXT:    add r12, r0
484; CHECK-NEXT:    add r8, r0
485; CHECK-NEXT:    add r11, r0
486; CHECK-NEXT:    add r9, r0
487; CHECK-NEXT:    ldr r0, [sp, #20] @ 4-byte Reload
488; CHECK-NEXT:    cmp r6, r0
489; CHECK-NEXT:    blo .LBB3_2
490; CHECK-NEXT:  .LBB3_5: @ %for.cond.cleanup
491; CHECK-NEXT:    add sp, #40
492; CHECK-NEXT:    vpop {d8, d9, d10, d11}
493; CHECK-NEXT:    add sp, #4
494; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
495entry:
496  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
497  %i = load i32, ptr %NumInputs, align 4
498  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
499  %i1 = load i32, ptr %NumFilters, align 4
500  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
501  %i2 = load ptr, ptr %pDCTCoefs, align 4
502  %cmp = icmp ugt i32 %i, 1
503  tail call void @llvm.assume(i1 %cmp)
504  %sub = add i32 %i1, -4
505  %cmp3113 = icmp ugt i32 %sub, 1
506  br i1 %cmp3113, label %for.body.preheader, label %for.cond.cleanup
507
508for.body.preheader:                               ; preds = %entry
509  %n.rnd.up = add i32 %i, 3
510  %n.vec = and i32 %n.rnd.up, -4
511  br label %for.body
512
513for.cond.cleanup:                                 ; preds = %middle.block, %entry
514  ret void
515
516for.body:                                         ; preds = %middle.block, %for.body.preheader
517  %k2.0114 = phi i32 [ %add43, %middle.block ], [ 1, %for.body.preheader ]
518  %mul4 = mul i32 %k2.0114, %i
519  %add = add nuw nsw i32 %k2.0114, 1
520  %mul5 = mul i32 %add, %i
521  %add6 = add nuw nsw i32 %k2.0114, 2
522  %mul7 = mul i32 %add6, %i
523  %add8 = add i32 %k2.0114, 3
524  %mul9 = mul i32 %add8, %i
525  br label %vector.body
526
527vector.body:                                      ; preds = %vector.body, %for.body
528  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
529  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i25, %vector.body ]
530  %vec.phi115 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i26, %vector.body ]
531  %vec.phi116 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i27, %vector.body ]
532  %vec.phi117 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i28, %vector.body ]
533  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
534  %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
535  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
536  %i5 = add i32 %index, %mul4
537  %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
538  %wide.masked.load118 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
539  %i8 = fmul fast <4 x float> %wide.masked.load118, %wide.masked.load
540  %i9 = fadd fast <4 x float> %i8, %vec.phi116
541  %i10 = add i32 %index, %mul5
542  %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
543  %wide.masked.load119 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
544  %i13 = fmul fast <4 x float> %wide.masked.load119, %wide.masked.load
545  %i14 = fadd fast <4 x float> %i13, %vec.phi117
546  %i15 = add i32 %index, %mul7
547  %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
548  %wide.masked.load120 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
549  %i18 = fmul fast <4 x float> %wide.masked.load120, %wide.masked.load
550  %i19 = fadd fast <4 x float> %i18, %vec.phi115
551  %i20 = add i32 %index, %mul9
552  %i21 = getelementptr inbounds float, ptr %i2, i32 %i20
553  %wide.masked.load121 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
554  %i23 = fmul fast <4 x float> %wide.masked.load121, %wide.masked.load
555  %i24 = fadd fast <4 x float> %i23, %vec.phi
556  %i25 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi
557  %i26 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi115
558  %i27 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi116
559  %i28 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi117
560  %index.next = add i32 %index, 4
561  %i29 = icmp eq i32 %index.next, %n.vec
562  br i1 %i29, label %middle.block, label %vector.body
563
564middle.block:                                     ; preds = %vector.body
565  %i30 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i28)
566  %i31 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i27)
567  %i32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i26)
568  %i33 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i25)
569  %arrayidx35 = getelementptr inbounds float, ptr %pOut, i32 %k2.0114
570  store float %i31, ptr %arrayidx35, align 4
571  %arrayidx37 = getelementptr inbounds float, ptr %pOut, i32 %add
572  store float %i30, ptr %arrayidx37, align 4
573  %arrayidx39 = getelementptr inbounds float, ptr %pOut, i32 %add6
574  store float %i32, ptr %arrayidx39, align 4
575  %arrayidx41 = getelementptr inbounds float, ptr %pOut, i32 %add8
576  store float %i33, ptr %arrayidx41, align 4
577  %add43 = add i32 %k2.0114, 4
578  %cmp3 = icmp ult i32 %add43, %sub
579  br i1 %cmp3, label %for.body, label %for.cond.cleanup
580}
581
582define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
583; CHECK-LABEL: DCT_mve5:
584; CHECK:       @ %bb.0: @ %entry
585; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
586; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
587; CHECK-NEXT:    .pad #4
588; CHECK-NEXT:    sub sp, #4
589; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
590; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
591; CHECK-NEXT:    .pad #32
592; CHECK-NEXT:    sub sp, #32
593; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
594; CHECK-NEXT:    ldr r1, [r0, #4]
595; CHECK-NEXT:    subs r1, #5
596; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
597; CHECK-NEXT:    cmp r1, #2
598; CHECK-NEXT:    blo.w .LBB4_5
599; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
600; CHECK-NEXT:    ldr r3, [r0, #8]
601; CHECK-NEXT:    ldr r1, [r0]
602; CHECK-NEXT:    adds r0, r3, #3
603; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
604; CHECK-NEXT:    bic r0, r0, #3
605; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
606; CHECK-NEXT:    subs r1, r0, #4
607; CHECK-NEXT:    movs r0, #1
608; CHECK-NEXT:    lsls r5, r3, #2
609; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
610; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
611; CHECK-NEXT:    add.w r1, r3, r3, lsl #2
612; CHECK-NEXT:    lsls r1, r1, #2
613; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
614; CHECK-NEXT:  .LBB4_2: @ %for.body
615; CHECK-NEXT:    @ =>This Loop Header: Depth=1
616; CHECK-NEXT:    @ Child Loop BB4_3 Depth 2
617; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
618; CHECK-NEXT:    adds r1, r0, #4
619; CHECK-NEXT:    ldr r4, [sp, #20] @ 4-byte Reload
620; CHECK-NEXT:    vmov.i32 q1, #0x0
621; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
622; CHECK-NEXT:    add.w r10, r0, #2
623; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
624; CHECK-NEXT:    adds r1, r0, #3
625; CHECK-NEXT:    add.w r11, r0, #1
626; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
627; CHECK-NEXT:    mov r3, r8
628; CHECK-NEXT:    vmov q0, q1
629; CHECK-NEXT:    vmov q3, q1
630; CHECK-NEXT:    vmov q2, q1
631; CHECK-NEXT:    vmov q4, q1
632; CHECK-NEXT:    dlstp.32 lr, r7
633; CHECK-NEXT:  .LBB4_3: @ %vector.body
634; CHECK-NEXT:    @ Parent Loop BB4_2 Depth=1
635; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
636; CHECK-NEXT:    add.w r9, r3, r5
637; CHECK-NEXT:    vldrw.u32 q5, [r4], #16
638; CHECK-NEXT:    vldrw.u32 q6, [r3], #16
639; CHECK-NEXT:    add.w r12, r9, r5
640; CHECK-NEXT:    vfma.f32 q3, q6, q5
641; CHECK-NEXT:    vldrw.u32 q6, [r9]
642; CHECK-NEXT:    add.w r6, r12, r5
643; CHECK-NEXT:    vfma.f32 q4, q6, q5
644; CHECK-NEXT:    vldrw.u32 q6, [r12]
645; CHECK-NEXT:    adds r7, r6, r5
646; CHECK-NEXT:    vfma.f32 q2, q6, q5
647; CHECK-NEXT:    vldrw.u32 q6, [r6]
648; CHECK-NEXT:    vfma.f32 q0, q6, q5
649; CHECK-NEXT:    vldrw.u32 q6, [r7]
650; CHECK-NEXT:    vfma.f32 q1, q6, q5
651; CHECK-NEXT:    letp lr, .LBB4_3
652; CHECK-NEXT:  @ %bb.4: @ %middle.block
653; CHECK-NEXT:    @ in Loop: Header=BB4_2 Depth=1
654; CHECK-NEXT:    vadd.f32 s18, s18, s19
655; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
656; CHECK-NEXT:    vadd.f32 s16, s16, s17
657; CHECK-NEXT:    vadd.f32 s14, s14, s15
658; CHECK-NEXT:    vadd.f32 s12, s12, s13
659; CHECK-NEXT:    vadd.f32 s6, s6, s7
660; CHECK-NEXT:    vadd.f32 s4, s4, s5
661; CHECK-NEXT:    vadd.f32 s10, s10, s11
662; CHECK-NEXT:    vadd.f32 s8, s8, s9
663; CHECK-NEXT:    vadd.f32 s0, s0, s1
664; CHECK-NEXT:    vadd.f32 s1, s16, s18
665; CHECK-NEXT:    vadd.f32 s2, s2, s3
666; CHECK-NEXT:    vadd.f32 s12, s12, s14
667; CHECK-NEXT:    vadd.f32 s4, s4, s6
668; CHECK-NEXT:    vadd.f32 s6, s8, s10
669; CHECK-NEXT:    vstr s1, [r1]
670; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
671; CHECK-NEXT:    vadd.f32 s0, s0, s2
672; CHECK-NEXT:    adds r0, #5
673; CHECK-NEXT:    vstr s12, [r1]
674; CHECK-NEXT:    add.w r1, r2, r10, lsl #2
675; CHECK-NEXT:    vstr s6, [r1]
676; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
677; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
678; CHECK-NEXT:    vstr s0, [r1]
679; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
680; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
681; CHECK-NEXT:    vstr s4, [r1]
682; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
683; CHECK-NEXT:    add r8, r1
684; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
685; CHECK-NEXT:    cmp r0, r1
686; CHECK-NEXT:    blo.w .LBB4_2
687; CHECK-NEXT:  .LBB4_5: @ %for.cond.cleanup
688; CHECK-NEXT:    add sp, #32
689; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
690; CHECK-NEXT:    add sp, #4
691; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
692entry:
693  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
694  %i = load i32, ptr %NumInputs, align 4
695  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
696  %i1 = load i32, ptr %NumFilters, align 4
697  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
698  %i2 = load ptr, ptr %pDCTCoefs, align 4
699  %cmp = icmp ugt i32 %i, 1
700  tail call void @llvm.assume(i1 %cmp)
701  %sub = add i32 %i1, -5
702  %cmp3134 = icmp ugt i32 %sub, 1
703  br i1 %cmp3134, label %for.body.preheader, label %for.cond.cleanup
704
705for.body.preheader:                               ; preds = %entry
706  %n.rnd.up = add i32 %i, 3
707  %n.vec = and i32 %n.rnd.up, -4
708  br label %for.body
709
710for.cond.cleanup:                                 ; preds = %middle.block, %entry
711  ret void
712
713for.body:                                         ; preds = %middle.block, %for.body.preheader
714  %k2.0135 = phi i32 [ %add52, %middle.block ], [ 1, %for.body.preheader ]
715  %mul4 = mul i32 %k2.0135, %i
716  %add = add nuw i32 %k2.0135, 1
717  %mul5 = mul i32 %add, %i
718  %add6 = add i32 %k2.0135, 2
719  %mul7 = mul i32 %add6, %i
720  %add8 = add i32 %k2.0135, 3
721  %mul9 = mul i32 %add8, %i
722  %add10 = add i32 %k2.0135, 4
723  %mul11 = mul i32 %add10, %i
724  br label %vector.body
725
726vector.body:                                      ; preds = %vector.body, %for.body
727  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
728  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i30, %vector.body ]
729  %vec.phi136 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i31, %vector.body ]
730  %vec.phi137 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i32, %vector.body ]
731  %vec.phi138 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i33, %vector.body ]
732  %vec.phi139 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i34, %vector.body ]
733  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
734  %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
735  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
736  %i5 = add i32 %index, %mul4
737  %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
738  %wide.masked.load140 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
739  %i8 = fmul fast <4 x float> %wide.masked.load140, %wide.masked.load
740  %i9 = fadd fast <4 x float> %i8, %vec.phi137
741  %i10 = add i32 %index, %mul5
742  %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
743  %wide.masked.load141 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
744  %i13 = fmul fast <4 x float> %wide.masked.load141, %wide.masked.load
745  %i14 = fadd fast <4 x float> %i13, %vec.phi139
746  %i15 = add i32 %index, %mul7
747  %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
748  %wide.masked.load142 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
749  %i18 = fmul fast <4 x float> %wide.masked.load142, %wide.masked.load
750  %i19 = fadd fast <4 x float> %i18, %vec.phi138
751  %i20 = add i32 %index, %mul9
752  %i21 = getelementptr inbounds float, ptr %i2, i32 %i20
753  %wide.masked.load143 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
754  %i23 = fmul fast <4 x float> %wide.masked.load143, %wide.masked.load
755  %i24 = fadd fast <4 x float> %i23, %vec.phi136
756  %i25 = add i32 %index, %mul11
757  %i26 = getelementptr inbounds float, ptr %i2, i32 %i25
758  %wide.masked.load144 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
759  %i28 = fmul fast <4 x float> %wide.masked.load144, %wide.masked.load
760  %i29 = fadd fast <4 x float> %i28, %vec.phi
761  %i30 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi
762  %i31 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi136
763  %i32 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi137
764  %i33 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi138
765  %i34 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi139
766  %index.next = add i32 %index, 4
767  %i35 = icmp eq i32 %index.next, %n.vec
768  br i1 %i35, label %middle.block, label %vector.body
769
770middle.block:                                     ; preds = %vector.body
771  %i36 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i34)
772  %i37 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i33)
773  %i38 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i32)
774  %i39 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i31)
775  %i40 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i30)
776  %arrayidx42 = getelementptr inbounds float, ptr %pOut, i32 %k2.0135
777  store float %i38, ptr %arrayidx42, align 4
778  %arrayidx44 = getelementptr inbounds float, ptr %pOut, i32 %add
779  store float %i36, ptr %arrayidx44, align 4
780  %arrayidx46 = getelementptr inbounds float, ptr %pOut, i32 %add6
781  store float %i37, ptr %arrayidx46, align 4
782  %arrayidx48 = getelementptr inbounds float, ptr %pOut, i32 %add8
783  store float %i39, ptr %arrayidx48, align 4
784  %arrayidx50 = getelementptr inbounds float, ptr %pOut, i32 %add10
785  store float %i40, ptr %arrayidx50, align 4
786  %add52 = add i32 %k2.0135, 5
787  %cmp3 = icmp ult i32 %add52, %sub
788  br i1 %cmp3, label %for.body, label %for.cond.cleanup
789}
790
791define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
792; CHECK-LABEL: DCT_mve6:
793; CHECK:       @ %bb.0: @ %entry
794; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
795; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
796; CHECK-NEXT:    .pad #4
797; CHECK-NEXT:    sub sp, #4
798; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
799; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
800; CHECK-NEXT:    .pad #32
801; CHECK-NEXT:    sub sp, #32
802; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
803; CHECK-NEXT:    ldr r1, [r0, #4]
804; CHECK-NEXT:    subs r1, #6
805; CHECK-NEXT:    str r1, [sp, #12] @ 4-byte Spill
806; CHECK-NEXT:    cmp r1, #2
807; CHECK-NEXT:    blo.w .LBB5_5
808; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
809; CHECK-NEXT:    ldr r3, [r0, #8]
810; CHECK-NEXT:    ldr r1, [r0]
811; CHECK-NEXT:    adds r0, r3, #3
812; CHECK-NEXT:    str r3, [sp, #8] @ 4-byte Spill
813; CHECK-NEXT:    bic r0, r0, #3
814; CHECK-NEXT:    add.w r8, r1, r3, lsl #2
815; CHECK-NEXT:    subs r1, r0, #4
816; CHECK-NEXT:    movs r0, #1
817; CHECK-NEXT:    lsls r5, r3, #2
818; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
819; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
820; CHECK-NEXT:    add.w r1, r3, r3, lsl #1
821; CHECK-NEXT:    lsls r1, r1, #3
822; CHECK-NEXT:    str r1, [sp] @ 4-byte Spill
823; CHECK-NEXT:  .LBB5_2: @ %for.body
824; CHECK-NEXT:    @ =>This Loop Header: Depth=1
825; CHECK-NEXT:    @ Child Loop BB5_3 Depth 2
826; CHECK-NEXT:    adds r1, r0, #5
827; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
828; CHECK-NEXT:    adds r1, r0, #4
829; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
830; CHECK-NEXT:    adds r1, r0, #3
831; CHECK-NEXT:    ldr r7, [sp, #8] @ 4-byte Reload
832; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
833; CHECK-NEXT:    vmov.i32 q1, #0x0
834; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
835; CHECK-NEXT:    add.w r11, r0, #2
836; CHECK-NEXT:    ldr r6, [sp, #4] @ 4-byte Reload
837; CHECK-NEXT:    adds r4, r0, #1
838; CHECK-NEXT:    mov r3, r8
839; CHECK-NEXT:    vmov q3, q1
840; CHECK-NEXT:    vmov q4, q1
841; CHECK-NEXT:    vmov q0, q1
842; CHECK-NEXT:    vmov q5, q1
843; CHECK-NEXT:    vmov q2, q1
844; CHECK-NEXT:    dlstp.32 lr, r7
845; CHECK-NEXT:  .LBB5_3: @ %vector.body
846; CHECK-NEXT:    @ Parent Loop BB5_2 Depth=1
847; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
848; CHECK-NEXT:    add.w r12, r3, r5
849; CHECK-NEXT:    vldrw.u32 q6, [r1], #16
850; CHECK-NEXT:    vldrw.u32 q7, [r3], #16
851; CHECK-NEXT:    add.w r10, r12, r5
852; CHECK-NEXT:    vfma.f32 q4, q7, q6
853; CHECK-NEXT:    vldrw.u32 q7, [r12]
854; CHECK-NEXT:    add.w r6, r10, r5
855; CHECK-NEXT:    vfma.f32 q5, q7, q6
856; CHECK-NEXT:    vldrw.u32 q7, [r10]
857; CHECK-NEXT:    adds r7, r6, r5
858; CHECK-NEXT:    vfma.f32 q2, q7, q6
859; CHECK-NEXT:    vldrw.u32 q7, [r6]
860; CHECK-NEXT:    adds r6, r7, r5
861; CHECK-NEXT:    vfma.f32 q0, q7, q6
862; CHECK-NEXT:    vldrw.u32 q7, [r7]
863; CHECK-NEXT:    vfma.f32 q3, q7, q6
864; CHECK-NEXT:    vldrw.u32 q7, [r6]
865; CHECK-NEXT:    vfma.f32 q1, q7, q6
866; CHECK-NEXT:    letp lr, .LBB5_3
867; CHECK-NEXT:  @ %bb.4: @ %middle.block
868; CHECK-NEXT:    @ in Loop: Header=BB5_2 Depth=1
869; CHECK-NEXT:    vadd.f32 s22, s22, s23
870; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
871; CHECK-NEXT:    vadd.f32 s20, s20, s21
872; CHECK-NEXT:    vadd.f32 s18, s18, s19
873; CHECK-NEXT:    vadd.f32 s16, s16, s17
874; CHECK-NEXT:    vadd.f32 s10, s10, s11
875; CHECK-NEXT:    vadd.f32 s8, s8, s9
876; CHECK-NEXT:    vadd.f32 s0, s0, s1
877; CHECK-NEXT:    vadd.f32 s2, s2, s3
878; CHECK-NEXT:    vadd.f32 s1, s20, s22
879; CHECK-NEXT:    vadd.f32 s6, s6, s7
880; CHECK-NEXT:    vadd.f32 s3, s16, s18
881; CHECK-NEXT:    vadd.f32 s4, s4, s5
882; CHECK-NEXT:    vadd.f32 s8, s8, s10
883; CHECK-NEXT:    vadd.f32 s14, s14, s15
884; CHECK-NEXT:    vadd.f32 s12, s12, s13
885; CHECK-NEXT:    vstr s1, [r1]
886; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
887; CHECK-NEXT:    vadd.f32 s0, s0, s2
888; CHECK-NEXT:    adds r0, #6
889; CHECK-NEXT:    vstr s3, [r1]
890; CHECK-NEXT:    add.w r1, r2, r11, lsl #2
891; CHECK-NEXT:    vadd.f32 s4, s4, s6
892; CHECK-NEXT:    vstr s8, [r1]
893; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
894; CHECK-NEXT:    vadd.f32 s6, s12, s14
895; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
896; CHECK-NEXT:    vstr s0, [r1]
897; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
898; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
899; CHECK-NEXT:    vstr s6, [r1]
900; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
901; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
902; CHECK-NEXT:    vstr s4, [r1]
903; CHECK-NEXT:    ldr r1, [sp] @ 4-byte Reload
904; CHECK-NEXT:    add r8, r1
905; CHECK-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
906; CHECK-NEXT:    cmp r0, r1
907; CHECK-NEXT:    blo.w .LBB5_2
908; CHECK-NEXT:  .LBB5_5: @ %for.cond.cleanup
909; CHECK-NEXT:    add sp, #32
910; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
911; CHECK-NEXT:    add sp, #4
912; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
913entry:
914  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
915  %i = load i32, ptr %NumInputs, align 4
916  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
917  %i1 = load i32, ptr %NumFilters, align 4
918  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
919  %i2 = load ptr, ptr %pDCTCoefs, align 4
920  %cmp = icmp ugt i32 %i, 1
921  tail call void @llvm.assume(i1 %cmp)
922  %sub = add i32 %i1, -6
923  %cmp3155 = icmp ugt i32 %sub, 1
924  br i1 %cmp3155, label %for.body.preheader, label %for.cond.cleanup
925
926for.body.preheader:                               ; preds = %entry
927  %n.rnd.up = add i32 %i, 3
928  %n.vec = and i32 %n.rnd.up, -4
929  br label %for.body
930
931for.cond.cleanup:                                 ; preds = %middle.block, %entry
932  ret void
933
934for.body:                                         ; preds = %middle.block, %for.body.preheader
935  %k2.0156 = phi i32 [ %add61, %middle.block ], [ 1, %for.body.preheader ]
936  %mul4 = mul i32 %k2.0156, %i
937  %add = add nuw i32 %k2.0156, 1
938  %mul5 = mul i32 %add, %i
939  %add6 = add i32 %k2.0156, 2
940  %mul7 = mul i32 %add6, %i
941  %add8 = add i32 %k2.0156, 3
942  %mul9 = mul i32 %add8, %i
943  %add10 = add i32 %k2.0156, 4
944  %mul11 = mul i32 %add10, %i
945  %add12 = add i32 %k2.0156, 5
946  %mul13 = mul i32 %add12, %i
947  br label %vector.body
948
949vector.body:                                      ; preds = %vector.body, %for.body
950  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
951  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i35, %vector.body ]
952  %vec.phi157 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i36, %vector.body ]
953  %vec.phi158 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i37, %vector.body ]
954  %vec.phi159 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i38, %vector.body ]
955  %vec.phi160 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i39, %vector.body ]
956  %vec.phi161 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i40, %vector.body ]
957  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
958  %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
959  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
960  %i5 = add i32 %index, %mul4
961  %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
962  %wide.masked.load162 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
963  %i8 = fmul fast <4 x float> %wide.masked.load162, %wide.masked.load
964  %i9 = fadd fast <4 x float> %i8, %vec.phi158
965  %i10 = add i32 %index, %mul5
966  %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
967  %wide.masked.load163 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
968  %i13 = fmul fast <4 x float> %wide.masked.load163, %wide.masked.load
969  %i14 = fadd fast <4 x float> %i13, %vec.phi160
970  %i15 = add i32 %index, %mul7
971  %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
972  %wide.masked.load164 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
973  %i18 = fmul fast <4 x float> %wide.masked.load164, %wide.masked.load
974  %i19 = fadd fast <4 x float> %i18, %vec.phi161
975  %i20 = add i32 %index, %mul9
976  %i21 = getelementptr inbounds float, ptr %i2, i32 %i20
977  %wide.masked.load165 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
978  %i23 = fmul fast <4 x float> %wide.masked.load165, %wide.masked.load
979  %i24 = fadd fast <4 x float> %i23, %vec.phi159
980  %i25 = add i32 %index, %mul11
981  %i26 = getelementptr inbounds float, ptr %i2, i32 %i25
982  %wide.masked.load166 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
983  %i28 = fmul fast <4 x float> %wide.masked.load166, %wide.masked.load
984  %i29 = fadd fast <4 x float> %i28, %vec.phi157
985  %i30 = add i32 %index, %mul13
986  %i31 = getelementptr inbounds float, ptr %i2, i32 %i30
987  %wide.masked.load167 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i31, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
988  %i33 = fmul fast <4 x float> %wide.masked.load167, %wide.masked.load
989  %i34 = fadd fast <4 x float> %i33, %vec.phi
990  %i35 = select <4 x i1> %active.lane.mask, <4 x float> %i34, <4 x float> %vec.phi
991  %i36 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi157
992  %i37 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi158
993  %i38 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi159
994  %i39 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi160
995  %i40 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi161
996  %index.next = add i32 %index, 4
997  %i41 = icmp eq i32 %index.next, %n.vec
998  br i1 %i41, label %middle.block, label %vector.body
999
1000middle.block:                                     ; preds = %vector.body
1001  %i42 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i40)
1002  %i43 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i39)
1003  %i44 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i38)
1004  %i45 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i37)
1005  %i46 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i36)
1006  %i47 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i35)
1007  %arrayidx49 = getelementptr inbounds float, ptr %pOut, i32 %k2.0156
1008  store float %i45, ptr %arrayidx49, align 4
1009  %arrayidx51 = getelementptr inbounds float, ptr %pOut, i32 %add
1010  store float %i43, ptr %arrayidx51, align 4
1011  %arrayidx53 = getelementptr inbounds float, ptr %pOut, i32 %add6
1012  store float %i42, ptr %arrayidx53, align 4
1013  %arrayidx55 = getelementptr inbounds float, ptr %pOut, i32 %add8
1014  store float %i44, ptr %arrayidx55, align 4
1015  %arrayidx57 = getelementptr inbounds float, ptr %pOut, i32 %add10
1016  store float %i46, ptr %arrayidx57, align 4
1017  %arrayidx59 = getelementptr inbounds float, ptr %pOut, i32 %add12
1018  store float %i47, ptr %arrayidx59, align 4
1019  %add61 = add i32 %k2.0156, 6
1020  %cmp3 = icmp ult i32 %add61, %sub
1021  br i1 %cmp3, label %for.body, label %for.cond.cleanup
1022}
1023
1024define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
1025; CHECK-LABEL: DCT_mve7:
1026; CHECK:       @ %bb.0: @ %entry
1027; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1028; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1029; CHECK-NEXT:    .pad #4
1030; CHECK-NEXT:    sub sp, #4
1031; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1032; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1033; CHECK-NEXT:    .pad #72
1034; CHECK-NEXT:    sub sp, #72
1035; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
1036; CHECK-NEXT:    ldr r1, [r0, #4]
1037; CHECK-NEXT:    subs r1, #7
1038; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
1039; CHECK-NEXT:    cmp r1, #2
1040; CHECK-NEXT:    blo.w .LBB6_5
1041; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1042; CHECK-NEXT:    ldr r3, [r0, #8]
1043; CHECK-NEXT:    ldr r1, [r0]
1044; CHECK-NEXT:    adds r0, r3, #3
1045; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
1046; CHECK-NEXT:    bic r0, r0, #3
1047; CHECK-NEXT:    add.w r9, r1, r3, lsl #2
1048; CHECK-NEXT:    subs r1, r0, #4
1049; CHECK-NEXT:    movs r0, #1
1050; CHECK-NEXT:    lsls r5, r3, #2
1051; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
1052; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
1053; CHECK-NEXT:    rsb r1, r3, r3, lsl #3
1054; CHECK-NEXT:    lsls r1, r1, #2
1055; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
1056; CHECK-NEXT:  .LBB6_2: @ %for.body
1057; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1058; CHECK-NEXT:    @ Child Loop BB6_3 Depth 2
1059; CHECK-NEXT:    adds r1, r0, #6
1060; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
1061; CHECK-NEXT:    adds r1, r0, #5
1062; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
1063; CHECK-NEXT:    adds r1, r0, #4
1064; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
1065; CHECK-NEXT:    adds r1, r0, #3
1066; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
1067; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
1068; CHECK-NEXT:    vmov.i32 q2, #0x0
1069; CHECK-NEXT:    ldr r1, [sp, #20] @ 4-byte Reload
1070; CHECK-NEXT:    adds r4, r0, #2
1071; CHECK-NEXT:    ldr r6, [sp, #8] @ 4-byte Reload
1072; CHECK-NEXT:    add.w r8, r0, #1
1073; CHECK-NEXT:    mov r3, r9
1074; CHECK-NEXT:    vmov q4, q2
1075; CHECK-NEXT:    vmov q5, q2
1076; CHECK-NEXT:    vmov q3, q2
1077; CHECK-NEXT:    vmov q6, q2
1078; CHECK-NEXT:    vmov q1, q2
1079; CHECK-NEXT:    mov r12, r7
1080; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
1081; CHECK-NEXT:    dls lr, r6
1082; CHECK-NEXT:  .LBB6_3: @ %vector.body
1083; CHECK-NEXT:    @ Parent Loop BB6_2 Depth=1
1084; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1085; CHECK-NEXT:    vctp.32 r12
1086; CHECK-NEXT:    add.w r10, r3, r5
1087; CHECK-NEXT:    vpstt
1088; CHECK-NEXT:    vldrwt.u32 q7, [r1], #16
1089; CHECK-NEXT:    vldrwt.u32 q0, [r3], #16
1090; CHECK-NEXT:    add.w r11, r10, r5
1091; CHECK-NEXT:    sub.w r12, r12, #4
1092; CHECK-NEXT:    vpstt
1093; CHECK-NEXT:    vfmat.f32 q5, q0, q7
1094; CHECK-NEXT:    vldrwt.u32 q0, [r10]
1095; CHECK-NEXT:    add.w r6, r11, r5
1096; CHECK-NEXT:    vpstt
1097; CHECK-NEXT:    vfmat.f32 q6, q0, q7
1098; CHECK-NEXT:    vldrwt.u32 q0, [r11]
1099; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
1100; CHECK-NEXT:    vmov q6, q5
1101; CHECK-NEXT:    vpst
1102; CHECK-NEXT:    vfmat.f32 q1, q0, q7
1103; CHECK-NEXT:    vmov q5, q4
1104; CHECK-NEXT:    vmov q4, q3
1105; CHECK-NEXT:    vmov q3, q1
1106; CHECK-NEXT:    vpst
1107; CHECK-NEXT:    vldrwt.u32 q0, [r6]
1108; CHECK-NEXT:    vldrw.u32 q1, [sp, #56] @ 16-byte Reload
1109; CHECK-NEXT:    adds r7, r6, r5
1110; CHECK-NEXT:    vpstt
1111; CHECK-NEXT:    vfmat.f32 q1, q0, q7
1112; CHECK-NEXT:    vldrwt.u32 q0, [r7]
1113; CHECK-NEXT:    adds r6, r7, r5
1114; CHECK-NEXT:    vstrw.32 q1, [sp, #56] @ 16-byte Spill
1115; CHECK-NEXT:    vmov q1, q3
1116; CHECK-NEXT:    vmov q3, q4
1117; CHECK-NEXT:    vpstt
1118; CHECK-NEXT:    vfmat.f32 q3, q0, q7
1119; CHECK-NEXT:    vldrwt.u32 q0, [r6]
1120; CHECK-NEXT:    vmov q4, q5
1121; CHECK-NEXT:    adds r7, r6, r5
1122; CHECK-NEXT:    vpstt
1123; CHECK-NEXT:    vfmat.f32 q4, q0, q7
1124; CHECK-NEXT:    vldrwt.u32 q0, [r7]
1125; CHECK-NEXT:    vmov q5, q6
1126; CHECK-NEXT:    vldrw.u32 q6, [sp, #40] @ 16-byte Reload
1127; CHECK-NEXT:    vpst
1128; CHECK-NEXT:    vfmat.f32 q2, q0, q7
1129; CHECK-NEXT:    le lr, .LBB6_3
1130; CHECK-NEXT:  @ %bb.4: @ %middle.block
1131; CHECK-NEXT:    @ in Loop: Header=BB6_2 Depth=1
1132; CHECK-NEXT:    vadd.f32 s0, s26, s27
1133; CHECK-NEXT:    add.w r1, r2, r8, lsl #2
1134; CHECK-NEXT:    vadd.f32 s2, s24, s25
1135; CHECK-NEXT:    vadd.f32 s1, s22, s23
1136; CHECK-NEXT:    vadd.f32 s3, s20, s21
1137; CHECK-NEXT:    vadd.f32 s6, s6, s7
1138; CHECK-NEXT:    vadd.f32 s4, s4, s5
1139; CHECK-NEXT:    vadd.f32 s10, s10, s11
1140; CHECK-NEXT:    vadd.f32 s8, s8, s9
1141; CHECK-NEXT:    vadd.f32 s0, s2, s0
1142; CHECK-NEXT:    vadd.f32 s9, s18, s19
1143; CHECK-NEXT:    vadd.f32 s11, s16, s17
1144; CHECK-NEXT:    vldrw.u32 q4, [sp, #56] @ 16-byte Reload
1145; CHECK-NEXT:    vadd.f32 s2, s3, s1
1146; CHECK-NEXT:    vadd.f32 s5, s18, s19
1147; CHECK-NEXT:    vadd.f32 s7, s16, s17
1148; CHECK-NEXT:    vadd.f32 s4, s4, s6
1149; CHECK-NEXT:    vstr s0, [r1]
1150; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
1151; CHECK-NEXT:    vadd.f32 s14, s14, s15
1152; CHECK-NEXT:    adds r0, #7
1153; CHECK-NEXT:    vadd.f32 s12, s12, s13
1154; CHECK-NEXT:    vstr s2, [r1]
1155; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
1156; CHECK-NEXT:    vadd.f32 s8, s8, s10
1157; CHECK-NEXT:    vadd.f32 s6, s7, s5
1158; CHECK-NEXT:    vstr s4, [r1]
1159; CHECK-NEXT:    vadd.f32 s10, s11, s9
1160; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
1161; CHECK-NEXT:    vadd.f32 s12, s12, s14
1162; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1163; CHECK-NEXT:    vstr s6, [r1]
1164; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
1165; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1166; CHECK-NEXT:    vstr s12, [r1]
1167; CHECK-NEXT:    ldr r1, [sp, #32] @ 4-byte Reload
1168; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1169; CHECK-NEXT:    vstr s10, [r1]
1170; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
1171; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1172; CHECK-NEXT:    vstr s8, [r1]
1173; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
1174; CHECK-NEXT:    add r9, r1
1175; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
1176; CHECK-NEXT:    cmp r0, r1
1177; CHECK-NEXT:    blo.w .LBB6_2
1178; CHECK-NEXT:  .LBB6_5: @ %for.cond.cleanup
1179; CHECK-NEXT:    add sp, #72
1180; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1181; CHECK-NEXT:    add sp, #4
1182; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1183entry:
1184  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
1185  %i = load i32, ptr %NumInputs, align 4
1186  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
1187  %i1 = load i32, ptr %NumFilters, align 4
1188  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
1189  %i2 = load ptr, ptr %pDCTCoefs, align 4
1190  %cmp = icmp ugt i32 %i, 1
1191  tail call void @llvm.assume(i1 %cmp)
1192  %sub = add i32 %i1, -7
1193  %cmp3176 = icmp ugt i32 %sub, 1
1194  br i1 %cmp3176, label %for.body.preheader, label %for.cond.cleanup
1195
1196for.body.preheader:                               ; preds = %entry
1197  %n.rnd.up = add i32 %i, 3
1198  %n.vec = and i32 %n.rnd.up, -4
1199  br label %for.body
1200
1201for.cond.cleanup:                                 ; preds = %middle.block, %entry
1202  ret void
1203
1204for.body:                                         ; preds = %middle.block, %for.body.preheader
1205  %k2.0177 = phi i32 [ %add70, %middle.block ], [ 1, %for.body.preheader ]
1206  %mul4 = mul i32 %k2.0177, %i
1207  %add = add nuw i32 %k2.0177, 1
1208  %mul5 = mul i32 %add, %i
1209  %add6 = add i32 %k2.0177, 2
1210  %mul7 = mul i32 %add6, %i
1211  %add8 = add i32 %k2.0177, 3
1212  %mul9 = mul i32 %add8, %i
1213  %add10 = add i32 %k2.0177, 4
1214  %mul11 = mul i32 %add10, %i
1215  %add12 = add i32 %k2.0177, 5
1216  %mul13 = mul i32 %add12, %i
1217  %add14 = add i32 %k2.0177, 6
1218  %mul15 = mul i32 %add14, %i
1219  br label %vector.body
1220
1221vector.body:                                      ; preds = %vector.body, %for.body
1222  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
1223  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i40, %vector.body ]
1224  %vec.phi178 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i41, %vector.body ]
1225  %vec.phi179 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i42, %vector.body ]
1226  %vec.phi180 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i43, %vector.body ]
1227  %vec.phi181 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i44, %vector.body ]
1228  %vec.phi182 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i45, %vector.body ]
1229  %vec.phi183 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i46, %vector.body ]
1230  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
1231  %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
1232  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1233  %i5 = add i32 %index, %mul4
1234  %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
1235  %wide.masked.load184 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1236  %i8 = fmul fast <4 x float> %wide.masked.load184, %wide.masked.load
1237  %i9 = fadd fast <4 x float> %i8, %vec.phi179
1238  %i10 = add i32 %index, %mul5
1239  %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
1240  %wide.masked.load185 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1241  %i13 = fmul fast <4 x float> %wide.masked.load185, %wide.masked.load
1242  %i14 = fadd fast <4 x float> %i13, %vec.phi181
1243  %i15 = add i32 %index, %mul7
1244  %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
1245  %wide.masked.load186 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1246  %i18 = fmul fast <4 x float> %wide.masked.load186, %wide.masked.load
1247  %i19 = fadd fast <4 x float> %i18, %vec.phi183
1248  %i20 = add i32 %index, %mul9
1249  %i21 = getelementptr inbounds float, ptr %i2, i32 %i20
1250  %wide.masked.load187 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1251  %i23 = fmul fast <4 x float> %wide.masked.load187, %wide.masked.load
1252  %i24 = fadd fast <4 x float> %i23, %vec.phi182
1253  %i25 = add i32 %index, %mul11
1254  %i26 = getelementptr inbounds float, ptr %i2, i32 %i25
1255  %wide.masked.load188 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1256  %i28 = fmul fast <4 x float> %wide.masked.load188, %wide.masked.load
1257  %i29 = fadd fast <4 x float> %i28, %vec.phi180
1258  %i30 = add i32 %index, %mul13
1259  %i31 = getelementptr inbounds float, ptr %i2, i32 %i30
1260  %wide.masked.load189 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i31, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1261  %i33 = fmul fast <4 x float> %wide.masked.load189, %wide.masked.load
1262  %i34 = fadd fast <4 x float> %i33, %vec.phi178
1263  %i35 = add i32 %index, %mul15
1264  %i36 = getelementptr inbounds float, ptr %i2, i32 %i35
1265  %wide.masked.load190 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i36, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1266  %i38 = fmul fast <4 x float> %wide.masked.load190, %wide.masked.load
1267  %i39 = fadd fast <4 x float> %i38, %vec.phi
1268  %i40 = select <4 x i1> %active.lane.mask, <4 x float> %i39, <4 x float> %vec.phi
1269  %i41 = select <4 x i1> %active.lane.mask, <4 x float> %i34, <4 x float> %vec.phi178
1270  %i42 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi179
1271  %i43 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi180
1272  %i44 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi181
1273  %i45 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi182
1274  %i46 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi183
1275  %index.next = add i32 %index, 4
1276  %i47 = icmp eq i32 %index.next, %n.vec
1277  br i1 %i47, label %middle.block, label %vector.body
1278
1279middle.block:                                     ; preds = %vector.body
1280  %i48 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i46)
1281  %i49 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i45)
1282  %i50 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i44)
1283  %i51 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i43)
1284  %i52 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i42)
1285  %i53 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i41)
1286  %i54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i40)
1287  %arrayidx56 = getelementptr inbounds float, ptr %pOut, i32 %k2.0177
1288  store float %i52, ptr %arrayidx56, align 4
1289  %arrayidx58 = getelementptr inbounds float, ptr %pOut, i32 %add
1290  store float %i50, ptr %arrayidx58, align 4
1291  %arrayidx60 = getelementptr inbounds float, ptr %pOut, i32 %add6
1292  store float %i48, ptr %arrayidx60, align 4
1293  %arrayidx62 = getelementptr inbounds float, ptr %pOut, i32 %add8
1294  store float %i49, ptr %arrayidx62, align 4
1295  %arrayidx64 = getelementptr inbounds float, ptr %pOut, i32 %add10
1296  store float %i51, ptr %arrayidx64, align 4
1297  %arrayidx66 = getelementptr inbounds float, ptr %pOut, i32 %add12
1298  store float %i53, ptr %arrayidx66, align 4
1299  %arrayidx68 = getelementptr inbounds float, ptr %pOut, i32 %add14
1300  store float %i54, ptr %arrayidx68, align 4
1301  %add70 = add i32 %k2.0177, 7
1302  %cmp3 = icmp ult i32 %add70, %sub
1303  br i1 %cmp3, label %for.body, label %for.cond.cleanup
1304}
1305
1306define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) {
1307; CHECK-LABEL: DCT_mve8:
1308; CHECK:       @ %bb.0: @ %entry
1309; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1310; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
1311; CHECK-NEXT:    .pad #4
1312; CHECK-NEXT:    sub sp, #4
1313; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1314; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1315; CHECK-NEXT:    .pad #88
1316; CHECK-NEXT:    sub sp, #88
1317; CHECK-NEXT:    str r1, [sp, #20] @ 4-byte Spill
1318; CHECK-NEXT:    ldr r1, [r0, #4]
1319; CHECK-NEXT:    subs r1, #8
1320; CHECK-NEXT:    str r1, [sp, #16] @ 4-byte Spill
1321; CHECK-NEXT:    cmp r1, #2
1322; CHECK-NEXT:    blo.w .LBB7_5
1323; CHECK-NEXT:  @ %bb.1: @ %for.body.preheader
1324; CHECK-NEXT:    ldr r3, [r0, #8]
1325; CHECK-NEXT:    ldr r1, [r0]
1326; CHECK-NEXT:    adds r0, r3, #3
1327; CHECK-NEXT:    str r3, [sp, #12] @ 4-byte Spill
1328; CHECK-NEXT:    bic r0, r0, #3
1329; CHECK-NEXT:    add.w r12, r1, r3, lsl #2
1330; CHECK-NEXT:    subs r1, r0, #4
1331; CHECK-NEXT:    movs r0, #1
1332; CHECK-NEXT:    lsls r6, r3, #2
1333; CHECK-NEXT:    add.w r1, r0, r1, lsr #2
1334; CHECK-NEXT:    str r1, [sp, #8] @ 4-byte Spill
1335; CHECK-NEXT:    lsls r1, r3, #5
1336; CHECK-NEXT:    str r1, [sp, #4] @ 4-byte Spill
1337; CHECK-NEXT:  .LBB7_2: @ %for.body
1338; CHECK-NEXT:    @ =>This Loop Header: Depth=1
1339; CHECK-NEXT:    @ Child Loop BB7_3 Depth 2
1340; CHECK-NEXT:    adds r1, r0, #7
1341; CHECK-NEXT:    str r1, [sp, #36] @ 4-byte Spill
1342; CHECK-NEXT:    adds r1, r0, #6
1343; CHECK-NEXT:    str r1, [sp, #32] @ 4-byte Spill
1344; CHECK-NEXT:    adds r1, r0, #5
1345; CHECK-NEXT:    ldr r7, [sp, #12] @ 4-byte Reload
1346; CHECK-NEXT:    str r1, [sp, #28] @ 4-byte Spill
1347; CHECK-NEXT:    adds r1, r0, #4
1348; CHECK-NEXT:    ldr.w r9, [sp, #20] @ 4-byte Reload
1349; CHECK-NEXT:    vmov.i32 q3, #0x0
1350; CHECK-NEXT:    ldr r5, [sp, #8] @ 4-byte Reload
1351; CHECK-NEXT:    adds r4, r0, #3
1352; CHECK-NEXT:    str r1, [sp, #24] @ 4-byte Spill
1353; CHECK-NEXT:    add.w r8, r0, #2
1354; CHECK-NEXT:    adds r1, r0, #1
1355; CHECK-NEXT:    mov r3, r12
1356; CHECK-NEXT:    vmov q5, q3
1357; CHECK-NEXT:    vmov q6, q3
1358; CHECK-NEXT:    vmov q4, q3
1359; CHECK-NEXT:    vmov q7, q3
1360; CHECK-NEXT:    vmov q2, q3
1361; CHECK-NEXT:    mov r10, r7
1362; CHECK-NEXT:    vstrw.32 q3, [sp, #56] @ 16-byte Spill
1363; CHECK-NEXT:    vstrw.32 q3, [sp, #72] @ 16-byte Spill
1364; CHECK-NEXT:    dls lr, r5
1365; CHECK-NEXT:  .LBB7_3: @ %vector.body
1366; CHECK-NEXT:    @ Parent Loop BB7_2 Depth=1
1367; CHECK-NEXT:    @ => This Inner Loop Header: Depth=2
1368; CHECK-NEXT:    vctp.32 r10
1369; CHECK-NEXT:    add.w r11, r3, r6
1370; CHECK-NEXT:    vpstt
1371; CHECK-NEXT:    vldrwt.u32 q0, [r9], #16
1372; CHECK-NEXT:    vldrwt.u32 q1, [r3], #16
1373; CHECK-NEXT:    add.w r5, r11, r6
1374; CHECK-NEXT:    sub.w r10, r10, #4
1375; CHECK-NEXT:    vpstt
1376; CHECK-NEXT:    vfmat.f32 q6, q1, q0
1377; CHECK-NEXT:    vldrwt.u32 q1, [r11]
1378; CHECK-NEXT:    vstrw.32 q6, [sp, #40] @ 16-byte Spill
1379; CHECK-NEXT:    vmov q6, q5
1380; CHECK-NEXT:    vpst
1381; CHECK-NEXT:    vfmat.f32 q7, q1, q0
1382; CHECK-NEXT:    vmov q5, q3
1383; CHECK-NEXT:    vmov q3, q4
1384; CHECK-NEXT:    vmov q4, q2
1385; CHECK-NEXT:    vpst
1386; CHECK-NEXT:    vldrwt.u32 q1, [r5]
1387; CHECK-NEXT:    vldrw.u32 q2, [sp, #56] @ 16-byte Reload
1388; CHECK-NEXT:    adds r7, r5, r6
1389; CHECK-NEXT:    vpstt
1390; CHECK-NEXT:    vfmat.f32 q2, q1, q0
1391; CHECK-NEXT:    vldrwt.u32 q1, [r7]
1392; CHECK-NEXT:    vstrw.32 q2, [sp, #56] @ 16-byte Spill
1393; CHECK-NEXT:    vldrw.u32 q2, [sp, #72] @ 16-byte Reload
1394; CHECK-NEXT:    adds r5, r7, r6
1395; CHECK-NEXT:    vpstt
1396; CHECK-NEXT:    vfmat.f32 q2, q1, q0
1397; CHECK-NEXT:    vldrwt.u32 q1, [r5]
1398; CHECK-NEXT:    adds r7, r5, r6
1399; CHECK-NEXT:    vstrw.32 q2, [sp, #72] @ 16-byte Spill
1400; CHECK-NEXT:    vmov q2, q4
1401; CHECK-NEXT:    vmov q4, q3
1402; CHECK-NEXT:    vpstt
1403; CHECK-NEXT:    vfmat.f32 q2, q1, q0
1404; CHECK-NEXT:    vldrwt.u32 q1, [r7]
1405; CHECK-NEXT:    adds r5, r7, r6
1406; CHECK-NEXT:    vmov q3, q5
1407; CHECK-NEXT:    vpstt
1408; CHECK-NEXT:    vfmat.f32 q4, q1, q0
1409; CHECK-NEXT:    vldrwt.u32 q1, [r5]
1410; CHECK-NEXT:    vmov q5, q6
1411; CHECK-NEXT:    add r5, r6
1412; CHECK-NEXT:    vpstt
1413; CHECK-NEXT:    vfmat.f32 q5, q1, q0
1414; CHECK-NEXT:    vldrwt.u32 q1, [r5]
1415; CHECK-NEXT:    vldrw.u32 q6, [sp, #40] @ 16-byte Reload
1416; CHECK-NEXT:    vpst
1417; CHECK-NEXT:    vfmat.f32 q3, q1, q0
1418; CHECK-NEXT:    le lr, .LBB7_3
1419; CHECK-NEXT:  @ %bb.4: @ %middle.block
1420; CHECK-NEXT:    @ in Loop: Header=BB7_2 Depth=1
1421; CHECK-NEXT:    vadd.f32 s0, s30, s31
1422; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1423; CHECK-NEXT:    vadd.f32 s2, s28, s29
1424; CHECK-NEXT:    vadd.f32 s4, s26, s27
1425; CHECK-NEXT:    vadd.f32 s6, s24, s25
1426; CHECK-NEXT:    vadd.f32 s5, s18, s19
1427; CHECK-NEXT:    vadd.f32 s7, s16, s17
1428; CHECK-NEXT:    vldrw.u32 q4, [sp, #56] @ 16-byte Reload
1429; CHECK-NEXT:    vadd.f32 s10, s10, s11
1430; CHECK-NEXT:    vadd.f32 s8, s8, s9
1431; CHECK-NEXT:    vadd.f32 s9, s18, s19
1432; CHECK-NEXT:    vadd.f32 s11, s16, s17
1433; CHECK-NEXT:    vldrw.u32 q4, [sp, #72] @ 16-byte Reload
1434; CHECK-NEXT:    vadd.f32 s14, s14, s15
1435; CHECK-NEXT:    vadd.f32 s12, s12, s13
1436; CHECK-NEXT:    vadd.f32 s13, s18, s19
1437; CHECK-NEXT:    vadd.f32 s15, s16, s17
1438; CHECK-NEXT:    vadd.f32 s0, s2, s0
1439; CHECK-NEXT:    vadd.f32 s2, s6, s4
1440; CHECK-NEXT:    vadd.f32 s8, s8, s10
1441; CHECK-NEXT:    vadd.f32 s10, s11, s9
1442; CHECK-NEXT:    vadd.f32 s6, s12, s14
1443; CHECK-NEXT:    vadd.f32 s1, s22, s23
1444; CHECK-NEXT:    vadd.f32 s14, s15, s13
1445; CHECK-NEXT:    vstr s0, [r1]
1446; CHECK-NEXT:    add.w r1, r2, r0, lsl #2
1447; CHECK-NEXT:    vadd.f32 s3, s20, s21
1448; CHECK-NEXT:    adds r0, #8
1449; CHECK-NEXT:    vstr s2, [r1]
1450; CHECK-NEXT:    add.w r1, r2, r8, lsl #2
1451; CHECK-NEXT:    vadd.f32 s12, s7, s5
1452; CHECK-NEXT:    vstr s10, [r1]
1453; CHECK-NEXT:    add.w r1, r2, r4, lsl #2
1454; CHECK-NEXT:    vstr s14, [r1]
1455; CHECK-NEXT:    ldr r1, [sp, #24] @ 4-byte Reload
1456; CHECK-NEXT:    vadd.f32 s4, s3, s1
1457; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1458; CHECK-NEXT:    vstr s8, [r1]
1459; CHECK-NEXT:    ldr r1, [sp, #28] @ 4-byte Reload
1460; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1461; CHECK-NEXT:    vstr s12, [r1]
1462; CHECK-NEXT:    ldr r1, [sp, #32] @ 4-byte Reload
1463; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1464; CHECK-NEXT:    vstr s4, [r1]
1465; CHECK-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
1466; CHECK-NEXT:    add.w r1, r2, r1, lsl #2
1467; CHECK-NEXT:    vstr s6, [r1]
1468; CHECK-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
1469; CHECK-NEXT:    add r12, r1
1470; CHECK-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
1471; CHECK-NEXT:    cmp r0, r1
1472; CHECK-NEXT:    blo.w .LBB7_2
1473; CHECK-NEXT:  .LBB7_5: @ %for.cond.cleanup
1474; CHECK-NEXT:    add sp, #88
1475; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1476; CHECK-NEXT:    add sp, #4
1477; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
1478entry:
1479  %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2
1480  %i = load i32, ptr %NumInputs, align 4
1481  %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1
1482  %i1 = load i32, ptr %NumFilters, align 4
1483  %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0
1484  %i2 = load ptr, ptr %pDCTCoefs, align 4
1485  %cmp = icmp ugt i32 %i, 1
1486  tail call void @llvm.assume(i1 %cmp)
1487  %sub = add i32 %i1, -8
1488  %cmp3197 = icmp ugt i32 %sub, 1
1489  br i1 %cmp3197, label %for.body.preheader, label %for.cond.cleanup
1490
1491for.body.preheader:                               ; preds = %entry
1492  %n.rnd.up = add i32 %i, 3
1493  %n.vec = and i32 %n.rnd.up, -4
1494  br label %for.body
1495
1496for.cond.cleanup:                                 ; preds = %middle.block, %entry
1497  ret void
1498
1499for.body:                                         ; preds = %middle.block, %for.body.preheader
1500  %k2.0198 = phi i32 [ %add79, %middle.block ], [ 1, %for.body.preheader ]
1501  %mul4 = mul i32 %k2.0198, %i
1502  %add = add nuw nsw i32 %k2.0198, 1
1503  %mul5 = mul i32 %add, %i
1504  %add6 = add nuw nsw i32 %k2.0198, 2
1505  %mul7 = mul i32 %add6, %i
1506  %add8 = add nuw nsw i32 %k2.0198, 3
1507  %mul9 = mul i32 %add8, %i
1508  %add10 = add nuw nsw i32 %k2.0198, 4
1509  %mul11 = mul i32 %add10, %i
1510  %add12 = add nuw nsw i32 %k2.0198, 5
1511  %mul13 = mul i32 %add12, %i
1512  %add14 = add nuw nsw i32 %k2.0198, 6
1513  %mul15 = mul i32 %add14, %i
1514  %add16 = add i32 %k2.0198, 7
1515  %mul17 = mul i32 %add16, %i
1516  br label %vector.body
1517
1518vector.body:                                      ; preds = %vector.body, %for.body
1519  %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
1520  %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i45, %vector.body ]
1521  %vec.phi199 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i46, %vector.body ]
1522  %vec.phi200 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i47, %vector.body ]
1523  %vec.phi201 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i48, %vector.body ]
1524  %vec.phi202 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i49, %vector.body ]
1525  %vec.phi203 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i50, %vector.body ]
1526  %vec.phi204 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i51, %vector.body ]
1527  %vec.phi205 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i52, %vector.body ]
1528  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i)
1529  %i3 = getelementptr inbounds float, ptr %pIn, i32 %index
1530  %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1531  %i5 = add i32 %index, %mul4
1532  %i6 = getelementptr inbounds float, ptr %i2, i32 %i5
1533  %wide.masked.load206 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1534  %i8 = fmul fast <4 x float> %wide.masked.load206, %wide.masked.load
1535  %i9 = fadd fast <4 x float> %i8, %vec.phi200
1536  %i10 = add i32 %index, %mul5
1537  %i11 = getelementptr inbounds float, ptr %i2, i32 %i10
1538  %wide.masked.load207 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1539  %i13 = fmul fast <4 x float> %wide.masked.load207, %wide.masked.load
1540  %i14 = fadd fast <4 x float> %i13, %vec.phi202
1541  %i15 = add i32 %index, %mul7
1542  %i16 = getelementptr inbounds float, ptr %i2, i32 %i15
1543  %wide.masked.load208 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1544  %i18 = fmul fast <4 x float> %wide.masked.load208, %wide.masked.load
1545  %i19 = fadd fast <4 x float> %i18, %vec.phi204
1546  %i20 = add i32 %index, %mul9
1547  %i21 = getelementptr inbounds float, ptr %i2, i32 %i20
1548  %wide.masked.load209 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1549  %i23 = fmul fast <4 x float> %wide.masked.load209, %wide.masked.load
1550  %i24 = fadd fast <4 x float> %i23, %vec.phi205
1551  %i25 = add i32 %index, %mul11
1552  %i26 = getelementptr inbounds float, ptr %i2, i32 %i25
1553  %wide.masked.load210 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1554  %i28 = fmul fast <4 x float> %wide.masked.load210, %wide.masked.load
1555  %i29 = fadd fast <4 x float> %i28, %vec.phi203
1556  %i30 = add i32 %index, %mul13
1557  %i31 = getelementptr inbounds float, ptr %i2, i32 %i30
1558  %wide.masked.load211 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i31, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1559  %i33 = fmul fast <4 x float> %wide.masked.load211, %wide.masked.load
1560  %i34 = fadd fast <4 x float> %i33, %vec.phi201
1561  %i35 = add i32 %index, %mul15
1562  %i36 = getelementptr inbounds float, ptr %i2, i32 %i35
1563  %wide.masked.load212 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i36, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1564  %i38 = fmul fast <4 x float> %wide.masked.load212, %wide.masked.load
1565  %i39 = fadd fast <4 x float> %i38, %vec.phi199
1566  %i40 = add i32 %index, %mul17
1567  %i41 = getelementptr inbounds float, ptr %i2, i32 %i40
1568  %wide.masked.load213 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i41, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
1569  %i43 = fmul fast <4 x float> %wide.masked.load213, %wide.masked.load
1570  %i44 = fadd fast <4 x float> %i43, %vec.phi
1571  %i45 = select <4 x i1> %active.lane.mask, <4 x float> %i44, <4 x float> %vec.phi
1572  %i46 = select <4 x i1> %active.lane.mask, <4 x float> %i39, <4 x float> %vec.phi199
1573  %i47 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi200
1574  %i48 = select <4 x i1> %active.lane.mask, <4 x float> %i34, <4 x float> %vec.phi201
1575  %i49 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi202
1576  %i50 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi203
1577  %i51 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi204
1578  %i52 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi205
1579  %index.next = add i32 %index, 4
1580  %i53 = icmp eq i32 %index.next, %n.vec
1581  br i1 %i53, label %middle.block, label %vector.body
1582
1583middle.block:                                     ; preds = %vector.body
1584  %i54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i52)
1585  %i55 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i51)
1586  %i56 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i50)
1587  %i57 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i49)
1588  %i58 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i48)
1589  %i59 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i47)
1590  %i60 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i46)
1591  %i61 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i45)
1592  %arrayidx63 = getelementptr inbounds float, ptr %pOut, i32 %k2.0198
1593  store float %i59, ptr %arrayidx63, align 4
1594  %arrayidx65 = getelementptr inbounds float, ptr %pOut, i32 %add
1595  store float %i57, ptr %arrayidx65, align 4
1596  %arrayidx67 = getelementptr inbounds float, ptr %pOut, i32 %add6
1597  store float %i55, ptr %arrayidx67, align 4
1598  %arrayidx69 = getelementptr inbounds float, ptr %pOut, i32 %add8
1599  store float %i54, ptr %arrayidx69, align 4
1600  %arrayidx71 = getelementptr inbounds float, ptr %pOut, i32 %add10
1601  store float %i56, ptr %arrayidx71, align 4
1602  %arrayidx73 = getelementptr inbounds float, ptr %pOut, i32 %add12
1603  store float %i58, ptr %arrayidx73, align 4
1604  %arrayidx75 = getelementptr inbounds float, ptr %pOut, i32 %add14
1605  store float %i60, ptr %arrayidx75, align 4
1606  %arrayidx77 = getelementptr inbounds float, ptr %pOut, i32 %add16
1607  store float %i61, ptr %arrayidx77, align 4
1608  %add79 = add i32 %k2.0198, 8
1609  %cmp3 = icmp ult i32 %add79, %sub
1610  br i1 %cmp3, label %for.body, label %for.cond.cleanup
1611}
1612
1613declare void @llvm.assume(i1 noundef)
1614declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
1615declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>)
1616declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
1617