1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s 3 4%struct.DCT_InstanceTypeDef = type { ptr, i32, i32 } 5 6define void @DCT_mve1(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) { 7; CHECK-LABEL: DCT_mve1: 8; CHECK: @ %bb.0: @ %entry 9; CHECK-NEXT: ldr r3, [r0, #4] 10; CHECK-NEXT: sub.w r12, r3, #1 11; CHECK-NEXT: cmp.w r12, #2 12; CHECK-NEXT: it lo 13; CHECK-NEXT: bxlo lr 14; CHECK-NEXT: .LBB0_1: @ %for.body.preheader 15; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr} 16; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} 17; CHECK-NEXT: ldr r5, [r0, #8] 18; CHECK-NEXT: ldr r3, [r0] 19; CHECK-NEXT: add.w r3, r3, r5, lsl #2 20; CHECK-NEXT: movs r0, #1 21; CHECK-NEXT: lsl.w r9, r5, #2 22; CHECK-NEXT: .LBB0_2: @ %for.body 23; CHECK-NEXT: @ =>This Loop Header: Depth=1 24; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 25; CHECK-NEXT: vmov.i32 q0, #0x0 26; CHECK-NEXT: mov r6, r1 27; CHECK-NEXT: mov r7, r3 28; CHECK-NEXT: dlstp.32 lr, r5 29; CHECK-NEXT: .LBB0_3: @ %vector.body 30; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 31; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 32; CHECK-NEXT: vldrw.u32 q1, [r6], #16 33; CHECK-NEXT: vldrw.u32 q2, [r7], #16 34; CHECK-NEXT: vfma.f32 q0, q2, q1 35; CHECK-NEXT: letp lr, .LBB0_3 36; CHECK-NEXT: @ %bb.4: @ %middle.block 37; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 38; CHECK-NEXT: vadd.f32 s2, s2, s3 39; CHECK-NEXT: add.w r7, r2, r0, lsl #2 40; CHECK-NEXT: vadd.f32 s0, s0, s1 41; CHECK-NEXT: adds r0, #1 42; CHECK-NEXT: add r3, r9 43; CHECK-NEXT: cmp r0, r12 44; CHECK-NEXT: vadd.f32 s0, s0, s2 45; CHECK-NEXT: vstr s0, [r7] 46; CHECK-NEXT: bne .LBB0_2 47; CHECK-NEXT: @ %bb.5: 48; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, lr} 49; CHECK-NEXT: bx lr 50entry: 51 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2 52 %i = load i32, ptr %NumInputs, align 4 53 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1 54 %i1 = load i32, ptr %NumFilters, align 4 55 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0 56 %i2 = load ptr, ptr %pDCTCoefs, align 4 57 %cmp = icmp ugt i32 %i, 1 58 tail call void @llvm.assume(i1 %cmp) 59 %sub = add i32 %i1, -1 60 %cmp350 = icmp ugt i32 %sub, 1 61 br i1 %cmp350, label %for.body.preheader, label %for.cond.cleanup 62 63for.body.preheader: ; preds = %entry 64 %n.rnd.up = add i32 %i, 3 65 %n.vec = and i32 %n.rnd.up, -4 66 br label %for.body 67 68for.cond.cleanup: ; preds = %middle.block, %entry 69 ret void 70 71for.body: ; preds = %middle.block, %for.body.preheader 72 %k2.051 = phi i32 [ %add16, %middle.block ], [ 1, %for.body.preheader ] 73 %mul4 = mul i32 %k2.051, %i 74 br label %vector.body 75 76vector.body: ; preds = %vector.body, %for.body 77 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 78 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i10, %vector.body ] 79 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i) 80 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index 81 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 82 %i5 = add i32 %index, %mul4 83 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5 84 %wide.masked.load53 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 85 %i8 = fmul fast <4 x float> %wide.masked.load53, %wide.masked.load 86 %i9 = fadd fast <4 x float> %i8, %vec.phi 87 %i10 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi 88 %index.next = add i32 %index, 4 89 %i11 = icmp eq i32 %index.next, %n.vec 90 br i1 %i11, label %middle.block, label %vector.body 91 92middle.block: ; preds = %vector.body 93 %i12 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i10) 94 %arrayidx14 = getelementptr inbounds float, ptr %pOut, i32 %k2.051 95 store float %i12, ptr %arrayidx14, align 4 96 %add16 = add nuw i32 %k2.051, 1 97 %exitcond52.not = icmp eq i32 %add16, %sub 98 br i1 %exitcond52.not, label %for.cond.cleanup, label %for.body 99} 100 101define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) { 102; CHECK-LABEL: DCT_mve2: 103; CHECK: @ %bb.0: @ %entry 104; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 105; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 106; CHECK-NEXT: .pad #4 107; CHECK-NEXT: sub sp, #4 108; CHECK-NEXT: str r1, [sp] @ 4-byte Spill 109; CHECK-NEXT: ldr r1, [r0, #4] 110; CHECK-NEXT: subs r1, #2 111; CHECK-NEXT: cmp r1, #2 112; CHECK-NEXT: blo .LBB1_5 113; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 114; CHECK-NEXT: ldr.w r12, [r0, #8] 115; CHECK-NEXT: movs r4, #1 116; CHECK-NEXT: ldr r3, [r0] 117; CHECK-NEXT: add.w r11, r3, r12, lsl #2 118; CHECK-NEXT: add.w r7, r3, r12, lsl #3 119; CHECK-NEXT: lsl.w r9, r12, #3 120; CHECK-NEXT: .LBB1_2: @ %for.body 121; CHECK-NEXT: @ =>This Loop Header: Depth=1 122; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 123; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload 124; CHECK-NEXT: vmov.i32 q0, #0x0 125; CHECK-NEXT: add.w r10, r4, #1 126; CHECK-NEXT: mov r3, r11 127; CHECK-NEXT: mov r0, r7 128; CHECK-NEXT: vmov q1, q0 129; CHECK-NEXT: dlstp.32 lr, r12 130; CHECK-NEXT: .LBB1_3: @ %vector.body 131; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1 132; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 133; CHECK-NEXT: vldrw.u32 q2, [r5], #16 134; CHECK-NEXT: vldrw.u32 q3, [r3], #16 135; CHECK-NEXT: vfma.f32 q1, q3, q2 136; CHECK-NEXT: vldrw.u32 q3, [r0], #16 137; CHECK-NEXT: vfma.f32 q0, q3, q2 138; CHECK-NEXT: letp lr, .LBB1_3 139; CHECK-NEXT: @ %bb.4: @ %middle.block 140; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 141; CHECK-NEXT: vadd.f32 s2, s2, s3 142; CHECK-NEXT: add.w r0, r2, r10, lsl #2 143; CHECK-NEXT: vadd.f32 s0, s0, s1 144; CHECK-NEXT: add r11, r9 145; CHECK-NEXT: vadd.f32 s6, s6, s7 146; CHECK-NEXT: add r7, r9 147; CHECK-NEXT: vadd.f32 s4, s4, s5 148; CHECK-NEXT: vadd.f32 s0, s0, s2 149; CHECK-NEXT: vadd.f32 s2, s4, s6 150; CHECK-NEXT: vstr s0, [r0] 151; CHECK-NEXT: add.w r0, r2, r4, lsl #2 152; CHECK-NEXT: adds r4, #2 153; CHECK-NEXT: cmp r4, r1 154; CHECK-NEXT: vstr s2, [r0] 155; CHECK-NEXT: blo .LBB1_2 156; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup 157; CHECK-NEXT: add sp, #4 158; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 159entry: 160 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2 161 %i = load i32, ptr %NumInputs, align 4 162 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1 163 %i1 = load i32, ptr %NumFilters, align 4 164 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0 165 %i2 = load ptr, ptr %pDCTCoefs, align 4 166 %cmp = icmp ugt i32 %i, 1 167 tail call void @llvm.assume(i1 %cmp) 168 %sub = add i32 %i1, -2 169 %cmp371 = icmp ugt i32 %sub, 1 170 br i1 %cmp371, label %for.body.preheader, label %for.cond.cleanup 171 172for.body.preheader: ; preds = %entry 173 %n.rnd.up = add i32 %i, 3 174 %n.vec = and i32 %n.rnd.up, -4 175 br label %for.body 176 177for.cond.cleanup: ; preds = %middle.block, %entry 178 ret void 179 180for.body: ; preds = %middle.block, %for.body.preheader 181 %k2.072 = phi i32 [ %add25, %middle.block ], [ 1, %for.body.preheader ] 182 %mul4 = mul i32 %k2.072, %i 183 %add = add nuw i32 %k2.072, 1 184 %mul5 = mul i32 %add, %i 185 br label %vector.body 186 187vector.body: ; preds = %vector.body, %for.body 188 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 189 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i15, %vector.body ] 190 %vec.phi73 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i16, %vector.body ] 191 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i) 192 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index 193 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 194 %i5 = add i32 %index, %mul4 195 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5 196 %wide.masked.load74 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 197 %i8 = fmul fast <4 x float> %wide.masked.load74, %wide.masked.load 198 %i9 = fadd fast <4 x float> %i8, %vec.phi73 199 %i10 = add i32 %index, %mul5 200 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10 201 %wide.masked.load75 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 202 %i13 = fmul fast <4 x float> %wide.masked.load75, %wide.masked.load 203 %i14 = fadd fast <4 x float> %i13, %vec.phi 204 %i15 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi 205 %i16 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi73 206 %index.next = add i32 %index, 4 207 %i17 = icmp eq i32 %index.next, %n.vec 208 br i1 %i17, label %middle.block, label %vector.body 209 210middle.block: ; preds = %vector.body 211 %i18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i16) 212 %i19 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i15) 213 %arrayidx21 = getelementptr inbounds float, ptr %pOut, i32 %k2.072 214 store float %i18, ptr %arrayidx21, align 4 215 %arrayidx23 = getelementptr inbounds float, ptr %pOut, i32 %add 216 store float %i19, ptr %arrayidx23, align 4 217 %add25 = add i32 %k2.072, 2 218 %cmp3 = icmp ult i32 %add25, %sub 219 br i1 %cmp3, label %for.body, label %for.cond.cleanup 220} 221 222define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) { 223; CHECK-LABEL: DCT_mve3: 224; CHECK: @ %bb.0: @ %entry 225; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 226; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 227; CHECK-NEXT: .pad #4 228; CHECK-NEXT: sub sp, #4 229; CHECK-NEXT: .vsave {d8, d9} 230; CHECK-NEXT: vpush {d8, d9} 231; CHECK-NEXT: .pad #24 232; CHECK-NEXT: sub sp, #24 233; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill 234; CHECK-NEXT: ldr r1, [r0, #4] 235; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill 236; CHECK-NEXT: subs r1, #3 237; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill 238; CHECK-NEXT: cmp r1, #2 239; CHECK-NEXT: blo .LBB2_5 240; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 241; CHECK-NEXT: ldr r3, [r0, #8] 242; CHECK-NEXT: movs r5, #1 243; CHECK-NEXT: ldr r1, [r0] 244; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill 245; CHECK-NEXT: add.w r0, r3, r3, lsl #1 246; CHECK-NEXT: add.w r9, r1, r3, lsl #2 247; CHECK-NEXT: add.w r12, r1, r3, lsl #3 248; CHECK-NEXT: adds r3, #3 249; CHECK-NEXT: bic r3, r3, #3 250; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload 251; CHECK-NEXT: add.w r10, r1, r0, lsl #2 252; CHECK-NEXT: subs r3, #4 253; CHECK-NEXT: lsl.w r11, r0, #2 254; CHECK-NEXT: add.w r1, r5, r3, lsr #2 255; CHECK-NEXT: str r1, [sp] @ 4-byte Spill 256; CHECK-NEXT: .LBB2_2: @ %for.body 257; CHECK-NEXT: @ =>This Loop Header: Depth=1 258; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 259; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload 260; CHECK-NEXT: vmov.i32 q0, #0x0 261; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload 262; CHECK-NEXT: adds r0, r5, #2 263; CHECK-NEXT: adds r2, r5, #1 264; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill 265; CHECK-NEXT: mov r3, r9 266; CHECK-NEXT: mov r0, r12 267; CHECK-NEXT: mov r4, r10 268; CHECK-NEXT: vmov q2, q0 269; CHECK-NEXT: vmov q1, q0 270; CHECK-NEXT: dlstp.32 lr, r7 271; CHECK-NEXT: .LBB2_3: @ %vector.body 272; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1 273; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 274; CHECK-NEXT: vldrw.u32 q3, [r6], #16 275; CHECK-NEXT: vldrw.u32 q4, [r3], #16 276; CHECK-NEXT: vfma.f32 q1, q4, q3 277; CHECK-NEXT: vldrw.u32 q4, [r0], #16 278; CHECK-NEXT: vfma.f32 q2, q4, q3 279; CHECK-NEXT: vldrw.u32 q4, [r4], #16 280; CHECK-NEXT: vfma.f32 q0, q4, q3 281; CHECK-NEXT: letp lr, .LBB2_3 282; CHECK-NEXT: @ %bb.4: @ %middle.block 283; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 284; CHECK-NEXT: vadd.f32 s10, s10, s11 285; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload 286; CHECK-NEXT: vadd.f32 s8, s8, s9 287; CHECK-NEXT: add r9, r11 288; CHECK-NEXT: vadd.f32 s6, s6, s7 289; CHECK-NEXT: add.w r0, r1, r2, lsl #2 290; CHECK-NEXT: vadd.f32 s4, s4, s5 291; CHECK-NEXT: add r12, r11 292; CHECK-NEXT: vadd.f32 s2, s2, s3 293; CHECK-NEXT: add r10, r11 294; CHECK-NEXT: vadd.f32 s0, s0, s1 295; CHECK-NEXT: vadd.f32 s8, s8, s10 296; CHECK-NEXT: vadd.f32 s4, s4, s6 297; CHECK-NEXT: vadd.f32 s0, s0, s2 298; CHECK-NEXT: vstr s8, [r0] 299; CHECK-NEXT: add.w r0, r1, r5, lsl #2 300; CHECK-NEXT: adds r5, #3 301; CHECK-NEXT: vstr s4, [r0] 302; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload 303; CHECK-NEXT: add.w r0, r1, r0, lsl #2 304; CHECK-NEXT: vstr s0, [r0] 305; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload 306; CHECK-NEXT: cmp r5, r0 307; CHECK-NEXT: blo .LBB2_2 308; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup 309; CHECK-NEXT: add sp, #24 310; CHECK-NEXT: vpop {d8, d9} 311; CHECK-NEXT: add sp, #4 312; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 313entry: 314 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2 315 %i = load i32, ptr %NumInputs, align 4 316 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1 317 %i1 = load i32, ptr %NumFilters, align 4 318 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0 319 %i2 = load ptr, ptr %pDCTCoefs, align 4 320 %cmp = icmp ugt i32 %i, 1 321 tail call void @llvm.assume(i1 %cmp) 322 %sub = add i32 %i1, -3 323 %cmp392 = icmp ugt i32 %sub, 1 324 br i1 %cmp392, label %for.body.preheader, label %for.cond.cleanup 325 326for.body.preheader: ; preds = %entry 327 %n.rnd.up = add i32 %i, 3 328 %n.vec = and i32 %n.rnd.up, -4 329 br label %for.body 330 331for.cond.cleanup: ; preds = %middle.block, %entry 332 ret void 333 334for.body: ; preds = %middle.block, %for.body.preheader 335 %k2.093 = phi i32 [ %add34, %middle.block ], [ 1, %for.body.preheader ] 336 %mul4 = mul i32 %k2.093, %i 337 %add = add nuw i32 %k2.093, 1 338 %mul5 = mul i32 %add, %i 339 %add6 = add i32 %k2.093, 2 340 %mul7 = mul i32 %add6, %i 341 br label %vector.body 342 343vector.body: ; preds = %vector.body, %for.body 344 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 345 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i20, %vector.body ] 346 %vec.phi94 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i21, %vector.body ] 347 %vec.phi95 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i22, %vector.body ] 348 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i) 349 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index 350 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 351 %i5 = add i32 %index, %mul4 352 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5 353 %wide.masked.load96 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 354 %i8 = fmul fast <4 x float> %wide.masked.load96, %wide.masked.load 355 %i9 = fadd fast <4 x float> %i8, %vec.phi95 356 %i10 = add i32 %index, %mul5 357 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10 358 %wide.masked.load97 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 359 %i13 = fmul fast <4 x float> %wide.masked.load97, %wide.masked.load 360 %i14 = fadd fast <4 x float> %i13, %vec.phi94 361 %i15 = add i32 %index, %mul7 362 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15 363 %wide.masked.load98 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 364 %i18 = fmul fast <4 x float> %wide.masked.load98, %wide.masked.load 365 %i19 = fadd fast <4 x float> %i18, %vec.phi 366 %i20 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi 367 %i21 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi94 368 %i22 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi95 369 %index.next = add i32 %index, 4 370 %i23 = icmp eq i32 %index.next, %n.vec 371 br i1 %i23, label %middle.block, label %vector.body 372 373middle.block: ; preds = %vector.body 374 %i24 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i22) 375 %i25 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i21) 376 %i26 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i20) 377 %arrayidx28 = getelementptr inbounds float, ptr %pOut, i32 %k2.093 378 store float %i24, ptr %arrayidx28, align 4 379 %arrayidx30 = getelementptr inbounds float, ptr %pOut, i32 %add 380 store float %i25, ptr %arrayidx30, align 4 381 %arrayidx32 = getelementptr inbounds float, ptr %pOut, i32 %add6 382 store float %i26, ptr %arrayidx32, align 4 383 %add34 = add i32 %k2.093, 3 384 %cmp3 = icmp ult i32 %add34, %sub 385 br i1 %cmp3, label %for.body, label %for.cond.cleanup 386} 387 388define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) { 389; CHECK-LABEL: DCT_mve4: 390; CHECK: @ %bb.0: @ %entry 391; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 392; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 393; CHECK-NEXT: .pad #4 394; CHECK-NEXT: sub sp, #4 395; CHECK-NEXT: .vsave {d8, d9, d10, d11} 396; CHECK-NEXT: vpush {d8, d9, d10, d11} 397; CHECK-NEXT: .pad #40 398; CHECK-NEXT: sub sp, #40 399; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill 400; CHECK-NEXT: ldr r1, [r0, #4] 401; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill 402; CHECK-NEXT: subs r1, #4 403; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill 404; CHECK-NEXT: cmp r1, #2 405; CHECK-NEXT: blo.w .LBB3_5 406; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 407; CHECK-NEXT: ldr r2, [r0, #8] 408; CHECK-NEXT: movs r6, #1 409; CHECK-NEXT: ldr r1, [r0] 410; CHECK-NEXT: add.w r0, r2, r2, lsl #1 411; CHECK-NEXT: add.w r12, r1, r2, lsl #2 412; CHECK-NEXT: add.w r8, r1, r2, lsl #3 413; CHECK-NEXT: add.w r9, r1, r2, lsl #4 414; CHECK-NEXT: add.w r11, r1, r0, lsl #2 415; CHECK-NEXT: adds r0, r2, #3 416; CHECK-NEXT: bic r0, r0, #3 417; CHECK-NEXT: subs r0, #4 418; CHECK-NEXT: add.w r0, r6, r0, lsr #2 419; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill 420; CHECK-NEXT: lsls r0, r2, #4 421; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload 422; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill 423; CHECK-NEXT: .LBB3_2: @ %for.body 424; CHECK-NEXT: @ =>This Loop Header: Depth=1 425; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 426; CHECK-NEXT: adds r0, r6, #3 427; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill 428; CHECK-NEXT: adds r0, r6, #2 429; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload 430; CHECK-NEXT: vmov.i32 q0, #0x0 431; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill 432; CHECK-NEXT: adds r0, r6, #1 433; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill 434; CHECK-NEXT: mov r3, r12 435; CHECK-NEXT: mov r0, r8 436; CHECK-NEXT: mov r5, r11 437; CHECK-NEXT: mov r4, r9 438; CHECK-NEXT: vmov q1, q0 439; CHECK-NEXT: vmov q2, q0 440; CHECK-NEXT: vmov q3, q0 441; CHECK-NEXT: dlstp.32 lr, r7 442; CHECK-NEXT: .LBB3_3: @ %vector.body 443; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 444; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 445; CHECK-NEXT: vldrw.u32 q4, [r1], #16 446; CHECK-NEXT: vldrw.u32 q5, [r0], #16 447; CHECK-NEXT: vfma.f32 q3, q5, q4 448; CHECK-NEXT: vldrw.u32 q5, [r3], #16 449; CHECK-NEXT: vfma.f32 q2, q5, q4 450; CHECK-NEXT: vldrw.u32 q5, [r5], #16 451; CHECK-NEXT: vfma.f32 q1, q5, q4 452; CHECK-NEXT: vldrw.u32 q5, [r4], #16 453; CHECK-NEXT: vfma.f32 q0, q5, q4 454; CHECK-NEXT: letp lr, .LBB3_3 455; CHECK-NEXT: @ %bb.4: @ %middle.block 456; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 457; CHECK-NEXT: vadd.f32 s14, s14, s15 458; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload 459; CHECK-NEXT: vadd.f32 s12, s12, s13 460; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload 461; CHECK-NEXT: vadd.f32 s10, s10, s11 462; CHECK-NEXT: vadd.f32 s8, s8, s9 463; CHECK-NEXT: add.w r0, r1, r0, lsl #2 464; CHECK-NEXT: vadd.f32 s6, s6, s7 465; CHECK-NEXT: vadd.f32 s4, s4, s5 466; CHECK-NEXT: vadd.f32 s2, s2, s3 467; CHECK-NEXT: vadd.f32 s0, s0, s1 468; CHECK-NEXT: vadd.f32 s12, s12, s14 469; CHECK-NEXT: vadd.f32 s8, s8, s10 470; CHECK-NEXT: vadd.f32 s4, s4, s6 471; CHECK-NEXT: vadd.f32 s0, s0, s2 472; CHECK-NEXT: vstr s12, [r0] 473; CHECK-NEXT: add.w r0, r1, r6, lsl #2 474; CHECK-NEXT: adds r6, #4 475; CHECK-NEXT: vstr s8, [r0] 476; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload 477; CHECK-NEXT: add.w r0, r1, r0, lsl #2 478; CHECK-NEXT: vstr s4, [r0] 479; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload 480; CHECK-NEXT: add.w r0, r1, r0, lsl #2 481; CHECK-NEXT: vstr s0, [r0] 482; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload 483; CHECK-NEXT: add r12, r0 484; CHECK-NEXT: add r8, r0 485; CHECK-NEXT: add r11, r0 486; CHECK-NEXT: add r9, r0 487; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload 488; CHECK-NEXT: cmp r6, r0 489; CHECK-NEXT: blo .LBB3_2 490; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup 491; CHECK-NEXT: add sp, #40 492; CHECK-NEXT: vpop {d8, d9, d10, d11} 493; CHECK-NEXT: add sp, #4 494; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 495entry: 496 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2 497 %i = load i32, ptr %NumInputs, align 4 498 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1 499 %i1 = load i32, ptr %NumFilters, align 4 500 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0 501 %i2 = load ptr, ptr %pDCTCoefs, align 4 502 %cmp = icmp ugt i32 %i, 1 503 tail call void @llvm.assume(i1 %cmp) 504 %sub = add i32 %i1, -4 505 %cmp3113 = icmp ugt i32 %sub, 1 506 br i1 %cmp3113, label %for.body.preheader, label %for.cond.cleanup 507 508for.body.preheader: ; preds = %entry 509 %n.rnd.up = add i32 %i, 3 510 %n.vec = and i32 %n.rnd.up, -4 511 br label %for.body 512 513for.cond.cleanup: ; preds = %middle.block, %entry 514 ret void 515 516for.body: ; preds = %middle.block, %for.body.preheader 517 %k2.0114 = phi i32 [ %add43, %middle.block ], [ 1, %for.body.preheader ] 518 %mul4 = mul i32 %k2.0114, %i 519 %add = add nuw nsw i32 %k2.0114, 1 520 %mul5 = mul i32 %add, %i 521 %add6 = add nuw nsw i32 %k2.0114, 2 522 %mul7 = mul i32 %add6, %i 523 %add8 = add i32 %k2.0114, 3 524 %mul9 = mul i32 %add8, %i 525 br label %vector.body 526 527vector.body: ; preds = %vector.body, %for.body 528 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 529 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i25, %vector.body ] 530 %vec.phi115 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i26, %vector.body ] 531 %vec.phi116 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i27, %vector.body ] 532 %vec.phi117 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i28, %vector.body ] 533 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i) 534 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index 535 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 536 %i5 = add i32 %index, %mul4 537 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5 538 %wide.masked.load118 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 539 %i8 = fmul fast <4 x float> %wide.masked.load118, %wide.masked.load 540 %i9 = fadd fast <4 x float> %i8, %vec.phi116 541 %i10 = add i32 %index, %mul5 542 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10 543 %wide.masked.load119 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 544 %i13 = fmul fast <4 x float> %wide.masked.load119, %wide.masked.load 545 %i14 = fadd fast <4 x float> %i13, %vec.phi117 546 %i15 = add i32 %index, %mul7 547 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15 548 %wide.masked.load120 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 549 %i18 = fmul fast <4 x float> %wide.masked.load120, %wide.masked.load 550 %i19 = fadd fast <4 x float> %i18, %vec.phi115 551 %i20 = add i32 %index, %mul9 552 %i21 = getelementptr inbounds float, ptr %i2, i32 %i20 553 %wide.masked.load121 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 554 %i23 = fmul fast <4 x float> %wide.masked.load121, %wide.masked.load 555 %i24 = fadd fast <4 x float> %i23, %vec.phi 556 %i25 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi 557 %i26 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi115 558 %i27 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi116 559 %i28 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi117 560 %index.next = add i32 %index, 4 561 %i29 = icmp eq i32 %index.next, %n.vec 562 br i1 %i29, label %middle.block, label %vector.body 563 564middle.block: ; preds = %vector.body 565 %i30 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i28) 566 %i31 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i27) 567 %i32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i26) 568 %i33 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i25) 569 %arrayidx35 = getelementptr inbounds float, ptr %pOut, i32 %k2.0114 570 store float %i31, ptr %arrayidx35, align 4 571 %arrayidx37 = getelementptr inbounds float, ptr %pOut, i32 %add 572 store float %i30, ptr %arrayidx37, align 4 573 %arrayidx39 = getelementptr inbounds float, ptr %pOut, i32 %add6 574 store float %i32, ptr %arrayidx39, align 4 575 %arrayidx41 = getelementptr inbounds float, ptr %pOut, i32 %add8 576 store float %i33, ptr %arrayidx41, align 4 577 %add43 = add i32 %k2.0114, 4 578 %cmp3 = icmp ult i32 %add43, %sub 579 br i1 %cmp3, label %for.body, label %for.cond.cleanup 580} 581 582define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) { 583; CHECK-LABEL: DCT_mve5: 584; CHECK: @ %bb.0: @ %entry 585; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 586; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 587; CHECK-NEXT: .pad #4 588; CHECK-NEXT: sub sp, #4 589; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 590; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 591; CHECK-NEXT: .pad #32 592; CHECK-NEXT: sub sp, #32 593; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill 594; CHECK-NEXT: ldr r1, [r0, #4] 595; CHECK-NEXT: subs r1, #5 596; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill 597; CHECK-NEXT: cmp r1, #2 598; CHECK-NEXT: blo.w .LBB4_5 599; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 600; CHECK-NEXT: ldr r3, [r0, #8] 601; CHECK-NEXT: ldr r1, [r0] 602; CHECK-NEXT: adds r0, r3, #3 603; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill 604; CHECK-NEXT: bic r0, r0, #3 605; CHECK-NEXT: add.w r8, r1, r3, lsl #2 606; CHECK-NEXT: subs r1, r0, #4 607; CHECK-NEXT: movs r0, #1 608; CHECK-NEXT: lsls r5, r3, #2 609; CHECK-NEXT: add.w r1, r0, r1, lsr #2 610; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill 611; CHECK-NEXT: add.w r1, r3, r3, lsl #2 612; CHECK-NEXT: lsls r1, r1, #2 613; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill 614; CHECK-NEXT: .LBB4_2: @ %for.body 615; CHECK-NEXT: @ =>This Loop Header: Depth=1 616; CHECK-NEXT: @ Child Loop BB4_3 Depth 2 617; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload 618; CHECK-NEXT: adds r1, r0, #4 619; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload 620; CHECK-NEXT: vmov.i32 q1, #0x0 621; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload 622; CHECK-NEXT: add.w r10, r0, #2 623; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill 624; CHECK-NEXT: adds r1, r0, #3 625; CHECK-NEXT: add.w r11, r0, #1 626; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill 627; CHECK-NEXT: mov r3, r8 628; CHECK-NEXT: vmov q0, q1 629; CHECK-NEXT: vmov q3, q1 630; CHECK-NEXT: vmov q2, q1 631; CHECK-NEXT: vmov q4, q1 632; CHECK-NEXT: dlstp.32 lr, r7 633; CHECK-NEXT: .LBB4_3: @ %vector.body 634; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 635; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 636; CHECK-NEXT: add.w r9, r3, r5 637; CHECK-NEXT: vldrw.u32 q5, [r4], #16 638; CHECK-NEXT: vldrw.u32 q6, [r3], #16 639; CHECK-NEXT: add.w r12, r9, r5 640; CHECK-NEXT: vfma.f32 q3, q6, q5 641; CHECK-NEXT: vldrw.u32 q6, [r9] 642; CHECK-NEXT: add.w r6, r12, r5 643; CHECK-NEXT: vfma.f32 q4, q6, q5 644; CHECK-NEXT: vldrw.u32 q6, [r12] 645; CHECK-NEXT: adds r7, r6, r5 646; CHECK-NEXT: vfma.f32 q2, q6, q5 647; CHECK-NEXT: vldrw.u32 q6, [r6] 648; CHECK-NEXT: vfma.f32 q0, q6, q5 649; CHECK-NEXT: vldrw.u32 q6, [r7] 650; CHECK-NEXT: vfma.f32 q1, q6, q5 651; CHECK-NEXT: letp lr, .LBB4_3 652; CHECK-NEXT: @ %bb.4: @ %middle.block 653; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1 654; CHECK-NEXT: vadd.f32 s18, s18, s19 655; CHECK-NEXT: add.w r1, r2, r11, lsl #2 656; CHECK-NEXT: vadd.f32 s16, s16, s17 657; CHECK-NEXT: vadd.f32 s14, s14, s15 658; CHECK-NEXT: vadd.f32 s12, s12, s13 659; CHECK-NEXT: vadd.f32 s6, s6, s7 660; CHECK-NEXT: vadd.f32 s4, s4, s5 661; CHECK-NEXT: vadd.f32 s10, s10, s11 662; CHECK-NEXT: vadd.f32 s8, s8, s9 663; CHECK-NEXT: vadd.f32 s0, s0, s1 664; CHECK-NEXT: vadd.f32 s1, s16, s18 665; CHECK-NEXT: vadd.f32 s2, s2, s3 666; CHECK-NEXT: vadd.f32 s12, s12, s14 667; CHECK-NEXT: vadd.f32 s4, s4, s6 668; CHECK-NEXT: vadd.f32 s6, s8, s10 669; CHECK-NEXT: vstr s1, [r1] 670; CHECK-NEXT: add.w r1, r2, r0, lsl #2 671; CHECK-NEXT: vadd.f32 s0, s0, s2 672; CHECK-NEXT: adds r0, #5 673; CHECK-NEXT: vstr s12, [r1] 674; CHECK-NEXT: add.w r1, r2, r10, lsl #2 675; CHECK-NEXT: vstr s6, [r1] 676; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload 677; CHECK-NEXT: add.w r1, r2, r1, lsl #2 678; CHECK-NEXT: vstr s0, [r1] 679; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload 680; CHECK-NEXT: add.w r1, r2, r1, lsl #2 681; CHECK-NEXT: vstr s4, [r1] 682; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload 683; CHECK-NEXT: add r8, r1 684; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload 685; CHECK-NEXT: cmp r0, r1 686; CHECK-NEXT: blo.w .LBB4_2 687; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup 688; CHECK-NEXT: add sp, #32 689; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 690; CHECK-NEXT: add sp, #4 691; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 692entry: 693 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2 694 %i = load i32, ptr %NumInputs, align 4 695 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1 696 %i1 = load i32, ptr %NumFilters, align 4 697 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0 698 %i2 = load ptr, ptr %pDCTCoefs, align 4 699 %cmp = icmp ugt i32 %i, 1 700 tail call void @llvm.assume(i1 %cmp) 701 %sub = add i32 %i1, -5 702 %cmp3134 = icmp ugt i32 %sub, 1 703 br i1 %cmp3134, label %for.body.preheader, label %for.cond.cleanup 704 705for.body.preheader: ; preds = %entry 706 %n.rnd.up = add i32 %i, 3 707 %n.vec = and i32 %n.rnd.up, -4 708 br label %for.body 709 710for.cond.cleanup: ; preds = %middle.block, %entry 711 ret void 712 713for.body: ; preds = %middle.block, %for.body.preheader 714 %k2.0135 = phi i32 [ %add52, %middle.block ], [ 1, %for.body.preheader ] 715 %mul4 = mul i32 %k2.0135, %i 716 %add = add nuw i32 %k2.0135, 1 717 %mul5 = mul i32 %add, %i 718 %add6 = add i32 %k2.0135, 2 719 %mul7 = mul i32 %add6, %i 720 %add8 = add i32 %k2.0135, 3 721 %mul9 = mul i32 %add8, %i 722 %add10 = add i32 %k2.0135, 4 723 %mul11 = mul i32 %add10, %i 724 br label %vector.body 725 726vector.body: ; preds = %vector.body, %for.body 727 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 728 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i30, %vector.body ] 729 %vec.phi136 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i31, %vector.body ] 730 %vec.phi137 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i32, %vector.body ] 731 %vec.phi138 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i33, %vector.body ] 732 %vec.phi139 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i34, %vector.body ] 733 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i) 734 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index 735 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 736 %i5 = add i32 %index, %mul4 737 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5 738 %wide.masked.load140 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 739 %i8 = fmul fast <4 x float> %wide.masked.load140, %wide.masked.load 740 %i9 = fadd fast <4 x float> %i8, %vec.phi137 741 %i10 = add i32 %index, %mul5 742 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10 743 %wide.masked.load141 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 744 %i13 = fmul fast <4 x float> %wide.masked.load141, %wide.masked.load 745 %i14 = fadd fast <4 x float> %i13, %vec.phi139 746 %i15 = add i32 %index, %mul7 747 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15 748 %wide.masked.load142 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 749 %i18 = fmul fast <4 x float> %wide.masked.load142, %wide.masked.load 750 %i19 = fadd fast <4 x float> %i18, %vec.phi138 751 %i20 = add i32 %index, %mul9 752 %i21 = getelementptr inbounds float, ptr %i2, i32 %i20 753 %wide.masked.load143 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 754 %i23 = fmul fast <4 x float> %wide.masked.load143, %wide.masked.load 755 %i24 = fadd fast <4 x float> %i23, %vec.phi136 756 %i25 = add i32 %index, %mul11 757 %i26 = getelementptr inbounds float, ptr %i2, i32 %i25 758 %wide.masked.load144 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 759 %i28 = fmul fast <4 x float> %wide.masked.load144, %wide.masked.load 760 %i29 = fadd fast <4 x float> %i28, %vec.phi 761 %i30 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi 762 %i31 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi136 763 %i32 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi137 764 %i33 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi138 765 %i34 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi139 766 %index.next = add i32 %index, 4 767 %i35 = icmp eq i32 %index.next, %n.vec 768 br i1 %i35, label %middle.block, label %vector.body 769 770middle.block: ; preds = %vector.body 771 %i36 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i34) 772 %i37 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i33) 773 %i38 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i32) 774 %i39 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i31) 775 %i40 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i30) 776 %arrayidx42 = getelementptr inbounds float, ptr %pOut, i32 %k2.0135 777 store float %i38, ptr %arrayidx42, align 4 778 %arrayidx44 = getelementptr inbounds float, ptr %pOut, i32 %add 779 store float %i36, ptr %arrayidx44, align 4 780 %arrayidx46 = getelementptr inbounds float, ptr %pOut, i32 %add6 781 store float %i37, ptr %arrayidx46, align 4 782 %arrayidx48 = getelementptr inbounds float, ptr %pOut, i32 %add8 783 store float %i39, ptr %arrayidx48, align 4 784 %arrayidx50 = getelementptr inbounds float, ptr %pOut, i32 %add10 785 store float %i40, ptr %arrayidx50, align 4 786 %add52 = add i32 %k2.0135, 5 787 %cmp3 = icmp ult i32 %add52, %sub 788 br i1 %cmp3, label %for.body, label %for.cond.cleanup 789} 790 791define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) { 792; CHECK-LABEL: DCT_mve6: 793; CHECK: @ %bb.0: @ %entry 794; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 795; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 796; CHECK-NEXT: .pad #4 797; CHECK-NEXT: sub sp, #4 798; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 799; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 800; CHECK-NEXT: .pad #32 801; CHECK-NEXT: sub sp, #32 802; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill 803; CHECK-NEXT: ldr r1, [r0, #4] 804; CHECK-NEXT: subs r1, #6 805; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill 806; CHECK-NEXT: cmp r1, #2 807; CHECK-NEXT: blo.w .LBB5_5 808; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 809; CHECK-NEXT: ldr r3, [r0, #8] 810; CHECK-NEXT: ldr r1, [r0] 811; CHECK-NEXT: adds r0, r3, #3 812; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill 813; CHECK-NEXT: bic r0, r0, #3 814; CHECK-NEXT: add.w r8, r1, r3, lsl #2 815; CHECK-NEXT: subs r1, r0, #4 816; CHECK-NEXT: movs r0, #1 817; CHECK-NEXT: lsls r5, r3, #2 818; CHECK-NEXT: add.w r1, r0, r1, lsr #2 819; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill 820; CHECK-NEXT: add.w r1, r3, r3, lsl #1 821; CHECK-NEXT: lsls r1, r1, #3 822; CHECK-NEXT: str r1, [sp] @ 4-byte Spill 823; CHECK-NEXT: .LBB5_2: @ %for.body 824; CHECK-NEXT: @ =>This Loop Header: Depth=1 825; CHECK-NEXT: @ Child Loop BB5_3 Depth 2 826; CHECK-NEXT: adds r1, r0, #5 827; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill 828; CHECK-NEXT: adds r1, r0, #4 829; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill 830; CHECK-NEXT: adds r1, r0, #3 831; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload 832; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill 833; CHECK-NEXT: vmov.i32 q1, #0x0 834; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload 835; CHECK-NEXT: add.w r11, r0, #2 836; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload 837; CHECK-NEXT: adds r4, r0, #1 838; CHECK-NEXT: mov r3, r8 839; CHECK-NEXT: vmov q3, q1 840; CHECK-NEXT: vmov q4, q1 841; CHECK-NEXT: vmov q0, q1 842; CHECK-NEXT: vmov q5, q1 843; CHECK-NEXT: vmov q2, q1 844; CHECK-NEXT: dlstp.32 lr, r7 845; CHECK-NEXT: .LBB5_3: @ %vector.body 846; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 847; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 848; CHECK-NEXT: add.w r12, r3, r5 849; CHECK-NEXT: vldrw.u32 q6, [r1], #16 850; CHECK-NEXT: vldrw.u32 q7, [r3], #16 851; CHECK-NEXT: add.w r10, r12, r5 852; CHECK-NEXT: vfma.f32 q4, q7, q6 853; CHECK-NEXT: vldrw.u32 q7, [r12] 854; CHECK-NEXT: add.w r6, r10, r5 855; CHECK-NEXT: vfma.f32 q5, q7, q6 856; CHECK-NEXT: vldrw.u32 q7, [r10] 857; CHECK-NEXT: adds r7, r6, r5 858; CHECK-NEXT: vfma.f32 q2, q7, q6 859; CHECK-NEXT: vldrw.u32 q7, [r6] 860; CHECK-NEXT: adds r6, r7, r5 861; CHECK-NEXT: vfma.f32 q0, q7, q6 862; CHECK-NEXT: vldrw.u32 q7, [r7] 863; CHECK-NEXT: vfma.f32 q3, q7, q6 864; CHECK-NEXT: vldrw.u32 q7, [r6] 865; CHECK-NEXT: vfma.f32 q1, q7, q6 866; CHECK-NEXT: letp lr, .LBB5_3 867; CHECK-NEXT: @ %bb.4: @ %middle.block 868; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1 869; CHECK-NEXT: vadd.f32 s22, s22, s23 870; CHECK-NEXT: add.w r1, r2, r4, lsl #2 871; CHECK-NEXT: vadd.f32 s20, s20, s21 872; CHECK-NEXT: vadd.f32 s18, s18, s19 873; CHECK-NEXT: vadd.f32 s16, s16, s17 874; CHECK-NEXT: vadd.f32 s10, s10, s11 875; CHECK-NEXT: vadd.f32 s8, s8, s9 876; CHECK-NEXT: vadd.f32 s0, s0, s1 877; CHECK-NEXT: vadd.f32 s2, s2, s3 878; CHECK-NEXT: vadd.f32 s1, s20, s22 879; CHECK-NEXT: vadd.f32 s6, s6, s7 880; CHECK-NEXT: vadd.f32 s3, s16, s18 881; CHECK-NEXT: vadd.f32 s4, s4, s5 882; CHECK-NEXT: vadd.f32 s8, s8, s10 883; CHECK-NEXT: vadd.f32 s14, s14, s15 884; CHECK-NEXT: vadd.f32 s12, s12, s13 885; CHECK-NEXT: vstr s1, [r1] 886; CHECK-NEXT: add.w r1, r2, r0, lsl #2 887; CHECK-NEXT: vadd.f32 s0, s0, s2 888; CHECK-NEXT: adds r0, #6 889; CHECK-NEXT: vstr s3, [r1] 890; CHECK-NEXT: add.w r1, r2, r11, lsl #2 891; CHECK-NEXT: vadd.f32 s4, s4, s6 892; CHECK-NEXT: vstr s8, [r1] 893; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload 894; CHECK-NEXT: vadd.f32 s6, s12, s14 895; CHECK-NEXT: add.w r1, r2, r1, lsl #2 896; CHECK-NEXT: vstr s0, [r1] 897; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload 898; CHECK-NEXT: add.w r1, r2, r1, lsl #2 899; CHECK-NEXT: vstr s6, [r1] 900; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload 901; CHECK-NEXT: add.w r1, r2, r1, lsl #2 902; CHECK-NEXT: vstr s4, [r1] 903; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload 904; CHECK-NEXT: add r8, r1 905; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload 906; CHECK-NEXT: cmp r0, r1 907; CHECK-NEXT: blo.w .LBB5_2 908; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup 909; CHECK-NEXT: add sp, #32 910; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 911; CHECK-NEXT: add sp, #4 912; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 913entry: 914 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2 915 %i = load i32, ptr %NumInputs, align 4 916 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1 917 %i1 = load i32, ptr %NumFilters, align 4 918 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0 919 %i2 = load ptr, ptr %pDCTCoefs, align 4 920 %cmp = icmp ugt i32 %i, 1 921 tail call void @llvm.assume(i1 %cmp) 922 %sub = add i32 %i1, -6 923 %cmp3155 = icmp ugt i32 %sub, 1 924 br i1 %cmp3155, label %for.body.preheader, label %for.cond.cleanup 925 926for.body.preheader: ; preds = %entry 927 %n.rnd.up = add i32 %i, 3 928 %n.vec = and i32 %n.rnd.up, -4 929 br label %for.body 930 931for.cond.cleanup: ; preds = %middle.block, %entry 932 ret void 933 934for.body: ; preds = %middle.block, %for.body.preheader 935 %k2.0156 = phi i32 [ %add61, %middle.block ], [ 1, %for.body.preheader ] 936 %mul4 = mul i32 %k2.0156, %i 937 %add = add nuw i32 %k2.0156, 1 938 %mul5 = mul i32 %add, %i 939 %add6 = add i32 %k2.0156, 2 940 %mul7 = mul i32 %add6, %i 941 %add8 = add i32 %k2.0156, 3 942 %mul9 = mul i32 %add8, %i 943 %add10 = add i32 %k2.0156, 4 944 %mul11 = mul i32 %add10, %i 945 %add12 = add i32 %k2.0156, 5 946 %mul13 = mul i32 %add12, %i 947 br label %vector.body 948 949vector.body: ; preds = %vector.body, %for.body 950 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 951 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i35, %vector.body ] 952 %vec.phi157 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i36, %vector.body ] 953 %vec.phi158 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i37, %vector.body ] 954 %vec.phi159 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i38, %vector.body ] 955 %vec.phi160 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i39, %vector.body ] 956 %vec.phi161 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i40, %vector.body ] 957 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i) 958 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index 959 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 960 %i5 = add i32 %index, %mul4 961 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5 962 %wide.masked.load162 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 963 %i8 = fmul fast <4 x float> %wide.masked.load162, %wide.masked.load 964 %i9 = fadd fast <4 x float> %i8, %vec.phi158 965 %i10 = add i32 %index, %mul5 966 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10 967 %wide.masked.load163 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 968 %i13 = fmul fast <4 x float> %wide.masked.load163, %wide.masked.load 969 %i14 = fadd fast <4 x float> %i13, %vec.phi160 970 %i15 = add i32 %index, %mul7 971 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15 972 %wide.masked.load164 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 973 %i18 = fmul fast <4 x float> %wide.masked.load164, %wide.masked.load 974 %i19 = fadd fast <4 x float> %i18, %vec.phi161 975 %i20 = add i32 %index, %mul9 976 %i21 = getelementptr inbounds float, ptr %i2, i32 %i20 977 %wide.masked.load165 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 978 %i23 = fmul fast <4 x float> %wide.masked.load165, %wide.masked.load 979 %i24 = fadd fast <4 x float> %i23, %vec.phi159 980 %i25 = add i32 %index, %mul11 981 %i26 = getelementptr inbounds float, ptr %i2, i32 %i25 982 %wide.masked.load166 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 983 %i28 = fmul fast <4 x float> %wide.masked.load166, %wide.masked.load 984 %i29 = fadd fast <4 x float> %i28, %vec.phi157 985 %i30 = add i32 %index, %mul13 986 %i31 = getelementptr inbounds float, ptr %i2, i32 %i30 987 %wide.masked.load167 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i31, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 988 %i33 = fmul fast <4 x float> %wide.masked.load167, %wide.masked.load 989 %i34 = fadd fast <4 x float> %i33, %vec.phi 990 %i35 = select <4 x i1> %active.lane.mask, <4 x float> %i34, <4 x float> %vec.phi 991 %i36 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi157 992 %i37 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi158 993 %i38 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi159 994 %i39 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi160 995 %i40 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi161 996 %index.next = add i32 %index, 4 997 %i41 = icmp eq i32 %index.next, %n.vec 998 br i1 %i41, label %middle.block, label %vector.body 999 1000middle.block: ; preds = %vector.body 1001 %i42 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i40) 1002 %i43 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i39) 1003 %i44 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i38) 1004 %i45 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i37) 1005 %i46 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i36) 1006 %i47 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i35) 1007 %arrayidx49 = getelementptr inbounds float, ptr %pOut, i32 %k2.0156 1008 store float %i45, ptr %arrayidx49, align 4 1009 %arrayidx51 = getelementptr inbounds float, ptr %pOut, i32 %add 1010 store float %i43, ptr %arrayidx51, align 4 1011 %arrayidx53 = getelementptr inbounds float, ptr %pOut, i32 %add6 1012 store float %i42, ptr %arrayidx53, align 4 1013 %arrayidx55 = getelementptr inbounds float, ptr %pOut, i32 %add8 1014 store float %i44, ptr %arrayidx55, align 4 1015 %arrayidx57 = getelementptr inbounds float, ptr %pOut, i32 %add10 1016 store float %i46, ptr %arrayidx57, align 4 1017 %arrayidx59 = getelementptr inbounds float, ptr %pOut, i32 %add12 1018 store float %i47, ptr %arrayidx59, align 4 1019 %add61 = add i32 %k2.0156, 6 1020 %cmp3 = icmp ult i32 %add61, %sub 1021 br i1 %cmp3, label %for.body, label %for.cond.cleanup 1022} 1023 1024define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) { 1025; CHECK-LABEL: DCT_mve7: 1026; CHECK: @ %bb.0: @ %entry 1027; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1028; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1029; CHECK-NEXT: .pad #4 1030; CHECK-NEXT: sub sp, #4 1031; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1032; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1033; CHECK-NEXT: .pad #72 1034; CHECK-NEXT: sub sp, #72 1035; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill 1036; CHECK-NEXT: ldr r1, [r0, #4] 1037; CHECK-NEXT: subs r1, #7 1038; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill 1039; CHECK-NEXT: cmp r1, #2 1040; CHECK-NEXT: blo.w .LBB6_5 1041; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1042; CHECK-NEXT: ldr r3, [r0, #8] 1043; CHECK-NEXT: ldr r1, [r0] 1044; CHECK-NEXT: adds r0, r3, #3 1045; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill 1046; CHECK-NEXT: bic r0, r0, #3 1047; CHECK-NEXT: add.w r9, r1, r3, lsl #2 1048; CHECK-NEXT: subs r1, r0, #4 1049; CHECK-NEXT: movs r0, #1 1050; CHECK-NEXT: lsls r5, r3, #2 1051; CHECK-NEXT: add.w r1, r0, r1, lsr #2 1052; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill 1053; CHECK-NEXT: rsb r1, r3, r3, lsl #3 1054; CHECK-NEXT: lsls r1, r1, #2 1055; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill 1056; CHECK-NEXT: .LBB6_2: @ %for.body 1057; CHECK-NEXT: @ =>This Loop Header: Depth=1 1058; CHECK-NEXT: @ Child Loop BB6_3 Depth 2 1059; CHECK-NEXT: adds r1, r0, #6 1060; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill 1061; CHECK-NEXT: adds r1, r0, #5 1062; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill 1063; CHECK-NEXT: adds r1, r0, #4 1064; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill 1065; CHECK-NEXT: adds r1, r0, #3 1066; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload 1067; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill 1068; CHECK-NEXT: vmov.i32 q2, #0x0 1069; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload 1070; CHECK-NEXT: adds r4, r0, #2 1071; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload 1072; CHECK-NEXT: add.w r8, r0, #1 1073; CHECK-NEXT: mov r3, r9 1074; CHECK-NEXT: vmov q4, q2 1075; CHECK-NEXT: vmov q5, q2 1076; CHECK-NEXT: vmov q3, q2 1077; CHECK-NEXT: vmov q6, q2 1078; CHECK-NEXT: vmov q1, q2 1079; CHECK-NEXT: mov r12, r7 1080; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill 1081; CHECK-NEXT: dls lr, r6 1082; CHECK-NEXT: .LBB6_3: @ %vector.body 1083; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 1084; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 1085; CHECK-NEXT: vctp.32 r12 1086; CHECK-NEXT: add.w r10, r3, r5 1087; CHECK-NEXT: vpstt 1088; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 1089; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 1090; CHECK-NEXT: add.w r11, r10, r5 1091; CHECK-NEXT: sub.w r12, r12, #4 1092; CHECK-NEXT: vpstt 1093; CHECK-NEXT: vfmat.f32 q5, q0, q7 1094; CHECK-NEXT: vldrwt.u32 q0, [r10] 1095; CHECK-NEXT: add.w r6, r11, r5 1096; CHECK-NEXT: vpstt 1097; CHECK-NEXT: vfmat.f32 q6, q0, q7 1098; CHECK-NEXT: vldrwt.u32 q0, [r11] 1099; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill 1100; CHECK-NEXT: vmov q6, q5 1101; CHECK-NEXT: vpst 1102; CHECK-NEXT: vfmat.f32 q1, q0, q7 1103; CHECK-NEXT: vmov q5, q4 1104; CHECK-NEXT: vmov q4, q3 1105; CHECK-NEXT: vmov q3, q1 1106; CHECK-NEXT: vpst 1107; CHECK-NEXT: vldrwt.u32 q0, [r6] 1108; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload 1109; CHECK-NEXT: adds r7, r6, r5 1110; CHECK-NEXT: vpstt 1111; CHECK-NEXT: vfmat.f32 q1, q0, q7 1112; CHECK-NEXT: vldrwt.u32 q0, [r7] 1113; CHECK-NEXT: adds r6, r7, r5 1114; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill 1115; CHECK-NEXT: vmov q1, q3 1116; CHECK-NEXT: vmov q3, q4 1117; CHECK-NEXT: vpstt 1118; CHECK-NEXT: vfmat.f32 q3, q0, q7 1119; CHECK-NEXT: vldrwt.u32 q0, [r6] 1120; CHECK-NEXT: vmov q4, q5 1121; CHECK-NEXT: adds r7, r6, r5 1122; CHECK-NEXT: vpstt 1123; CHECK-NEXT: vfmat.f32 q4, q0, q7 1124; CHECK-NEXT: vldrwt.u32 q0, [r7] 1125; CHECK-NEXT: vmov q5, q6 1126; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload 1127; CHECK-NEXT: vpst 1128; CHECK-NEXT: vfmat.f32 q2, q0, q7 1129; CHECK-NEXT: le lr, .LBB6_3 1130; CHECK-NEXT: @ %bb.4: @ %middle.block 1131; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1 1132; CHECK-NEXT: vadd.f32 s0, s26, s27 1133; CHECK-NEXT: add.w r1, r2, r8, lsl #2 1134; CHECK-NEXT: vadd.f32 s2, s24, s25 1135; CHECK-NEXT: vadd.f32 s1, s22, s23 1136; CHECK-NEXT: vadd.f32 s3, s20, s21 1137; CHECK-NEXT: vadd.f32 s6, s6, s7 1138; CHECK-NEXT: vadd.f32 s4, s4, s5 1139; CHECK-NEXT: vadd.f32 s10, s10, s11 1140; CHECK-NEXT: vadd.f32 s8, s8, s9 1141; CHECK-NEXT: vadd.f32 s0, s2, s0 1142; CHECK-NEXT: vadd.f32 s9, s18, s19 1143; CHECK-NEXT: vadd.f32 s11, s16, s17 1144; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload 1145; CHECK-NEXT: vadd.f32 s2, s3, s1 1146; CHECK-NEXT: vadd.f32 s5, s18, s19 1147; CHECK-NEXT: vadd.f32 s7, s16, s17 1148; CHECK-NEXT: vadd.f32 s4, s4, s6 1149; CHECK-NEXT: vstr s0, [r1] 1150; CHECK-NEXT: add.w r1, r2, r0, lsl #2 1151; CHECK-NEXT: vadd.f32 s14, s14, s15 1152; CHECK-NEXT: adds r0, #7 1153; CHECK-NEXT: vadd.f32 s12, s12, s13 1154; CHECK-NEXT: vstr s2, [r1] 1155; CHECK-NEXT: add.w r1, r2, r4, lsl #2 1156; CHECK-NEXT: vadd.f32 s8, s8, s10 1157; CHECK-NEXT: vadd.f32 s6, s7, s5 1158; CHECK-NEXT: vstr s4, [r1] 1159; CHECK-NEXT: vadd.f32 s10, s11, s9 1160; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload 1161; CHECK-NEXT: vadd.f32 s12, s12, s14 1162; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1163; CHECK-NEXT: vstr s6, [r1] 1164; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload 1165; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1166; CHECK-NEXT: vstr s12, [r1] 1167; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload 1168; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1169; CHECK-NEXT: vstr s10, [r1] 1170; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload 1171; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1172; CHECK-NEXT: vstr s8, [r1] 1173; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload 1174; CHECK-NEXT: add r9, r1 1175; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload 1176; CHECK-NEXT: cmp r0, r1 1177; CHECK-NEXT: blo.w .LBB6_2 1178; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup 1179; CHECK-NEXT: add sp, #72 1180; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1181; CHECK-NEXT: add sp, #4 1182; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 1183entry: 1184 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2 1185 %i = load i32, ptr %NumInputs, align 4 1186 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1 1187 %i1 = load i32, ptr %NumFilters, align 4 1188 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0 1189 %i2 = load ptr, ptr %pDCTCoefs, align 4 1190 %cmp = icmp ugt i32 %i, 1 1191 tail call void @llvm.assume(i1 %cmp) 1192 %sub = add i32 %i1, -7 1193 %cmp3176 = icmp ugt i32 %sub, 1 1194 br i1 %cmp3176, label %for.body.preheader, label %for.cond.cleanup 1195 1196for.body.preheader: ; preds = %entry 1197 %n.rnd.up = add i32 %i, 3 1198 %n.vec = and i32 %n.rnd.up, -4 1199 br label %for.body 1200 1201for.cond.cleanup: ; preds = %middle.block, %entry 1202 ret void 1203 1204for.body: ; preds = %middle.block, %for.body.preheader 1205 %k2.0177 = phi i32 [ %add70, %middle.block ], [ 1, %for.body.preheader ] 1206 %mul4 = mul i32 %k2.0177, %i 1207 %add = add nuw i32 %k2.0177, 1 1208 %mul5 = mul i32 %add, %i 1209 %add6 = add i32 %k2.0177, 2 1210 %mul7 = mul i32 %add6, %i 1211 %add8 = add i32 %k2.0177, 3 1212 %mul9 = mul i32 %add8, %i 1213 %add10 = add i32 %k2.0177, 4 1214 %mul11 = mul i32 %add10, %i 1215 %add12 = add i32 %k2.0177, 5 1216 %mul13 = mul i32 %add12, %i 1217 %add14 = add i32 %k2.0177, 6 1218 %mul15 = mul i32 %add14, %i 1219 br label %vector.body 1220 1221vector.body: ; preds = %vector.body, %for.body 1222 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 1223 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i40, %vector.body ] 1224 %vec.phi178 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i41, %vector.body ] 1225 %vec.phi179 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i42, %vector.body ] 1226 %vec.phi180 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i43, %vector.body ] 1227 %vec.phi181 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i44, %vector.body ] 1228 %vec.phi182 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i45, %vector.body ] 1229 %vec.phi183 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i46, %vector.body ] 1230 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i) 1231 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index 1232 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1233 %i5 = add i32 %index, %mul4 1234 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5 1235 %wide.masked.load184 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1236 %i8 = fmul fast <4 x float> %wide.masked.load184, %wide.masked.load 1237 %i9 = fadd fast <4 x float> %i8, %vec.phi179 1238 %i10 = add i32 %index, %mul5 1239 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10 1240 %wide.masked.load185 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1241 %i13 = fmul fast <4 x float> %wide.masked.load185, %wide.masked.load 1242 %i14 = fadd fast <4 x float> %i13, %vec.phi181 1243 %i15 = add i32 %index, %mul7 1244 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15 1245 %wide.masked.load186 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1246 %i18 = fmul fast <4 x float> %wide.masked.load186, %wide.masked.load 1247 %i19 = fadd fast <4 x float> %i18, %vec.phi183 1248 %i20 = add i32 %index, %mul9 1249 %i21 = getelementptr inbounds float, ptr %i2, i32 %i20 1250 %wide.masked.load187 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1251 %i23 = fmul fast <4 x float> %wide.masked.load187, %wide.masked.load 1252 %i24 = fadd fast <4 x float> %i23, %vec.phi182 1253 %i25 = add i32 %index, %mul11 1254 %i26 = getelementptr inbounds float, ptr %i2, i32 %i25 1255 %wide.masked.load188 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1256 %i28 = fmul fast <4 x float> %wide.masked.load188, %wide.masked.load 1257 %i29 = fadd fast <4 x float> %i28, %vec.phi180 1258 %i30 = add i32 %index, %mul13 1259 %i31 = getelementptr inbounds float, ptr %i2, i32 %i30 1260 %wide.masked.load189 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i31, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1261 %i33 = fmul fast <4 x float> %wide.masked.load189, %wide.masked.load 1262 %i34 = fadd fast <4 x float> %i33, %vec.phi178 1263 %i35 = add i32 %index, %mul15 1264 %i36 = getelementptr inbounds float, ptr %i2, i32 %i35 1265 %wide.masked.load190 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i36, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1266 %i38 = fmul fast <4 x float> %wide.masked.load190, %wide.masked.load 1267 %i39 = fadd fast <4 x float> %i38, %vec.phi 1268 %i40 = select <4 x i1> %active.lane.mask, <4 x float> %i39, <4 x float> %vec.phi 1269 %i41 = select <4 x i1> %active.lane.mask, <4 x float> %i34, <4 x float> %vec.phi178 1270 %i42 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi179 1271 %i43 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi180 1272 %i44 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi181 1273 %i45 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi182 1274 %i46 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi183 1275 %index.next = add i32 %index, 4 1276 %i47 = icmp eq i32 %index.next, %n.vec 1277 br i1 %i47, label %middle.block, label %vector.body 1278 1279middle.block: ; preds = %vector.body 1280 %i48 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i46) 1281 %i49 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i45) 1282 %i50 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i44) 1283 %i51 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i43) 1284 %i52 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i42) 1285 %i53 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i41) 1286 %i54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i40) 1287 %arrayidx56 = getelementptr inbounds float, ptr %pOut, i32 %k2.0177 1288 store float %i52, ptr %arrayidx56, align 4 1289 %arrayidx58 = getelementptr inbounds float, ptr %pOut, i32 %add 1290 store float %i50, ptr %arrayidx58, align 4 1291 %arrayidx60 = getelementptr inbounds float, ptr %pOut, i32 %add6 1292 store float %i48, ptr %arrayidx60, align 4 1293 %arrayidx62 = getelementptr inbounds float, ptr %pOut, i32 %add8 1294 store float %i49, ptr %arrayidx62, align 4 1295 %arrayidx64 = getelementptr inbounds float, ptr %pOut, i32 %add10 1296 store float %i51, ptr %arrayidx64, align 4 1297 %arrayidx66 = getelementptr inbounds float, ptr %pOut, i32 %add12 1298 store float %i53, ptr %arrayidx66, align 4 1299 %arrayidx68 = getelementptr inbounds float, ptr %pOut, i32 %add14 1300 store float %i54, ptr %arrayidx68, align 4 1301 %add70 = add i32 %k2.0177, 7 1302 %cmp3 = icmp ult i32 %add70, %sub 1303 br i1 %cmp3, label %for.body, label %for.cond.cleanup 1304} 1305 1306define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, ptr nocapture %pOut) { 1307; CHECK-LABEL: DCT_mve8: 1308; CHECK: @ %bb.0: @ %entry 1309; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1310; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1311; CHECK-NEXT: .pad #4 1312; CHECK-NEXT: sub sp, #4 1313; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1314; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1315; CHECK-NEXT: .pad #88 1316; CHECK-NEXT: sub sp, #88 1317; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill 1318; CHECK-NEXT: ldr r1, [r0, #4] 1319; CHECK-NEXT: subs r1, #8 1320; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill 1321; CHECK-NEXT: cmp r1, #2 1322; CHECK-NEXT: blo.w .LBB7_5 1323; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1324; CHECK-NEXT: ldr r3, [r0, #8] 1325; CHECK-NEXT: ldr r1, [r0] 1326; CHECK-NEXT: adds r0, r3, #3 1327; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill 1328; CHECK-NEXT: bic r0, r0, #3 1329; CHECK-NEXT: add.w r12, r1, r3, lsl #2 1330; CHECK-NEXT: subs r1, r0, #4 1331; CHECK-NEXT: movs r0, #1 1332; CHECK-NEXT: lsls r6, r3, #2 1333; CHECK-NEXT: add.w r1, r0, r1, lsr #2 1334; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill 1335; CHECK-NEXT: lsls r1, r3, #5 1336; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill 1337; CHECK-NEXT: .LBB7_2: @ %for.body 1338; CHECK-NEXT: @ =>This Loop Header: Depth=1 1339; CHECK-NEXT: @ Child Loop BB7_3 Depth 2 1340; CHECK-NEXT: adds r1, r0, #7 1341; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill 1342; CHECK-NEXT: adds r1, r0, #6 1343; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill 1344; CHECK-NEXT: adds r1, r0, #5 1345; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload 1346; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill 1347; CHECK-NEXT: adds r1, r0, #4 1348; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload 1349; CHECK-NEXT: vmov.i32 q3, #0x0 1350; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload 1351; CHECK-NEXT: adds r4, r0, #3 1352; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill 1353; CHECK-NEXT: add.w r8, r0, #2 1354; CHECK-NEXT: adds r1, r0, #1 1355; CHECK-NEXT: mov r3, r12 1356; CHECK-NEXT: vmov q5, q3 1357; CHECK-NEXT: vmov q6, q3 1358; CHECK-NEXT: vmov q4, q3 1359; CHECK-NEXT: vmov q7, q3 1360; CHECK-NEXT: vmov q2, q3 1361; CHECK-NEXT: mov r10, r7 1362; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill 1363; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill 1364; CHECK-NEXT: dls lr, r5 1365; CHECK-NEXT: .LBB7_3: @ %vector.body 1366; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 1367; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 1368; CHECK-NEXT: vctp.32 r10 1369; CHECK-NEXT: add.w r11, r3, r6 1370; CHECK-NEXT: vpstt 1371; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 1372; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 1373; CHECK-NEXT: add.w r5, r11, r6 1374; CHECK-NEXT: sub.w r10, r10, #4 1375; CHECK-NEXT: vpstt 1376; CHECK-NEXT: vfmat.f32 q6, q1, q0 1377; CHECK-NEXT: vldrwt.u32 q1, [r11] 1378; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill 1379; CHECK-NEXT: vmov q6, q5 1380; CHECK-NEXT: vpst 1381; CHECK-NEXT: vfmat.f32 q7, q1, q0 1382; CHECK-NEXT: vmov q5, q3 1383; CHECK-NEXT: vmov q3, q4 1384; CHECK-NEXT: vmov q4, q2 1385; CHECK-NEXT: vpst 1386; CHECK-NEXT: vldrwt.u32 q1, [r5] 1387; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload 1388; CHECK-NEXT: adds r7, r5, r6 1389; CHECK-NEXT: vpstt 1390; CHECK-NEXT: vfmat.f32 q2, q1, q0 1391; CHECK-NEXT: vldrwt.u32 q1, [r7] 1392; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill 1393; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload 1394; CHECK-NEXT: adds r5, r7, r6 1395; CHECK-NEXT: vpstt 1396; CHECK-NEXT: vfmat.f32 q2, q1, q0 1397; CHECK-NEXT: vldrwt.u32 q1, [r5] 1398; CHECK-NEXT: adds r7, r5, r6 1399; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill 1400; CHECK-NEXT: vmov q2, q4 1401; CHECK-NEXT: vmov q4, q3 1402; CHECK-NEXT: vpstt 1403; CHECK-NEXT: vfmat.f32 q2, q1, q0 1404; CHECK-NEXT: vldrwt.u32 q1, [r7] 1405; CHECK-NEXT: adds r5, r7, r6 1406; CHECK-NEXT: vmov q3, q5 1407; CHECK-NEXT: vpstt 1408; CHECK-NEXT: vfmat.f32 q4, q1, q0 1409; CHECK-NEXT: vldrwt.u32 q1, [r5] 1410; CHECK-NEXT: vmov q5, q6 1411; CHECK-NEXT: add r5, r6 1412; CHECK-NEXT: vpstt 1413; CHECK-NEXT: vfmat.f32 q5, q1, q0 1414; CHECK-NEXT: vldrwt.u32 q1, [r5] 1415; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload 1416; CHECK-NEXT: vpst 1417; CHECK-NEXT: vfmat.f32 q3, q1, q0 1418; CHECK-NEXT: le lr, .LBB7_3 1419; CHECK-NEXT: @ %bb.4: @ %middle.block 1420; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1 1421; CHECK-NEXT: vadd.f32 s0, s30, s31 1422; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1423; CHECK-NEXT: vadd.f32 s2, s28, s29 1424; CHECK-NEXT: vadd.f32 s4, s26, s27 1425; CHECK-NEXT: vadd.f32 s6, s24, s25 1426; CHECK-NEXT: vadd.f32 s5, s18, s19 1427; CHECK-NEXT: vadd.f32 s7, s16, s17 1428; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload 1429; CHECK-NEXT: vadd.f32 s10, s10, s11 1430; CHECK-NEXT: vadd.f32 s8, s8, s9 1431; CHECK-NEXT: vadd.f32 s9, s18, s19 1432; CHECK-NEXT: vadd.f32 s11, s16, s17 1433; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload 1434; CHECK-NEXT: vadd.f32 s14, s14, s15 1435; CHECK-NEXT: vadd.f32 s12, s12, s13 1436; CHECK-NEXT: vadd.f32 s13, s18, s19 1437; CHECK-NEXT: vadd.f32 s15, s16, s17 1438; CHECK-NEXT: vadd.f32 s0, s2, s0 1439; CHECK-NEXT: vadd.f32 s2, s6, s4 1440; CHECK-NEXT: vadd.f32 s8, s8, s10 1441; CHECK-NEXT: vadd.f32 s10, s11, s9 1442; CHECK-NEXT: vadd.f32 s6, s12, s14 1443; CHECK-NEXT: vadd.f32 s1, s22, s23 1444; CHECK-NEXT: vadd.f32 s14, s15, s13 1445; CHECK-NEXT: vstr s0, [r1] 1446; CHECK-NEXT: add.w r1, r2, r0, lsl #2 1447; CHECK-NEXT: vadd.f32 s3, s20, s21 1448; CHECK-NEXT: adds r0, #8 1449; CHECK-NEXT: vstr s2, [r1] 1450; CHECK-NEXT: add.w r1, r2, r8, lsl #2 1451; CHECK-NEXT: vadd.f32 s12, s7, s5 1452; CHECK-NEXT: vstr s10, [r1] 1453; CHECK-NEXT: add.w r1, r2, r4, lsl #2 1454; CHECK-NEXT: vstr s14, [r1] 1455; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload 1456; CHECK-NEXT: vadd.f32 s4, s3, s1 1457; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1458; CHECK-NEXT: vstr s8, [r1] 1459; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload 1460; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1461; CHECK-NEXT: vstr s12, [r1] 1462; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload 1463; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1464; CHECK-NEXT: vstr s4, [r1] 1465; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload 1466; CHECK-NEXT: add.w r1, r2, r1, lsl #2 1467; CHECK-NEXT: vstr s6, [r1] 1468; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload 1469; CHECK-NEXT: add r12, r1 1470; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload 1471; CHECK-NEXT: cmp r0, r1 1472; CHECK-NEXT: blo.w .LBB7_2 1473; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup 1474; CHECK-NEXT: add sp, #88 1475; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1476; CHECK-NEXT: add sp, #4 1477; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 1478entry: 1479 %NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 2 1480 %i = load i32, ptr %NumInputs, align 4 1481 %NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 1 1482 %i1 = load i32, ptr %NumFilters, align 4 1483 %pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, ptr %S, i32 0, i32 0 1484 %i2 = load ptr, ptr %pDCTCoefs, align 4 1485 %cmp = icmp ugt i32 %i, 1 1486 tail call void @llvm.assume(i1 %cmp) 1487 %sub = add i32 %i1, -8 1488 %cmp3197 = icmp ugt i32 %sub, 1 1489 br i1 %cmp3197, label %for.body.preheader, label %for.cond.cleanup 1490 1491for.body.preheader: ; preds = %entry 1492 %n.rnd.up = add i32 %i, 3 1493 %n.vec = and i32 %n.rnd.up, -4 1494 br label %for.body 1495 1496for.cond.cleanup: ; preds = %middle.block, %entry 1497 ret void 1498 1499for.body: ; preds = %middle.block, %for.body.preheader 1500 %k2.0198 = phi i32 [ %add79, %middle.block ], [ 1, %for.body.preheader ] 1501 %mul4 = mul i32 %k2.0198, %i 1502 %add = add nuw nsw i32 %k2.0198, 1 1503 %mul5 = mul i32 %add, %i 1504 %add6 = add nuw nsw i32 %k2.0198, 2 1505 %mul7 = mul i32 %add6, %i 1506 %add8 = add nuw nsw i32 %k2.0198, 3 1507 %mul9 = mul i32 %add8, %i 1508 %add10 = add nuw nsw i32 %k2.0198, 4 1509 %mul11 = mul i32 %add10, %i 1510 %add12 = add nuw nsw i32 %k2.0198, 5 1511 %mul13 = mul i32 %add12, %i 1512 %add14 = add nuw nsw i32 %k2.0198, 6 1513 %mul15 = mul i32 %add14, %i 1514 %add16 = add i32 %k2.0198, 7 1515 %mul17 = mul i32 %add16, %i 1516 br label %vector.body 1517 1518vector.body: ; preds = %vector.body, %for.body 1519 %index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ] 1520 %vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %i45, %vector.body ] 1521 %vec.phi199 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i46, %vector.body ] 1522 %vec.phi200 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i47, %vector.body ] 1523 %vec.phi201 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i48, %vector.body ] 1524 %vec.phi202 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i49, %vector.body ] 1525 %vec.phi203 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i50, %vector.body ] 1526 %vec.phi204 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i51, %vector.body ] 1527 %vec.phi205 = phi <4 x float> [ zeroinitializer, %for.body ], [ %i52, %vector.body ] 1528 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %i) 1529 %i3 = getelementptr inbounds float, ptr %pIn, i32 %index 1530 %wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i3, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1531 %i5 = add i32 %index, %mul4 1532 %i6 = getelementptr inbounds float, ptr %i2, i32 %i5 1533 %wide.masked.load206 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i6, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1534 %i8 = fmul fast <4 x float> %wide.masked.load206, %wide.masked.load 1535 %i9 = fadd fast <4 x float> %i8, %vec.phi200 1536 %i10 = add i32 %index, %mul5 1537 %i11 = getelementptr inbounds float, ptr %i2, i32 %i10 1538 %wide.masked.load207 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i11, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1539 %i13 = fmul fast <4 x float> %wide.masked.load207, %wide.masked.load 1540 %i14 = fadd fast <4 x float> %i13, %vec.phi202 1541 %i15 = add i32 %index, %mul7 1542 %i16 = getelementptr inbounds float, ptr %i2, i32 %i15 1543 %wide.masked.load208 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i16, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1544 %i18 = fmul fast <4 x float> %wide.masked.load208, %wide.masked.load 1545 %i19 = fadd fast <4 x float> %i18, %vec.phi204 1546 %i20 = add i32 %index, %mul9 1547 %i21 = getelementptr inbounds float, ptr %i2, i32 %i20 1548 %wide.masked.load209 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i21, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1549 %i23 = fmul fast <4 x float> %wide.masked.load209, %wide.masked.load 1550 %i24 = fadd fast <4 x float> %i23, %vec.phi205 1551 %i25 = add i32 %index, %mul11 1552 %i26 = getelementptr inbounds float, ptr %i2, i32 %i25 1553 %wide.masked.load210 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i26, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1554 %i28 = fmul fast <4 x float> %wide.masked.load210, %wide.masked.load 1555 %i29 = fadd fast <4 x float> %i28, %vec.phi203 1556 %i30 = add i32 %index, %mul13 1557 %i31 = getelementptr inbounds float, ptr %i2, i32 %i30 1558 %wide.masked.load211 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i31, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1559 %i33 = fmul fast <4 x float> %wide.masked.load211, %wide.masked.load 1560 %i34 = fadd fast <4 x float> %i33, %vec.phi201 1561 %i35 = add i32 %index, %mul15 1562 %i36 = getelementptr inbounds float, ptr %i2, i32 %i35 1563 %wide.masked.load212 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i36, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1564 %i38 = fmul fast <4 x float> %wide.masked.load212, %wide.masked.load 1565 %i39 = fadd fast <4 x float> %i38, %vec.phi199 1566 %i40 = add i32 %index, %mul17 1567 %i41 = getelementptr inbounds float, ptr %i2, i32 %i40 1568 %wide.masked.load213 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr %i41, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef) 1569 %i43 = fmul fast <4 x float> %wide.masked.load213, %wide.masked.load 1570 %i44 = fadd fast <4 x float> %i43, %vec.phi 1571 %i45 = select <4 x i1> %active.lane.mask, <4 x float> %i44, <4 x float> %vec.phi 1572 %i46 = select <4 x i1> %active.lane.mask, <4 x float> %i39, <4 x float> %vec.phi199 1573 %i47 = select <4 x i1> %active.lane.mask, <4 x float> %i9, <4 x float> %vec.phi200 1574 %i48 = select <4 x i1> %active.lane.mask, <4 x float> %i34, <4 x float> %vec.phi201 1575 %i49 = select <4 x i1> %active.lane.mask, <4 x float> %i14, <4 x float> %vec.phi202 1576 %i50 = select <4 x i1> %active.lane.mask, <4 x float> %i29, <4 x float> %vec.phi203 1577 %i51 = select <4 x i1> %active.lane.mask, <4 x float> %i19, <4 x float> %vec.phi204 1578 %i52 = select <4 x i1> %active.lane.mask, <4 x float> %i24, <4 x float> %vec.phi205 1579 %index.next = add i32 %index, 4 1580 %i53 = icmp eq i32 %index.next, %n.vec 1581 br i1 %i53, label %middle.block, label %vector.body 1582 1583middle.block: ; preds = %vector.body 1584 %i54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i52) 1585 %i55 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i51) 1586 %i56 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i50) 1587 %i57 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i49) 1588 %i58 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i48) 1589 %i59 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i47) 1590 %i60 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i46) 1591 %i61 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %i45) 1592 %arrayidx63 = getelementptr inbounds float, ptr %pOut, i32 %k2.0198 1593 store float %i59, ptr %arrayidx63, align 4 1594 %arrayidx65 = getelementptr inbounds float, ptr %pOut, i32 %add 1595 store float %i57, ptr %arrayidx65, align 4 1596 %arrayidx67 = getelementptr inbounds float, ptr %pOut, i32 %add6 1597 store float %i55, ptr %arrayidx67, align 4 1598 %arrayidx69 = getelementptr inbounds float, ptr %pOut, i32 %add8 1599 store float %i54, ptr %arrayidx69, align 4 1600 %arrayidx71 = getelementptr inbounds float, ptr %pOut, i32 %add10 1601 store float %i56, ptr %arrayidx71, align 4 1602 %arrayidx73 = getelementptr inbounds float, ptr %pOut, i32 %add12 1603 store float %i58, ptr %arrayidx73, align 4 1604 %arrayidx75 = getelementptr inbounds float, ptr %pOut, i32 %add14 1605 store float %i60, ptr %arrayidx75, align 4 1606 %arrayidx77 = getelementptr inbounds float, ptr %pOut, i32 %add16 1607 store float %i61, ptr %arrayidx77, align 4 1608 %add79 = add i32 %k2.0198, 8 1609 %cmp3 = icmp ult i32 %add79, %sub 1610 br i1 %cmp3, label %for.body, label %for.cond.cleanup 1611} 1612 1613declare void @llvm.assume(i1 noundef) 1614declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 1615declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>) 1616declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) 1617