1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp,+fp64 -verify-machineinstrs %s -o - | FileCheck %s 3 4define void @arm_cmplx_mag_squared_f16(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) { 5; CHECK-LABEL: arm_cmplx_mag_squared_f16: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r4, r5, r7, lr} 8; CHECK-NEXT: push {r4, r5, r7, lr} 9; CHECK-NEXT: cmp r2, #0 10; CHECK-NEXT: beq .LBB0_8 11; CHECK-NEXT: @ %bb.1: @ %while.body.preheader 12; CHECK-NEXT: cmp r2, #8 13; CHECK-NEXT: blo .LBB0_9 14; CHECK-NEXT: @ %bb.2: @ %vector.memcheck 15; CHECK-NEXT: add.w r3, r0, r2, lsl #2 16; CHECK-NEXT: cmp r3, r1 17; CHECK-NEXT: itt hi 18; CHECK-NEXT: addhi.w r3, r1, r2, lsl #1 19; CHECK-NEXT: cmphi r3, r0 20; CHECK-NEXT: bhi .LBB0_9 21; CHECK-NEXT: @ %bb.3: @ %vector.ph 22; CHECK-NEXT: bic r4, r2, #7 23; CHECK-NEXT: movs r5, #1 24; CHECK-NEXT: sub.w r3, r4, #8 25; CHECK-NEXT: add.w r12, r1, r4, lsl #1 26; CHECK-NEXT: add.w lr, r5, r3, lsr #3 27; CHECK-NEXT: add.w r3, r0, r4, lsl #2 28; CHECK-NEXT: and r5, r2, #7 29; CHECK-NEXT: .LBB0_4: @ %vector.body 30; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 31; CHECK-NEXT: vld20.16 {q0, q1}, [r0] 32; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! 33; CHECK-NEXT: vmul.f16 q0, q0, q0 34; CHECK-NEXT: vfma.f16 q0, q1, q1 35; CHECK-NEXT: vstrb.8 q0, [r1], #16 36; CHECK-NEXT: le lr, .LBB0_4 37; CHECK-NEXT: @ %bb.5: @ %middle.block 38; CHECK-NEXT: cmp r4, r2 39; CHECK-NEXT: it eq 40; CHECK-NEXT: popeq {r4, r5, r7, pc} 41; CHECK-NEXT: .LBB0_6: @ %while.body.preheader26 42; CHECK-NEXT: dls lr, r5 43; CHECK-NEXT: .LBB0_7: @ %while.body 44; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 45; CHECK-NEXT: vldr.16 s0, [r3] 46; CHECK-NEXT: vldr.16 s2, [r3, #2] 47; CHECK-NEXT: adds r3, #4 48; CHECK-NEXT: vmul.f16 s0, s0, s0 49; CHECK-NEXT: vfma.f16 s0, s2, s2 50; CHECK-NEXT: vstr.16 s0, [r12] 51; CHECK-NEXT: add.w r12, r12, #2 52; CHECK-NEXT: le lr, .LBB0_7 53; CHECK-NEXT: .LBB0_8: @ %while.end 54; CHECK-NEXT: pop {r4, r5, r7, pc} 55; CHECK-NEXT: .LBB0_9: 56; CHECK-NEXT: mov r3, r0 57; CHECK-NEXT: mov r12, r1 58; CHECK-NEXT: mov r5, r2 59; CHECK-NEXT: b .LBB0_6 60entry: 61 %cmp.not11 = icmp eq i32 %numSamples, 0 62 br i1 %cmp.not11, label %while.end, label %while.body.preheader 63 64while.body.preheader: ; preds = %entry 65 %min.iters.check = icmp ult i32 %numSamples, 8 66 br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck 67 68vector.memcheck: ; preds = %while.body.preheader 69 %scevgep = getelementptr half, ptr %pDst, i32 %numSamples 70 %0 = shl i32 %numSamples, 1 71 %scevgep18 = getelementptr half, ptr %pSrc, i32 %0 72 %bound0 = icmp ugt ptr %scevgep18, %pDst 73 %bound1 = icmp ugt ptr %scevgep, %pSrc 74 %found.conflict = and i1 %bound0, %bound1 75 br i1 %found.conflict, label %while.body.preheader26, label %vector.ph 76 77vector.ph: ; preds = %vector.memcheck 78 %n.vec = and i32 %numSamples, -8 79 %1 = shl i32 %n.vec, 1 80 %ind.end = getelementptr half, ptr %pSrc, i32 %1 81 %ind.end21 = getelementptr half, ptr %pDst, i32 %n.vec 82 %ind.end23 = and i32 %numSamples, 7 83 br label %vector.body 84 85vector.body: ; preds = %vector.body, %vector.ph 86 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 87 %2 = shl i32 %index, 1 88 %next.gep = getelementptr half, ptr %pSrc, i32 %2 89 %next.gep24 = getelementptr half, ptr %pDst, i32 %index 90 %wide.vec = load <16 x half>, ptr %next.gep, align 2 91 %3 = fmul fast <16 x half> %wide.vec, %wide.vec 92 %4 = shufflevector <16 x half> %3, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 93 %5 = fmul fast <16 x half> %wide.vec, %wide.vec 94 %6 = shufflevector <16 x half> %5, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 95 %7 = fadd fast <8 x half> %6, %4 96 store <8 x half> %7, ptr %next.gep24, align 2 97 %index.next = add i32 %index, 8 98 %8 = icmp eq i32 %index.next, %n.vec 99 br i1 %8, label %middle.block, label %vector.body 100 101middle.block: ; preds = %vector.body 102 %cmp.n = icmp eq i32 %n.vec, %numSamples 103 br i1 %cmp.n, label %while.end, label %while.body.preheader26 104 105while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader 106 %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ] 107 %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ] 108 %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ] 109 br label %while.body 110 111while.body: ; preds = %while.body.preheader26, %while.body 112 %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ] 113 %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ] 114 %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ] 115 %incdec.ptr = getelementptr inbounds half, ptr %pSrc.addr.014, i32 1 116 %9 = load half, ptr %pSrc.addr.014, align 2 117 %incdec.ptr1 = getelementptr inbounds half, ptr %pSrc.addr.014, i32 2 118 %10 = load half, ptr %incdec.ptr, align 2 119 %mul = fmul fast half %9, %9 120 %mul2 = fmul fast half %10, %10 121 %add = fadd fast half %mul2, %mul 122 %incdec.ptr3 = getelementptr inbounds half, ptr %pDst.addr.013, i32 1 123 store half %add, ptr %pDst.addr.013, align 2 124 %dec = add i32 %blkCnt.012, -1 125 %cmp.not = icmp eq i32 %dec, 0 126 br i1 %cmp.not, label %while.end, label %while.body 127 128while.end: ; preds = %while.body, %middle.block, %entry 129 ret void 130} 131 132define void @arm_cmplx_mag_squared_f32(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) { 133; CHECK-LABEL: arm_cmplx_mag_squared_f32: 134; CHECK: @ %bb.0: @ %entry 135; CHECK-NEXT: .save {r4, r5, r7, lr} 136; CHECK-NEXT: push {r4, r5, r7, lr} 137; CHECK-NEXT: cbz r2, .LBB1_8 138; CHECK-NEXT: @ %bb.1: @ %while.body.preheader 139; CHECK-NEXT: cmp r2, #4 140; CHECK-NEXT: blo .LBB1_9 141; CHECK-NEXT: @ %bb.2: @ %vector.memcheck 142; CHECK-NEXT: add.w r3, r0, r2, lsl #3 143; CHECK-NEXT: cmp r3, r1 144; CHECK-NEXT: itt hi 145; CHECK-NEXT: addhi.w r3, r1, r2, lsl #2 146; CHECK-NEXT: cmphi r3, r0 147; CHECK-NEXT: bhi .LBB1_9 148; CHECK-NEXT: @ %bb.3: @ %vector.ph 149; CHECK-NEXT: bic r4, r2, #3 150; CHECK-NEXT: movs r5, #1 151; CHECK-NEXT: subs r3, r4, #4 152; CHECK-NEXT: add.w r12, r1, r4, lsl #2 153; CHECK-NEXT: add.w lr, r5, r3, lsr #2 154; CHECK-NEXT: add.w r3, r0, r4, lsl #3 155; CHECK-NEXT: and r5, r2, #3 156; CHECK-NEXT: .LBB1_4: @ %vector.body 157; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 158; CHECK-NEXT: vld20.32 {q0, q1}, [r0] 159; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! 160; CHECK-NEXT: vmul.f32 q0, q0, q0 161; CHECK-NEXT: vfma.f32 q0, q1, q1 162; CHECK-NEXT: vstrb.8 q0, [r1], #16 163; CHECK-NEXT: le lr, .LBB1_4 164; CHECK-NEXT: @ %bb.5: @ %middle.block 165; CHECK-NEXT: cmp r4, r2 166; CHECK-NEXT: it eq 167; CHECK-NEXT: popeq {r4, r5, r7, pc} 168; CHECK-NEXT: .LBB1_6: @ %while.body.preheader26 169; CHECK-NEXT: dls lr, r5 170; CHECK-NEXT: .LBB1_7: @ %while.body 171; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 172; CHECK-NEXT: vldr s0, [r3] 173; CHECK-NEXT: vldr s2, [r3, #4] 174; CHECK-NEXT: adds r3, #8 175; CHECK-NEXT: vmul.f32 s0, s0, s0 176; CHECK-NEXT: vfma.f32 s0, s2, s2 177; CHECK-NEXT: vstmia r12!, {s0} 178; CHECK-NEXT: le lr, .LBB1_7 179; CHECK-NEXT: .LBB1_8: @ %while.end 180; CHECK-NEXT: pop {r4, r5, r7, pc} 181; CHECK-NEXT: .LBB1_9: 182; CHECK-NEXT: mov r3, r0 183; CHECK-NEXT: mov r12, r1 184; CHECK-NEXT: mov r5, r2 185; CHECK-NEXT: b .LBB1_6 186entry: 187 %cmp.not11 = icmp eq i32 %numSamples, 0 188 br i1 %cmp.not11, label %while.end, label %while.body.preheader 189 190while.body.preheader: ; preds = %entry 191 %min.iters.check = icmp ult i32 %numSamples, 4 192 br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck 193 194vector.memcheck: ; preds = %while.body.preheader 195 %scevgep = getelementptr float, ptr %pDst, i32 %numSamples 196 %0 = shl i32 %numSamples, 1 197 %scevgep18 = getelementptr float, ptr %pSrc, i32 %0 198 %bound0 = icmp ugt ptr %scevgep18, %pDst 199 %bound1 = icmp ugt ptr %scevgep, %pSrc 200 %found.conflict = and i1 %bound0, %bound1 201 br i1 %found.conflict, label %while.body.preheader26, label %vector.ph 202 203vector.ph: ; preds = %vector.memcheck 204 %n.vec = and i32 %numSamples, -4 205 %1 = shl i32 %n.vec, 1 206 %ind.end = getelementptr float, ptr %pSrc, i32 %1 207 %ind.end21 = getelementptr float, ptr %pDst, i32 %n.vec 208 %ind.end23 = and i32 %numSamples, 3 209 br label %vector.body 210 211vector.body: ; preds = %vector.body, %vector.ph 212 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 213 %2 = shl i32 %index, 1 214 %next.gep = getelementptr float, ptr %pSrc, i32 %2 215 %next.gep24 = getelementptr float, ptr %pDst, i32 %index 216 %wide.vec = load <8 x float>, ptr %next.gep, align 4 217 %3 = fmul fast <8 x float> %wide.vec, %wide.vec 218 %4 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 219 %5 = fmul fast <8 x float> %wide.vec, %wide.vec 220 %6 = shufflevector <8 x float> %5, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 221 %7 = fadd fast <4 x float> %6, %4 222 store <4 x float> %7, ptr %next.gep24, align 4 223 %index.next = add i32 %index, 4 224 %8 = icmp eq i32 %index.next, %n.vec 225 br i1 %8, label %middle.block, label %vector.body 226 227middle.block: ; preds = %vector.body 228 %cmp.n = icmp eq i32 %n.vec, %numSamples 229 br i1 %cmp.n, label %while.end, label %while.body.preheader26 230 231while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader 232 %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ] 233 %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ] 234 %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ] 235 br label %while.body 236 237while.body: ; preds = %while.body.preheader26, %while.body 238 %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ] 239 %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ] 240 %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ] 241 %incdec.ptr = getelementptr inbounds float, ptr %pSrc.addr.014, i32 1 242 %9 = load float, ptr %pSrc.addr.014, align 4 243 %incdec.ptr1 = getelementptr inbounds float, ptr %pSrc.addr.014, i32 2 244 %10 = load float, ptr %incdec.ptr, align 4 245 %mul = fmul fast float %9, %9 246 %mul2 = fmul fast float %10, %10 247 %add = fadd fast float %mul2, %mul 248 %incdec.ptr3 = getelementptr inbounds float, ptr %pDst.addr.013, i32 1 249 store float %add, ptr %pDst.addr.013, align 4 250 %dec = add i32 %blkCnt.012, -1 251 %cmp.not = icmp eq i32 %dec, 0 252 br i1 %cmp.not, label %while.end, label %while.body 253 254while.end: ; preds = %while.body, %middle.block, %entry 255 ret void 256} 257 258define void @arm_cmplx_mag_squared_f16_cse(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) { 259; CHECK-LABEL: arm_cmplx_mag_squared_f16_cse: 260; CHECK: @ %bb.0: @ %entry 261; CHECK-NEXT: .save {r4, r5, r7, lr} 262; CHECK-NEXT: push {r4, r5, r7, lr} 263; CHECK-NEXT: cmp r2, #0 264; CHECK-NEXT: beq .LBB2_8 265; CHECK-NEXT: @ %bb.1: @ %while.body.preheader 266; CHECK-NEXT: cmp r2, #8 267; CHECK-NEXT: blo .LBB2_9 268; CHECK-NEXT: @ %bb.2: @ %vector.memcheck 269; CHECK-NEXT: add.w r3, r0, r2, lsl #2 270; CHECK-NEXT: cmp r3, r1 271; CHECK-NEXT: itt hi 272; CHECK-NEXT: addhi.w r3, r1, r2, lsl #1 273; CHECK-NEXT: cmphi r3, r0 274; CHECK-NEXT: bhi .LBB2_9 275; CHECK-NEXT: @ %bb.3: @ %vector.ph 276; CHECK-NEXT: bic r4, r2, #7 277; CHECK-NEXT: movs r5, #1 278; CHECK-NEXT: sub.w r3, r4, #8 279; CHECK-NEXT: add.w r12, r1, r4, lsl #1 280; CHECK-NEXT: add.w lr, r5, r3, lsr #3 281; CHECK-NEXT: add.w r3, r0, r4, lsl #2 282; CHECK-NEXT: and r5, r2, #7 283; CHECK-NEXT: .LBB2_4: @ %vector.body 284; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 285; CHECK-NEXT: vld20.16 {q0, q1}, [r0] 286; CHECK-NEXT: vld21.16 {q0, q1}, [r0]! 287; CHECK-NEXT: vmul.f16 q0, q0, q0 288; CHECK-NEXT: vfma.f16 q0, q1, q1 289; CHECK-NEXT: vstrb.8 q0, [r1], #16 290; CHECK-NEXT: le lr, .LBB2_4 291; CHECK-NEXT: @ %bb.5: @ %middle.block 292; CHECK-NEXT: cmp r4, r2 293; CHECK-NEXT: it eq 294; CHECK-NEXT: popeq {r4, r5, r7, pc} 295; CHECK-NEXT: .LBB2_6: @ %while.body.preheader26 296; CHECK-NEXT: dls lr, r5 297; CHECK-NEXT: .LBB2_7: @ %while.body 298; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 299; CHECK-NEXT: vldr.16 s0, [r3] 300; CHECK-NEXT: vldr.16 s2, [r3, #2] 301; CHECK-NEXT: adds r3, #4 302; CHECK-NEXT: vmul.f16 s0, s0, s0 303; CHECK-NEXT: vfma.f16 s0, s2, s2 304; CHECK-NEXT: vstr.16 s0, [r12] 305; CHECK-NEXT: add.w r12, r12, #2 306; CHECK-NEXT: le lr, .LBB2_7 307; CHECK-NEXT: .LBB2_8: @ %while.end 308; CHECK-NEXT: pop {r4, r5, r7, pc} 309; CHECK-NEXT: .LBB2_9: 310; CHECK-NEXT: mov r3, r0 311; CHECK-NEXT: mov r12, r1 312; CHECK-NEXT: mov r5, r2 313; CHECK-NEXT: b .LBB2_6 314entry: 315 %cmp.not11 = icmp eq i32 %numSamples, 0 316 br i1 %cmp.not11, label %while.end, label %while.body.preheader 317 318while.body.preheader: ; preds = %entry 319 %min.iters.check = icmp ult i32 %numSamples, 8 320 br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck 321 322vector.memcheck: ; preds = %while.body.preheader 323 %scevgep = getelementptr half, ptr %pDst, i32 %numSamples 324 %0 = shl i32 %numSamples, 1 325 %scevgep18 = getelementptr half, ptr %pSrc, i32 %0 326 %bound0 = icmp ugt ptr %scevgep18, %pDst 327 %bound1 = icmp ugt ptr %scevgep, %pSrc 328 %found.conflict = and i1 %bound0, %bound1 329 br i1 %found.conflict, label %while.body.preheader26, label %vector.ph 330 331vector.ph: ; preds = %vector.memcheck 332 %n.vec = and i32 %numSamples, -8 333 %1 = shl i32 %n.vec, 1 334 %ind.end = getelementptr half, ptr %pSrc, i32 %1 335 %ind.end21 = getelementptr half, ptr %pDst, i32 %n.vec 336 %ind.end23 = and i32 %numSamples, 7 337 br label %vector.body 338 339vector.body: ; preds = %vector.body, %vector.ph 340 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 341 %2 = shl i32 %index, 1 342 %next.gep = getelementptr half, ptr %pSrc, i32 %2 343 %next.gep24 = getelementptr half, ptr %pDst, i32 %index 344 %wide.vec = load <16 x half>, ptr %next.gep, align 2 345 %3 = fmul fast <16 x half> %wide.vec, %wide.vec 346 %4 = shufflevector <16 x half> %3, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 347 %5 = shufflevector <16 x half> %3, <16 x half> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 348 %6 = fadd fast <8 x half> %5, %4 349 store <8 x half> %6, ptr %next.gep24, align 2 350 %index.next = add i32 %index, 8 351 %7 = icmp eq i32 %index.next, %n.vec 352 br i1 %7, label %middle.block, label %vector.body 353 354middle.block: ; preds = %vector.body 355 %cmp.n = icmp eq i32 %n.vec, %numSamples 356 br i1 %cmp.n, label %while.end, label %while.body.preheader26 357 358while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader 359 %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ] 360 %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ] 361 %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ] 362 br label %while.body 363 364while.body: ; preds = %while.body, %while.body.preheader26 365 %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ] 366 %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ] 367 %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ] 368 %incdec.ptr = getelementptr inbounds half, ptr %pSrc.addr.014, i32 1 369 %8 = load half, ptr %pSrc.addr.014, align 2 370 %incdec.ptr1 = getelementptr inbounds half, ptr %pSrc.addr.014, i32 2 371 %9 = load half, ptr %incdec.ptr, align 2 372 %mul = fmul fast half %8, %8 373 %mul2 = fmul fast half %9, %9 374 %add = fadd fast half %mul2, %mul 375 %incdec.ptr3 = getelementptr inbounds half, ptr %pDst.addr.013, i32 1 376 store half %add, ptr %pDst.addr.013, align 2 377 %dec = add i32 %blkCnt.012, -1 378 %cmp.not = icmp eq i32 %dec, 0 379 br i1 %cmp.not, label %while.end, label %while.body 380 381while.end: ; preds = %while.body, %middle.block, %entry 382 ret void 383} 384 385define void @arm_cmplx_mag_squared_f32_cse(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %numSamples) { 386; CHECK-LABEL: arm_cmplx_mag_squared_f32_cse: 387; CHECK: @ %bb.0: @ %entry 388; CHECK-NEXT: .save {r4, r5, r7, lr} 389; CHECK-NEXT: push {r4, r5, r7, lr} 390; CHECK-NEXT: cbz r2, .LBB3_8 391; CHECK-NEXT: @ %bb.1: @ %while.body.preheader 392; CHECK-NEXT: cmp r2, #4 393; CHECK-NEXT: blo .LBB3_9 394; CHECK-NEXT: @ %bb.2: @ %vector.memcheck 395; CHECK-NEXT: add.w r3, r0, r2, lsl #3 396; CHECK-NEXT: cmp r3, r1 397; CHECK-NEXT: itt hi 398; CHECK-NEXT: addhi.w r3, r1, r2, lsl #2 399; CHECK-NEXT: cmphi r3, r0 400; CHECK-NEXT: bhi .LBB3_9 401; CHECK-NEXT: @ %bb.3: @ %vector.ph 402; CHECK-NEXT: bic r4, r2, #3 403; CHECK-NEXT: movs r5, #1 404; CHECK-NEXT: subs r3, r4, #4 405; CHECK-NEXT: add.w r12, r1, r4, lsl #2 406; CHECK-NEXT: add.w lr, r5, r3, lsr #2 407; CHECK-NEXT: add.w r3, r0, r4, lsl #3 408; CHECK-NEXT: and r5, r2, #3 409; CHECK-NEXT: .LBB3_4: @ %vector.body 410; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 411; CHECK-NEXT: vld20.32 {q0, q1}, [r0] 412; CHECK-NEXT: vld21.32 {q0, q1}, [r0]! 413; CHECK-NEXT: vmul.f32 q0, q0, q0 414; CHECK-NEXT: vfma.f32 q0, q1, q1 415; CHECK-NEXT: vstrb.8 q0, [r1], #16 416; CHECK-NEXT: le lr, .LBB3_4 417; CHECK-NEXT: @ %bb.5: @ %middle.block 418; CHECK-NEXT: cmp r4, r2 419; CHECK-NEXT: it eq 420; CHECK-NEXT: popeq {r4, r5, r7, pc} 421; CHECK-NEXT: .LBB3_6: @ %while.body.preheader26 422; CHECK-NEXT: dls lr, r5 423; CHECK-NEXT: .LBB3_7: @ %while.body 424; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 425; CHECK-NEXT: vldr s0, [r3] 426; CHECK-NEXT: vldr s2, [r3, #4] 427; CHECK-NEXT: adds r3, #8 428; CHECK-NEXT: vmul.f32 s0, s0, s0 429; CHECK-NEXT: vfma.f32 s0, s2, s2 430; CHECK-NEXT: vstmia r12!, {s0} 431; CHECK-NEXT: le lr, .LBB3_7 432; CHECK-NEXT: .LBB3_8: @ %while.end 433; CHECK-NEXT: pop {r4, r5, r7, pc} 434; CHECK-NEXT: .LBB3_9: 435; CHECK-NEXT: mov r3, r0 436; CHECK-NEXT: mov r12, r1 437; CHECK-NEXT: mov r5, r2 438; CHECK-NEXT: b .LBB3_6 439entry: 440 %cmp.not11 = icmp eq i32 %numSamples, 0 441 br i1 %cmp.not11, label %while.end, label %while.body.preheader 442 443while.body.preheader: ; preds = %entry 444 %min.iters.check = icmp ult i32 %numSamples, 4 445 br i1 %min.iters.check, label %while.body.preheader26, label %vector.memcheck 446 447vector.memcheck: ; preds = %while.body.preheader 448 %scevgep = getelementptr float, ptr %pDst, i32 %numSamples 449 %0 = shl i32 %numSamples, 1 450 %scevgep18 = getelementptr float, ptr %pSrc, i32 %0 451 %bound0 = icmp ugt ptr %scevgep18, %pDst 452 %bound1 = icmp ugt ptr %scevgep, %pSrc 453 %found.conflict = and i1 %bound0, %bound1 454 br i1 %found.conflict, label %while.body.preheader26, label %vector.ph 455 456vector.ph: ; preds = %vector.memcheck 457 %n.vec = and i32 %numSamples, -4 458 %1 = shl i32 %n.vec, 1 459 %ind.end = getelementptr float, ptr %pSrc, i32 %1 460 %ind.end21 = getelementptr float, ptr %pDst, i32 %n.vec 461 %ind.end23 = and i32 %numSamples, 3 462 br label %vector.body 463 464vector.body: ; preds = %vector.body, %vector.ph 465 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 466 %2 = shl i32 %index, 1 467 %next.gep = getelementptr float, ptr %pSrc, i32 %2 468 %next.gep24 = getelementptr float, ptr %pDst, i32 %index 469 %wide.vec = load <8 x float>, ptr %next.gep, align 4 470 %3 = fmul fast <8 x float> %wide.vec, %wide.vec 471 %4 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 472 %5 = shufflevector <8 x float> %3, <8 x float> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 473 %6 = fadd fast <4 x float> %5, %4 474 store <4 x float> %6, ptr %next.gep24, align 4 475 %index.next = add i32 %index, 4 476 %7 = icmp eq i32 %index.next, %n.vec 477 br i1 %7, label %middle.block, label %vector.body 478 479middle.block: ; preds = %vector.body 480 %cmp.n = icmp eq i32 %n.vec, %numSamples 481 br i1 %cmp.n, label %while.end, label %while.body.preheader26 482 483while.body.preheader26: ; preds = %middle.block, %vector.memcheck, %while.body.preheader 484 %pSrc.addr.014.ph = phi ptr [ %pSrc, %vector.memcheck ], [ %pSrc, %while.body.preheader ], [ %ind.end, %middle.block ] 485 %pDst.addr.013.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end21, %middle.block ] 486 %blkCnt.012.ph = phi i32 [ %numSamples, %vector.memcheck ], [ %numSamples, %while.body.preheader ], [ %ind.end23, %middle.block ] 487 br label %while.body 488 489while.body: ; preds = %while.body, %while.body.preheader26 490 %pSrc.addr.014 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrc.addr.014.ph, %while.body.preheader26 ] 491 %pDst.addr.013 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.013.ph, %while.body.preheader26 ] 492 %blkCnt.012 = phi i32 [ %dec, %while.body ], [ %blkCnt.012.ph, %while.body.preheader26 ] 493 %incdec.ptr = getelementptr inbounds float, ptr %pSrc.addr.014, i32 1 494 %8 = load float, ptr %pSrc.addr.014, align 4 495 %incdec.ptr1 = getelementptr inbounds float, ptr %pSrc.addr.014, i32 2 496 %9 = load float, ptr %incdec.ptr, align 4 497 %mul = fmul fast float %8, %8 498 %mul2 = fmul fast float %9, %9 499 %add = fadd fast float %mul2, %mul 500 %incdec.ptr3 = getelementptr inbounds float, ptr %pDst.addr.013, i32 1 501 store float %add, ptr %pDst.addr.013, align 4 502 %dec = add i32 %blkCnt.012, -1 503 %cmp.not = icmp eq i32 %dec, 0 504 br i1 %cmp.not, label %while.end, label %while.body 505 506while.end: ; preds = %while.body, %middle.block, %entry 507 ret void 508} 509