1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp -o - %s | FileCheck --check-prefix=CHECK %s 3 4define void @tailpred(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture %pDst, i32 %blockSize) { 5; CHECK-LABEL: tailpred: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r4, lr} 8; CHECK-NEXT: push {r4, lr} 9; CHECK-NEXT: cmp r3, #0 10; CHECK-NEXT: it eq 11; CHECK-NEXT: popeq {r4, pc} 12; CHECK-NEXT: .LBB0_1: @ %vector.memcheck 13; CHECK-NEXT: add.w r12, r1, r3, lsl #1 14; CHECK-NEXT: add.w lr, r2, r3, lsl #1 15; CHECK-NEXT: cmp r12, r2 16; CHECK-NEXT: add.w r4, r0, r3, lsl #1 17; CHECK-NEXT: cset r12, hi 18; CHECK-NEXT: cmp lr, r1 19; CHECK-NEXT: csel r12, zr, r12, ls 20; CHECK-NEXT: cmp lr, r0 21; CHECK-NEXT: cset lr, hi 22; CHECK-NEXT: cmp r4, r2 23; CHECK-NEXT: cset r4, hi 24; CHECK-NEXT: tst.w r4, lr 25; CHECK-NEXT: it eq 26; CHECK-NEXT: cmpeq.w r12, #0 27; CHECK-NEXT: beq .LBB0_4 28; CHECK-NEXT: @ %bb.2: @ %while.body.preheader 29; CHECK-NEXT: dls lr, r3 30; CHECK-NEXT: .LBB0_3: @ %while.body 31; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 32; CHECK-NEXT: vldr.16 s0, [r0] 33; CHECK-NEXT: vldr.16 s2, [r1] 34; CHECK-NEXT: adds r1, #2 35; CHECK-NEXT: adds r0, #2 36; CHECK-NEXT: vadd.f16 s0, s2, s0 37; CHECK-NEXT: vstr.16 s0, [r2] 38; CHECK-NEXT: adds r2, #2 39; CHECK-NEXT: le lr, .LBB0_3 40; CHECK-NEXT: b .LBB0_6 41; CHECK-NEXT: .LBB0_4: @ %vector.ph 42; CHECK-NEXT: dlstp.16 lr, r3 43; CHECK-NEXT: .LBB0_5: @ %vector.body 44; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 45; CHECK-NEXT: vldrh.u16 q0, [r0], #16 46; CHECK-NEXT: vldrh.u16 q1, [r1], #16 47; CHECK-NEXT: vadd.f16 q0, q1, q0 48; CHECK-NEXT: vstrh.16 q0, [r2], #16 49; CHECK-NEXT: letp lr, .LBB0_5 50; CHECK-NEXT: .LBB0_6: @ %while.end 51; CHECK-NEXT: pop {r4, pc} 52entry: 53 %cmp.not6 = icmp eq i32 %blockSize, 0 54 br i1 %cmp.not6, label %while.end, label %vector.memcheck 55 56vector.memcheck: ; preds = %entry 57 %scevgep = getelementptr half, ptr %pDst, i32 %blockSize 58 %scevgep14 = getelementptr half, ptr %pSrcA, i32 %blockSize 59 %scevgep17 = getelementptr half, ptr %pSrcB, i32 %blockSize 60 %bound0 = icmp ugt ptr %scevgep14, %pDst 61 %bound1 = icmp ugt ptr %scevgep, %pSrcA 62 %found.conflict = and i1 %bound0, %bound1 63 %bound019 = icmp ugt ptr %scevgep17, %pDst 64 %bound120 = icmp ugt ptr %scevgep, %pSrcB 65 %found.conflict21 = and i1 %bound019, %bound120 66 %conflict.rdx = or i1 %found.conflict, %found.conflict21 67 br i1 %conflict.rdx, label %while.body, label %vector.ph 68 69vector.ph: ; preds = %vector.memcheck 70 %n.rnd.up = add i32 %blockSize, 7 71 %n.vec = and i32 %n.rnd.up, -8 72 br label %vector.body 73 74vector.body: ; preds = %vector.body, %vector.ph 75 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 76 %next.gep = getelementptr half, ptr %pSrcA, i32 %index 77 %next.gep28 = getelementptr half, ptr %pDst, i32 %index 78 %next.gep29 = getelementptr half, ptr %pSrcB, i32 %index 79 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %blockSize) 80 %wide.masked.load = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %next.gep, i32 2, <8 x i1> %active.lane.mask, <8 x half> undef) 81 %wide.masked.load32 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr %next.gep29, i32 2, <8 x i1> %active.lane.mask, <8 x half> undef) 82 %0 = fadd fast <8 x half> %wide.masked.load32, %wide.masked.load 83 call void @llvm.masked.store.v8f16.p0(<8 x half> %0, ptr %next.gep28, i32 2, <8 x i1> %active.lane.mask) 84 %index.next = add i32 %index, 8 85 %1 = icmp eq i32 %index.next, %n.vec 86 br i1 %1, label %while.end, label %vector.body 87 88while.body: ; preds = %vector.memcheck, %while.body 89 %blkCnt.010 = phi i32 [ %dec, %while.body ], [ %blockSize, %vector.memcheck ] 90 %pSrcA.addr.09 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA, %vector.memcheck ] 91 %pDst.addr.08 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst, %vector.memcheck ] 92 %pSrcB.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrcB, %vector.memcheck ] 93 %incdec.ptr = getelementptr inbounds half, ptr %pSrcA.addr.09, i32 1 94 %2 = load half, ptr %pSrcA.addr.09, align 2 95 %incdec.ptr1 = getelementptr inbounds half, ptr %pSrcB.addr.07, i32 1 96 %3 = load half, ptr %pSrcB.addr.07, align 2 97 %4 = fadd fast half %3, %2 98 %incdec.ptr3 = getelementptr inbounds half, ptr %pDst.addr.08, i32 1 99 store half %4, ptr %pDst.addr.08, align 2 100 %dec = add i32 %blkCnt.010, -1 101 %cmp.not = icmp eq i32 %dec, 0 102 br i1 %cmp.not, label %while.end, label %while.body 103 104while.end: ; preds = %vector.body, %while.body, %entry 105 ret void 106} 107 108define void @notailpred(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr nocapture %pDst, i32 %blockSize) { 109; CHECK-LABEL: notailpred: 110; CHECK: @ %bb.0: @ %entry 111; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 112; CHECK-NEXT: push {r4, r5, r6, r7, lr} 113; CHECK-NEXT: cbz r3, .LBB1_6 114; CHECK-NEXT: @ %bb.1: @ %while.body.preheader 115; CHECK-NEXT: cmp r3, #8 116; CHECK-NEXT: blo .LBB1_3 117; CHECK-NEXT: @ %bb.2: @ %vector.memcheck 118; CHECK-NEXT: add.w r7, r1, r3, lsl #1 119; CHECK-NEXT: add.w r6, r2, r3, lsl #1 120; CHECK-NEXT: cmp r7, r2 121; CHECK-NEXT: add.w r5, r0, r3, lsl #1 122; CHECK-NEXT: cset r7, hi 123; CHECK-NEXT: cmp r6, r1 124; CHECK-NEXT: csel r7, zr, r7, ls 125; CHECK-NEXT: cmp r6, r0 126; CHECK-NEXT: cset r6, hi 127; CHECK-NEXT: cmp r5, r2 128; CHECK-NEXT: cset r5, hi 129; CHECK-NEXT: tst r5, r6 130; CHECK-NEXT: it eq 131; CHECK-NEXT: cmpeq r7, #0 132; CHECK-NEXT: beq .LBB1_7 133; CHECK-NEXT: .LBB1_3: 134; CHECK-NEXT: mov r5, r3 135; CHECK-NEXT: mov r12, r0 136; CHECK-NEXT: mov r7, r2 137; CHECK-NEXT: mov r4, r1 138; CHECK-NEXT: .LBB1_4: @ %while.body.preheader31 139; CHECK-NEXT: dls lr, r5 140; CHECK-NEXT: .LBB1_5: @ %while.body 141; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 142; CHECK-NEXT: vldr.16 s0, [r12] 143; CHECK-NEXT: vldr.16 s2, [r4] 144; CHECK-NEXT: adds r4, #2 145; CHECK-NEXT: add.w r12, r12, #2 146; CHECK-NEXT: vadd.f16 s0, s2, s0 147; CHECK-NEXT: vstr.16 s0, [r7] 148; CHECK-NEXT: adds r7, #2 149; CHECK-NEXT: le lr, .LBB1_5 150; CHECK-NEXT: .LBB1_6: @ %while.end 151; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 152; CHECK-NEXT: .LBB1_7: @ %vector.ph 153; CHECK-NEXT: bic r6, r3, #7 154; CHECK-NEXT: movs r5, #1 155; CHECK-NEXT: sub.w r7, r6, #8 156; CHECK-NEXT: add.w r4, r1, r6, lsl #1 157; CHECK-NEXT: add.w r12, r0, r6, lsl #1 158; CHECK-NEXT: add.w lr, r5, r7, lsr #3 159; CHECK-NEXT: add.w r7, r2, r6, lsl #1 160; CHECK-NEXT: and r5, r3, #7 161; CHECK-NEXT: .LBB1_8: @ %vector.body 162; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 163; CHECK-NEXT: vldrh.u16 q0, [r0], #16 164; CHECK-NEXT: vldrh.u16 q1, [r1], #16 165; CHECK-NEXT: vadd.f16 q0, q1, q0 166; CHECK-NEXT: vstrb.8 q0, [r2], #16 167; CHECK-NEXT: le lr, .LBB1_8 168; CHECK-NEXT: @ %bb.9: @ %middle.block 169; CHECK-NEXT: cmp r6, r3 170; CHECK-NEXT: bne .LBB1_4 171; CHECK-NEXT: b .LBB1_6 172entry: 173 %cmp.not6 = icmp eq i32 %blockSize, 0 174 br i1 %cmp.not6, label %while.end, label %while.body.preheader 175 176while.body.preheader: ; preds = %entry 177 %min.iters.check = icmp ult i32 %blockSize, 8 178 br i1 %min.iters.check, label %while.body.preheader31, label %vector.memcheck 179 180vector.memcheck: ; preds = %while.body.preheader 181 %scevgep = getelementptr half, ptr %pDst, i32 %blockSize 182 %scevgep14 = getelementptr half, ptr %pSrcA, i32 %blockSize 183 %scevgep17 = getelementptr half, ptr %pSrcB, i32 %blockSize 184 %bound0 = icmp ugt ptr %scevgep14, %pDst 185 %bound1 = icmp ugt ptr %scevgep, %pSrcA 186 %found.conflict = and i1 %bound0, %bound1 187 %bound019 = icmp ugt ptr %scevgep17, %pDst 188 %bound120 = icmp ugt ptr %scevgep, %pSrcB 189 %found.conflict21 = and i1 %bound019, %bound120 190 %conflict.rdx = or i1 %found.conflict, %found.conflict21 191 br i1 %conflict.rdx, label %while.body.preheader31, label %vector.ph 192 193vector.ph: ; preds = %vector.memcheck 194 %n.vec = and i32 %blockSize, -8 195 %ind.end = and i32 %blockSize, 7 196 %ind.end23 = getelementptr half, ptr %pSrcA, i32 %n.vec 197 %ind.end25 = getelementptr half, ptr %pDst, i32 %n.vec 198 %ind.end27 = getelementptr half, ptr %pSrcB, i32 %n.vec 199 br label %vector.body 200 201vector.body: ; preds = %vector.body, %vector.ph 202 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 203 %next.gep = getelementptr half, ptr %pSrcA, i32 %index 204 %next.gep28 = getelementptr half, ptr %pDst, i32 %index 205 %next.gep29 = getelementptr half, ptr %pSrcB, i32 %index 206 %wide.load = load <8 x half>, ptr %next.gep, align 2 207 %wide.load30 = load <8 x half>, ptr %next.gep29, align 2 208 %0 = fadd fast <8 x half> %wide.load30, %wide.load 209 store <8 x half> %0, ptr %next.gep28, align 2 210 %index.next = add i32 %index, 8 211 %1 = icmp eq i32 %index.next, %n.vec 212 br i1 %1, label %middle.block, label %vector.body 213 214middle.block: ; preds = %vector.body 215 %cmp.n = icmp eq i32 %n.vec, %blockSize 216 br i1 %cmp.n, label %while.end, label %while.body.preheader31 217 218while.body.preheader31: ; preds = %middle.block, %vector.memcheck, %while.body.preheader 219 %blkCnt.010.ph = phi i32 [ %blockSize, %vector.memcheck ], [ %blockSize, %while.body.preheader ], [ %ind.end, %middle.block ] 220 %pSrcA.addr.09.ph = phi ptr [ %pSrcA, %vector.memcheck ], [ %pSrcA, %while.body.preheader ], [ %ind.end23, %middle.block ] 221 %pDst.addr.08.ph = phi ptr [ %pDst, %vector.memcheck ], [ %pDst, %while.body.preheader ], [ %ind.end25, %middle.block ] 222 %pSrcB.addr.07.ph = phi ptr [ %pSrcB, %vector.memcheck ], [ %pSrcB, %while.body.preheader ], [ %ind.end27, %middle.block ] 223 br label %while.body 224 225while.body: ; preds = %while.body.preheader31, %while.body 226 %blkCnt.010 = phi i32 [ %dec, %while.body ], [ %blkCnt.010.ph, %while.body.preheader31 ] 227 %pSrcA.addr.09 = phi ptr [ %incdec.ptr, %while.body ], [ %pSrcA.addr.09.ph, %while.body.preheader31 ] 228 %pDst.addr.08 = phi ptr [ %incdec.ptr3, %while.body ], [ %pDst.addr.08.ph, %while.body.preheader31 ] 229 %pSrcB.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %pSrcB.addr.07.ph, %while.body.preheader31 ] 230 %incdec.ptr = getelementptr inbounds half, ptr %pSrcA.addr.09, i32 1 231 %2 = load half, ptr %pSrcA.addr.09, align 2 232 %incdec.ptr1 = getelementptr inbounds half, ptr %pSrcB.addr.07, i32 1 233 %3 = load half, ptr %pSrcB.addr.07, align 2 234 %4 = fadd fast half %3, %2 235 %incdec.ptr3 = getelementptr inbounds half, ptr %pDst.addr.08, i32 1 236 store half %4, ptr %pDst.addr.08, align 2 237 %dec = add i32 %blkCnt.010, -1 238 %cmp.not = icmp eq i32 %dec, 0 239 br i1 %cmp.not, label %while.end, label %while.body 240 241while.end: ; preds = %while.body, %middle.block, %entry 242 ret void 243} 244 245declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1 246declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32 immarg, <8 x i1>, <8 x half>) #2 247declare void @llvm.masked.store.v8f16.p0(<8 x half>, ptr, i32 immarg, <8 x i1>) #3 248