1; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s 2 3; TODO: The unrolled pattern is preventing the transform 4; CHECK-LABEL: mul_v16i8_unroll 5; CHECK-NOT: call i32 @llvm.arm.vcpt 6define void @mul_v16i8_unroll(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) { 7entry: 8 %cmp8 = icmp eq i32 %N, 0 9 %tmp8 = add i32 %N, 15 10 %tmp9 = lshr i32 %tmp8, 4 11 %tmp10 = shl nuw i32 %tmp9, 4 12 %tmp11 = add i32 %tmp10, -16 13 %tmp12 = lshr i32 %tmp11, 4 14 %tmp13 = add nuw nsw i32 %tmp12, 1 15 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 16 17vector.ph: ; preds = %entry 18 %trip.count.minus.1 = add i32 %N, -1 19 %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 20 %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer 21 %xtraiter = and i32 %tmp13, 1 22 %0 = icmp ult i32 %tmp12, 1 23 br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %vector.ph.new 24 25vector.ph.new: ; preds = %vector.ph 26 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13) 27 %unroll_iter = sub i32 %tmp13, %xtraiter 28 br label %vector.body 29 30vector.body: ; preds = %vector.body, %vector.ph.new 31 %index = phi i32 [ 0, %vector.ph.new ], [ %index.next.1, %vector.body ] 32 %niter = phi i32 [ %unroll_iter, %vector.ph.new ], [ %niter.nsub.1, %vector.body ] 33 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 34 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer 35 %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 36 %tmp = getelementptr inbounds i8, ptr %a, i32 %index 37 %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11 38 %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp, i32 4, <16 x i1> %tmp1, <16 x i8> undef) 39 %tmp3 = getelementptr inbounds i8, ptr %b, i32 %index 40 %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3, i32 4, <16 x i1> %tmp1, <16 x i8> undef) 41 %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load 42 %tmp6 = getelementptr inbounds i8, ptr %c, i32 %index 43 tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul, ptr %tmp6, i32 4, <16 x i1> %tmp1) 44 %index.next = add nuw nsw i32 %index, 16 45 %niter.nsub = sub i32 %niter, 1 46 %broadcast.splatinsert.1 = insertelement <16 x i32> undef, i32 %index.next, i32 0 47 %broadcast.splat.1 = shufflevector <16 x i32> %broadcast.splatinsert.1, <16 x i32> undef, <16 x i32> zeroinitializer 48 %induction.1 = add <16 x i32> %broadcast.splat.1, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 49 %tmp.1 = getelementptr inbounds i8, ptr %a, i32 %index.next 50 %tmp1.1 = icmp ule <16 x i32> %induction.1, %broadcast.splat11 51 %wide.masked.load.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef) 52 %tmp3.1 = getelementptr inbounds i8, ptr %b, i32 %index.next 53 %wide.masked.load2.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef) 54 %mul.1 = mul nsw <16 x i8> %wide.masked.load2.1, %wide.masked.load.1 55 %tmp6.1 = getelementptr inbounds i8, ptr %c, i32 %index.next 56 tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul.1, ptr %tmp6.1, i32 4, <16 x i1> %tmp1.1) 57 %index.next.1 = add i32 %index.next, 16 58 %niter.nsub.1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %niter.nsub, i32 1) 59 %niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0 60 br i1 %niter.ncmp.1, label %vector.body, label %for.cond.cleanup.loopexit.unr-lcssa.loopexit 61 62for.cond.cleanup.loopexit.unr-lcssa.loopexit: ; preds = %vector.body 63 %index.unr.ph = phi i32 [ %index.next.1, %vector.body ] 64 %tmp14.unr.ph = phi i32 [ -2, %vector.body ] 65 br label %for.cond.cleanup.loopexit.unr-lcssa 66 67for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.cond.cleanup.loopexit.unr-lcssa.loopexit, %vector.ph 68 %index.unr = phi i32 [ 0, %vector.ph ], [ %index.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ] 69 %tmp14.unr = phi i32 [ %tmp13, %vector.ph ], [ %tmp14.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ] 70 %lcmp.mod = icmp ne i32 %xtraiter, 0 71 br i1 %lcmp.mod, label %vector.body.epil.preheader, label %for.cond.cleanup.loopexit 72 73vector.body.epil.preheader: ; preds = %for.cond.cleanup.loopexit.unr-lcssa 74 br label %vector.body.epil 75 76vector.body.epil: ; preds = %vector.body.epil.preheader 77 %index.epil = phi i32 [ %index.unr, %vector.body.epil.preheader ] 78 %tmp14.epil = phi i32 [ %tmp14.unr, %vector.body.epil.preheader ] 79 %broadcast.splatinsert.epil = insertelement <16 x i32> undef, i32 %index.epil, i32 0 80 %broadcast.splat.epil = shufflevector <16 x i32> %broadcast.splatinsert.epil, <16 x i32> undef, <16 x i32> zeroinitializer 81 %induction.epil = add <16 x i32> %broadcast.splat.epil, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 82 %tmp.epil = getelementptr inbounds i8, ptr %a, i32 %index.epil 83 %tmp1.epil = icmp ule <16 x i32> %induction.epil, %broadcast.splat11 84 %wide.masked.load.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef) 85 %tmp3.epil = getelementptr inbounds i8, ptr %b, i32 %index.epil 86 %wide.masked.load2.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef) 87 %mul.epil = mul nsw <16 x i8> %wide.masked.load2.epil, %wide.masked.load.epil 88 %tmp6.epil = getelementptr inbounds i8, ptr %c, i32 %index.epil 89 tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul.epil, ptr %tmp6.epil, i32 4, <16 x i1> %tmp1.epil) 90 %index.next.epil = add i32 %index.epil, 16 91 %tmp15.epil = add nuw nsw i32 %tmp14.epil, -1 92 %tmp16.epil = icmp ne i32 %tmp15.epil, 0 93 br label %for.cond.cleanup.loopexit.epilog-lcssa 94 95for.cond.cleanup.loopexit.epilog-lcssa: ; preds = %vector.body.epil 96 br label %for.cond.cleanup.loopexit 97 98for.cond.cleanup.loopexit: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.cond.cleanup.loopexit.epilog-lcssa 99 br label %for.cond.cleanup 100 101for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry 102 ret void 103} 104 105declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>) #1 106declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>) #2 107declare i32 @llvm.start.loop.iterations.i32(i32) #3 108declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3 109 110