xref: /llvm-project/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-unroll.ll (revision b5b663aac17415625340eb29c8010832bfc4c21c)
1; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
2
3; TODO: The unrolled pattern is preventing the transform
4; CHECK-LABEL: mul_v16i8_unroll
5; CHECK-NOT: call i32 @llvm.arm.vcpt
6define void @mul_v16i8_unroll(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
7entry:
8  %cmp8 = icmp eq i32 %N, 0
9  %tmp8 = add i32 %N, 15
10  %tmp9 = lshr i32 %tmp8, 4
11  %tmp10 = shl nuw i32 %tmp9, 4
12  %tmp11 = add i32 %tmp10, -16
13  %tmp12 = lshr i32 %tmp11, 4
14  %tmp13 = add nuw nsw i32 %tmp12, 1
15  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
16
17vector.ph:                                        ; preds = %entry
18  %trip.count.minus.1 = add i32 %N, -1
19  %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
20  %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
21  %xtraiter = and i32 %tmp13, 1
22  %0 = icmp ult i32 %tmp12, 1
23  br i1 %0, label %for.cond.cleanup.loopexit.unr-lcssa, label %vector.ph.new
24
25vector.ph.new:                                    ; preds = %vector.ph
26  %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
27  %unroll_iter = sub i32 %tmp13, %xtraiter
28  br label %vector.body
29
30vector.body:                                      ; preds = %vector.body, %vector.ph.new
31  %index = phi i32 [ 0, %vector.ph.new ], [ %index.next.1, %vector.body ]
32  %niter = phi i32 [ %unroll_iter, %vector.ph.new ], [ %niter.nsub.1, %vector.body ]
33  %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
34  %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
35  %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
36  %tmp = getelementptr inbounds i8, ptr %a, i32 %index
37  %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
38  %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
39  %tmp3 = getelementptr inbounds i8, ptr %b, i32 %index
40  %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
41  %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
42  %tmp6 = getelementptr inbounds i8, ptr %c, i32 %index
43  tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul, ptr %tmp6, i32 4, <16 x i1> %tmp1)
44  %index.next = add nuw nsw i32 %index, 16
45  %niter.nsub = sub i32 %niter, 1
46  %broadcast.splatinsert.1 = insertelement <16 x i32> undef, i32 %index.next, i32 0
47  %broadcast.splat.1 = shufflevector <16 x i32> %broadcast.splatinsert.1, <16 x i32> undef, <16 x i32> zeroinitializer
48  %induction.1 = add <16 x i32> %broadcast.splat.1, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
49  %tmp.1 = getelementptr inbounds i8, ptr %a, i32 %index.next
50  %tmp1.1 = icmp ule <16 x i32> %induction.1, %broadcast.splat11
51  %wide.masked.load.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef)
52  %tmp3.1 = getelementptr inbounds i8, ptr %b, i32 %index.next
53  %wide.masked.load2.1 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3.1, i32 4, <16 x i1> %tmp1.1, <16 x i8> undef)
54  %mul.1 = mul nsw <16 x i8> %wide.masked.load2.1, %wide.masked.load.1
55  %tmp6.1 = getelementptr inbounds i8, ptr %c, i32 %index.next
56  tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul.1, ptr %tmp6.1, i32 4, <16 x i1> %tmp1.1)
57  %index.next.1 = add i32 %index.next, 16
58  %niter.nsub.1 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %niter.nsub, i32 1)
59  %niter.ncmp.1 = icmp ne i32 %niter.nsub.1, 0
60  br i1 %niter.ncmp.1, label %vector.body, label %for.cond.cleanup.loopexit.unr-lcssa.loopexit
61
62for.cond.cleanup.loopexit.unr-lcssa.loopexit:     ; preds = %vector.body
63  %index.unr.ph = phi i32 [ %index.next.1, %vector.body ]
64  %tmp14.unr.ph = phi i32 [ -2, %vector.body ]
65  br label %for.cond.cleanup.loopexit.unr-lcssa
66
67for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.cond.cleanup.loopexit.unr-lcssa.loopexit, %vector.ph
68  %index.unr = phi i32 [ 0, %vector.ph ], [ %index.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ]
69  %tmp14.unr = phi i32 [ %tmp13, %vector.ph ], [ %tmp14.unr.ph, %for.cond.cleanup.loopexit.unr-lcssa.loopexit ]
70  %lcmp.mod = icmp ne i32 %xtraiter, 0
71  br i1 %lcmp.mod, label %vector.body.epil.preheader, label %for.cond.cleanup.loopexit
72
73vector.body.epil.preheader:                       ; preds = %for.cond.cleanup.loopexit.unr-lcssa
74  br label %vector.body.epil
75
76vector.body.epil:                                 ; preds = %vector.body.epil.preheader
77  %index.epil = phi i32 [ %index.unr, %vector.body.epil.preheader ]
78  %tmp14.epil = phi i32 [ %tmp14.unr, %vector.body.epil.preheader ]
79  %broadcast.splatinsert.epil = insertelement <16 x i32> undef, i32 %index.epil, i32 0
80  %broadcast.splat.epil = shufflevector <16 x i32> %broadcast.splatinsert.epil, <16 x i32> undef, <16 x i32> zeroinitializer
81  %induction.epil = add <16 x i32> %broadcast.splat.epil, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
82  %tmp.epil = getelementptr inbounds i8, ptr %a, i32 %index.epil
83  %tmp1.epil = icmp ule <16 x i32> %induction.epil, %broadcast.splat11
84  %wide.masked.load.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef)
85  %tmp3.epil = getelementptr inbounds i8, ptr %b, i32 %index.epil
86  %wide.masked.load2.epil = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3.epil, i32 4, <16 x i1> %tmp1.epil, <16 x i8> undef)
87  %mul.epil = mul nsw <16 x i8> %wide.masked.load2.epil, %wide.masked.load.epil
88  %tmp6.epil = getelementptr inbounds i8, ptr %c, i32 %index.epil
89  tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul.epil, ptr %tmp6.epil, i32 4, <16 x i1> %tmp1.epil)
90  %index.next.epil = add i32 %index.epil, 16
91  %tmp15.epil = add nuw nsw i32 %tmp14.epil, -1
92  %tmp16.epil = icmp ne i32 %tmp15.epil, 0
93  br label %for.cond.cleanup.loopexit.epilog-lcssa
94
95for.cond.cleanup.loopexit.epilog-lcssa:           ; preds = %vector.body.epil
96  br label %for.cond.cleanup.loopexit
97
98for.cond.cleanup.loopexit:                        ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.cond.cleanup.loopexit.epilog-lcssa
99  br label %for.cond.cleanup
100
101for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
102  ret void
103}
104
105declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>) #1
106declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>) #2
107declare i32 @llvm.start.loop.iterations.i32(i32) #3
108declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
109
110