xref: /llvm-project/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-prefer-flag.ll (revision 82821254f532c1dbdfd5d985ef7130511efaaa83)
1; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -passes=loop-vectorize -tail-predication=enabled -S < %s | \
2; RUN:  FileCheck %s -check-prefix=CHECK
3
4; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -passes=loop-vectorize -tail-predication=enabled \
5; RUN:     -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | \
6; RUN:     FileCheck -check-prefix=PREDFLAG %s
7
8target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
9
10; This test has a loop hint "predicate.predicate" set to false, so shouldn't
11; get tail-folded, except with -prefer-predicate-over-epilog which then
12; overrules this.
13;
14define dso_local void @flag_overrules_hint(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) local_unnamed_addr #0 {
15; CHECK-LABEL: flag_overrules_hint(
16; CHECK:       vector.body:
17; CHECK-NOT:   @llvm.masked.load.v8i32.p0(
18; CHECK-NOT:   @llvm.masked.store.v8i32.p0(
19; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
20
21; PREDFLAG-LABEL: flag_overrules_hint(
22; PREDFLAG:  vector.body:
23; PREDFLAG:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
24; PREDFLAG:  %[[ELEM0:.*]] = add i64 %index, 0
25; PREDFLAG:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 430)
26; PREDFLAG:  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %active.lane.mask
27; PREDFLAG:  %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %active.lane.mask
28; PREDFLAG:  %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load
29; PREDFLAG:  call void @llvm.masked.store.v4i32.p0({{.*}}, <4 x i1> %active.lane.mask
30; PREDFLAG:  %index.next = add nuw i64 %index, 4
31; PREDFLAG:  %[[CMP:.*]] = icmp eq i64 %index.next, 432
32; PREDFLAG:  br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0
33entry:
34  br label %for.body
35
36for.cond.cleanup:
37  ret void
38
39for.body:
40  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
41  %arrayidx = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
42  %0 = load i32, ptr %arrayidx, align 4
43  %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv
44  %1 = load i32, ptr %arrayidx2, align 4
45  %add = add nsw i32 %1, %0
46  %arrayidx4 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
47  store i32 %add, ptr %arrayidx4, align 4
48  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
49  %exitcond = icmp eq i64 %indvars.iv.next, 430
50  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
51}
52
53define dso_local void @interleave4(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
54; PREDFLAG-LABEL: interleave4(
55; PREDFLAG:  %[[ADD1:.*]] = add i32 %index, 0
56; PREDFLAG:  %[[ADD2:.*]] = add i32 %index, 4
57; PREDFLAG:  %[[ADD3:.*]] = add i32 %index, 8
58; PREDFLAG:  %[[ADD4:.*]] = add i32 %index, 12
59; PREDFLAG:  %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %N)
60; PREDFLAG:  %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %N)
61; PREDFLAG:  %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %N)
62; PREDFLAG:  %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %N)
63;
64; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
65; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
66; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
67; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
68; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
69; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
70; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
71; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
72;
73; PREDFLAG:  call void @llvm.masked.store.v4i32.p0({{.*}}, <4 x i1> %[[ALM1]])
74; PREDFLAG:  call void @llvm.masked.store.v4i32.p0({{.*}}, <4 x i1> %[[ALM2]])
75; PREDFLAG:  call void @llvm.masked.store.v4i32.p0({{.*}}, <4 x i1> %[[ALM3]])
76; PREDFLAG:  call void @llvm.masked.store.v4i32.p0({{.*}}, <4 x i1> %[[ALM4]])
77;
78entry:
79  %cmp8 = icmp sgt i32 %N, 0
80  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
81
82for.body.preheader:                               ; preds = %entry
83  br label %for.body
84
85for.cond.cleanup.loopexit:                        ; preds = %for.body
86  br label %for.cond.cleanup
87
88for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
89  ret void
90
91for.body:                                         ; preds = %for.body.preheader, %for.body
92  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
93  %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09
94  %0 = load i32, ptr %arrayidx, align 4
95  %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09
96  %1 = load i32, ptr %arrayidx1, align 4
97  %add = add nsw i32 %1, %0
98  %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09
99  store i32 %add, ptr %arrayidx2, align 4
100  %inc = add nuw nsw i32 %i.09, 1
101  %exitcond = icmp eq i32 %inc, %N
102  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !14
103}
104
105!10 = distinct !{!10, !11, !12}
106!11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
107!12 = !{!"llvm.loop.vectorize.enable", i1 true}
108
109!14 = distinct !{!14, !15}
110!15 = !{!"llvm.loop.interleave.count", i32 4}
111