1; RUN: opt -mcpu=skx -S -passes=loop-vectorize,instcombine -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s 2 3target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 4target triple = "x86_64-pc-linux-gnu" 5 6; Case1: With pragma predicate to force tail-folding. 7; All memory opertions are masked. 8;void fold_tail(int * restrict p, int * restrict q1, int * restrict q2, int guard) { 9; #pragma clang loop vectorize_predicate(enable) 10; for(int ix=0; ix < 1021; ++ix) { 11; if (ix > guard) { 12; p[ix] = q1[ix] + q2[ix]; 13; } 14; } 15;} 16 17;CHECK-LABEL: @fold_tail 18;CHECK: vector.body: 19;CHECK: call <8 x i32> @llvm.masked.load 20;CHECK: call <8 x i32> @llvm.masked.load 21;CHECK: call void @llvm.masked.store 22 23; Function Attrs: nofree norecurse nounwind uwtable 24define dso_local void @fold_tail(ptr noalias nocapture %p, ptr noalias nocapture readonly %q1, ptr noalias nocapture readonly %q2, 25i32 %guard) local_unnamed_addr #0 { 26entry: 27 %0 = sext i32 %guard to i64 28 br label %for.body 29 30for.cond.cleanup: 31 ret void 32 33for.body: 34 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 35 %cmp1 = icmp sgt i64 %indvars.iv, %0 36 br i1 %cmp1, label %if.then, label %for.inc 37 38if.then: 39 %arrayidx = getelementptr inbounds i32, ptr %q1, i64 %indvars.iv 40 %1 = load i32, ptr %arrayidx, align 4, !tbaa !2 41 %arrayidx3 = getelementptr inbounds i32, ptr %q2, i64 %indvars.iv 42 %2 = load i32, ptr %arrayidx3, align 4, !tbaa !2 43 %add = add nsw i32 %2, %1 44 %arrayidx5 = getelementptr inbounds i32, ptr %p, i64 %indvars.iv 45 store i32 %add, ptr %arrayidx5, align 4, !tbaa !2 46 br label %for.inc 47 48for.inc: 49 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 50 %exitcond = icmp eq i64 %indvars.iv.next, 1021 51 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8 52} 53 54; Case2: With pragma assume_safety both, load and store are masked. 55; void assume_safety(int * p, int * q1, int * q2, int guard) { 56; #pragma clang loop vectorize(assume_safety) 57; for(int ix=0; ix < 1021; ++ix) { 58; if (ix > guard) { 59; p[ix] = q1[ix] + q2[ix]; 60; } 61; } 62;} 63 64;CHECK-LABEL: @assume_safety 65;CHECK: vector.body: 66;CHECK: call <8 x i32> @llvm.masked.load 67;CHECK: call void @llvm.masked.store 68 69; Function Attrs: norecurse nounwind uwtable 70define void @assume_safety(ptr nocapture, ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 { 71 %5 = sext i32 %3 to i64 72 br label %7 73 74; <label>:6: 75 ret void 76 77; <label>:7: 78 %8 = phi i64 [ 0, %4 ], [ %18, %17 ] 79 %9 = icmp sgt i64 %8, %5 80 br i1 %9, label %10, label %17 81 82; <label>:10: 83 %11 = getelementptr inbounds i32, ptr %1, i64 %8 84 %12 = load i32, ptr %11, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 85 %13 = getelementptr inbounds i32, ptr %2, i64 %8 86 %14 = load i32, ptr %13, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 87 %15 = add nsw i32 %14, %12 88 %16 = getelementptr inbounds i32, ptr %0, i64 %8 89 store i32 %15, ptr %16, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6 90 br label %17 91 92; <label>:17: 93 %18 = add nuw nsw i64 %8, 1 94 %19 = icmp eq i64 %18, 1021 95 br i1 %19, label %6, label %7, !llvm.loop !6 96} 97 98; Case3: With pragma assume_safety and pragma predicate both the store and the 99; load are masked. 100; void fold_tail_and_assume_safety(int * p, int * q1, int * q2, int guard) { 101; #pragma clang loop vectorize(assume_safety) vectorize_predicate(enable) 102; for(int ix=0; ix < 1021; ++ix) { 103; if (ix > guard) { 104; p[ix] = q1[ix] + q2[ix]; 105; } 106; } 107;} 108 109;CHECK-LABEL: @fold_tail_and_assume_safety 110;CHECK: vector.body: 111;CHECK: call <8 x i32> @llvm.masked.load 112;CHECK: call <8 x i32> @llvm.masked.load 113;CHECK: call void @llvm.masked.store 114 115; Function Attrs: nofree norecurse nounwind uwtable 116define dso_local void @fold_tail_and_assume_safety(ptr noalias nocapture %p, ptr noalias nocapture readonly %q1, ptr noalias nocapture readonly %q2, 117i32 %guard) local_unnamed_addr #0 { 118entry: 119 %0 = sext i32 %guard to i64 120 br label %for.body 121 122for.cond.cleanup: 123 ret void 124 125for.body: 126 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ] 127 %cmp1 = icmp sgt i64 %indvars.iv, %0 128 br i1 %cmp1, label %if.then, label %for.inc 129 130if.then: 131 %arrayidx = getelementptr inbounds i32, ptr %q1, i64 %indvars.iv 132 %1 = load i32, ptr %arrayidx, align 4, !tbaa !2, !llvm.access.group !10 133 %arrayidx3 = getelementptr inbounds i32, ptr %q2, i64 %indvars.iv 134 %2 = load i32, ptr %arrayidx3, align 4, !tbaa !2, !llvm.access.group !10 135 %add = add nsw i32 %2, %1 136 %arrayidx5 = getelementptr inbounds i32, ptr %p, i64 %indvars.iv 137 store i32 %add, ptr %arrayidx5, align 4, !tbaa !2, !llvm.access.group !10 138 br label %for.inc 139 140for.inc: 141 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 142 %exitcond = icmp eq i64 %indvars.iv.next, 1021 143 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11 144} 145 146attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } 147 148!llvm.module.flags = !{!0} 149!llvm.ident = !{!1} 150 151!0 = !{i32 1, !"wchar_size", i32 4} 152!1 = !{!"clang version 6.0.0-1ubuntu2 (tags/RELEASE_600/final)"} 153!2 = !{!3, !3, i64 0} 154!3 = !{!"int", !4, i64 0} 155!4 = !{!"omnipotent char", !5, i64 0} 156!5 = !{!"Simple C/C++ TBAA"} 157!6 = distinct !{!6, !7} 158!7 = !{!"llvm.loop.vectorize.enable", i1 true} 159 160!8 = distinct !{!8, !9} 161!9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} 162 163!10 = distinct !{} 164!11 = distinct !{!11, !12, !13} 165!12 = !{!"llvm.loop.parallel_accesses", !10} 166!13 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} 167