1; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s --check-prefixes=COMMON,DEFAULT 2; RUN: opt < %s -passes=loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER 3; RUN: opt < %s -passes=loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER 4; RUN: opt < %s -passes=loop-vectorize -tail-predication=enabled -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-ENABLE-TP 5 6target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" 7target triple = "thumbv8.1m.main-arm-unknown-eabihf" 8 9; This IR corresponds to this type of C-code: 10; 11; void f(char *a, char *b, char *c, int N) { 12; while (N-- > 0) 13; *c++ = *a++ + *b++; 14; } 15; 16define dso_local void @sgt_loopguard(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 17; COMMON-LABEL: @sgt_loopguard( 18; COMMON: vector.body: 19 20; CHECK-TF: %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0 21; CHECK-TF: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %N) 22; CHECK-TF: llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask 23; CHECK-TF: llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask 24; CHECK-TF: llvm.masked.store.v16i8.p0(<16 x i8> %{{.*}}, ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask) 25entry: 26 %cmp5 = icmp sgt i32 %N, 0 27 br i1 %cmp5, label %while.body.preheader, label %while.end 28 29while.body.preheader: 30 br label %while.body 31 32while.body: 33 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] 34 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ] 35 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ] 36 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ] 37 %dec = add nsw i32 %N.addr.09, -1 38 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1 39 %0 = load i8, ptr %a.addr.06, align 1 40 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1 41 %1 = load i8, ptr %b.addr.07, align 1 42 %add = add i8 %1, %0 43 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1 44 store i8 %add, ptr %c.addr.08, align 1 45 %cmp = icmp sgt i32 %N.addr.09, 1 46 br i1 %cmp, label %while.body, label %while.end.loopexit 47 48while.end.loopexit: 49 br label %while.end 50 51while.end: 52 ret void 53} 54 55; No loop-guard: we need one for this to be valid. 56; 57define dso_local void @sgt_no_loopguard(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 58; COMMON-LABEL: @sgt_no_loopguard( 59; COMMON: vector.body: 60; CHECK-TF: masked.load 61; CHECK-TF: masked.load 62; CHECK-TF: masked.store 63entry: 64 br label %while.body 65 66while.body: 67 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %entry ] 68 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %entry ] 69 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %entry ] 70 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %entry ] 71 %dec = add nsw i32 %N.addr.09, -1 72 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1 73 %0 = load i8, ptr %a.addr.06, align 1 74 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1 75 %1 = load i8, ptr %b.addr.07, align 1 76 %add = add i8 %1, %0 77 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1 78 store i8 %add, ptr %c.addr.08, align 1 79 %cmp = icmp sgt i32 %N.addr.09, 1 80 br i1 %cmp, label %while.body, label %while.end.loopexit 81 82while.end.loopexit: 83 br label %while.end 84 85while.end: 86 ret void 87} 88 89define dso_local void @sgt_extra_use_cmp(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 90; COMMON-LABEL: @sgt_extra_use_cmp( 91; COMMON: vector.body: 92; CHECK-TF: masked.load 93; CHECK-TF: masked.load 94; CHECK-TF: masked.store 95entry: 96 br label %while.body 97 98while.body: 99 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %entry ] 100 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %entry ] 101 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %entry ] 102 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %entry ] 103 %dec = add nsw i32 %N.addr.09, -1 104 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1 105 %0 = load i8, ptr %a.addr.06, align 1 106 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1 107 %1 = load i8, ptr %b.addr.07, align 1 108 %add = add i8 %1, %0 109 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1 110 %cmp = icmp sgt i32 %N.addr.09, 1 111 %select = select i1 %cmp, i8 %0, i8 %1 112 %add2 = add i8 %add, %select 113 store i8 %add2, ptr %c.addr.08, align 1 114 br i1 %cmp, label %while.body, label %while.end.loopexit 115 116while.end.loopexit: 117 br label %while.end 118 119while.end: 120 ret void 121} 122 123define dso_local void @sgt_const_tripcount(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 124; COMMON-LABEL: @sgt_const_tripcount( 125; COMMON: vector.body: 126; CHECK-TF: masked.load 127; CHECK-TF: masked.load 128; CHECK-TF: masked.store 129entry: 130 %cmp5 = icmp sgt i32 %N, 0 131 br i1 %cmp5, label %while.body.preheader, label %while.end 132 133while.body.preheader: 134 br label %while.body 135 136while.body: 137 %N.addr.09 = phi i32 [ %dec, %while.body ], [ 2049, %while.body.preheader ] 138 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ] 139 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ] 140 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ] 141 %dec = add nsw i32 %N.addr.09, -1 142 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1 143 %0 = load i8, ptr %a.addr.06, align 1 144 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1 145 %1 = load i8, ptr %b.addr.07, align 1 146 %add = add i8 %1, %0 147 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1 148 store i8 %add, ptr %c.addr.08, align 1 149 %cmp = icmp sgt i32 %N.addr.09, 1 150 br i1 %cmp, label %while.body, label %while.end.loopexit 151 152while.end.loopexit: 153 br label %while.end 154 155while.end: 156 ret void 157} 158 159define dso_local void @sgt_no_guard_0_startval(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 160; COMMON-LABEL: @sgt_no_guard_0_startval( 161; COMMON-NOT: vector.body: 162entry: 163 br label %while.body 164 165while.body: 166 %N.addr.09 = phi i32 [ %dec, %while.body ], [ 0, %entry ] 167 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %entry ] 168 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %entry ] 169 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %entry] 170 %dec = add nsw i32 %N.addr.09, -1 171 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1 172 %0 = load i8, ptr %a.addr.06, align 1 173 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1 174 %1 = load i8, ptr %b.addr.07, align 1 175 %add = add i8 %1, %0 176 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1 177 store i8 %add, ptr %c.addr.08, align 1 178 %cmp = icmp sgt i32 %N.addr.09, 1 179 br i1 %cmp, label %while.body, label %while.end.loopexit 180 181while.end.loopexit: 182 br label %while.end 183 184while.end: 185 ret void 186} 187 188define dso_local void @sgt_step_minus_two(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 189; COMMON-LABEL: @sgt_step_minus_two( 190; COMMON: vector.body: 191; CHECK-TF: masked.load 192; CHECK-TF: masked.load 193; CHECK-TF: masked.store 194entry: 195 %cmp5 = icmp sgt i32 %N, 0 196 br i1 %cmp5, label %while.body.preheader, label %while.end 197 198while.body.preheader: 199 br label %while.body 200 201while.body: 202 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] 203 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ] 204 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ] 205 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ] 206 %dec = add nsw i32 %N.addr.09, -2 207 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1 208 %0 = load i8, ptr %a.addr.06, align 1 209 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1 210 %1 = load i8, ptr %b.addr.07, align 1 211 %add = add i8 %1, %0 212 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1 213 store i8 %add, ptr %c.addr.08, align 1 214 %cmp = icmp sgt i32 %N.addr.09, 1 215 br i1 %cmp, label %while.body, label %while.end.loopexit 216 217while.end.loopexit: 218 br label %while.end 219 220while.end: 221 ret void 222} 223 224define dso_local void @sgt_step_not_constant(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N, i32 %S) local_unnamed_addr #0 { 225; COMMON-LABEL: @sgt_step_not_constant( 226; COMMON-NOT: vector.body: 227entry: 228 %cmp5 = icmp sgt i32 %N, 0 229 br i1 %cmp5, label %while.body.preheader, label %while.end 230 231while.body.preheader: 232 br label %while.body 233 234while.body: 235 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] 236 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ] 237 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ] 238 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ] 239 %dec = add nsw i32 %N.addr.09, %S 240 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1 241 %0 = load i8, ptr %a.addr.06, align 1 242 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1 243 %1 = load i8, ptr %b.addr.07, align 1 244 %add = add i8 %1, %0 245 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1 246 store i8 %add, ptr %c.addr.08, align 1 247 %cmp = icmp sgt i32 %N.addr.09, 1 248 br i1 %cmp, label %while.body, label %while.end.loopexit 249 250while.end.loopexit: 251 br label %while.end 252 253while.end: 254 ret void 255} 256 257define dso_local void @icmp_eq(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) #0 { 258; COMMON-LABEL: @icmp_eq 259; COMMON: vector.body: 260entry: 261 %cmp6 = icmp eq i32 %N, 0 262 br i1 %cmp6, label %while.end, label %while.body.preheader 263 264while.body.preheader: 265 br label %while.body 266 267while.body: 268 %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ] 269 %C.addr.09 = phi ptr [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ] 270 %B.addr.08 = phi ptr [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ] 271 %A.addr.07 = phi ptr [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ] 272 %incdec.ptr = getelementptr inbounds i8, ptr %A.addr.07, i32 1 273 %0 = load i8, ptr %A.addr.07, align 1 274 %incdec.ptr1 = getelementptr inbounds i8, ptr %B.addr.08, i32 1 275 %1 = load i8, ptr %B.addr.08, align 1 276 %add = add i8 %1, %0 277 %incdec.ptr4 = getelementptr inbounds i8, ptr %C.addr.09, i32 1 278 store i8 %add, ptr %C.addr.09, align 1 279 %dec = add i32 %N.addr.010, -1 280 %cmp = icmp eq i32 %dec, 0 281 br i1 %cmp, label %while.end.loopexit, label %while.body 282 283while.end.loopexit: 284 br label %while.end 285 286while.end: 287 ret void 288} 289 290; This IR corresponds to this type of C-code: 291; 292; void f(char *a, char *b, char * __restrict c, int N) { 293; #pragma clang loop vectorize_width(16) 294; for (int i = N; i>0; i--) 295; c[i] = a[i] + b[i]; 296; } 297; 298define dso_local void @sgt_for_loop(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 299; COMMON-LABEL: @sgt_for_loop( 300; COMMON: vector.body: 301; CHECK-PREFER: masked.load 302; CHECK-PREFER: masked.load 303; CHECK-PREFER: masked.store 304; 305; TODO: if tail-predication is requested, tail-folding isn't triggered because 306; the profitability check returns "Different strides found, can't tail-predicate", 307; investigate this. 308; 309; CHECK-ENABLE-TP-NOT: masked.load 310; CHECK-ENABLE-TP-NOT: masked.load 311; CHECK-ENABLE-TP-NOT: masked.store 312; 313entry: 314 %cmp5 = icmp sgt i32 %N, 0 315 br i1 %cmp5, label %for.body.preheader, label %for.end 316 317for.body.preheader: 318 br label %for.body 319 320for.body: 321 %i.011 = phi i32 [ %dec, %for.body ], [ %N, %for.body.preheader ] 322 %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.011 323 %0 = load i8, ptr %arrayidx, align 1 324 %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.011 325 %1 = load i8, ptr %arrayidx1, align 1 326 %add = add i8 %1, %0 327 %arrayidx4 = getelementptr inbounds i8, ptr %c, i32 %i.011 328 store i8 %add, ptr %arrayidx4, align 1 329 %dec = add nsw i32 %i.011, -1 330 %cmp = icmp sgt i32 %i.011, 1 331 br i1 %cmp, label %for.body, label %for.end, !llvm.loop !1 332 333for.end: 334 ret void 335} 336 337define dso_local void @sgt_for_loop_i64(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 338; COMMON-LABEL: @sgt_for_loop_i64( 339; COMMON: vector.body: 340; 341; CHECK-PREFER: masked.load 342; CHECK-PREFER: masked.load 343; CHECK-PREFER: masked.store 344; 345; With -disable-mve-tail-predication=false, the target hook returns 346; "preferPredicateOverEpilogue: hardware-loop is not profitable." 347; so here we don't expect the tail-folding. TODO: look into this. 348; 349; CHECK-ENABLE-TP-NOT: masked.load 350; CHECK-ENABLE-TP-NOT: masked.load 351; CHECK-ENABLE-TP-NOT: masked.store 352; 353entry: 354 %cmp14 = icmp sgt i32 %N, 0 355 br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup 356 357for.body.preheader: 358 %conv16 = zext i32 %N to i64 359 br label %for.body 360 361for.cond.cleanup.loopexit: 362 br label %for.cond.cleanup 363 364for.cond.cleanup: 365 ret void 366 367for.body: 368 %i.015 = phi i64 [ %dec, %for.body ], [ %conv16, %for.body.preheader ] 369 %idxprom = trunc i64 %i.015 to i32 370 %arrayidx = getelementptr inbounds i8, ptr %a, i32 %idxprom 371 %0 = load i8, ptr %arrayidx, align 1 372 %arrayidx4 = getelementptr inbounds i8, ptr %b, i32 %idxprom 373 %1 = load i8, ptr %arrayidx4, align 1 374 %add = add i8 %1, %0 375 %arrayidx8 = getelementptr inbounds i8, ptr %c, i32 %idxprom 376 store i8 %add, ptr %arrayidx8, align 1 377 %dec = add nsw i64 %i.015, -1 378 %cmp = icmp sgt i64 %i.015, 1 379 br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !1 380} 381 382; This IR corresponds to this nested-loop: 383; 384; for (int i = 0; i<N; i++) 385; for (int j = i+1; j>0; j--) 386; c[j] = a[j] + b[j]; 387; 388; while the inner-loop looks similar to previous examples, we can't 389; transform this because the inner loop because isGuarded returns 390; false for the inner-loop. 391; 392define dso_local void @sgt_nested_loop(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 { 393; COMMON-LABEL: @sgt_nested_loop( 394; DEFAULT-NOT: vector.body: 395; CHECK-TF-NOT: masked.load 396; CHECK-TF-NOT: masked.load 397; CHECK-TF-NOT: masked.store 398; COMMON: } 399; 400entry: 401 %cmp21 = icmp sgt i32 %N, 0 402 br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup 403 404for.body.preheader: 405 br label %for.body 406 407for.cond.loopexit: 408 %exitcond = icmp eq i32 %add, %N 409 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body 410 411for.cond.cleanup.loopexit: 412 br label %for.cond.cleanup 413 414for.cond.cleanup: 415 ret void 416 417for.body: 418 %i.022 = phi i32 [ %add, %for.cond.loopexit ], [ 0, %for.body.preheader ] 419 %add = add nuw nsw i32 %i.022, 1 420 br label %for.body4 421 422for.body4: ; preds = %for.body, %for.body4 423 %j.020 = phi i32 [ %add, %for.body ], [ %dec, %for.body4 ] 424 %arrayidx = getelementptr inbounds i8, ptr %a, i32 %j.020 425 %0 = load i8, ptr %arrayidx, align 1 426 %arrayidx5 = getelementptr inbounds i8, ptr %b, i32 %j.020 427 %1 = load i8, ptr %arrayidx5, align 1 428 %add7 = add i8 %1, %0 429 %arrayidx9 = getelementptr inbounds i8, ptr %c, i32 %j.020 430 store i8 %add7, ptr %arrayidx9, align 1 431 %dec = add nsw i32 %j.020, -1 432 %cmp2 = icmp sgt i32 %j.020, 1 433 br i1 %cmp2, label %for.body4, label %for.cond.loopexit 434} 435 436attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } 437 438!1 = distinct !{!1, !2} 439!2 = !{!"llvm.loop.vectorize.width", i32 16} 440