1; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf \ 2; RUN: -enable-arm-maskedgatscat=false \ 3; RUN: -tail-predication=enabled -passes=loop-vectorize -S < %s | \ 4; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING 5 6; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=-mve \ 7; RUN: -tail-predication=enabled -passes=loop-vectorize \ 8; RUN: -enable-arm-maskedgatscat=false \ 9; RUN: -enable-arm-maskedldst=true -S < %s | \ 10; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 11 12; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \ 13; RUN: -tail-predication=enabled -passes=loop-vectorize \ 14; RUN: -enable-arm-maskedgatscat=false \ 15; RUN: -enable-arm-maskedldst=false -S < %s | \ 16; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 17 18; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve \ 19; RUN: -tail-predication=disabled -passes=loop-vectorize \ 20; RUN: -enable-arm-maskedgatscat=false \ 21; RUN: -enable-arm-maskedldst=true -S < %s | \ 22; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 23 24; Disabling the low-overhead branch extension will make 25; 'isHardwareLoopProfitable' return false, so that we test avoiding folding for 26; these cases. 27; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve,-lob \ 28; RUN: -tail-predication=enabled -passes=loop-vectorize \ 29; RUN: -enable-arm-maskedgatscat=false \ 30; RUN: -enable-arm-maskedldst=true -S < %s | \ 31; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 32 33; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 34; RUN: -tail-predication=enabled -passes=loop-vectorize \ 35; RUN: -enable-arm-maskedgatscat=false \ 36; RUN: -enable-arm-maskedldst=true -S < %s | \ 37; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING 38 39; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 40; RUN: -prefer-predicate-over-epilogue=scalar-epilogue \ 41; RUN: -tail-predication=enabled -passes=loop-vectorize \ 42; RUN: -enable-arm-maskedgatscat=false \ 43; RUN: -enable-arm-maskedldst=true -S < %s | \ 44; RUN: FileCheck %s -check-prefixes=CHECK,NO-FOLDING 45 46; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp \ 47; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ 48; RUN: -tail-predication=enabled -passes=loop-vectorize \ 49; RUN: -enable-arm-maskedgatscat=false \ 50; RUN: -enable-arm-maskedldst=true -S < %s | \ 51; RUN: FileCheck %s -check-prefixes=CHECK 52 53target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" 54 55define void @prefer_folding(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { 56; CHECK-LABEL: prefer_folding( 57; PREFER-FOLDING: vector.body: 58; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 59; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 60; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 431) 61; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %active.lane.mask, 62; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0({{.*}}, <4 x i1> %active.lane.mask, 63; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0({{.*}}, <4 x i1> %active.lane.mask 64; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 65; 66; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0( 67; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0( 68; NO-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %for.body 69entry: 70 br label %for.body 71 72for.cond.cleanup: 73 ret void 74 75for.body: 76 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 77 %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09 78 %0 = load i32, ptr %arrayidx, align 4 79 %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09 80 %1 = load i32, ptr %arrayidx1, align 4 81 %add = add nsw i32 %1, %0 82 %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09 83 store i32 %add, ptr %arrayidx2, align 4 84 %add3 = add nuw nsw i32 %i.09, 1 85 %exitcond = icmp eq i32 %add3, 431 86 br i1 %exitcond, label %for.cond.cleanup, label %for.body 87} 88 89define void @mixed_types(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture %D, ptr noalias nocapture readonly %E, ptr noalias nocapture readonly %F) #0 { 90; CHECK-LABEL: mixed_types( 91; PREFER-FOLDING: vector.body: 92; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0 93; PREFER-FOLDING: call <4 x i16> @llvm.masked.load.v4i16.p0 94; PREFER-FOLDING: call void @llvm.masked.store.v4i16.p0 95; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0 96; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0 97; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0 98; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 99entry: 100 br label %for.body 101 102for.cond.cleanup: 103 ret void 104 105for.body: 106 %i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ] 107 %arrayidx = getelementptr inbounds i16, ptr %B, i32 %i.018 108 %0 = load i16, ptr %arrayidx, align 2 109 %arrayidx1 = getelementptr inbounds i16, ptr %C, i32 %i.018 110 %1 = load i16, ptr %arrayidx1, align 2 111 %add = add i16 %1, %0 112 %arrayidx4 = getelementptr inbounds i16, ptr %A, i32 %i.018 113 store i16 %add, ptr %arrayidx4, align 2 114 %arrayidx5 = getelementptr inbounds i32, ptr %E, i32 %i.018 115 %2 = load i32, ptr %arrayidx5, align 4 116 %arrayidx6 = getelementptr inbounds i32, ptr %F, i32 %i.018 117 %3 = load i32, ptr %arrayidx6, align 4 118 %add7 = add nsw i32 %3, %2 119 %arrayidx8 = getelementptr inbounds i32, ptr %D, i32 %i.018 120 store i32 %add7, ptr %arrayidx8, align 4 121 %add9 = add nuw nsw i32 %i.018, 1 122 %exitcond = icmp eq i32 %add9, 431 123 br i1 %exitcond, label %for.cond.cleanup, label %for.body 124} 125 126define void @zero_extending_load_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { 127; CHECK-LABEL: zero_extending_load_allowed( 128; PREFER-FOLDING: vector.body: 129; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0 130; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0 131; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0 132; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 133entry: 134 br label %for.body 135 136for.cond.cleanup: 137 ret void 138 139for.body: 140 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 141 %arrayidx = getelementptr inbounds i8, ptr %B, i32 %i.09 142 %0 = load i8, ptr %arrayidx, align 1 143 %conv = zext i8 %0 to i32 144 %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09 145 %1 = load i32, ptr %arrayidx1, align 4 146 %add = add nsw i32 %1, %conv 147 %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09 148 store i32 %add, ptr %arrayidx2, align 4 149 %add3 = add nuw nsw i32 %i.09, 1 150 %exitcond = icmp eq i32 %add3, 431 151 br i1 %exitcond, label %for.cond.cleanup, label %for.body 152} 153 154define void @sign_extending_load_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { 155; CHECK-LABEL: sign_extending_load_allowed( 156; PREFER-FOLDING: vector.body: 157; PREFER-FOLDING: call <4 x i8> @llvm.masked.load.v4i8.p0 158; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0 159; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0 160; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 161entry: 162 br label %for.body 163 164for.cond.cleanup: 165 ret void 166 167for.body: 168 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 169 %arrayidx = getelementptr inbounds i8, ptr %B, i32 %i.09 170 %0 = load i8, ptr %arrayidx, align 1 171 %conv = sext i8 %0 to i32 172 %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09 173 %1 = load i32, ptr %arrayidx1, align 4 174 %add = add nsw i32 %1, %conv 175 %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09 176 store i32 %add, ptr %arrayidx2, align 4 177 %add3 = add nuw nsw i32 %i.09, 1 178 %exitcond = icmp eq i32 %add3, 431 179 br i1 %exitcond, label %for.cond.cleanup, label %for.body 180} 181 182define void @narrowing_store_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { 183; CHECK-LABEL: narrowing_store_allowed( 184; PREFER-FOLDING: call void @llvm.masked.store.v4i8.p0 185; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 186entry: 187 br label %for.body 188 189for.cond.cleanup: 190 ret void 191 192for.body: 193 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 194 %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09 195 %0 = load i32, ptr %arrayidx, align 4 196 %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09 197 %1 = load i32, ptr %arrayidx1, align 4 198 %add = add nsw i32 %1, %0 199 %conv = trunc i32 %add to i8 200 %arrayidx2 = getelementptr inbounds i8, ptr %A, i32 %i.09 201 store i8 %conv, ptr %arrayidx2, align 1 202 %add3 = add nuw nsw i32 %i.09, 1 203 %exitcond = icmp eq i32 %add3, 431 204 br i1 %exitcond, label %for.cond.cleanup, label %for.body 205} 206 207@tab = common global [32 x i8] zeroinitializer, align 1 208 209define i32 @icmp_not_allowed() #0 { 210; CHECK-LABEL: icmp_not_allowed( 211; PREFER-FOLDING: vector.body: 212; PREFER-FOLDING-NOT: llvm.masked.load 213; PREFER-FOLDING-NOT: llvm.masked.store 214; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 215entry: 216 br label %for.body 217 218for.body: 219 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 220 %arrayidx = getelementptr inbounds [32 x i8], ptr @tab, i32 0, i32 %i.08 221 %0 = load i8, ptr %arrayidx, align 1 222 %cmp1 = icmp eq i8 %0, 0 223 %. = select i1 %cmp1, i8 2, i8 1 224 store i8 %., ptr %arrayidx, align 1 225 %inc = add nsw i32 %i.08, 1 226 %exitcond = icmp slt i32 %inc, 1000 227 br i1 %exitcond, label %for.body, label %for.end 228 229for.end: 230 ret i32 0 231} 232 233@ftab = common global [32 x float] zeroinitializer, align 1 234 235define float @fcmp_not_allowed() #0 { 236; CHECK-LABEL: fcmp_not_allowed( 237; PREFER-FOLDING: vector.body: 238; PREFER-FOLDING-NOT: llvm.masked.load 239; PREFER-FOLDING-NOT: llvm.masked.store 240; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 241entry: 242 br label %for.body 243 244for.body: 245 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ] 246 %arrayidx = getelementptr inbounds [32 x float], ptr @ftab, i32 0, i32 %i.08 247 %0 = load float, ptr %arrayidx, align 4 248 %cmp1 = fcmp oeq float %0, 0.000000e+00 249 %. = select i1 %cmp1, float 2.000000e+00, float 1.000000e+00 250 store float %., ptr %arrayidx, align 4 251 %inc = add nsw i32 %i.08, 1 252 %exitcond = icmp slt i32 %inc, 999 253 br i1 %exitcond, label %for.body, label %for.end 254 255for.end: 256 ret float 0.000000e+00 257} 258 259define void @pragma_vect_predicate_disable(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { 260; CHECK-LABEL: pragma_vect_predicate_disable( 261; PREFER-FOLDING: vector.body: 262; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0 263; PREFER-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0 264; PREFER-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0 265; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 266entry: 267 br label %for.body 268 269for.cond.cleanup: 270 ret void 271 272for.body: 273 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 274 %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09 275 %0 = load i32, ptr %arrayidx, align 4 276 %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09 277 %1 = load i32, ptr %arrayidx1, align 4 278 %add = add nsw i32 %1, %0 279 %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09 280 store i32 %add, ptr %arrayidx2, align 4 281 %add3 = add nuw nsw i32 %i.09, 1 282 %exitcond = icmp eq i32 %add3, 431 283 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !7 284} 285 286define void @stride_4(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { 287; CHECK-LABEL: stride_4( 288; PREFER-FOLDING: vector.body: 289; PREFER-FOLDING-NOT: llvm.masked.load 290; PREFER-FOLDING-NOT: llvm.masked.store 291; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 292entry: 293 br label %for.body 294 295for.cond.cleanup: 296 ret void 297 298for.body: 299 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 300 %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09 301 %0 = load i32, ptr %arrayidx, align 4 302 %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09 303 %1 = load i32, ptr %arrayidx1, align 4 304 %add = add nsw i32 %1, %0 305 %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09 306 store i32 %add, ptr %arrayidx2, align 4 307 %add3 = add nuw nsw i32 %i.09, 4 308 %cmp = icmp ult i32 %add3, 731 309 br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !5 310} 311 312define dso_local void @half(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { 313; CHECK-LABEL: half( 314; PREFER-FOLDING: vector.body: 315; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0 316; PREFER-FOLDING: call <8 x half> @llvm.masked.load.v8f16.p0 317; PREFER-FOLDING: call void @llvm.masked.store.v8f16.p0 318; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 319entry: 320 br label %for.body 321 322for.cond.cleanup: 323 ret void 324 325for.body: 326 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 327 %arrayidx = getelementptr inbounds half, ptr %B, i32 %i.09 328 %0 = load half, ptr %arrayidx, align 2 329 %arrayidx1 = getelementptr inbounds half, ptr %C, i32 %i.09 330 %1 = load half, ptr %arrayidx1, align 2 331 %add = fadd fast half %1, %0 332 %arrayidx2 = getelementptr inbounds half, ptr %A, i32 %i.09 333 store half %add, ptr %arrayidx2, align 2 334 %add3 = add nuw nsw i32 %i.09, 1 335 %exitcond = icmp eq i32 %add3, 431 336 br i1 %exitcond, label %for.cond.cleanup, label %for.body 337} 338 339define void @float(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { 340; CHECK-LABEL: float( 341; PREFER-FOLDING: vector.body: 342; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 343; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 344; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 431) 345; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0({{.*}}%active.lane.mask 346; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0({{.*}}%active.lane.mask 347; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0({{.*}}%active.lane.mask 348; PREFER-FOLDING: %index.next = add nuw i32 %index, 4 349; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 350entry: 351 br label %for.body 352 353for.cond.cleanup: 354 ret void 355 356for.body: 357 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 358 %arrayidx = getelementptr inbounds float, ptr %B, i32 %i.09 359 %0 = load float, ptr %arrayidx, align 4 360 %arrayidx1 = getelementptr inbounds float, ptr %C, i32 %i.09 361 %1 = load float, ptr %arrayidx1, align 4 362 %add = fadd fast float %1, %0 363 %arrayidx2 = getelementptr inbounds float, ptr %A, i32 %i.09 364 store float %add, ptr %arrayidx2, align 4 365 %add3 = add nuw nsw i32 %i.09, 1 366 %exitcond = icmp eq i32 %add3, 431 367 br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 368} 369 370define void @fpext_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { 371; CHECK-LABEL: fpext_allowed( 372; PREFER-FOLDING: vector.body: 373; PREFER-FOLDING-NOT: llvm.masked.load 374; PREFER-FOLDING-NOT: llvm.masked.store 375; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 376entry: 377 br label %for.body 378 379for.cond.cleanup: 380 ret void 381 382for.body: 383 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 384 %arrayidx = getelementptr inbounds half, ptr %B, i32 %i.09 385 %0 = load half, ptr %arrayidx, align 2 386 %conv = fpext half %0 to float 387 %arrayidx1 = getelementptr inbounds float, ptr %C, i32 %i.09 388 %1 = load float, ptr %arrayidx1, align 4 389 %add = fadd fast float %1, %conv 390 %arrayidx2 = getelementptr inbounds float, ptr %A, i32 %i.09 391 store float %add, ptr %arrayidx2, align 4 392 %add3 = add nuw nsw i32 %i.09, 1 393 %exitcond = icmp eq i32 %add3, 431 394 br i1 %exitcond, label %for.cond.cleanup, label %for.body 395} 396 397define void @fptrunc_allowed(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C) #0 { 398; CHECK-LABEL: fptrunc_allowed( 399; PREFER-FOLDING: vector.body: 400; PREFER-FOLDING-NOT: llvm.masked.load 401; PREFER-FOLDING-NOT: llvm.masked.store 402; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body 403entry: 404 br label %for.body 405 406for.cond.cleanup: 407 ret void 408 409for.body: 410 %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] 411 %arrayidx = getelementptr inbounds float, ptr %B, i32 %i.09 412 %0 = load float, ptr %arrayidx, align 4 413 %arrayidx1 = getelementptr inbounds float, ptr %C, i32 %i.09 414 %1 = load float, ptr %arrayidx1, align 4 415 %add = fadd fast float %1, %0 416 %conv = fptrunc float %add to half 417 %arrayidx2 = getelementptr inbounds half, ptr %A, i32 %i.09 418 store half %conv, ptr %arrayidx2, align 2 419 %add3 = add nuw nsw i32 %i.09, 1 420 %exitcond = icmp eq i32 %add3, 431 421 br i1 %exitcond, label %for.cond.cleanup, label %for.body 422} 423 424attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } 425 426!5 = distinct !{!5, !6} 427!6 = !{!"llvm.loop.vectorize.enable", i1 true} 428 429!7 = distinct !{!7, !8} 430!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} 431 432!10 = distinct !{!10, !11} 433!11 = !{!"llvm.loop.vectorize.width", i32 4} 434