1; RUN: opt < %s -passes="loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s 2 3target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 4 5; This test checks that we can vectorize loop with reduction variable 6; stored in an invariant address. 7; 8; int sum = 0; 9; for(i=0..N) { 10; sum += src[i]; 11; dst[42] = sum; 12; } 13; CHECK-LABEL: @reduc_store 14; CHECK: vector.body: 15; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ] 16; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] 17; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 18; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]] 19; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 20; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 21; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] 22; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 23; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000 24; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] 25; CHECK: middle.block: 26; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]]) 27; CHECK-NEXT: store i32 [[TMP6]], ptr [[GEP_DST:%.*]], align 4 28; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH:%.*]] 29define void @reduc_store(ptr %dst, ptr readonly %src) { 30entry: 31 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 32 store i32 0, ptr %gep.dst, align 4 33 br label %for.body 34 35for.body: 36 %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] 37 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 38 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 39 %0 = load i32, ptr %gep.src, align 4 40 %add = add nsw i32 %sum, %0 41 store i32 %add, ptr %gep.dst, align 4 42 %iv.next = add nuw nsw i64 %iv, 1 43 %exitcond = icmp eq i64 %iv.next, 1000 44 br i1 %exitcond, label %exit, label %for.body 45 46exit: 47 ret void 48} 49 50; Same as above but with floating point numbers instead. 51; 52; float sum = 0; 53; for(i=0..N) { 54; sum += src[i]; 55; dst[42] = sum; 56; } 57; CHECK-LABEL: @reduc_store_fadd_fast 58; CHECK: vector.body: 59; CHECK: phi <4 x float> 60; CHECK: load <4 x float> 61; CHECK: fadd fast <4 x float> 62; CHECK-NOT: store float %{{[0-9]+}}, ptr %gep.dst 63; CHECK: middle.block: 64; CHECK-NEXT: [[TMP:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32 65; CHECK-NEXT: store float %{{[0-9]+}}, ptr %gep.dst 66define void @reduc_store_fadd_fast(ptr %dst, ptr readonly %src) { 67entry: 68 %gep.dst = getelementptr inbounds float, ptr %dst, i64 42 69 store float 0.000000e+00, ptr %gep.dst, align 4 70 br label %for.body 71 72for.body: 73 %sum = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] 74 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 75 %gep.src = getelementptr inbounds float, ptr %src, i64 %iv 76 %0 = load float, ptr %gep.src, align 4 77 %add = fadd fast float %sum, %0 78 store float %add, ptr %gep.dst, align 4 79 %iv.next = add nuw nsw i64 %iv, 1 80 %exitcond = icmp eq i64 %iv.next, 1000 81 br i1 %exitcond, label %exit, label %for.body 82 83exit: 84 ret void 85} 86 87; Check that if we have a read from an invariant address, we do not vectorize. 88; 89; int sum = 0; 90; for(i=0..N) { 91; sum += src[i]; 92; dst.2[i] = dst[42]; 93; dst[42] = sum; 94; } 95; CHECK-LABEL: @reduc_store_load 96; CHECK-NOT: vector.body 97define void @reduc_store_load(ptr %dst, ptr readonly %src, ptr noalias %dst.2) { 98entry: 99 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 100 store i32 0, ptr %gep.dst, align 4 101 br label %for.body 102 103for.body: 104 %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] 105 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 106 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 107 %0 = load i32, ptr %gep.src, align 4 108 %add = add nsw i32 %sum, %0 109 %lv = load i32, ptr %gep.dst 110 %gep.dst.2 = getelementptr inbounds i32, ptr %dst.2, i64 %iv 111 store i32 %lv, ptr %gep.dst.2, align 4 112 store i32 %add, ptr %gep.dst, align 4 113 %iv.next = add nuw nsw i64 %iv, 1 114 %exitcond = icmp eq i64 %iv.next, 1000 115 br i1 %exitcond, label %exit, label %for.body 116 117exit: 118 ret void 119} 120 121; Check that if we have a read from an invariant address, we do not vectorize, 122; even if we vectorize with runtime checks. The test below is a variant of 123; @reduc_store_load with a non-constant dependence distance, resulting in 124; vectorization with runtime checks. 125; 126; CHECK-LABEL: @reduc_store_load_with_non_constant_distance_dependence 127; CHECK-NOT: vector.body: 128define void @reduc_store_load_with_non_constant_distance_dependence(ptr %dst, ptr noalias %dst.2, i64 %off) { 129entry: 130 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 131 %dst.2.off = getelementptr inbounds i32, ptr %dst.2, i64 %off 132 store i32 0, ptr %gep.dst, align 4 133 br label %for.body 134 135for.body: 136 %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] 137 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 138 %gep.src = getelementptr inbounds i32, ptr %dst.2, i64 %iv 139 %0 = load i32, ptr %gep.src, align 4 140 %iv.off = mul i64 %iv, 2 141 %add = add nsw i32 %sum, %0 142 %lv = load i32, ptr %gep.dst 143 store i32 %add, ptr %gep.dst, align 4 144 %gep.src.2 = getelementptr inbounds i32, ptr %dst.2.off, i64 %iv 145 store i32 %lv, ptr %gep.src.2, align 4 146 %iv.next = add nuw nsw i64 %iv, 1 147 %exitcond = icmp eq i64 %iv.next, 1000 148 br i1 %exitcond, label %exit, label %for.body 149 150exit: 151 ret void 152} 153 154 155; Final value is not guaranteed to be stored in an invariant address. 156; We don't vectorize in that case. 157; 158; int sum = 0; 159; for(i=0..N) { 160; int diff = y[i] - x[i]; 161; if (diff > 0) { 162; sum = += diff; 163; *t = sum; 164; } 165; } 166; CHECK-LABEL: @reduc_cond_store 167; CHECK-NOT: vector.body 168define void @reduc_cond_store(ptr %t, ptr readonly %x, ptr readonly %y) { 169entry: 170 store i32 0, ptr %t, align 4 171 br label %for.body 172 173for.body: 174 %sum = phi i32 [ 0, %entry ], [ %sum.2, %if.end ] 175 %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ] 176 %gep.y = getelementptr inbounds i32, ptr %y, i64 %iv 177 %0 = load i32, ptr %gep.y, align 4 178 %gep.x = getelementptr inbounds i32, ptr %x, i64 %iv 179 %1 = load i32, ptr %gep.x, align 4 180 %diff = sub nsw i32 %0, %1 181 %cmp2 = icmp sgt i32 %diff, 0 182 br i1 %cmp2, label %if.then, label %if.end 183 184if.then: 185 %sum.1 = add nsw i32 %diff, %sum 186 store i32 %sum.1, ptr %t, align 4 187 br label %if.end 188 189if.end: 190 %sum.2 = phi i32 [ %sum.1, %if.then ], [ %0, %for.body ] 191 %iv.next = add nuw nsw i64 %iv, 1 192 %exitcond = icmp eq i64 %iv.next, 1000 193 br i1 %exitcond, label %for.end, label %for.body 194 195for.end: 196 ret void 197} 198 199; Check that we can vectorize code with several stores to an invariant address 200; with condition that final reduction value is stored too. 201; 202; int sum = 0; 203; for(int i=0; i < 1000; i+=2) { 204; sum += src[i]; 205; dst[42] = sum; 206; sum += src[i+1]; 207; dst[42] = sum; 208; } 209; CHECK-LABEL: @reduc_store_inside_unrolled 210; CHECK: vector.body: 211; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ] 212; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 2, i64 4, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 213; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ] 214; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2 215; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0 216; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2 217; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4 218; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6 219; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]] 220; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]] 221; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP2]] 222; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP3]] 223; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4 224; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4 225; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4 226; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4 227; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0 228; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1 229; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2 230; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3 231; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]] 232; CHECK-NEXT: [[TMP17:%.*]] = or disjoint <4 x i64> [[VEC_IND]], splat (i64 1) 233; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0 234; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]] 235; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1 236; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]] 237; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2 238; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]] 239; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3 240; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]] 241; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4 242; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4 243; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP23]], align 4 244; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4 245; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0 246; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1 247; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP28]], i32 2 248; CHECK-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 3 249; CHECK-NEXT: [[TMP34]] = add <4 x i32> [[TMP33]], [[TMP16]] 250; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 251; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8) 252; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500 253; CHECK-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] 254; CHECK: middle.block: 255; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP34]]) 256; CHECK-NEXT: store i32 [[TMP36]], ptr [[GEP_DST:%.*]], align 4 257; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH:%.*]] 258define void @reduc_store_inside_unrolled(ptr %dst, ptr readonly %src) { 259entry: 260 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 261 br label %for.body 262 263for.body: 264 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 265 %sum = phi i32 [ 0, %entry ], [ %sum.2, %for.body ] 266 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 267 %0 = load i32, ptr %gep.src, align 4 268 %sum.1 = add nsw i32 %0, %sum 269 store i32 %sum.1, ptr %gep.dst, align 4 270 %1 = or disjoint i64 %iv, 1 271 %gep.src.1 = getelementptr inbounds i32, ptr %src, i64 %1 272 %2 = load i32, ptr %gep.src.1, align 4 273 %sum.2 = add nsw i32 %2, %sum.1 274 store i32 %sum.2, ptr %gep.dst, align 4 275 %iv.next = add nuw nsw i64 %iv, 2 276 %cmp = icmp slt i64 %iv.next, 1000 277 br i1 %cmp, label %for.body, label %exit 278 279exit: 280 ret void 281} 282 283; Check that we cannot vectorize code if stored value is not the final reduction 284; value 285; 286; int sum = 0; 287; for(int i=0; i < 1000; i++) { 288; sum += src[i]; 289; dst[42] = sum + 1; 290; } 291; CHECK-LABEL: @reduc_store_not_final_value 292; CHECK-NOT: vector.body: 293define void @reduc_store_not_final_value(ptr %dst, ptr readonly %src) { 294entry: 295 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 296 store i32 0, ptr %gep.dst, align 4 297 br label %for.body 298 299for.body: 300 %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] 301 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 302 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 303 %0 = load i32, ptr %gep.src, align 4 304 %add = add nsw i32 %sum, %0 305 %sum_plus_one = add i32 %add, 1 306 store i32 %sum_plus_one, ptr %gep.dst, align 4 307 %iv.next = add nuw nsw i64 %iv, 1 308 %exitcond = icmp eq i64 %iv.next, 1000 309 br i1 %exitcond, label %exit, label %for.body 310 311exit: 312 ret void 313} 314 315; We cannot vectorize if two (or more) invariant stores exist in a loop. 316; 317; int sum = 0; 318; for(int i=0; i < 1000; i+=2) { 319; sum += src[i]; 320; dst[42] = sum; 321; sum += src[i+1]; 322; other_dst[42] = sum; 323; } 324; CHECK-LABEL: @reduc_double_invariant_store 325; CHECK-NOT: vector.body: 326define void @reduc_double_invariant_store(ptr %dst, ptr %other_dst, ptr readonly %src) { 327entry: 328 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 329 %gep.other_dst = getelementptr inbounds i32, ptr %other_dst, i64 42 330 br label %for.body 331 332for.body: 333 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 334 %sum = phi i32 [ 0, %entry ], [ %sum.2, %for.body ] 335 %arrayidx = getelementptr inbounds i32, ptr %src, i64 %iv 336 %0 = load i32, ptr %arrayidx, align 4 337 %sum.1 = add nsw i32 %0, %sum 338 store i32 %sum.1, ptr %gep.dst, align 4 339 %1 = or disjoint i64 %iv, 1 340 %arrayidx4 = getelementptr inbounds i32, ptr %src, i64 %1 341 %2 = load i32, ptr %arrayidx4, align 4 342 %sum.2 = add nsw i32 %2, %sum.1 343 store i32 %sum.2, ptr %gep.other_dst, align 4 344 %iv.next = add nuw nsw i64 %iv, 2 345 %cmp = icmp slt i64 %iv.next, 1000 346 br i1 %cmp, label %for.body, label %exit 347 348exit: 349 ret void 350} 351 352; int sum = 0; 353; for(int i=0; i < 1000; i+=2) { 354; sum += src[i]; 355; if (src[i+1] > 0) 356; dst[42] = sum; 357; sum += src[i+1]; 358; dst[42] = sum; 359; } 360; CHECK-LABEL: @reduc_store_middle_store_predicated 361; CHECK: vector.body: 362; CHECK-NOT: store i32 %{{[0-9]+}}, ptr %gep.dst 363; CHECK: middle.block: 364; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32 365; CHECK-NEXT: store i32 [[TMP]], ptr %gep.dst 366; CHECK: ret void 367define void @reduc_store_middle_store_predicated(ptr %dst, ptr readonly %src) { 368entry: 369 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 370 br label %for.body 371 372for.body: ; preds = %latch, %entry 373 %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] 374 %sum = phi i32 [ 0, %entry ], [ %sum.2, %latch ] 375 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 376 %0 = load i32, ptr %gep.src, align 4 377 %sum.1 = add nsw i32 %0, %sum 378 %cmp = icmp sgt i32 %0, 0 379 br i1 %cmp, label %predicated, label %latch 380 381predicated: ; preds = %for.body 382 store i32 %sum.1, ptr %gep.dst, align 4 383 br label %latch 384 385latch: ; preds = %predicated, %for.body 386 %1 = or disjoint i64 %iv, 1 387 %gep.src.1 = getelementptr inbounds i32, ptr %src, i64 %1 388 %2 = load i32, ptr %gep.src.1, align 4 389 %sum.2 = add nsw i32 %2, %sum.1 390 store i32 %sum.2, ptr %gep.dst, align 4 391 %iv.next = add nuw nsw i64 %iv, 2 392 %cmp.1 = icmp slt i64 %iv.next, 1000 393 br i1 %cmp.1, label %for.body, label %exit 394 395exit: ; preds = %latch 396 ret void 397} 398 399; int sum = 0; 400; for(int i=0; i < 1000; i+=2) { 401; sum += src[i]; 402; dst[42] = sum; 403; sum += src[i+1]; 404; if (src[i+1] > 0) 405; dst[42] = sum; 406; } 407; CHECK-LABEL: @reduc_store_final_store_predicated 408; CHECK-NOT: vector.body: 409define void @reduc_store_final_store_predicated(ptr %dst, ptr readonly %src) { 410entry: 411 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 412 br label %for.body 413 414for.body: ; preds = %latch, %entry 415 %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ] 416 %sum = phi i32 [ 0, %entry ], [ %sum.1, %latch ] 417 %arrayidx = getelementptr inbounds i32, ptr %src, i64 %iv 418 %0 = load i32, ptr %arrayidx, align 4 419 %sum.1 = add nsw i32 %0, %sum 420 store i32 %sum.1, ptr %gep.dst, align 4 421 %1 = or disjoint i64 %iv, 1 422 %gep.src.1 = getelementptr inbounds i32, ptr %src, i64 %1 423 %2 = load i32, ptr %gep.src.1, align 4 424 %sum.2 = add nsw i32 %2, %sum.1 425 %cmp1 = icmp sgt i32 %2, 0 426 br i1 %cmp1, label %predicated, label %latch 427 428predicated: ; preds = %for.body 429 store i32 %sum.2, ptr %gep.dst, align 4 430 br label %latch 431 432latch: ; preds = %predicated, %for.body 433 %iv.next = add nuw nsw i64 %iv, 2 434 %cmp = icmp slt i64 %iv.next, 1000 435 br i1 %cmp, label %for.body, label %exit 436 437exit: ; preds = %latch 438 ret void 439} 440 441; Final reduction value is overwritten inside loop 442; 443; for(int i=0; i < 1000; i++) { 444; sum += src[i]; 445; dst[42] = sum; 446; dst[42] = 0; 447; } 448; CHECK-LABEL: @reduc_store_final_store_overwritten 449; CHECK-NOT: vector.body: 450define void @reduc_store_final_store_overwritten(ptr %dst, ptr readonly %src) { 451entry: 452 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 453 br label %for.body 454 455for.body: 456 %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] 457 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 458 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 459 %0 = load i32, ptr %gep.src, align 4 460 %add = add nsw i32 %sum, %0 461 store i32 %add, ptr %gep.dst, align 4 462 store i32 0, ptr %gep.dst, align 4 463 %iv.next = add nuw nsw i64 %iv, 1 464 %exitcond = icmp eq i64 %iv.next, 1000 465 br i1 %exitcond, label %exit, label %for.body 466 467exit: 468 ret void 469} 470 471; Final value used outside of loop does not prevent vectorization 472; 473; int sum = 0; 474; for(int i=0; i < 1000; i++) { 475; sum += src[i]; 476; dst[42] = sum; 477; } 478; dst[43] = sum; 479; CHECK-LABEL: @reduc_store_inoutside 480; CHECK: vector.body: 481; CHECK-NOT: store i32 %{{[0-9]+}}, ptr %gep.src 482; CHECK: middle.block: 483; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32 484; CHECK-NEXT: store i32 [[TMP]], ptr %gep.dst 485; CHECK: exit: 486; CHECK: [[PHI:%.*]] = phi i32 [ [[TMP1:%.*]], %for.body ], [ [[TMP2:%.*]], %middle.block ] 487; CHECK: [[ADDR:%.*]] = getelementptr inbounds i32, ptr %dst, i64 43 488; CHECK: store i32 [[PHI]], ptr [[ADDR]] 489; CHECK: ret void 490define void @reduc_store_inoutside(ptr %dst, ptr readonly %src) { 491entry: 492 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 493 br label %for.body 494 495for.body: 496 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 497 %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ] 498 %arrayidx = getelementptr inbounds i32, ptr %src, i64 %iv 499 %0 = load i32, ptr %arrayidx, align 4 500 %sum.1 = add nsw i32 %0, %sum 501 store i32 %sum.1, ptr %gep.dst, align 4 502 %iv.next = add nuw nsw i64 %iv, 1 503 %exitcond = icmp eq i64 %iv.next, 1000 504 br i1 %exitcond, label %exit, label %for.body 505 506exit: 507 %sum.lcssa = phi i32 [ %sum.1, %for.body ] 508 %gep.dst.1 = getelementptr inbounds i32, ptr %dst, i64 43 509 store i32 %sum.lcssa, ptr %gep.dst.1, align 4 510 ret void 511} 512 513; Test for PR55540. 514define void @test_drop_poison_generating_dead_recipe(ptr %dst) { 515; CHECK-LABEL: @test_drop_poison_generating_dead_recipe( 516; CHECK: vector.body: 517; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] 518; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ [[TMP0:%.*]], %vector.body ] 519; CHECK-NEXT: [[TMP0]] = add <4 x i64> [[VEC_PHI]], splat (i64 -31364) 520; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 521; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 360 522; CHECK-NEXT: br i1 [[TMP1]], label %middle.block, label %vector.body 523; CHECK: middle.block: 524; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP0]]) 525; CHECK-NEXT: store i64 [[TMP2]], ptr [[DST:%.*]], align 8 526; CHECK-NEXT: br i1 false, label %exit, label %scalar.ph 527; CHECK: scalar.ph: 528; 529entry: 530 br label %body 531 532body: 533 %red = phi i64 [ 0, %entry ], [ %red.next, %body ] 534 %iv = phi i32 [ 2, %entry ], [ %iv.next, %body ] 535 %add.1 = add nuw i64 %red, -23523 536 store i64 %add.1, ptr %dst, align 8 537 %red.next = add nuw i64 %red, -31364 538 store i64 %red.next, ptr %dst, align 8 539 %iv.next = add nuw nsw i32 %iv, 1 540 %ec = icmp ugt i32 %iv, 363 541 br i1 %ec, label %exit, label %body 542 543exit: 544 ret void 545} 546 547define void @reduc_store_invariant_addr_not_hoisted(ptr %dst, ptr readonly %src) { 548; CHECK-LABEL: @reduc_store_invariant_addr_not_hoisted 549; CHECK-NOT: vector.body: 550entry: 551 br label %for.body 552 553for.body: 554 %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] 555 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 556 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 557 %0 = load i32, ptr %gep.src, align 4 558 %add = add nsw i32 %sum, %0 559 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 560 store i32 %add, ptr %gep.dst, align 4 561 %iv.next = add nuw nsw i64 %iv, 1 562 %exitcond = icmp eq i64 %iv.next, 1000 563 br i1 %exitcond, label %exit, label %for.body 564 565exit: 566 ret void 567} 568 569; Make sure we can vectorize loop with a non-reduction value stored to an 570; invariant address that is calculated inside loop. 571define i32 @non_reduc_store_invariant_addr_not_hoisted(ptr %dst, ptr readonly %src) { 572; CHECK-LABEL: @non_reduc_store_invariant_addr_not_hoisted 573; CHECK: vector.body: 574entry: 575 br label %for.body 576 577for.body: ; preds = %for.body, %entry 578 %sum = phi i32 [ 0, %entry ], [ %add, %for.body ] 579 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 580 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 581 %0 = load i32, ptr %gep.src, align 4 582 %add = add nsw i32 %sum, %0 583 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42 584 store i32 0, ptr %gep.dst, align 4 585 %iv.next = add nuw nsw i64 %iv, 1 586 %exitcond = icmp eq i64 %iv.next, 1000 587 br i1 %exitcond, label %exit, label %for.body 588 589exit: ; preds = %for.body 590 %add.lcssa = phi i32 [ %add, %for.body ] 591 ret i32 %add.lcssa 592} 593 594; Make sure that if there are several reductions in the loop, the order of invariant stores sank outside of the loop is preserved 595; See https://github.com/llvm/llvm-project/issues/64047 596define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) { 597; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr 598; CHECK: middle.block: 599; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]]) 600; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]]) 601; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4 602; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4 603; 604entry: 605 br label %for.body 606 607for.body: 608 %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.body ] 609 %mul = phi i32 [ 1, %entry ], [ %mul.next, %for.body ] 610 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 611 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 612 %0 = load i32, ptr %gep.src, align 4 613 %sum.next = add nsw i32 %sum, %0 614 store i32 %sum.next, ptr %dst, align 4 615 %mul.next = mul nsw i32 %mul, %0 616 store i32 %mul.next, ptr %dst, align 4 617 %iv.next = add nuw nsw i64 %iv, 1 618 %exitcond = icmp eq i64 %iv.next, 1000 619 br i1 %exitcond, label %exit, label %for.body 620 621exit: 622 ret void 623} 624 625define void @reduc_mul_add_store_same_ptr(ptr %dst, ptr readonly %src) { 626; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr 627; CHECK: middle.block: 628; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]]) 629; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]]) 630; CHECK-NEXT: store i32 [[TMP7]], ptr %dst, align 4 631; CHECK-NEXT: store i32 [[TMP6]], ptr %dst, align 4 632; 633entry: 634 br label %for.body 635 636for.body: 637 %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.body ] 638 %mul = phi i32 [ 1, %entry ], [ %mul.next, %for.body ] 639 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 640 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 641 %0 = load i32, ptr %gep.src, align 4 642 %mul.next = mul nsw i32 %mul, %0 643 store i32 %mul.next, ptr %dst, align 4 644 %sum.next = add nsw i32 %sum, %0 645 store i32 %sum.next, ptr %dst, align 4 646 %iv.next = add nuw nsw i64 %iv, 1 647 %exitcond = icmp eq i64 %iv.next, 1000 648 br i1 %exitcond, label %exit, label %for.body 649 650exit: 651 ret void 652} 653 654; Same as above but storing is done to two different pointers and they can be aliased 655define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) { 656; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr 657; CHECK: middle.block: 658; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]]) 659; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]]) 660; CHECK-NEXT: store i32 [[TMP6]], ptr %dst1, align 4 661; CHECK-NEXT: store i32 [[TMP7]], ptr %dst2, align 4 662; 663entry: 664 br label %for.body 665 666for.body: 667 %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.body ] 668 %mul = phi i32 [ 1, %entry ], [ %mul.next, %for.body ] 669 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 670 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 671 %0 = load i32, ptr %gep.src, align 4 672 %sum.next = add nsw i32 %sum, %0 673 store i32 %sum.next, ptr %dst1, align 4 674 %mul.next = mul nsw i32 %mul, %0 675 store i32 %mul.next, ptr %dst2, align 4 676 %iv.next = add nuw nsw i64 %iv, 1 677 %exitcond = icmp eq i64 %iv.next, 1000 678 br i1 %exitcond, label %exit, label %for.body 679 680exit: 681 ret void 682} 683 684define void @reduc_mul_add_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) { 685; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr 686; CHECK: middle.block: 687; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]]) 688; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]]) 689; CHECK-NEXT: store i32 [[TMP7]], ptr %dst1, align 4 690; CHECK-NEXT: store i32 [[TMP6]], ptr %dst2, align 4 691; 692entry: 693 br label %for.body 694 695for.body: 696 %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.body ] 697 %mul = phi i32 [ 1, %entry ], [ %mul.next, %for.body ] 698 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 699 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 700 %0 = load i32, ptr %gep.src, align 4 701 %mul.next = mul nsw i32 %mul, %0 702 store i32 %mul.next, ptr %dst1, align 4 703 %sum.next = add nsw i32 %sum, %0 704 store i32 %sum.next, ptr %dst2, align 4 705 %iv.next = add nuw nsw i64 %iv, 1 706 %exitcond = icmp eq i64 %iv.next, 1000 707 br i1 %exitcond, label %exit, label %for.body 708 709exit: 710 ret void 711} 712