1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -S -force-vector-width=4 -passes=loop-vectorize -mcpu=haswell < %s | FileCheck %s 3 4;; Basic functional tests for uniform loads and stores. These are cases kept 5;; deliberately simple (and unoptimized by other passes) to feed the vectorizer 6;; with particular input IR. 7 8target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 9target triple = "x86_64-unknown-linux-gnu" 10 11define i32 @uniform_load(ptr align(4) %addr) { 12; CHECK-LABEL: @uniform_load( 13; CHECK-NEXT: entry: 14; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 15; CHECK: vector.ph: 16; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 17; CHECK: vector.body: 18; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 19; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADDR:%.*]], align 4 20; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 21; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 22; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 23; CHECK: middle.block: 24; CHECK-NEXT: br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] 25; CHECK: scalar.ph: 26; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 27; CHECK-NEXT: br label [[FOR_BODY:%.*]] 28; CHECK: for.body: 29; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 30; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[ADDR]], align 4 31; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 32; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 33; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] 34; CHECK: loopexit: 35; CHECK-NEXT: [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ] 36; CHECK-NEXT: ret i32 [[LOAD_LCSSA]] 37; 38entry: 39 br label %for.body 40 41for.body: 42 %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] 43 %load = load i32, ptr %addr 44 %iv.next = add nuw nsw i64 %iv, 1 45 %exitcond = icmp eq i64 %iv, 4096 46 br i1 %exitcond, label %loopexit, label %for.body 47 48loopexit: 49 ret i32 %load 50} 51 52define i32 @uniform_load2(ptr align(4) %addr) { 53; CHECK-LABEL: @uniform_load2( 54; CHECK-NEXT: entry: 55; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 56; CHECK: vector.ph: 57; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 58; CHECK: vector.body: 59; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 60; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] 61; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] 62; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 63; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] 64; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADDR:%.*]], align 4 65; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 66; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer 67; CHECK-NEXT: [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]] 68; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT9]] 69; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT9]] 70; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]] 71; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 72; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 73; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] 74; CHECK: middle.block: 75; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]] 76; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]] 77; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]] 78; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) 79; CHECK-NEXT: br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] 80; CHECK: scalar.ph: 81; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 82; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 83; CHECK-NEXT: br label [[FOR_BODY:%.*]] 84; CHECK: for.body: 85; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 86; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 87; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[ADDR]], align 4 88; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]] 89; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 90; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 91; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] 92; CHECK: loopexit: 93; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] 94; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] 95; 96entry: 97 br label %for.body 98 99for.body: 100 %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] 101 %accum = phi i32 [%accum.next, %for.body], [0, %entry] 102 %load = load i32, ptr %addr 103 %accum.next = add i32 %accum, %load 104 %iv.next = add nuw nsw i64 %iv, 1 105 %exitcond = icmp eq i64 %iv, 4096 106 br i1 %exitcond, label %loopexit, label %for.body 107 108loopexit: 109 ret i32 %accum.next 110} 111 112define i32 @uniform_address(ptr align(4) %addr, i32 %byte_offset) { 113; CHECK-LABEL: @uniform_address( 114; CHECK-NEXT: entry: 115; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 116; CHECK: vector.ph: 117; CHECK-NEXT: [[TMP0:%.*]] = udiv i32 [[BYTE_OFFSET:%.*]], 4 118; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i32 [[TMP0]] 119; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 120; CHECK: vector.body: 121; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 122; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4 123; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 124; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 125; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] 126; CHECK: middle.block: 127; CHECK-NEXT: br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] 128; CHECK: scalar.ph: 129; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 130; CHECK-NEXT: br label [[FOR_BODY:%.*]] 131; CHECK: for.body: 132; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 133; CHECK-NEXT: [[OFFSET:%.*]] = udiv i32 [[BYTE_OFFSET]], 4 134; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[OFFSET]] 135; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4 136; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 137; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 138; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] 139; CHECK: loopexit: 140; CHECK-NEXT: [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] 141; CHECK-NEXT: ret i32 [[LOAD_LCSSA]] 142; 143entry: 144 br label %for.body 145 146for.body: 147 %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] 148 %offset = udiv i32 %byte_offset, 4 149 %gep = getelementptr i32, ptr %addr, i32 %offset 150 %load = load i32, ptr %gep 151 %iv.next = add nuw nsw i64 %iv, 1 152 %exitcond = icmp eq i64 %iv, 4096 153 br i1 %exitcond, label %loopexit, label %for.body 154 155loopexit: 156 ret i32 %load 157} 158 159 160 161define void @uniform_store_uniform_value(ptr align(4) %addr) { 162; CHECK-LABEL: @uniform_store_uniform_value( 163; CHECK-NEXT: entry: 164; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 165; CHECK: vector.ph: 166; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 167; CHECK: vector.body: 168; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 169; CHECK-NEXT: store i32 0, ptr [[ADDR:%.*]], align 4 170; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 171; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 172; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] 173; CHECK: middle.block: 174; CHECK-NEXT: br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] 175; CHECK: scalar.ph: 176; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 177; CHECK-NEXT: br label [[FOR_BODY:%.*]] 178; CHECK: for.body: 179; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 180; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 181; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 182; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 183; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] 184; CHECK: loopexit: 185; CHECK-NEXT: ret void 186; 187entry: 188 br label %for.body 189 190for.body: 191 %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] 192 store i32 0, ptr %addr 193 %iv.next = add nuw nsw i64 %iv, 1 194 %exitcond = icmp eq i64 %iv, 4096 195 br i1 %exitcond, label %loopexit, label %for.body 196 197loopexit: 198 ret void 199} 200 201define void @uniform_store_varying_value(ptr align(4) %addr) { 202; CHECK-LABEL: @uniform_store_varying_value( 203; CHECK-NEXT: entry: 204; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 205; CHECK: vector.ph: 206; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 207; CHECK: vector.body: 208; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 209; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32 210; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 12 211; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP0]], 13 212; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP0]], 14 213; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP0]], 15 214; CHECK-NEXT: store i32 [[TMP7]], ptr [[ADDR:%.*]], align 4 215; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 216; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 217; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] 218; CHECK: middle.block: 219; CHECK-NEXT: br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] 220; CHECK: scalar.ph: 221; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 222; CHECK-NEXT: br label [[FOR_BODY:%.*]] 223; CHECK: for.body: 224; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 225; CHECK-NEXT: [[IV_I32:%.*]] = trunc i64 [[IV]] to i32 226; CHECK-NEXT: store i32 [[IV_I32]], ptr [[ADDR]], align 4 227; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 228; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 229; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] 230; CHECK: loopexit: 231; CHECK-NEXT: ret void 232; 233entry: 234 br label %for.body 235 236for.body: 237 %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] 238 %iv.i32 = trunc i64 %iv to i32 239 store i32 %iv.i32, ptr %addr 240 %iv.next = add nuw nsw i64 %iv, 1 241 %exitcond = icmp eq i64 %iv, 4096 242 br i1 %exitcond, label %loopexit, label %for.body 243 244loopexit: 245 ret void 246} 247 248define void @uniform_rw(ptr align(4) %addr) { 249; CHECK-LABEL: @uniform_rw( 250; CHECK-NEXT: entry: 251; CHECK-NEXT: br label [[FOR_BODY:%.*]] 252; CHECK: for.body: 253; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] 254; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[ADDR:%.*]], align 4 255; CHECK-NEXT: [[INC:%.*]] = add i32 [[LOAD]], 1 256; CHECK-NEXT: store i32 [[INC]], ptr [[ADDR]], align 4 257; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 258; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 259; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]] 260; CHECK: loopexit: 261; CHECK-NEXT: ret void 262; 263entry: 264 br label %for.body 265 266for.body: 267 %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] 268 %load = load i32, ptr %addr 269 %inc = add i32 %load, 1 270 store i32 %inc, ptr %addr 271 %iv.next = add nuw nsw i64 %iv, 1 272 %exitcond = icmp eq i64 %iv, 4096 273 br i1 %exitcond, label %loopexit, label %for.body 274 275loopexit: 276 ret void 277} 278 279define void @uniform_copy(ptr %A, ptr %B) { 280; CHECK-LABEL: @uniform_copy( 281; CHECK-NEXT: entry: 282; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 283; CHECK: vector.memcheck: 284; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 4 285; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4 286; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[B]], [[UGLYGEP1]] 287; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[UGLYGEP]] 288; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 289; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 290; CHECK: vector.ph: 291; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 292; CHECK: vector.body: 293; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 294; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META12:![0-9]+]] 295; CHECK-NEXT: store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META12]] 296; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 297; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 298; CHECK-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] 299; CHECK: middle.block: 300; CHECK-NEXT: br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] 301; CHECK: scalar.ph: 302; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] 303; CHECK-NEXT: br label [[FOR_BODY:%.*]] 304; CHECK: for.body: 305; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 306; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[A]], align 4 307; CHECK-NEXT: store i32 [[LOAD]], ptr [[B]], align 4 308; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 309; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 310; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] 311; CHECK: loopexit: 312; CHECK-NEXT: ret void 313; 314entry: 315 br label %for.body 316 317for.body: 318 %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] 319 %load = load i32, ptr %A 320 store i32 %load, ptr %B 321 %iv.next = add nuw nsw i64 %iv, 1 322 %exitcond = icmp eq i64 %iv, 4096 323 br i1 %exitcond, label %loopexit, label %for.body 324 325loopexit: 326 ret void 327} 328 329 330declare void @init(ptr) 331 332;; Count the number of bits set in a bit vector -- key point of relevance is 333;; that the byte load is uniform across 8 iterations at a time. 334;; TODO: At the moment, this is vectorized with VF=4 and UF=4. The load is 335;; considered uniform across VF=4, but should be considered uniform across 336;; VF=8/VF=4,UF=2. 337define i32 @test_count_bits(ptr %test_base) { 338; CHECK-LABEL: @test_count_bits( 339; CHECK-NEXT: entry: 340; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4096 x i32], align 4 341; CHECK-NEXT: call void @init(ptr [[ALLOCA]]) 342; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 343; CHECK: vector.ph: 344; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 345; CHECK: vector.body: 346; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 347; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 348; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ] 349; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ] 350; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ] 351; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ] 352; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4) 353; CHECK-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4) 354; CHECK-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4) 355; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 356; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 357; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8 358; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12 359; CHECK-NEXT: [[TMP4:%.*]] = udiv i64 [[TMP0]], 8 360; CHECK-NEXT: [[TMP5:%.*]] = udiv i64 [[TMP1]], 8 361; CHECK-NEXT: [[TMP6:%.*]] = udiv i64 [[TMP2]], 8 362; CHECK-NEXT: [[TMP7:%.*]] = udiv i64 [[TMP3]], 8 363; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP4]] 364; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP5]] 365; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP6]] 366; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP7]] 367; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1 368; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0 369; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer 370; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1 371; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i8> poison, i8 [[TMP13]], i64 0 372; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT7]], <4 x i8> poison, <4 x i32> zeroinitializer 373; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1 374; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i8> poison, i8 [[TMP14]], i64 0 375; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT9]], <4 x i8> poison, <4 x i32> zeroinitializer 376; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1 377; CHECK-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x i8> poison, i8 [[TMP15]], i64 0 378; CHECK-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT11]], <4 x i8> poison, <4 x i32> zeroinitializer 379; CHECK-NEXT: [[TMP16:%.*]] = urem <4 x i64> [[VEC_IND]], splat (i64 8) 380; CHECK-NEXT: [[TMP17:%.*]] = urem <4 x i64> [[STEP_ADD]], splat (i64 8) 381; CHECK-NEXT: [[TMP18:%.*]] = urem <4 x i64> [[STEP_ADD_2]], splat (i64 8) 382; CHECK-NEXT: [[TMP19:%.*]] = urem <4 x i64> [[STEP_ADD_3]], splat (i64 8) 383; CHECK-NEXT: [[TMP20:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i8> 384; CHECK-NEXT: [[TMP21:%.*]] = trunc <4 x i64> [[TMP17]] to <4 x i8> 385; CHECK-NEXT: [[TMP22:%.*]] = trunc <4 x i64> [[TMP18]] to <4 x i8> 386; CHECK-NEXT: [[TMP23:%.*]] = trunc <4 x i64> [[TMP19]] to <4 x i8> 387; CHECK-NEXT: [[TMP24:%.*]] = lshr <4 x i8> [[BROADCAST_SPLAT]], [[TMP20]] 388; CHECK-NEXT: [[TMP25:%.*]] = lshr <4 x i8> [[BROADCAST_SPLAT8]], [[TMP21]] 389; CHECK-NEXT: [[TMP26:%.*]] = lshr <4 x i8> [[BROADCAST_SPLAT10]], [[TMP22]] 390; CHECK-NEXT: [[TMP27:%.*]] = lshr <4 x i8> [[BROADCAST_SPLAT12]], [[TMP23]] 391; CHECK-NEXT: [[TMP28:%.*]] = and <4 x i8> [[TMP24]], splat (i8 1) 392; CHECK-NEXT: [[TMP29:%.*]] = and <4 x i8> [[TMP25]], splat (i8 1) 393; CHECK-NEXT: [[TMP30:%.*]] = and <4 x i8> [[TMP26]], splat (i8 1) 394; CHECK-NEXT: [[TMP31:%.*]] = and <4 x i8> [[TMP27]], splat (i8 1) 395; CHECK-NEXT: [[TMP32:%.*]] = zext <4 x i8> [[TMP28]] to <4 x i32> 396; CHECK-NEXT: [[TMP33:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32> 397; CHECK-NEXT: [[TMP34:%.*]] = zext <4 x i8> [[TMP30]] to <4 x i32> 398; CHECK-NEXT: [[TMP35:%.*]] = zext <4 x i8> [[TMP31]] to <4 x i32> 399; CHECK-NEXT: [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[TMP32]] 400; CHECK-NEXT: [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[TMP33]] 401; CHECK-NEXT: [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[TMP34]] 402; CHECK-NEXT: [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[TMP35]] 403; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 404; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4) 405; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 406; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] 407; CHECK: middle.block: 408; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]] 409; CHECK-NEXT: [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]] 410; CHECK-NEXT: [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX13]] 411; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX14]]) 412; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] 413; CHECK: scalar.ph: 414; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 415; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP41]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 416; CHECK-NEXT: br label [[LOOP:%.*]] 417; CHECK: loop: 418; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] 419; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP]] ] 420; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 421; CHECK-NEXT: [[BYTE:%.*]] = udiv i64 [[IV]], 8 422; CHECK-NEXT: [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[BYTE]] 423; CHECK-NEXT: [[EARLYCND:%.*]] = load i8, ptr [[TEST_ADDR]], align 1 424; CHECK-NEXT: [[BIT:%.*]] = urem i64 [[IV]], 8 425; CHECK-NEXT: [[BIT_TRUNC:%.*]] = trunc i64 [[BIT]] to i8 426; CHECK-NEXT: [[MASK:%.*]] = lshr i8 [[EARLYCND]], [[BIT_TRUNC]] 427; CHECK-NEXT: [[TEST:%.*]] = and i8 [[MASK]], 1 428; CHECK-NEXT: [[VAL:%.*]] = zext i8 [[TEST]] to i32 429; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL]] 430; CHECK-NEXT: [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094 431; CHECK-NEXT: br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]] 432; CHECK: loop_exit: 433; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ] 434; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] 435; 436entry: 437 %alloca = alloca [4096 x i32] 438 call void @init(ptr %alloca) 439 br label %loop 440loop: 441 %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] 442 %accum = phi i32 [ 0, %entry ], [ %accum.next, %loop ] 443 %iv.next = add i64 %iv, 1 444 %byte = udiv i64 %iv, 8 445 %test_addr = getelementptr inbounds i8, ptr %test_base, i64 %byte 446 %earlycnd = load i8, ptr %test_addr 447 %bit = urem i64 %iv, 8 448 %bit.trunc = trunc i64 %bit to i8 449 %mask = lshr i8 %earlycnd, %bit.trunc 450 %test = and i8 %mask, 1 451 %val = zext i8 %test to i32 452 %accum.next = add i32 %accum, %val 453 %exit = icmp ugt i64 %iv, 4094 454 br i1 %exit, label %loop_exit, label %loop 455 456loop_exit: 457 ret i32 %accum.next 458} 459 460;; Same as uniform_load, but show that the uniformity analysis can handle 461;; pointer operands which are not local to the function. 462@GAddr = external global i32 align 4 463define i32 @uniform_load_global() { 464; CHECK-LABEL: @uniform_load_global( 465; CHECK-NEXT: entry: 466; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 467; CHECK: vector.ph: 468; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 469; CHECK: vector.body: 470; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 471; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] 472; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] 473; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 474; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] 475; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr @GAddr, align 4 476; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 477; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer 478; CHECK-NEXT: [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]] 479; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT9]] 480; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT9]] 481; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]] 482; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 483; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 484; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] 485; CHECK: middle.block: 486; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]] 487; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]] 488; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]] 489; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) 490; CHECK-NEXT: br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] 491; CHECK: scalar.ph: 492; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 493; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 494; CHECK-NEXT: br label [[FOR_BODY:%.*]] 495; CHECK: for.body: 496; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 497; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 498; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr @GAddr, align 4 499; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]] 500; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 501; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 502; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] 503; CHECK: loopexit: 504; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] 505; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] 506; 507entry: 508 br label %for.body 509 510for.body: 511 %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] 512 %accum = phi i32 [%accum.next, %for.body], [0, %entry] 513 %load = load i32, ptr @GAddr 514 %accum.next = add i32 %accum, %load 515 %iv.next = add nuw nsw i64 %iv, 1 516 %exitcond = icmp eq i64 %iv, 4096 517 br i1 %exitcond, label %loopexit, label %for.body 518 519loopexit: 520 ret i32 %accum.next 521} 522 523;; Same as the global case, but using a constexpr 524define i32 @uniform_load_constexpr() { 525; CHECK-LABEL: @uniform_load_constexpr( 526; CHECK-NEXT: entry: 527; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 528; CHECK: vector.ph: 529; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 530; CHECK: vector.body: 531; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 532; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] 533; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] 534; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 535; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] 536; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4 537; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0 538; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer 539; CHECK-NEXT: [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]] 540; CHECK-NEXT: [[TMP2]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT9]] 541; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT9]] 542; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]] 543; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 544; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 545; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] 546; CHECK: middle.block: 547; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]] 548; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]] 549; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]] 550; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]]) 551; CHECK-NEXT: br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]] 552; CHECK: scalar.ph: 553; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 554; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] 555; CHECK-NEXT: br label [[FOR_BODY:%.*]] 556; CHECK: for.body: 557; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 558; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] 559; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4 560; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]] 561; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 562; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096 563; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] 564; CHECK: loopexit: 565; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ] 566; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]] 567; 568entry: 569 br label %for.body 570 571for.body: ; preds = %for.body, %entry 572 %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ] 573 %accum = phi i32 [ %accum.next, %for.body ], [ 0, %entry ] 574 %load = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4 575 %accum.next = add i32 %accum, %load 576 %iv.next = add nuw nsw i64 %iv, 1 577 %exitcond = icmp eq i64 %iv, 4096 578 br i1 %exitcond, label %loopexit, label %for.body 579 580loopexit: ; preds = %for.body 581 ret i32 %accum.next 582} 583