1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 2; RUN: opt < %s -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s 3; REQUIRES: asserts 4 5target triple = "aarch64-unknown-linux-gnu" 6 7;; Based on the following C code: 8;; 9;; void simple_histogram(int *buckets, unsigned *indices, int N) { 10;; for (int i = 0; i < N; ++i) 11;; buckets[indices[i]]++; 12;; } 13 14;; Confirm finding a histogram operation 15; CHECK-LABEL: Checking a loop in 'simple_histogram' 16; CHECK: LV: Checking for a histogram on: store i32 %inc, ptr %gep.bucket, align 4 17; CHECK: LV: Found histogram for: store i32 %inc, ptr %gep.bucket, align 4 18 19;; Confirm cost calculation for runtime checks 20; CHECK-LABEL: LV: Checking a loop in 'simple_histogram_rtdepcheck' 21; CHECK: Calculating cost of runtime checks: 22; CHECK: Total cost of runtime checks: 23; CHECK: LV: Minimum required TC for runtime checks to be profitable: 24 25;; Confirm inability to vectorize with potential alias to buckets 26; CHECK-LABEL: LV: Checking a loop in 'simple_histogram_unsafe_alias' 27; CHECK: LV: Can't vectorize due to memory conflicts 28; CHECK-NEXT: LV: Not vectorizing: Cannot prove legality. 29 30define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { 31; CHECK-LABEL: define void @simple_histogram( 32; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { 33; CHECK-NEXT: entry: 34; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() 35; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 36; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] 37; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 38; CHECK: vector.ph: 39; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() 40; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 41; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] 42; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() 43; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 44; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 45; CHECK: vector.body: 46; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 47; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]] 48; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4 49; CHECK-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64> 50; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]] 51; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> splat (i1 true)) 52; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] 53; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 54; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] 55; CHECK: middle.block: 56; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] 57; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 58; CHECK: scalar.ph: 59; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 60; CHECK-NEXT: br label [[FOR_BODY:%.*]] 61; CHECK: for.body: 62; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 63; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] 64; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 65; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64 66; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] 67; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 68; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], 1 69; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 70; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 71; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] 72; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] 73; CHECK: for.exit: 74; CHECK-NEXT: ret void 75; 76entry: 77 br label %for.body 78 79for.body: 80 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 81 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 82 %l.idx = load i32, ptr %gep.indices, align 4 83 %idxprom1 = zext i32 %l.idx to i64 84 %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 85 %l.bucket = load i32, ptr %gep.bucket, align 4 86 %inc = add nsw i32 %l.bucket, 1 87 store i32 %inc, ptr %gep.bucket, align 4 88 %iv.next = add nuw nsw i64 %iv, 1 89 %exitcond = icmp eq i64 %iv.next, %N 90 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4 91 92for.exit: 93 ret void 94} 95 96define void @simple_histogram_inc_param(ptr noalias %buckets, ptr readonly %indices, i64 %N, i32 %incval) #0 { 97; CHECK-LABEL: define void @simple_histogram_inc_param( 98; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]], i32 [[INCVAL:%.*]]) #[[ATTR0]] { 99; CHECK-NEXT: entry: 100; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() 101; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 102; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] 103; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 104; CHECK: vector.ph: 105; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() 106; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 107; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] 108; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() 109; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 110; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 111; CHECK: vector.body: 112; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 113; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]] 114; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4 115; CHECK-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64> 116; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]] 117; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 [[INCVAL]], <vscale x 4 x i1> splat (i1 true)) 118; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] 119; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 120; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] 121; CHECK: middle.block: 122; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] 123; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 124; CHECK: scalar.ph: 125; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 126; CHECK-NEXT: br label [[FOR_BODY:%.*]] 127; CHECK: for.body: 128; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 129; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] 130; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 131; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64 132; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] 133; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 134; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], [[INCVAL]] 135; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 136; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 137; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] 138; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] 139; CHECK: for.exit: 140; CHECK-NEXT: ret void 141; 142entry: 143 br label %for.body 144 145for.body: 146 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 147 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 148 %l.idx = load i32, ptr %gep.indices, align 4 149 %idxprom1 = zext i32 %l.idx to i64 150 %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 151 %l.bucket = load i32, ptr %gep.bucket, align 4 152 %inc = add nsw i32 %l.bucket, %incval 153 store i32 %inc, ptr %gep.bucket, align 4 154 %iv.next = add nuw nsw i64 %iv, 1 155 %exitcond = icmp eq i64 %iv.next, %N 156 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4 157 158for.exit: 159 ret void 160} 161 162define void @simple_histogram_sub(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { 163; CHECK-LABEL: define void @simple_histogram_sub( 164; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 165; CHECK-NEXT: entry: 166; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() 167; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 168; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] 169; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 170; CHECK: vector.ph: 171; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() 172; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 173; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] 174; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() 175; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 176; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 177; CHECK: vector.body: 178; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 179; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]] 180; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4 181; CHECK-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64> 182; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]] 183; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 -1, <vscale x 4 x i1> splat (i1 true)) 184; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] 185; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 186; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] 187; CHECK: middle.block: 188; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] 189; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 190; CHECK: scalar.ph: 191; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 192; CHECK-NEXT: br label [[FOR_BODY:%.*]] 193; CHECK: for.body: 194; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 195; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] 196; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 197; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP12]] to i64 198; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] 199; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 200; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], -1 201; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 202; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 203; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] 204; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] 205; CHECK: for.exit: 206; CHECK-NEXT: ret void 207; 208entry: 209 br label %for.body 210 211for.body: 212 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 213 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 214 %l.idx = load i32, ptr %gep.indices, align 4 215 %idxprom1 = sext i32 %l.idx to i64 216 %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 217 %l.bucket = load i32, ptr %gep.bucket, align 4 218 %inc = sub nsw i32 %l.bucket, 1 219 store i32 %inc, ptr %gep.bucket, align 4 220 %iv.next = add nuw nsw i64 %iv, 1 221 %exitcond = icmp eq i64 %iv.next, %N 222 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4 223 224for.exit: 225 ret void 226} 227 228define void @conditional_histogram(ptr noalias %buckets, ptr readonly %indices, ptr readonly %conds, i64 %N) #0 { 229; CHECK-LABEL: define void @conditional_histogram( 230; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], ptr readonly [[CONDS:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 231; CHECK-NEXT: entry: 232; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() 233; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP6]], 2 234; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP3]] 235; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 236; CHECK: vector.ph: 237; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() 238; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 239; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] 240; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() 241; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 242; CHECK-NEXT: br label [[FOR_BODY:%.*]] 243; CHECK: vector.body: 244; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] 245; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]] 246; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4 247; CHECK-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64> 248; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]] 249; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[CONDS]], i64 [[INDEX]] 250; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4 251; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD1]], splat (i32 5100) 252; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[TMP13]]) 253; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] 254; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 255; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] 256; CHECK: middle.block: 257; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] 258; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 259; CHECK: scalar.ph: 260; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 261; CHECK-NEXT: br label [[FOR_BODY1:%.*]] 262; CHECK: for.body: 263; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[NEXT:%.*]] ] 264; CHECK-NEXT: [[CONDIDX:%.*]] = getelementptr inbounds i32, ptr [[CONDS]], i64 [[IV1]] 265; CHECK-NEXT: [[CONDDATA:%.*]] = load i32, ptr [[CONDIDX]], align 4 266; CHECK-NEXT: [[IFCOND:%.*]] = icmp sgt i32 [[CONDDATA]], 5100 267; CHECK-NEXT: br i1 [[IFCOND]], label [[IFTRUE:%.*]], label [[NEXT]] 268; CHECK: iftrue: 269; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]] 270; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 271; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP1]] to i64 272; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] 273; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4 274; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP15]], 1 275; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX3]], align 4 276; CHECK-NEXT: br label [[NEXT]] 277; CHECK: next: 278; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 279; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] 280; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]] 281; CHECK: for.exit: 282; CHECK-NEXT: ret void 283; 284entry: 285 br label %for.body 286 287for.body: 288 %iv = phi i64 [ 0, %entry ], [ %iv.next, %next ] 289 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 290 %l.idx = load i32, ptr %gep.indices, align 4 291 %idxprom1 = zext i32 %l.idx to i64 292 %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 293 %condidx = getelementptr inbounds i32, ptr %conds, i64 %iv 294 %conddata = load i32, ptr %condidx, align 4 295 %ifcond = icmp sgt i32 %conddata, 5100 296 br i1 %ifcond, label %iftrue, label %next 297 298iftrue: 299 %l.bucket = load i32, ptr %gep.bucket, align 4 300 %inc = add nsw i32 %l.bucket, 1 301 store i32 %inc, ptr %gep.bucket, align 4 302 br label %next 303 304next: 305 %iv.next = add nuw nsw i64 %iv, 1 306 %exitcond = icmp eq i64 %iv.next, %N 307 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4 308 309for.exit: 310 ret void 311} 312 313define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { 314; CHECK-LABEL: define void @histogram_8bit( 315; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 316; CHECK-NEXT: entry: 317; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() 318; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 2 319; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP9]] 320; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] 321; CHECK: vector.ph: 322; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() 323; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 324; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] 325; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() 326; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 327; CHECK-NEXT: br label [[FOR_BODY:%.*]] 328; CHECK: vector.body: 329; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 330; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] 331; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[ARRAYIDX]], align 4 332; CHECK-NEXT: [[TMP6:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64> 333; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP6]] 334; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i8(<vscale x 4 x ptr> [[TMP7]], i8 1, <vscale x 4 x i1> splat (i1 true)) 335; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]] 336; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] 337; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] 338; CHECK: middle.block: 339; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] 340; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 341; CHECK: scalar.ph: 342; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] 343; CHECK-NEXT: br label [[FOR_BODY1:%.*]] 344; CHECK: for.body: 345; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] 346; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]] 347; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_INDICES]], align 4 348; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64 349; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[BUCKETS]], i64 [[IDXPROM1]] 350; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 4 351; CHECK-NEXT: [[INC:%.*]] = add nsw i8 [[TMP1]], 1 352; CHECK-NEXT: store i8 [[INC]], ptr [[ARRAYIDX2]], align 4 353; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 354; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] 355; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]] 356; CHECK: for.exit: 357; CHECK-NEXT: ret void 358; 359entry: 360 br label %for.body 361 362for.body: 363 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 364 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 365 %l.idx = load i32, ptr %gep.indices, align 4 366 %idxprom1 = zext i32 %l.idx to i64 367 %gep.bucket = getelementptr inbounds i8, ptr %buckets, i64 %idxprom1 368 %l.bucket = load i8, ptr %gep.bucket, align 4 369 %inc = add nsw i8 %l.bucket, 1 370 store i8 %inc, ptr %gep.bucket, align 4 371 %iv.next = add nuw nsw i64 %iv, 1 372 %exitcond = icmp eq i64 %iv.next, %N 373 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4 374 375for.exit: 376 ret void 377} 378 379;; We don't currently support floating point histograms. 380define void @histogram_float(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { 381; CHECK-LABEL: define void @histogram_float( 382; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 383; CHECK-NEXT: entry: 384; CHECK-NEXT: br label [[FOR_BODY:%.*]] 385; CHECK: for.body: 386; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 387; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[INDICES]], i64 [[IV]] 388; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 389; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64 390; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[BUCKETS]], i64 [[IDXPROM1]] 391; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 392; CHECK-NEXT: [[INC:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 393; CHECK-NEXT: store float [[INC]], ptr [[ARRAYIDX2]], align 4 394; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 395; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] 396; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] 397; CHECK: for.exit: 398; CHECK-NEXT: ret void 399; 400entry: 401 br label %for.body 402 403for.body: 404 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 405 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 406 %l.idx = load i32, ptr %gep.indices, align 4 407 %idxprom1 = zext i32 %l.idx to i64 408 %gep.bucket = getelementptr inbounds float, ptr %buckets, i64 %idxprom1 409 %l.bucket = load float, ptr %gep.bucket, align 4 410 %inc = fadd fast float %l.bucket, 1.0 411 store float %inc, ptr %gep.bucket, align 4 412 %iv.next = add nuw nsw i64 %iv, 1 413 %exitcond = icmp eq i64 %iv.next, %N 414 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4 415 416for.exit: 417 ret void 418} 419 420;; We don't support histograms with a update value that's not loop-invariant. 421define void @histogram_varying_increment(ptr noalias %buckets, ptr readonly %indices, ptr readonly %incvals, i64 %N) #0 { 422; CHECK-LABEL: define void @histogram_varying_increment( 423; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], ptr readonly [[INCVALS:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 424; CHECK-NEXT: entry: 425; CHECK-NEXT: br label [[FOR_BODY:%.*]] 426; CHECK: for.body: 427; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 428; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[INDICES]], i64 [[IV]] 429; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 430; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64 431; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] 432; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 433; CHECK-NEXT: [[INCIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[INCVALS]], i64 [[IV]] 434; CHECK-NEXT: [[INCVAL:%.*]] = load i32, ptr [[INCIDX]], align 4 435; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], [[INCVAL]] 436; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 437; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 438; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] 439; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12]] 440; CHECK: for.exit: 441; CHECK-NEXT: ret void 442; 443entry: 444 br label %for.body 445 446for.body: 447 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 448 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 449 %l.idx = load i32, ptr %gep.indices, align 4 450 %idxprom1 = zext i32 %l.idx to i64 451 %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 452 %l.bucket = load i32, ptr %gep.bucket, align 4 453 %gep.incvals = getelementptr inbounds i32, ptr %incvals, i64 %iv 454 %l.incval = load i32, ptr %gep.incvals, align 4 455 %inc = add nsw i32 %l.bucket, %l.incval 456 store i32 %inc, ptr %gep.bucket, align 4 457 %iv.next = add nuw nsw i64 %iv, 1 458 %exitcond = icmp eq i64 %iv.next, %N 459 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4 460 461for.exit: 462 ret void 463} 464 465;; Test that interleaving works when vectorizing. 466define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { 467; CHECK-LABEL: define void @simple_histogram_user_interleave( 468; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 469; CHECK-NEXT: entry: 470; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() 471; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3 472; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] 473; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 474; CHECK: vector.ph: 475; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() 476; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8 477; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] 478; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() 479; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3 480; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 481; CHECK: vector.body: 482; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 483; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]] 484; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() 485; CHECK-NEXT: [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP15]], 4 486; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[DOTIDX]] 487; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4 488; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4 489; CHECK-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64> 490; CHECK-NEXT: [[TMP19:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64> 491; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]] 492; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP19]] 493; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> splat (i1 true)) 494; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP21]], i32 1, <vscale x 4 x i1> splat (i1 true)) 495; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] 496; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 497; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] 498; CHECK: middle.block: 499; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] 500; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 501; CHECK: scalar.ph: 502; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 503; CHECK-NEXT: br label [[FOR_BODY:%.*]] 504; CHECK: for.body: 505; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 506; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] 507; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 508; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64 509; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] 510; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 511; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], 1 512; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 513; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 514; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] 515; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] 516; CHECK: for.exit: 517; CHECK-NEXT: ret void 518; 519entry: 520 br label %for.body 521 522for.body: 523 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 524 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 525 %l.idx = load i32, ptr %gep.indices, align 4 526 %idxprom1 = zext i32 %l.idx to i64 527 %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 528 %l.bucket = load i32, ptr %gep.bucket, align 4 529 %inc = add nsw i32 %l.bucket, 1 530 store i32 %inc, ptr %gep.bucket, align 4 531 %iv.next = add nuw nsw i64 %iv, 1 532 %exitcond = icmp eq i64 %iv.next, %N 533 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !0 534 535for.exit: 536 ret void 537} 538 539;; Test that we can handle more than one GEP index. 540@idx_array = dso_local local_unnamed_addr global [1048576 x i32] zeroinitializer, align 4 541@data_array = dso_local local_unnamed_addr global [1048576 x i32] zeroinitializer, align 4 542 543define void @histogram_array_3op_gep(i64 noundef %N) #0 { 544; CHECK-LABEL: define void @histogram_array_3op_gep( 545; CHECK-SAME: i64 noundef [[N:%.*]]) #[[ATTR0]] { 546; CHECK-NEXT: entry: 547; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() 548; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 549; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] 550; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 551; CHECK: vector.ph: 552; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() 553; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 554; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] 555; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() 556; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 557; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 558; CHECK: vector.body: 559; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 560; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1048576 x i32], ptr @idx_array, i64 0, i64 [[INDEX]] 561; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4 562; CHECK-NEXT: [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64> 563; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1048576 x i32], ptr @data_array, i64 0, <vscale x 4 x i64> [[TMP14]] 564; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP11]], i32 1, <vscale x 4 x i1> splat (i1 true)) 565; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] 566; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 567; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] 568; CHECK: middle.block: 569; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] 570; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 571; CHECK: scalar.ph: 572; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 573; CHECK-NEXT: br label [[FOR_BODY:%.*]] 574; CHECK: for.body: 575; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 576; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1048576 x i32], ptr @idx_array, i64 0, i64 [[IV]] 577; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 578; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[TMP9]] to i64 579; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [1048576 x i32], ptr @data_array, i64 0, i64 [[IDXPROM5]] 580; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4 581; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP10]], 1 582; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX6]], align 4 583; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 584; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] 585; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] 586; CHECK: for.exit: 587; CHECK-NEXT: ret void 588; 589entry: 590 br label %for.body 591 592for.body: 593 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 594 %gep.indices = getelementptr inbounds [1048576 x i32], ptr @idx_array, i64 0, i64 %iv 595 %l.idx = load i32, ptr %gep.indices, align 4 596 %idxprom5 = sext i32 %l.idx to i64 597 %gep.bucket = getelementptr inbounds [1048576 x i32], ptr @data_array, i64 0, i64 %idxprom5 598 %l.bucket = load i32, ptr %gep.bucket, align 4 599 %inc = add nsw i32 %l.bucket, 1 600 store i32 %inc, ptr %gep.bucket, align 4 601 %iv.next = add nuw nsw i64 %iv, 1 602 %exitcond = icmp eq i64 %iv.next, %N 603 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4 604 605for.exit: 606 ret void 607} 608 609;; Add a struct into the mix, use a different constant index. 610;; { unused, buckets } 611%somestruct = type { [1048576 x i32], [1048576 x i32] } 612 613define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr readonly %indices, ptr noalias %data.struct) #0 { 614; CHECK-LABEL: define void @histogram_array_4op_gep_nonzero_const_idx( 615; CHECK-SAME: i64 noundef [[N:%.*]], ptr readonly [[INDICES:%.*]], ptr noalias [[DATA_STRUCT:%.*]]) #[[ATTR0]] { 616; CHECK-NEXT: entry: 617; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() 618; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 619; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] 620; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] 621; CHECK: vector.ph: 622; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() 623; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4 624; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] 625; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() 626; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 627; CHECK-NEXT: br label [[FOR_BODY:%.*]] 628; CHECK: vector.body: 629; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 630; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] 631; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4 632; CHECK-NEXT: [[TMP6:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64> 633; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[SOMESTRUCT:%.*]], ptr [[DATA_STRUCT]], i64 1, i32 0, <vscale x 4 x i64> [[TMP6]] 634; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 1, <vscale x 4 x i1> splat (i1 true)) 635; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]] 636; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] 637; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] 638; CHECK: middle.block: 639; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] 640; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 641; CHECK: scalar.ph: 642; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] 643; CHECK-NEXT: br label [[FOR_BODY1:%.*]] 644; CHECK: for.body: 645; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ] 646; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]] 647; CHECK-NEXT: [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4 648; CHECK-NEXT: [[IDXPROM5:%.*]] = sext i32 [[L_IDX]] to i64 649; CHECK-NEXT: [[GEP_BUCKET:%.*]] = getelementptr inbounds [[SOMESTRUCT]], ptr [[DATA_STRUCT]], i64 1, i32 0, i64 [[IDXPROM5]] 650; CHECK-NEXT: [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4 651; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[L_BUCKET]], 1 652; CHECK-NEXT: store i32 [[INC]], ptr [[GEP_BUCKET]], align 4 653; CHECK-NEXT: [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1 654; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]] 655; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP19:![0-9]+]] 656; CHECK: for.exit: 657; CHECK-NEXT: ret void 658; 659entry: 660 br label %for.body 661 662for.body: 663 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 664 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 665 %l.idx = load i32, ptr %gep.indices, align 4 666 %idxprom5 = sext i32 %l.idx to i64 667 %gep.bucket = getelementptr inbounds %somestruct, ptr %data.struct, i32 1, i32 0, i64 %idxprom5 668 %l.bucket = load i32, ptr %gep.bucket, align 4 669 %inc = add nsw i32 %l.bucket, 1 670 store i32 %inc, ptr %gep.bucket, align 4 671 %iv.next = add nuw nsw i64 %iv, 1 672 %exitcond = icmp eq i64 %iv.next, %N 673 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4 674 675for.exit: 676 ret void 677} 678 679;; Make sure the histogram intrinsic uses the active lane mask when tail folding. 680define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { 681; CHECK-LABEL: define void @simple_histogram_tailfold( 682; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 683; CHECK-NEXT: entry: 684; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 685; CHECK: vector.ph: 686; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() 687; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP2]], 2 688; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() 689; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2 690; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP5]]) 691; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) 692; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 693; CHECK: vector.body: 694; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 695; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] 696; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]] 697; CHECK-NEXT: [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison) 698; CHECK-NEXT: [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64> 699; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]] 700; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]]) 701; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] 702; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]]) 703; CHECK-NEXT: [[TMP11:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0 704; CHECK-NEXT: br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]] 705; CHECK: middle.block: 706; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 707; CHECK: scalar.ph: 708; CHECK-NEXT: br label [[FOR_BODY:%.*]] 709; CHECK: for.body: 710; CHECK-NEXT: br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] 711; CHECK: for.exit: 712; CHECK-NEXT: ret void 713; 714entry: 715 br label %for.body 716 717for.body: 718 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 719 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 720 %l.idx = load i32, ptr %gep.indices, align 4 721 %idxprom1 = zext i32 %l.idx to i64 722 %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 723 %l.bucket = load i32, ptr %gep.bucket, align 4 724 %inc = add nsw i32 %l.bucket, 1 725 store i32 %inc, ptr %gep.bucket, align 4 726 %iv.next = add nuw nsw i64 %iv, 1 727 %exitcond = icmp eq i64 %iv.next, %N 728 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !2 729 730for.exit: 731 ret void 732} 733 734;; Check that we can still vectorize a histogram when LAA found another dependency 735;; that doesn't conflict with the buckets. 736define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr %indices, i64 %N) #0 { 737; CHECK-LABEL: define void @simple_histogram_rtdepcheck( 738; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr [[ARRAY:%.*]], ptr [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 739; CHECK-NEXT: entry: 740; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() 741; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 742; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8) 743; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]] 744; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 745; CHECK: vector.memcheck: 746; CHECK-NEXT: [[ARRAY1:%.*]] = ptrtoint ptr [[ARRAY]] to i64 747; CHECK-NEXT: [[INDICES2:%.*]] = ptrtoint ptr [[INDICES]] to i64 748; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() 749; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 750; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[ARRAY1]], [[INDICES2]] 751; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]] 752; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 753; CHECK: vector.ph: 754; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() 755; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP6]], -4 756; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] 757; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() 758; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 759; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32() 760; CHECK-NEXT: [[TMP11:%.*]] = trunc nuw nsw i64 [[TMP8]] to i32 761; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0 762; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer 763; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 764; CHECK: vector.body: 765; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 766; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] 767; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]] 768; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4 769; CHECK-NEXT: [[TMP13:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64> 770; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP13]] 771; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP14]], i32 1, <vscale x 4 x i1> splat (i1 true)) 772; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDEX]] 773; CHECK-NEXT: store <vscale x 4 x i32> [[VEC_IND]], ptr [[TMP15]], align 4 774; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] 775; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]] 776; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 777; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] 778; CHECK: middle.block: 779; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] 780; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 781; CHECK: scalar.ph: 782; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ] 783; CHECK-NEXT: br label [[FOR_BODY:%.*]] 784; CHECK: for.body: 785; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 786; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] 787; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 788; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP17]] to i64 789; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] 790; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 791; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP18]], 1 792; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 793; CHECK-NEXT: [[IDX_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]] 794; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32 795; CHECK-NEXT: store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4 796; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 797; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] 798; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] 799; CHECK: for.exit: 800; CHECK-NEXT: ret void 801; 802entry: 803 br label %for.body 804 805for.body: 806 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 807 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 808 %l.idx = load i32, ptr %gep.indices, align 4 809 %idxprom1 = zext i32 %l.idx to i64 810 %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 811 %l.bucket = load i32, ptr %gep.bucket, align 4 812 %inc = add nsw i32 %l.bucket, 1 813 store i32 %inc, ptr %gep.bucket, align 4 814 %idx.addr = getelementptr inbounds i32, ptr %array, i64 %iv 815 %iv.trunc = trunc i64 %iv to i32 816 store i32 %iv.trunc, ptr %idx.addr, align 4 817 %iv.next = add nuw nsw i64 %iv, 1 818 %exitcond = icmp eq i64 %iv.next, %N 819 br i1 %exitcond, label %for.exit, label %for.body 820 821for.exit: 822 ret void 823} 824 825;; Make sure we don't vectorize if there's a potential alias between buckets 826;; and indices. 827define void @simple_histogram_unsafe_alias(ptr %buckets, ptr %indices, i64 %N) #0 { 828; CHECK-LABEL: define void @simple_histogram_unsafe_alias( 829; CHECK-SAME: ptr [[BUCKETS:%.*]], ptr [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 830; CHECK-NEXT: entry: 831; CHECK-NEXT: br label [[FOR_BODY:%.*]] 832; CHECK: for.body: 833; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 834; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[INDICES]], i64 [[IV]] 835; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 836; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64 837; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]] 838; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 839; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP13]], 1 840; CHECK-NEXT: store i32 [[INC]], ptr [[ARRAYIDX2]], align 4 841; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 842; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] 843; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]] 844; CHECK: for.exit: 845; CHECK-NEXT: ret void 846; 847entry: 848 br label %for.body 849 850for.body: 851 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 852 %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv 853 %l.idx = load i32, ptr %gep.indices, align 4 854 %idxprom1 = zext i32 %l.idx to i64 855 %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1 856 %l.bucket = load i32, ptr %gep.bucket, align 4 857 %inc = add nsw i32 %l.bucket, 1 858 store i32 %inc, ptr %gep.bucket, align 4 859 %iv.next = add nuw nsw i64 %iv, 1 860 %exitcond = icmp eq i64 %iv.next, %N 861 br i1 %exitcond, label %for.exit, label %for.body 862 863for.exit: 864 ret void 865} 866 867define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { 868; CHECK-LABEL: define void @simple_histogram_64b( 869; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { 870; CHECK-NEXT: entry: 871; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() 872; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1 873; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] 874; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] 875; CHECK: vector.ph: 876; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() 877; CHECK-NEXT: [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -2 878; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]] 879; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() 880; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 1 881; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 882; CHECK: vector.body: 883; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 884; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[INDICES]], i64 [[INDEX]] 885; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 4 886; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[BUCKETS]], <vscale x 2 x i64> [[WIDE_LOAD]] 887; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> [[TMP6]], i64 1, <vscale x 2 x i1> splat (i1 true)) 888; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] 889; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 890; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] 891; CHECK: middle.block: 892; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] 893; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] 894; CHECK: scalar.ph: 895; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] 896; CHECK-NEXT: br label [[FOR_BODY:%.*]] 897; CHECK: for.body: 898; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] 899; CHECK-NEXT: [[GEP_INDICES:%.*]] = getelementptr inbounds i64, ptr [[INDICES]], i64 [[IV]] 900; CHECK-NEXT: [[L_IDX:%.*]] = load i64, ptr [[GEP_INDICES]], align 4 901; CHECK-NEXT: [[GEP_BUCKET:%.*]] = getelementptr inbounds i64, ptr [[BUCKETS]], i64 [[L_IDX]] 902; CHECK-NEXT: [[L_BUCKET:%.*]] = load i64, ptr [[GEP_BUCKET]], align 4 903; CHECK-NEXT: [[INC:%.*]] = add nsw i64 [[L_BUCKET]], 1 904; CHECK-NEXT: store i64 [[INC]], ptr [[GEP_BUCKET]], align 4 905; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 906; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] 907; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] 908; CHECK: for.exit: 909; CHECK-NEXT: ret void 910; 911entry: 912 br label %for.body 913 914for.body: 915 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 916 %gep.indices = getelementptr inbounds i64, ptr %indices, i64 %iv 917 %l.idx = load i64, ptr %gep.indices, align 4 918 %gep.bucket = getelementptr inbounds i64, ptr %buckets, i64 %l.idx 919 %l.bucket = load i64, ptr %gep.bucket, align 4 920 %inc = add nsw i64 %l.bucket, 1 921 store i64 %inc, ptr %gep.bucket, align 4 922 %iv.next = add nuw nsw i64 %iv, 1 923 %exitcond = icmp eq i64 %iv.next, %N 924 br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4 925 926for.exit: 927 ret void 928} 929 930attributes #0 = { "target-features"="+sve2" vscale_range(1,16) } 931 932!0 = distinct !{!0, !1} 933!1 = !{!"llvm.loop.interleave.count", i32 2} 934!2 = distinct !{!2, !3} 935!3 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} 936!4 = distinct !{!4, !5} 937!5 = !{!"llvm.loop.interleave.count", i32 1} 938