xref: /llvm-project/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll (revision 23a239267e8a1d20ed10d3545feaf2a2bb70b085)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
2; RUN: opt < %s -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
3; REQUIRES: asserts
4
5target triple = "aarch64-unknown-linux-gnu"
6
7;; Based on the following C code:
8;;
9;; void simple_histogram(int *buckets, unsigned *indices, int N) {
10;;   for (int i = 0; i < N; ++i)
11;;     buckets[indices[i]]++;
12;; }
13
14;; Confirm finding a histogram operation
15; CHECK-LABEL: Checking a loop in 'simple_histogram'
16; CHECK: LV: Checking for a histogram on: store i32 %inc, ptr %gep.bucket, align 4
17; CHECK: LV: Found histogram for: store i32 %inc, ptr %gep.bucket, align 4
18
19;; Confirm cost calculation for runtime checks
20; CHECK-LABEL: LV: Checking a loop in 'simple_histogram_rtdepcheck'
21; CHECK: Calculating cost of runtime checks:
22; CHECK: Total cost of runtime checks:
23; CHECK: LV: Minimum required TC for runtime checks to be profitable:
24
25;; Confirm inability to vectorize with potential alias to buckets
26; CHECK-LABEL: LV: Checking a loop in 'simple_histogram_unsafe_alias'
27; CHECK: LV: Can't vectorize due to memory conflicts
28; CHECK-NEXT: LV: Not vectorizing: Cannot prove legality.
29
30define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
31; CHECK-LABEL: define void @simple_histogram(
32; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
33; CHECK-NEXT:  entry:
34; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
35; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
36; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
37; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
38; CHECK:       vector.ph:
39; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
40; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
41; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
42; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
43; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
44; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
45; CHECK:       vector.body:
46; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
47; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
48; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
49; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
50; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
51; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> splat (i1 true))
52; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
53; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
54; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
55; CHECK:       middle.block:
56; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
57; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
58; CHECK:       scalar.ph:
59; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
60; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
61; CHECK:       for.body:
62; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
63; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
64; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
65; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
66; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
67; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
68; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], 1
69; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
70; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
71; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
72; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
73; CHECK:       for.exit:
74; CHECK-NEXT:    ret void
75;
76entry:
77  br label %for.body
78
79for.body:
80  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
81  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
82  %l.idx = load i32, ptr %gep.indices, align 4
83  %idxprom1 = zext i32 %l.idx to i64
84  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
85  %l.bucket = load i32, ptr %gep.bucket, align 4
86  %inc = add nsw i32 %l.bucket, 1
87  store i32 %inc, ptr %gep.bucket, align 4
88  %iv.next = add nuw nsw i64 %iv, 1
89  %exitcond = icmp eq i64 %iv.next, %N
90  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
91
92for.exit:
93  ret void
94}
95
96define void @simple_histogram_inc_param(ptr noalias %buckets, ptr readonly %indices, i64 %N, i32 %incval) #0 {
97; CHECK-LABEL: define void @simple_histogram_inc_param(
98; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]], i32 [[INCVAL:%.*]]) #[[ATTR0]] {
99; CHECK-NEXT:  entry:
100; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
101; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
102; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
103; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
104; CHECK:       vector.ph:
105; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
106; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
107; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
108; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
109; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
110; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
111; CHECK:       vector.body:
112; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
113; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
114; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
115; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
116; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
117; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 [[INCVAL]], <vscale x 4 x i1> splat (i1 true))
118; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
119; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
120; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
121; CHECK:       middle.block:
122; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
123; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
124; CHECK:       scalar.ph:
125; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
126; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
127; CHECK:       for.body:
128; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
129; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
130; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
131; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
132; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
133; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
134; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], [[INCVAL]]
135; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
136; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
137; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
138; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
139; CHECK:       for.exit:
140; CHECK-NEXT:    ret void
141;
142entry:
143  br label %for.body
144
145for.body:
146  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
147  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
148  %l.idx = load i32, ptr %gep.indices, align 4
149  %idxprom1 = zext i32 %l.idx to i64
150  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
151  %l.bucket = load i32, ptr %gep.bucket, align 4
152  %inc = add nsw i32 %l.bucket, %incval
153  store i32 %inc, ptr %gep.bucket, align 4
154  %iv.next = add nuw nsw i64 %iv, 1
155  %exitcond = icmp eq i64 %iv.next, %N
156  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
157
158for.exit:
159  ret void
160}
161
162define void @simple_histogram_sub(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
163; CHECK-LABEL: define void @simple_histogram_sub(
164; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
165; CHECK-NEXT:  entry:
166; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
167; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
168; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
169; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
170; CHECK:       vector.ph:
171; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
172; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
173; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
174; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
175; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
176; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
177; CHECK:       vector.body:
178; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
179; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
180; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
181; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
182; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
183; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 -1, <vscale x 4 x i1> splat (i1 true))
184; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
185; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
186; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
187; CHECK:       middle.block:
188; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
189; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
190; CHECK:       scalar.ph:
191; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
192; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
193; CHECK:       for.body:
194; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
195; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
196; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
197; CHECK-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[TMP12]] to i64
198; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
199; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
200; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], -1
201; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
202; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
203; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
204; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
205; CHECK:       for.exit:
206; CHECK-NEXT:    ret void
207;
208entry:
209  br label %for.body
210
211for.body:
212  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
213  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
214  %l.idx = load i32, ptr %gep.indices, align 4
215  %idxprom1 = sext i32 %l.idx to i64
216  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
217  %l.bucket = load i32, ptr %gep.bucket, align 4
218  %inc = sub nsw i32 %l.bucket, 1
219  store i32 %inc, ptr %gep.bucket, align 4
220  %iv.next = add nuw nsw i64 %iv, 1
221  %exitcond = icmp eq i64 %iv.next, %N
222  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
223
224for.exit:
225  ret void
226}
227
228define void @conditional_histogram(ptr noalias %buckets, ptr readonly %indices, ptr readonly %conds, i64 %N) #0 {
229; CHECK-LABEL: define void @conditional_histogram(
230; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], ptr readonly [[CONDS:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
231; CHECK-NEXT:  entry:
232; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
233; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP6]], 2
234; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP3]]
235; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
236; CHECK:       vector.ph:
237; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
238; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
239; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
240; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
241; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
242; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
243; CHECK:       vector.body:
244; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
245; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
246; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
247; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
248; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
249; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[CONDS]], i64 [[INDEX]]
250; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
251; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD1]], splat (i32 5100)
252; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[TMP13]])
253; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
254; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
255; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
256; CHECK:       middle.block:
257; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
258; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
259; CHECK:       scalar.ph:
260; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
261; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
262; CHECK:       for.body:
263; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[NEXT:%.*]] ]
264; CHECK-NEXT:    [[CONDIDX:%.*]] = getelementptr inbounds i32, ptr [[CONDS]], i64 [[IV1]]
265; CHECK-NEXT:    [[CONDDATA:%.*]] = load i32, ptr [[CONDIDX]], align 4
266; CHECK-NEXT:    [[IFCOND:%.*]] = icmp sgt i32 [[CONDDATA]], 5100
267; CHECK-NEXT:    br i1 [[IFCOND]], label [[IFTRUE:%.*]], label [[NEXT]]
268; CHECK:       iftrue:
269; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]]
270; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
271; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP1]] to i64
272; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
273; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
274; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP15]], 1
275; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX3]], align 4
276; CHECK-NEXT:    br label [[NEXT]]
277; CHECK:       next:
278; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
279; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
280; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP9:![0-9]+]]
281; CHECK:       for.exit:
282; CHECK-NEXT:    ret void
283;
284entry:
285  br label %for.body
286
287for.body:
288  %iv = phi i64 [ 0, %entry ], [ %iv.next, %next ]
289  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
290  %l.idx = load i32, ptr %gep.indices, align 4
291  %idxprom1 = zext i32 %l.idx to i64
292  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
293  %condidx = getelementptr inbounds i32, ptr %conds, i64 %iv
294  %conddata = load i32, ptr %condidx, align 4
295  %ifcond = icmp sgt i32 %conddata, 5100
296  br i1 %ifcond, label %iftrue, label %next
297
298iftrue:
299  %l.bucket = load i32, ptr %gep.bucket, align 4
300  %inc = add nsw i32 %l.bucket, 1
301  store i32 %inc, ptr %gep.bucket, align 4
302  br label %next
303
304next:
305  %iv.next = add nuw nsw i64 %iv, 1
306  %exitcond = icmp eq i64 %iv.next, %N
307  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
308
309for.exit:
310  ret void
311}
312
313define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
314; CHECK-LABEL: define void @histogram_8bit(
315; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
316; CHECK-NEXT:  entry:
317; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
318; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 2
319; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP9]]
320; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
321; CHECK:       vector.ph:
322; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
323; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
324; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
325; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
326; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
327; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
328; CHECK:       vector.body:
329; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
330; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
331; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[ARRAYIDX]], align 4
332; CHECK-NEXT:    [[TMP6:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
333; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP6]]
334; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i8(<vscale x 4 x ptr> [[TMP7]], i8 1, <vscale x 4 x i1> splat (i1 true))
335; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]]
336; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
337; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
338; CHECK:       middle.block:
339; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
340; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
341; CHECK:       scalar.ph:
342; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
343; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
344; CHECK:       for.body:
345; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
346; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]]
347; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
348; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64
349; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[BUCKETS]], i64 [[IDXPROM1]]
350; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX2]], align 4
351; CHECK-NEXT:    [[INC:%.*]] = add nsw i8 [[TMP1]], 1
352; CHECK-NEXT:    store i8 [[INC]], ptr [[ARRAYIDX2]], align 4
353; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
354; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
355; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]]
356; CHECK:       for.exit:
357; CHECK-NEXT:    ret void
358;
359entry:
360  br label %for.body
361
362for.body:
363  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
364  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
365  %l.idx = load i32, ptr %gep.indices, align 4
366  %idxprom1 = zext i32 %l.idx to i64
367  %gep.bucket = getelementptr inbounds i8, ptr %buckets, i64 %idxprom1
368  %l.bucket = load i8, ptr %gep.bucket, align 4
369  %inc = add nsw i8 %l.bucket, 1
370  store i8 %inc, ptr %gep.bucket, align 4
371  %iv.next = add nuw nsw i64 %iv, 1
372  %exitcond = icmp eq i64 %iv.next, %N
373  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
374
375for.exit:
376  ret void
377}
378
379;; We don't currently support floating point histograms.
380define void @histogram_float(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
381; CHECK-LABEL: define void @histogram_float(
382; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
383; CHECK-NEXT:  entry:
384; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
385; CHECK:       for.body:
386; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
387; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[INDICES]], i64 [[IV]]
388; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
389; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64
390; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw float, ptr [[BUCKETS]], i64 [[IDXPROM1]]
391; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
392; CHECK-NEXT:    [[INC:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
393; CHECK-NEXT:    store float [[INC]], ptr [[ARRAYIDX2]], align 4
394; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
395; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
396; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
397; CHECK:       for.exit:
398; CHECK-NEXT:    ret void
399;
400entry:
401  br label %for.body
402
403for.body:
404  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
405  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
406  %l.idx = load i32, ptr %gep.indices, align 4
407  %idxprom1 = zext i32 %l.idx to i64
408  %gep.bucket = getelementptr inbounds float, ptr %buckets, i64 %idxprom1
409  %l.bucket = load float, ptr %gep.bucket, align 4
410  %inc = fadd fast float %l.bucket, 1.0
411  store float %inc, ptr %gep.bucket, align 4
412  %iv.next = add nuw nsw i64 %iv, 1
413  %exitcond = icmp eq i64 %iv.next, %N
414  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
415
416for.exit:
417  ret void
418}
419
420;; We don't support histograms with a update value that's not loop-invariant.
421define void @histogram_varying_increment(ptr noalias %buckets, ptr readonly %indices, ptr readonly %incvals, i64 %N) #0 {
422; CHECK-LABEL: define void @histogram_varying_increment(
423; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], ptr readonly [[INCVALS:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
424; CHECK-NEXT:  entry:
425; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
426; CHECK:       for.body:
427; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
428; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[INDICES]], i64 [[IV]]
429; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
430; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64
431; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
432; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
433; CHECK-NEXT:    [[INCIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[INCVALS]], i64 [[IV]]
434; CHECK-NEXT:    [[INCVAL:%.*]] = load i32, ptr [[INCIDX]], align 4
435; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], [[INCVAL]]
436; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
437; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
438; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
439; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12]]
440; CHECK:       for.exit:
441; CHECK-NEXT:    ret void
442;
443entry:
444  br label %for.body
445
446for.body:
447  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
448  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
449  %l.idx = load i32, ptr %gep.indices, align 4
450  %idxprom1 = zext i32 %l.idx to i64
451  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
452  %l.bucket = load i32, ptr %gep.bucket, align 4
453  %gep.incvals = getelementptr inbounds i32, ptr %incvals, i64 %iv
454  %l.incval = load i32, ptr %gep.incvals, align 4
455  %inc = add nsw i32 %l.bucket, %l.incval
456  store i32 %inc, ptr %gep.bucket, align 4
457  %iv.next = add nuw nsw i64 %iv, 1
458  %exitcond = icmp eq i64 %iv.next, %N
459  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
460
461for.exit:
462  ret void
463}
464
465;; Test that interleaving works when vectorizing.
466define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
467; CHECK-LABEL: define void @simple_histogram_user_interleave(
468; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
469; CHECK-NEXT:  entry:
470; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
471; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
472; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
473; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
474; CHECK:       vector.ph:
475; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
476; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -8
477; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
478; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
479; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
480; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
481; CHECK:       vector.body:
482; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
483; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
484; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
485; CHECK-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP15]], 4
486; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[DOTIDX]]
487; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
488; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
489; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
490; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
491; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
492; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP19]]
493; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> splat (i1 true))
494; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP21]], i32 1, <vscale x 4 x i1> splat (i1 true))
495; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
496; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
497; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
498; CHECK:       middle.block:
499; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
500; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
501; CHECK:       scalar.ph:
502; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
503; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
504; CHECK:       for.body:
505; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
506; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
507; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
508; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
509; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
510; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
511; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], 1
512; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
513; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
514; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
515; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
516; CHECK:       for.exit:
517; CHECK-NEXT:    ret void
518;
519entry:
520  br label %for.body
521
522for.body:
523  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
524  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
525  %l.idx = load i32, ptr %gep.indices, align 4
526  %idxprom1 = zext i32 %l.idx to i64
527  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
528  %l.bucket = load i32, ptr %gep.bucket, align 4
529  %inc = add nsw i32 %l.bucket, 1
530  store i32 %inc, ptr %gep.bucket, align 4
531  %iv.next = add nuw nsw i64 %iv, 1
532  %exitcond = icmp eq i64 %iv.next, %N
533  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !0
534
535for.exit:
536  ret void
537}
538
539;; Test that we can handle more than one GEP index.
540@idx_array = dso_local local_unnamed_addr global [1048576 x i32] zeroinitializer, align 4
541@data_array = dso_local local_unnamed_addr global [1048576 x i32] zeroinitializer, align 4
542
543define void @histogram_array_3op_gep(i64 noundef %N) #0 {
544; CHECK-LABEL: define void @histogram_array_3op_gep(
545; CHECK-SAME: i64 noundef [[N:%.*]]) #[[ATTR0]] {
546; CHECK-NEXT:  entry:
547; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
548; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
549; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
550; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
551; CHECK:       vector.ph:
552; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
553; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
554; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
555; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
556; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
557; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
558; CHECK:       vector.body:
559; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
560; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1048576 x i32], ptr @idx_array, i64 0, i64 [[INDEX]]
561; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
562; CHECK-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
563; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1048576 x i32], ptr @data_array, i64 0, <vscale x 4 x i64> [[TMP14]]
564; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP11]], i32 1, <vscale x 4 x i1> splat (i1 true))
565; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
566; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
567; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
568; CHECK:       middle.block:
569; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
570; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
571; CHECK:       scalar.ph:
572; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
573; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
574; CHECK:       for.body:
575; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
576; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1048576 x i32], ptr @idx_array, i64 0, i64 [[IV]]
577; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
578; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[TMP9]] to i64
579; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [1048576 x i32], ptr @data_array, i64 0, i64 [[IDXPROM5]]
580; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
581; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP10]], 1
582; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX6]], align 4
583; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
584; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
585; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
586; CHECK:       for.exit:
587; CHECK-NEXT:    ret void
588;
589entry:
590  br label %for.body
591
592for.body:
593  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
594  %gep.indices = getelementptr inbounds [1048576 x i32], ptr @idx_array, i64 0, i64 %iv
595  %l.idx = load i32, ptr %gep.indices, align 4
596  %idxprom5 = sext i32 %l.idx to i64
597  %gep.bucket = getelementptr inbounds [1048576 x i32], ptr @data_array, i64 0, i64 %idxprom5
598  %l.bucket = load i32, ptr %gep.bucket, align 4
599  %inc = add nsw i32 %l.bucket, 1
600  store i32 %inc, ptr %gep.bucket, align 4
601  %iv.next = add nuw nsw i64 %iv, 1
602  %exitcond = icmp eq i64 %iv.next, %N
603  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
604
605for.exit:
606  ret void
607}
608
609;; Add a struct into the mix, use a different constant index.
610;; { unused, buckets }
611%somestruct = type { [1048576 x i32], [1048576 x i32] }
612
613define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr readonly %indices, ptr noalias %data.struct) #0 {
614; CHECK-LABEL: define void @histogram_array_4op_gep_nonzero_const_idx(
615; CHECK-SAME: i64 noundef [[N:%.*]], ptr readonly [[INDICES:%.*]], ptr noalias [[DATA_STRUCT:%.*]]) #[[ATTR0]] {
616; CHECK-NEXT:  entry:
617; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
618; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
619; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
620; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
621; CHECK:       vector.ph:
622; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
623; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
624; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
625; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
626; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
627; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
628; CHECK:       vector.body:
629; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
630; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
631; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
632; CHECK-NEXT:    [[TMP6:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
633; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[SOMESTRUCT:%.*]], ptr [[DATA_STRUCT]], i64 1, i32 0, <vscale x 4 x i64> [[TMP6]]
634; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 1, <vscale x 4 x i1> splat (i1 true))
635; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]]
636; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
637; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
638; CHECK:       middle.block:
639; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
640; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
641; CHECK:       scalar.ph:
642; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
643; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
644; CHECK:       for.body:
645; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
646; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]]
647; CHECK-NEXT:    [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
648; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[L_IDX]] to i64
649; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr inbounds [[SOMESTRUCT]], ptr [[DATA_STRUCT]], i64 1, i32 0, i64 [[IDXPROM5]]
650; CHECK-NEXT:    [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4
651; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[L_BUCKET]], 1
652; CHECK-NEXT:    store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
653; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
654; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
655; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP19:![0-9]+]]
656; CHECK:       for.exit:
657; CHECK-NEXT:    ret void
658;
659entry:
660  br label %for.body
661
662for.body:
663  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
664  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
665  %l.idx = load i32, ptr %gep.indices, align 4
666  %idxprom5 = sext i32 %l.idx to i64
667  %gep.bucket = getelementptr inbounds %somestruct, ptr %data.struct, i32 1, i32 0, i64 %idxprom5
668  %l.bucket = load i32, ptr %gep.bucket, align 4
669  %inc = add nsw i32 %l.bucket, 1
670  store i32 %inc, ptr %gep.bucket, align 4
671  %iv.next = add nuw nsw i64 %iv, 1
672  %exitcond = icmp eq i64 %iv.next, %N
673  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
674
675for.exit:
676  ret void
677}
678
679;; Make sure the histogram intrinsic uses the active lane mask when tail folding.
680define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
681; CHECK-LABEL: define void @simple_histogram_tailfold(
682; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
683; CHECK-NEXT:  entry:
684; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
685; CHECK:       vector.ph:
686; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
687; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP2]], 2
688; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
689; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
690; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP5]])
691; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
692; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
693; CHECK:       vector.body:
694; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
695; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
696; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
697; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
698; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
699; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
700; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
701; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
702; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
703; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
704; CHECK-NEXT:    br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]]
705; CHECK:       middle.block:
706; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
707; CHECK:       scalar.ph:
708; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
709; CHECK:       for.body:
710; CHECK-NEXT:    br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
711; CHECK:       for.exit:
712; CHECK-NEXT:    ret void
713;
714entry:
715  br label %for.body
716
717for.body:
718  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
719  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
720  %l.idx = load i32, ptr %gep.indices, align 4
721  %idxprom1 = zext i32 %l.idx to i64
722  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
723  %l.bucket = load i32, ptr %gep.bucket, align 4
724  %inc = add nsw i32 %l.bucket, 1
725  store i32 %inc, ptr %gep.bucket, align 4
726  %iv.next = add nuw nsw i64 %iv, 1
727  %exitcond = icmp eq i64 %iv.next, %N
728  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !2
729
730for.exit:
731  ret void
732}
733
734;; Check that we can still vectorize a histogram when LAA found another dependency
735;; that doesn't conflict with the buckets.
736define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr %indices, i64 %N) #0 {
737; CHECK-LABEL: define void @simple_histogram_rtdepcheck(
738; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr [[ARRAY:%.*]], ptr [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
739; CHECK-NEXT:  entry:
740; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
741; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
742; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8)
743; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
744; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
745; CHECK:       vector.memcheck:
746; CHECK-NEXT:    [[ARRAY1:%.*]] = ptrtoint ptr [[ARRAY]] to i64
747; CHECK-NEXT:    [[INDICES2:%.*]] = ptrtoint ptr [[INDICES]] to i64
748; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
749; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
750; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[ARRAY1]], [[INDICES2]]
751; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]]
752; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
753; CHECK:       vector.ph:
754; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
755; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP6]], -4
756; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
757; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
758; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
759; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
760; CHECK-NEXT:    [[TMP11:%.*]] = trunc nuw nsw i64 [[TMP8]] to i32
761; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
762; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
763; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
764; CHECK:       vector.body:
765; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
766; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
767; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
768; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
769; CHECK-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
770; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP13]]
771; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP14]], i32 1, <vscale x 4 x i1> splat (i1 true))
772; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDEX]]
773; CHECK-NEXT:    store <vscale x 4 x i32> [[VEC_IND]], ptr [[TMP15]], align 4
774; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
775; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
776; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
777; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
778; CHECK:       middle.block:
779; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
780; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
781; CHECK:       scalar.ph:
782; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ]
783; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
784; CHECK:       for.body:
785; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
786; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
787; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
788; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP17]] to i64
789; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
790; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
791; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP18]], 1
792; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
793; CHECK-NEXT:    [[IDX_ADDR:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[IV]]
794; CHECK-NEXT:    [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
795; CHECK-NEXT:    store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4
796; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
797; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
798; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
799; CHECK:       for.exit:
800; CHECK-NEXT:    ret void
801;
802entry:
803  br label %for.body
804
805for.body:
806  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
807  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
808  %l.idx = load i32, ptr %gep.indices, align 4
809  %idxprom1 = zext i32 %l.idx to i64
810  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
811  %l.bucket = load i32, ptr %gep.bucket, align 4
812  %inc = add nsw i32 %l.bucket, 1
813  store i32 %inc, ptr %gep.bucket, align 4
814  %idx.addr = getelementptr inbounds i32, ptr %array, i64 %iv
815  %iv.trunc = trunc i64 %iv to i32
816  store i32 %iv.trunc, ptr %idx.addr, align 4
817  %iv.next = add nuw nsw i64 %iv, 1
818  %exitcond = icmp eq i64 %iv.next, %N
819  br i1 %exitcond, label %for.exit, label %for.body
820
821for.exit:
822  ret void
823}
824
825;; Make sure we don't vectorize if there's a potential alias between buckets
826;; and indices.
827define void @simple_histogram_unsafe_alias(ptr %buckets, ptr %indices, i64 %N) #0 {
828; CHECK-LABEL: define void @simple_histogram_unsafe_alias(
829; CHECK-SAME: ptr [[BUCKETS:%.*]], ptr [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
830; CHECK-NEXT:  entry:
831; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
832; CHECK:       for.body:
833; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
834; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i32, ptr [[INDICES]], i64 [[IV]]
835; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
836; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
837; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
838; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
839; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], 1
840; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
841; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
842; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
843; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]]
844; CHECK:       for.exit:
845; CHECK-NEXT:    ret void
846;
847entry:
848  br label %for.body
849
850for.body:
851  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
852  %gep.indices = getelementptr inbounds i32, ptr %indices, i64 %iv
853  %l.idx = load i32, ptr %gep.indices, align 4
854  %idxprom1 = zext i32 %l.idx to i64
855  %gep.bucket = getelementptr inbounds i32, ptr %buckets, i64 %idxprom1
856  %l.bucket = load i32, ptr %gep.bucket, align 4
857  %inc = add nsw i32 %l.bucket, 1
858  store i32 %inc, ptr %gep.bucket, align 4
859  %iv.next = add nuw nsw i64 %iv, 1
860  %exitcond = icmp eq i64 %iv.next, %N
861  br i1 %exitcond, label %for.exit, label %for.body
862
863for.exit:
864  ret void
865}
866
867define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
868; CHECK-LABEL: define void @simple_histogram_64b(
869; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
870; CHECK-NEXT:  entry:
871; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
872; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
873; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
874; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
875; CHECK:       vector.ph:
876; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
877; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -2
878; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
879; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
880; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 1
881; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
882; CHECK:       vector.body:
883; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
884; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[INDICES]], i64 [[INDEX]]
885; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 4
886; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[BUCKETS]], <vscale x 2 x i64> [[WIDE_LOAD]]
887; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> [[TMP6]], i64 1, <vscale x 2 x i1> splat (i1 true))
888; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
889; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
890; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
891; CHECK:       middle.block:
892; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
893; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
894; CHECK:       scalar.ph:
895; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
896; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
897; CHECK:       for.body:
898; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
899; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i64, ptr [[INDICES]], i64 [[IV]]
900; CHECK-NEXT:    [[L_IDX:%.*]] = load i64, ptr [[GEP_INDICES]], align 4
901; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr inbounds i64, ptr [[BUCKETS]], i64 [[L_IDX]]
902; CHECK-NEXT:    [[L_BUCKET:%.*]] = load i64, ptr [[GEP_BUCKET]], align 4
903; CHECK-NEXT:    [[INC:%.*]] = add nsw i64 [[L_BUCKET]], 1
904; CHECK-NEXT:    store i64 [[INC]], ptr [[GEP_BUCKET]], align 4
905; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
906; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
907; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
908; CHECK:       for.exit:
909; CHECK-NEXT:    ret void
910;
911entry:
912  br label %for.body
913
914for.body:
915  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
916  %gep.indices = getelementptr inbounds i64, ptr %indices, i64 %iv
917  %l.idx = load i64, ptr %gep.indices, align 4
918  %gep.bucket = getelementptr inbounds i64, ptr %buckets, i64 %l.idx
919  %l.bucket = load i64, ptr %gep.bucket, align 4
920  %inc = add nsw i64 %l.bucket, 1
921  store i64 %inc, ptr %gep.bucket, align 4
922  %iv.next = add nuw nsw i64 %iv, 1
923  %exitcond = icmp eq i64 %iv.next, %N
924  br i1 %exitcond, label %for.exit, label %for.body, !llvm.loop !4
925
926for.exit:
927  ret void
928}
929
930attributes #0 = { "target-features"="+sve2" vscale_range(1,16) }
931
932!0 = distinct !{!0, !1}
933!1 = !{!"llvm.loop.interleave.count", i32 2}
934!2 = distinct !{!2, !3}
935!3 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
936!4 = distinct !{!4, !5}
937!5 = !{!"llvm.loop.interleave.count", i32 1}
938