xref: /llvm-project/llvm/test/Transforms/LoopVectorize/reduction-with-invariant-store.ll (revision 38fffa630ee80163dc65e759392ad29798905679)
1; RUN: opt < %s -passes="loop-vectorize" -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
2
3target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
4
5; This test checks that we can vectorize loop with reduction variable
6; stored in an invariant address.
7;
8; int sum = 0;
9; for(i=0..N) {
10;   sum += src[i];
11;   dst[42] = sum;
12; }
13; CHECK-LABEL: @reduc_store
14; CHECK:       vector.body:
15; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
16; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
17; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
18; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]]
19; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0
20; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
21; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
22; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
23; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
24; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
25; CHECK:       middle.block:
26; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
27; CHECK-NEXT:    store i32 [[TMP6]], ptr [[GEP_DST:%.*]], align 4
28; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
29define void @reduc_store(ptr %dst, ptr readonly %src) {
30entry:
31  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
32  store i32 0, ptr %gep.dst, align 4
33  br label %for.body
34
35for.body:
36  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
37  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
38  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
39  %0 = load i32, ptr %gep.src, align 4
40  %add = add nsw i32 %sum, %0
41  store i32 %add, ptr %gep.dst, align 4
42  %iv.next = add nuw nsw i64 %iv, 1
43  %exitcond = icmp eq i64 %iv.next, 1000
44  br i1 %exitcond, label %exit, label %for.body
45
46exit:
47  ret void
48}
49
50; Same as above but with floating point numbers instead.
51;
52; float sum = 0;
53; for(i=0..N) {
54;   sum += src[i];
55;   dst[42] = sum;
56; }
57; CHECK-LABEL: @reduc_store_fadd_fast
58; CHECK: vector.body:
59; CHECK: phi <4 x float>
60; CHECK: load <4 x float>
61; CHECK: fadd fast <4 x float>
62; CHECK-NOT: store float %{{[0-9]+}}, ptr %gep.dst
63; CHECK: middle.block:
64; CHECK-NEXT: [[TMP:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32
65; CHECK-NEXT: store float %{{[0-9]+}}, ptr %gep.dst
66define void @reduc_store_fadd_fast(ptr %dst, ptr readonly %src) {
67entry:
68  %gep.dst = getelementptr inbounds float, ptr %dst, i64 42
69  store float 0.000000e+00, ptr %gep.dst, align 4
70  br label %for.body
71
72for.body:
73  %sum = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
74  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
75  %gep.src = getelementptr inbounds float, ptr %src, i64 %iv
76  %0 = load float, ptr %gep.src, align 4
77  %add = fadd fast float %sum, %0
78  store float %add, ptr %gep.dst, align 4
79  %iv.next = add nuw nsw i64 %iv, 1
80  %exitcond = icmp eq i64 %iv.next, 1000
81  br i1 %exitcond, label %exit, label %for.body
82
83exit:
84  ret void
85}
86
87; Check that if we have a read from an invariant address, we do not vectorize.
88;
89; int sum = 0;
90; for(i=0..N) {
91;   sum += src[i];
92;   dst.2[i] = dst[42];
93;   dst[42] = sum;
94; }
95; CHECK-LABEL: @reduc_store_load
96; CHECK-NOT: vector.body
97define void @reduc_store_load(ptr %dst, ptr readonly %src, ptr noalias %dst.2) {
98entry:
99  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
100  store i32 0, ptr %gep.dst, align 4
101  br label %for.body
102
103for.body:
104  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
105  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
106  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
107  %0 = load i32, ptr %gep.src, align 4
108  %add = add nsw i32 %sum, %0
109  %lv = load i32, ptr %gep.dst
110  %gep.dst.2  = getelementptr inbounds i32, ptr %dst.2, i64 %iv
111  store i32 %lv, ptr %gep.dst.2, align 4
112  store i32 %add, ptr %gep.dst, align 4
113  %iv.next = add nuw nsw i64 %iv, 1
114  %exitcond = icmp eq i64 %iv.next, 1000
115  br i1 %exitcond, label %exit, label %for.body
116
117exit:
118  ret void
119}
120
121; Check that if we have a read from an invariant address, we do not vectorize,
122; even if we vectorize with runtime checks. The test below is a variant of
123; @reduc_store_load with a non-constant dependence distance, resulting in
124; vectorization with runtime checks.
125;
126; CHECK-LABEL: @reduc_store_load_with_non_constant_distance_dependence
127; CHECK-NOT: vector.body:
128define void @reduc_store_load_with_non_constant_distance_dependence(ptr %dst, ptr noalias %dst.2, i64 %off) {
129entry:
130  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
131  %dst.2.off = getelementptr inbounds i32, ptr %dst.2, i64 %off
132  store i32 0, ptr %gep.dst, align 4
133  br label %for.body
134
135for.body:
136  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
137  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
138  %gep.src = getelementptr inbounds i32, ptr %dst.2, i64 %iv
139  %0 = load i32, ptr %gep.src, align 4
140  %iv.off = mul i64 %iv, 2
141  %add = add nsw i32 %sum, %0
142  %lv = load i32, ptr %gep.dst
143  store i32 %add, ptr %gep.dst, align 4
144  %gep.src.2 = getelementptr inbounds i32, ptr %dst.2.off, i64 %iv
145  store i32 %lv, ptr %gep.src.2, align 4
146  %iv.next = add nuw nsw i64 %iv, 1
147  %exitcond = icmp eq i64 %iv.next, 1000
148  br i1 %exitcond, label %exit, label %for.body
149
150exit:
151  ret void
152}
153
154
155; Final value is not guaranteed to be stored in an invariant address.
156; We don't vectorize in that case.
157;
158; int sum = 0;
159; for(i=0..N) {
160;   int diff = y[i] - x[i];
161;   if (diff > 0) {
162;     sum = += diff;
163;     *t = sum;
164;   }
165; }
166; CHECK-LABEL: @reduc_cond_store
167; CHECK-NOT: vector.body
168define void @reduc_cond_store(ptr %t, ptr readonly %x, ptr readonly %y) {
169entry:
170  store i32 0, ptr %t, align 4
171  br label %for.body
172
173for.body:
174  %sum = phi i32 [ 0, %entry ], [ %sum.2, %if.end ]
175  %iv = phi i64 [ 0, %entry ], [ %iv.next, %if.end ]
176  %gep.y = getelementptr inbounds i32, ptr %y, i64 %iv
177  %0 = load i32, ptr %gep.y, align 4
178  %gep.x = getelementptr inbounds i32, ptr %x, i64 %iv
179  %1 = load i32, ptr %gep.x, align 4
180  %diff = sub nsw i32 %0, %1
181  %cmp2 = icmp sgt i32 %diff, 0
182  br i1 %cmp2, label %if.then, label %if.end
183
184if.then:
185  %sum.1 = add nsw i32 %diff, %sum
186  store i32 %sum.1, ptr %t, align 4
187  br label %if.end
188
189if.end:
190  %sum.2 = phi i32 [ %sum.1, %if.then ], [ %0, %for.body ]
191  %iv.next = add nuw nsw i64 %iv, 1
192  %exitcond = icmp eq i64 %iv.next, 1000
193  br i1 %exitcond, label %for.end, label %for.body
194
195for.end:
196  ret void
197}
198
199; Check that we can vectorize code with several stores to an invariant address
200; with condition that final reduction value is stored too.
201;
202;  int sum = 0;
203;  for(int i=0; i < 1000; i+=2) {
204;    sum += src[i];
205;    dst[42] = sum;
206;    sum += src[i+1];
207;    dst[42] = sum;
208;  }
209; CHECK-LABEL: @reduc_store_inside_unrolled
210; CHECK:       vector.body:
211; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
212; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 2, i64 4, i64 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
213; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
214; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
215; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
216; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 2
217; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 4
218; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 6
219; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]]
220; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]]
221; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP2]]
222; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP3]]
223; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4
224; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP5]], align 4
225; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP6]], align 4
226; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP7]], align 4
227; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> poison, i32 [[TMP8]], i32 0
228; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP9]], i32 1
229; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i32 2
230; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP11]], i32 3
231; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[VEC_PHI]]
232; CHECK-NEXT:    [[TMP17:%.*]] = or disjoint <4 x i64> [[VEC_IND]], splat (i64 1)
233; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x i64> [[TMP17]], i32 0
234; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
235; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i64> [[TMP17]], i32 1
236; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
237; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x i64> [[TMP17]], i32 2
238; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
239; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i64> [[TMP17]], i32 3
240; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]]
241; CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP19]], align 4
242; CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP21]], align 4
243; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP23]], align 4
244; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP25]], align 4
245; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
246; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP27]], i32 1
247; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP28]], i32 2
248; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP29]], i32 3
249; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP33]], [[TMP16]]
250; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
251; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 8)
252; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT]], 500
253; CHECK-NEXT:    br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
254; CHECK:       middle.block:
255; CHECK-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP34]])
256; CHECK-NEXT:    store i32 [[TMP36]], ptr [[GEP_DST:%.*]], align 4
257; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
258define void @reduc_store_inside_unrolled(ptr %dst, ptr readonly %src) {
259entry:
260  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
261  br label %for.body
262
263for.body:
264  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
265  %sum = phi i32 [ 0, %entry ], [ %sum.2, %for.body ]
266  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
267  %0 = load i32, ptr %gep.src, align 4
268  %sum.1 = add nsw i32 %0, %sum
269  store i32 %sum.1, ptr %gep.dst, align 4
270  %1 = or disjoint i64 %iv, 1
271  %gep.src.1 = getelementptr inbounds i32, ptr %src, i64 %1
272  %2 = load i32, ptr %gep.src.1, align 4
273  %sum.2 = add nsw i32 %2, %sum.1
274  store i32 %sum.2, ptr %gep.dst, align 4
275  %iv.next = add nuw nsw i64 %iv, 2
276  %cmp = icmp slt i64 %iv.next, 1000
277  br i1 %cmp, label %for.body, label %exit
278
279exit:
280  ret void
281}
282
283; Check that we cannot vectorize code if stored value is not the final reduction
284; value
285;
286;  int sum = 0;
287;  for(int i=0; i < 1000; i++) {
288;    sum += src[i];
289;    dst[42] = sum + 1;
290;  }
291; CHECK-LABEL: @reduc_store_not_final_value
292; CHECK-NOT: vector.body:
293define void @reduc_store_not_final_value(ptr %dst, ptr readonly %src) {
294entry:
295  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
296  store i32 0, ptr %gep.dst, align 4
297  br label %for.body
298
299for.body:
300  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
301  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
302  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
303  %0 = load i32, ptr %gep.src, align 4
304  %add = add nsw i32 %sum, %0
305  %sum_plus_one = add i32 %add, 1
306  store i32 %sum_plus_one, ptr %gep.dst, align 4
307  %iv.next = add nuw nsw i64 %iv, 1
308  %exitcond = icmp eq i64 %iv.next, 1000
309  br i1 %exitcond, label %exit, label %for.body
310
311exit:
312  ret void
313}
314
315; We cannot vectorize if two (or more) invariant stores exist in a loop.
316;
317;  int sum = 0;
318;  for(int i=0; i < 1000; i+=2) {
319;    sum += src[i];
320;    dst[42] = sum;
321;    sum += src[i+1];
322;    other_dst[42] = sum;
323;  }
324; CHECK-LABEL: @reduc_double_invariant_store
325; CHECK-NOT: vector.body:
326define void @reduc_double_invariant_store(ptr %dst, ptr %other_dst, ptr readonly %src) {
327entry:
328  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
329  %gep.other_dst = getelementptr inbounds i32, ptr %other_dst, i64 42
330  br label %for.body
331
332for.body:
333  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
334  %sum = phi i32 [ 0, %entry ], [ %sum.2, %for.body ]
335  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %iv
336  %0 = load i32, ptr %arrayidx, align 4
337  %sum.1 = add nsw i32 %0, %sum
338  store i32 %sum.1, ptr %gep.dst, align 4
339  %1 = or disjoint i64 %iv, 1
340  %arrayidx4 = getelementptr inbounds i32, ptr %src, i64 %1
341  %2 = load i32, ptr %arrayidx4, align 4
342  %sum.2 = add nsw i32 %2, %sum.1
343  store i32 %sum.2, ptr %gep.other_dst, align 4
344  %iv.next = add nuw nsw i64 %iv, 2
345  %cmp = icmp slt i64 %iv.next, 1000
346  br i1 %cmp, label %for.body, label %exit
347
348exit:
349  ret void
350}
351
352;  int sum = 0;
353;  for(int i=0; i < 1000; i+=2) {
354;    sum += src[i];
355;    if (src[i+1] > 0)
356;      dst[42] = sum;
357;    sum += src[i+1];
358;    dst[42] = sum;
359;  }
360; CHECK-LABEL: @reduc_store_middle_store_predicated
361; CHECK: vector.body:
362; CHECK-NOT: store i32 %{{[0-9]+}}, ptr %gep.dst
363; CHECK: middle.block:
364; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32
365; CHECK-NEXT: store i32 [[TMP]], ptr %gep.dst
366; CHECK: ret void
367define void @reduc_store_middle_store_predicated(ptr %dst, ptr readonly %src) {
368entry:
369  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
370  br label %for.body
371
372for.body:                                         ; preds = %latch, %entry
373  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
374  %sum = phi i32 [ 0, %entry ], [ %sum.2, %latch ]
375  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
376  %0 = load i32, ptr %gep.src, align 4
377  %sum.1 = add nsw i32 %0, %sum
378  %cmp = icmp sgt i32 %0, 0
379  br i1 %cmp, label %predicated, label %latch
380
381predicated:                                       ; preds = %for.body
382  store i32 %sum.1, ptr %gep.dst, align 4
383  br label %latch
384
385latch:                                            ; preds = %predicated, %for.body
386  %1 = or disjoint i64 %iv, 1
387  %gep.src.1 = getelementptr inbounds i32, ptr %src, i64 %1
388  %2 = load i32, ptr %gep.src.1, align 4
389  %sum.2 = add nsw i32 %2, %sum.1
390  store i32 %sum.2, ptr %gep.dst, align 4
391  %iv.next = add nuw nsw i64 %iv, 2
392  %cmp.1 = icmp slt i64 %iv.next, 1000
393  br i1 %cmp.1, label %for.body, label %exit
394
395exit:                                 ; preds = %latch
396  ret void
397}
398
399;  int sum = 0;
400;  for(int i=0; i < 1000; i+=2) {
401;    sum += src[i];
402;    dst[42] = sum;
403;    sum += src[i+1];
404;    if (src[i+1] > 0)
405;      dst[42] = sum;
406;  }
407; CHECK-LABEL: @reduc_store_final_store_predicated
408; CHECK-NOT: vector.body:
409define void @reduc_store_final_store_predicated(ptr %dst, ptr readonly %src) {
410entry:
411  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
412  br label %for.body
413
414for.body:                                         ; preds = %latch, %entry
415  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
416  %sum = phi i32 [ 0, %entry ], [ %sum.1, %latch ]
417  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %iv
418  %0 = load i32, ptr %arrayidx, align 4
419  %sum.1 = add nsw i32 %0, %sum
420  store i32 %sum.1, ptr %gep.dst, align 4
421  %1 = or disjoint i64 %iv, 1
422  %gep.src.1 = getelementptr inbounds i32, ptr %src, i64 %1
423  %2 = load i32, ptr %gep.src.1, align 4
424  %sum.2 = add nsw i32 %2, %sum.1
425  %cmp1 = icmp sgt i32 %2, 0
426  br i1 %cmp1, label %predicated, label %latch
427
428predicated:                                       ; preds = %for.body
429  store i32 %sum.2, ptr %gep.dst, align 4
430  br label %latch
431
432latch:                                            ; preds = %predicated, %for.body
433  %iv.next = add nuw nsw i64 %iv, 2
434  %cmp = icmp slt i64 %iv.next, 1000
435  br i1 %cmp, label %for.body, label %exit
436
437exit:                                 ; preds = %latch
438  ret void
439}
440
441; Final reduction value is overwritten inside loop
442;
443; for(int i=0; i < 1000; i++) {
444;   sum += src[i];
445;   dst[42] = sum;
446;   dst[42] = 0;
447; }
448; CHECK-LABEL: @reduc_store_final_store_overwritten
449; CHECK-NOT: vector.body:
450define void @reduc_store_final_store_overwritten(ptr %dst, ptr readonly %src) {
451entry:
452  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
453  br label %for.body
454
455for.body:
456  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
457  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
458  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
459  %0 = load i32, ptr %gep.src, align 4
460  %add = add nsw i32 %sum, %0
461  store i32 %add, ptr %gep.dst, align 4
462  store i32 0, ptr %gep.dst, align 4
463  %iv.next = add nuw nsw i64 %iv, 1
464  %exitcond = icmp eq i64 %iv.next, 1000
465  br i1 %exitcond, label %exit, label %for.body
466
467exit:
468  ret void
469}
470
471; Final value used outside of loop does not prevent vectorization
472;
473; int sum = 0;
474; for(int i=0; i < 1000; i++) {
475;   sum += src[i];
476;   dst[42] = sum;
477; }
478; dst[43] = sum;
479; CHECK-LABEL: @reduc_store_inoutside
480; CHECK: vector.body:
481; CHECK-NOT: store i32 %{{[0-9]+}}, ptr %gep.src
482; CHECK: middle.block:
483; CHECK-NEXT: [[TMP:%.*]] = call i32 @llvm.vector.reduce.add.v4i32
484; CHECK-NEXT: store i32 [[TMP]], ptr %gep.dst
485; CHECK: exit:
486; CHECK: [[PHI:%.*]] = phi i32 [ [[TMP1:%.*]], %for.body ], [ [[TMP2:%.*]], %middle.block ]
487; CHECK: [[ADDR:%.*]] = getelementptr inbounds i32, ptr %dst, i64 43
488; CHECK: store i32 [[PHI]], ptr [[ADDR]]
489; CHECK: ret void
490define void @reduc_store_inoutside(ptr %dst, ptr readonly %src) {
491entry:
492  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
493  br label %for.body
494
495for.body:
496  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
497  %sum = phi i32 [ 0, %entry ], [ %sum.1, %for.body ]
498  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %iv
499  %0 = load i32, ptr %arrayidx, align 4
500  %sum.1 = add nsw i32 %0, %sum
501  store i32 %sum.1, ptr %gep.dst, align 4
502  %iv.next = add nuw nsw i64 %iv, 1
503  %exitcond = icmp eq i64 %iv.next, 1000
504  br i1 %exitcond, label %exit, label %for.body
505
506exit:
507  %sum.lcssa = phi i32 [ %sum.1, %for.body ]
508  %gep.dst.1 = getelementptr inbounds i32, ptr %dst, i64 43
509  store i32 %sum.lcssa, ptr %gep.dst.1, align 4
510  ret void
511}
512
513; Test for PR55540.
514define void @test_drop_poison_generating_dead_recipe(ptr %dst) {
515; CHECK-LABEL: @test_drop_poison_generating_dead_recipe(
516; CHECK:       vector.body:
517; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
518; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %vector.ph ], [ [[TMP0:%.*]], %vector.body ]
519; CHECK-NEXT:    [[TMP0]] = add <4 x i64> [[VEC_PHI]], splat (i64 -31364)
520; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
521; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], 360
522; CHECK-NEXT:    br i1 [[TMP1]], label %middle.block, label %vector.body
523; CHECK:       middle.block:
524; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP0]])
525; CHECK-NEXT:    store i64 [[TMP2]], ptr [[DST:%.*]], align 8
526; CHECK-NEXT:    br i1 false, label %exit, label %scalar.ph
527; CHECK:       scalar.ph:
528;
529entry:
530  br label %body
531
532body:
533  %red = phi i64 [ 0, %entry ], [ %red.next, %body ]
534  %iv = phi i32 [ 2, %entry ], [ %iv.next, %body ]
535  %add.1 = add nuw i64 %red, -23523
536  store i64 %add.1, ptr %dst, align 8
537  %red.next = add nuw i64 %red, -31364
538  store i64 %red.next, ptr %dst, align 8
539  %iv.next = add nuw nsw i32 %iv, 1
540  %ec = icmp ugt i32 %iv, 363
541  br i1 %ec, label %exit, label %body
542
543exit:
544  ret void
545}
546
547define void @reduc_store_invariant_addr_not_hoisted(ptr %dst, ptr readonly %src) {
548; CHECK-LABEL: @reduc_store_invariant_addr_not_hoisted
549; CHECK-NOT: vector.body:
550entry:
551  br label %for.body
552
553for.body:
554  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
555  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
556  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
557  %0 = load i32, ptr %gep.src, align 4
558  %add = add nsw i32 %sum, %0
559  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
560  store i32 %add, ptr %gep.dst, align 4
561  %iv.next = add nuw nsw i64 %iv, 1
562  %exitcond = icmp eq i64 %iv.next, 1000
563  br i1 %exitcond, label %exit, label %for.body
564
565exit:
566  ret void
567}
568
569; Make sure we can vectorize loop with a non-reduction value stored to an
570; invariant address that is calculated inside loop.
571define i32 @non_reduc_store_invariant_addr_not_hoisted(ptr %dst, ptr readonly %src) {
572; CHECK-LABEL: @non_reduc_store_invariant_addr_not_hoisted
573; CHECK: vector.body:
574entry:
575  br label %for.body
576
577for.body:                                         ; preds = %for.body, %entry
578  %sum = phi i32 [ 0, %entry ], [ %add, %for.body ]
579  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
580  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
581  %0 = load i32, ptr %gep.src, align 4
582  %add = add nsw i32 %sum, %0
583  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 42
584  store i32 0, ptr %gep.dst, align 4
585  %iv.next = add nuw nsw i64 %iv, 1
586  %exitcond = icmp eq i64 %iv.next, 1000
587  br i1 %exitcond, label %exit, label %for.body
588
589exit:                                             ; preds = %for.body
590  %add.lcssa = phi i32 [ %add, %for.body ]
591  ret i32 %add.lcssa
592}
593
594; Make sure that if there are several reductions in the loop, the order of invariant stores sank outside of the loop is preserved
595; See https://github.com/llvm/llvm-project/issues/64047
596define void @reduc_add_mul_store_same_ptr(ptr %dst, ptr readonly %src) {
597; CHECK-LABEL: define void @reduc_add_mul_store_same_ptr
598; CHECK:       middle.block:
599; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
600; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
601; CHECK-NEXT:    store i32 [[TMP6]], ptr %dst, align 4
602; CHECK-NEXT:    store i32 [[TMP7]], ptr %dst, align 4
603;
604entry:
605  br label %for.body
606
607for.body:
608  %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.body ]
609  %mul = phi i32 [ 1, %entry ], [ %mul.next, %for.body ]
610  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
611  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
612  %0 = load i32, ptr %gep.src, align 4
613  %sum.next = add nsw i32 %sum, %0
614  store i32 %sum.next, ptr %dst, align 4
615  %mul.next = mul nsw i32 %mul, %0
616  store i32 %mul.next, ptr %dst, align 4
617  %iv.next = add nuw nsw i64 %iv, 1
618  %exitcond = icmp eq i64 %iv.next, 1000
619  br i1 %exitcond, label %exit, label %for.body
620
621exit:
622  ret void
623}
624
625define void @reduc_mul_add_store_same_ptr(ptr %dst, ptr readonly %src) {
626; CHECK-LABEL: define void @reduc_mul_add_store_same_ptr
627; CHECK:       middle.block:
628; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
629; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
630; CHECK-NEXT:    store i32 [[TMP7]], ptr %dst, align 4
631; CHECK-NEXT:    store i32 [[TMP6]], ptr %dst, align 4
632;
633entry:
634  br label %for.body
635
636for.body:
637  %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.body ]
638  %mul = phi i32 [ 1, %entry ], [ %mul.next, %for.body ]
639  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
640  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
641  %0 = load i32, ptr %gep.src, align 4
642  %mul.next = mul nsw i32 %mul, %0
643  store i32 %mul.next, ptr %dst, align 4
644  %sum.next = add nsw i32 %sum, %0
645  store i32 %sum.next, ptr %dst, align 4
646  %iv.next = add nuw nsw i64 %iv, 1
647  %exitcond = icmp eq i64 %iv.next, 1000
648  br i1 %exitcond, label %exit, label %for.body
649
650exit:
651  ret void
652}
653
654; Same as above but storing is done to two different pointers and they can be aliased
655define void @reduc_add_mul_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
656; CHECK-LABEL: define void @reduc_add_mul_store_different_ptr
657; CHECK:       middle.block:
658; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3:%.*]])
659; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP4:%.*]])
660; CHECK-NEXT:    store i32 [[TMP6]], ptr %dst1, align 4
661; CHECK-NEXT:    store i32 [[TMP7]], ptr %dst2, align 4
662;
663entry:
664  br label %for.body
665
666for.body:
667  %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.body ]
668  %mul = phi i32 [ 1, %entry ], [ %mul.next, %for.body ]
669  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
670  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
671  %0 = load i32, ptr %gep.src, align 4
672  %sum.next = add nsw i32 %sum, %0
673  store i32 %sum.next, ptr %dst1, align 4
674  %mul.next = mul nsw i32 %mul, %0
675  store i32 %mul.next, ptr %dst2, align 4
676  %iv.next = add nuw nsw i64 %iv, 1
677  %exitcond = icmp eq i64 %iv.next, 1000
678  br i1 %exitcond, label %exit, label %for.body
679
680exit:
681  ret void
682}
683
684define void @reduc_mul_add_store_different_ptr(ptr %dst1, ptr %dst2, ptr readonly %src) {
685; CHECK-LABEL: define void @reduc_mul_add_store_different_ptr
686; CHECK:       middle.block:
687; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4:%.*]])
688; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP3:%.*]])
689; CHECK-NEXT:    store i32 [[TMP7]], ptr %dst1, align 4
690; CHECK-NEXT:    store i32 [[TMP6]], ptr %dst2, align 4
691;
692entry:
693  br label %for.body
694
695for.body:
696  %sum = phi i32 [ 0, %entry ], [ %sum.next, %for.body ]
697  %mul = phi i32 [ 1, %entry ], [ %mul.next, %for.body ]
698  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
699  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
700  %0 = load i32, ptr %gep.src, align 4
701  %mul.next = mul nsw i32 %mul, %0
702  store i32 %mul.next, ptr %dst1, align 4
703  %sum.next = add nsw i32 %sum, %0
704  store i32 %sum.next, ptr %dst2, align 4
705  %iv.next = add nuw nsw i64 %iv, 1
706  %exitcond = icmp eq i64 %iv.next, 1000
707  br i1 %exitcond, label %exit, label %for.body
708
709exit:
710  ret void
711}
712