xref: /llvm-project/llvm/test/Transforms/LoopVectorize/X86/uniform_mem_op.ll (revision 4ad0fdd1631eeae432714c03ede01a10dc00025d)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -force-vector-width=4 -passes=loop-vectorize -mcpu=haswell < %s | FileCheck %s
3
4;; Basic functional tests for uniform loads and stores.  These are cases kept
5;; deliberately simple (and unoptimized by other passes) to feed the vectorizer
6;; with particular input IR.
7
8target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
9target triple = "x86_64-unknown-linux-gnu"
10
11define i32 @uniform_load(ptr align(4) %addr) {
12; CHECK-LABEL: @uniform_load(
13; CHECK-NEXT:  entry:
14; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
15; CHECK:       vector.ph:
16; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
17; CHECK:       vector.body:
18; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
19; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADDR:%.*]], align 4
20; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
21; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
22; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
23; CHECK:       middle.block:
24; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
25; CHECK:       scalar.ph:
26; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
27; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
28; CHECK:       for.body:
29; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
30; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ADDR]], align 4
31; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
32; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
33; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
34; CHECK:       loopexit:
35; CHECK-NEXT:    [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ]
36; CHECK-NEXT:    ret i32 [[LOAD_LCSSA]]
37;
38entry:
39  br label %for.body
40
41for.body:
42  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
43  %load = load i32, ptr %addr
44  %iv.next = add nuw nsw i64 %iv, 1
45  %exitcond = icmp eq i64 %iv, 4096
46  br i1 %exitcond, label %loopexit, label %for.body
47
48loopexit:
49  ret i32 %load
50}
51
52define i32 @uniform_load2(ptr align(4) %addr) {
53; CHECK-LABEL: @uniform_load2(
54; CHECK-NEXT:  entry:
55; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
56; CHECK:       vector.ph:
57; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
58; CHECK:       vector.body:
59; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
60; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
61; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
62; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
63; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
64; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADDR:%.*]], align 4
65; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
66; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
67; CHECK-NEXT:    [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]]
68; CHECK-NEXT:    [[TMP2]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT9]]
69; CHECK-NEXT:    [[TMP3]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT9]]
70; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
71; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
72; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
73; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
74; CHECK:       middle.block:
75; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]]
76; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]]
77; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]]
78; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
79; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
80; CHECK:       scalar.ph:
81; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
82; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
83; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
84; CHECK:       for.body:
85; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
86; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
87; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ADDR]], align 4
88; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]]
89; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
90; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
91; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
92; CHECK:       loopexit:
93; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
94; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
95;
96entry:
97  br label %for.body
98
99for.body:
100  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
101  %accum = phi i32 [%accum.next, %for.body], [0, %entry]
102  %load = load i32, ptr %addr
103  %accum.next = add i32 %accum, %load
104  %iv.next = add nuw nsw i64 %iv, 1
105  %exitcond = icmp eq i64 %iv, 4096
106  br i1 %exitcond, label %loopexit, label %for.body
107
108loopexit:
109  ret i32 %accum.next
110}
111
112define i32 @uniform_address(ptr align(4) %addr, i32 %byte_offset) {
113; CHECK-LABEL: @uniform_address(
114; CHECK-NEXT:  entry:
115; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
116; CHECK:       vector.ph:
117; CHECK-NEXT:    [[TMP0:%.*]] = udiv i32 [[BYTE_OFFSET:%.*]], 4
118; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[ADDR:%.*]], i32 [[TMP0]]
119; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
120; CHECK:       vector.body:
121; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
122; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP4]], align 4
123; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
124; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
125; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
126; CHECK:       middle.block:
127; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
128; CHECK:       scalar.ph:
129; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
130; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
131; CHECK:       for.body:
132; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
133; CHECK-NEXT:    [[OFFSET:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
134; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[ADDR]], i32 [[OFFSET]]
135; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
136; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
137; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
138; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
139; CHECK:       loopexit:
140; CHECK-NEXT:    [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
141; CHECK-NEXT:    ret i32 [[LOAD_LCSSA]]
142;
143entry:
144  br label %for.body
145
146for.body:
147  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
148  %offset = udiv i32 %byte_offset, 4
149  %gep = getelementptr i32, ptr %addr, i32 %offset
150  %load = load i32, ptr %gep
151  %iv.next = add nuw nsw i64 %iv, 1
152  %exitcond = icmp eq i64 %iv, 4096
153  br i1 %exitcond, label %loopexit, label %for.body
154
155loopexit:
156  ret i32 %load
157}
158
159
160
161define void @uniform_store_uniform_value(ptr align(4) %addr) {
162; CHECK-LABEL: @uniform_store_uniform_value(
163; CHECK-NEXT:  entry:
164; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
165; CHECK:       vector.ph:
166; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
167; CHECK:       vector.body:
168; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
169; CHECK-NEXT:    store i32 0, ptr [[ADDR:%.*]], align 4
170; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
171; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
172; CHECK-NEXT:    br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
173; CHECK:       middle.block:
174; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
175; CHECK:       scalar.ph:
176; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
177; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
178; CHECK:       for.body:
179; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
180; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
181; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
182; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
183; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
184; CHECK:       loopexit:
185; CHECK-NEXT:    ret void
186;
187entry:
188  br label %for.body
189
190for.body:
191  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
192  store i32 0, ptr %addr
193  %iv.next = add nuw nsw i64 %iv, 1
194  %exitcond = icmp eq i64 %iv, 4096
195  br i1 %exitcond, label %loopexit, label %for.body
196
197loopexit:
198  ret void
199}
200
201define void @uniform_store_varying_value(ptr align(4) %addr) {
202; CHECK-LABEL: @uniform_store_varying_value(
203; CHECK-NEXT:  entry:
204; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
205; CHECK:       vector.ph:
206; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
207; CHECK:       vector.body:
208; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
209; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
210; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP0]], 12
211; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[TMP0]], 13
212; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[TMP0]], 14
213; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP0]], 15
214; CHECK-NEXT:    store i32 [[TMP7]], ptr [[ADDR:%.*]], align 4
215; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
216; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
217; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
218; CHECK:       middle.block:
219; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
220; CHECK:       scalar.ph:
221; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
222; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
223; CHECK:       for.body:
224; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
225; CHECK-NEXT:    [[IV_I32:%.*]] = trunc i64 [[IV]] to i32
226; CHECK-NEXT:    store i32 [[IV_I32]], ptr [[ADDR]], align 4
227; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
228; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
229; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
230; CHECK:       loopexit:
231; CHECK-NEXT:    ret void
232;
233entry:
234  br label %for.body
235
236for.body:
237  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
238  %iv.i32 = trunc i64 %iv to i32
239  store i32 %iv.i32, ptr %addr
240  %iv.next = add nuw nsw i64 %iv, 1
241  %exitcond = icmp eq i64 %iv, 4096
242  br i1 %exitcond, label %loopexit, label %for.body
243
244loopexit:
245  ret void
246}
247
248define void @uniform_rw(ptr align(4) %addr) {
249; CHECK-LABEL: @uniform_rw(
250; CHECK-NEXT:  entry:
251; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
252; CHECK:       for.body:
253; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
254; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ADDR:%.*]], align 4
255; CHECK-NEXT:    [[INC:%.*]] = add i32 [[LOAD]], 1
256; CHECK-NEXT:    store i32 [[INC]], ptr [[ADDR]], align 4
257; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
258; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
259; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT:%.*]], label [[FOR_BODY]]
260; CHECK:       loopexit:
261; CHECK-NEXT:    ret void
262;
263entry:
264  br label %for.body
265
266for.body:
267  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
268  %load = load i32, ptr %addr
269  %inc = add i32 %load, 1
270  store i32 %inc, ptr %addr
271  %iv.next = add nuw nsw i64 %iv, 1
272  %exitcond = icmp eq i64 %iv, 4096
273  br i1 %exitcond, label %loopexit, label %for.body
274
275loopexit:
276  ret void
277}
278
279define void @uniform_copy(ptr %A, ptr %B) {
280; CHECK-LABEL: @uniform_copy(
281; CHECK-NEXT:  entry:
282; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
283; CHECK:       vector.memcheck:
284; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 4
285; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
286; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[UGLYGEP1]]
287; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[UGLYGEP]]
288; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
289; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
290; CHECK:       vector.ph:
291; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
292; CHECK:       vector.body:
293; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
294; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !alias.scope [[META12:![0-9]+]]
295; CHECK-NEXT:    store i32 [[TMP0]], ptr [[B]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META12]]
296; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
297; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
298; CHECK-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
299; CHECK:       middle.block:
300; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
301; CHECK:       scalar.ph:
302; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ENTRY:%.*]] ]
303; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
304; CHECK:       for.body:
305; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
306; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 4
307; CHECK-NEXT:    store i32 [[LOAD]], ptr [[B]], align 4
308; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
309; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
310; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
311; CHECK:       loopexit:
312; CHECK-NEXT:    ret void
313;
314entry:
315  br label %for.body
316
317for.body:
318  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
319  %load = load i32, ptr %A
320  store i32 %load, ptr %B
321  %iv.next = add nuw nsw i64 %iv, 1
322  %exitcond = icmp eq i64 %iv, 4096
323  br i1 %exitcond, label %loopexit, label %for.body
324
325loopexit:
326  ret void
327}
328
329
330declare void @init(ptr)
331
332;; Count the number of bits set in a bit vector -- key point of relevance is
333;; that the byte load is uniform across 8 iterations at a time.
334;; TODO: At the moment, this is vectorized with VF=4 and UF=4. The load is
335;; considered uniform across VF=4, but should be considered uniform across
336;; VF=8/VF=4,UF=2.
337define i32 @test_count_bits(ptr %test_base) {
338; CHECK-LABEL: @test_count_bits(
339; CHECK-NEXT:  entry:
340; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [4096 x i32], align 4
341; CHECK-NEXT:    call void @init(ptr [[ALLOCA]])
342; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
343; CHECK:       vector.ph:
344; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
345; CHECK:       vector.body:
346; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
347; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
348; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
349; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
350; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
351; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
352; CHECK-NEXT:    [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 4)
353; CHECK-NEXT:    [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 4)
354; CHECK-NEXT:    [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 4)
355; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
356; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
357; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 8
358; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 12
359; CHECK-NEXT:    [[TMP4:%.*]] = udiv i64 [[TMP0]], 8
360; CHECK-NEXT:    [[TMP5:%.*]] = udiv i64 [[TMP1]], 8
361; CHECK-NEXT:    [[TMP6:%.*]] = udiv i64 [[TMP2]], 8
362; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP3]], 8
363; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE:%.*]], i64 [[TMP4]]
364; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP5]]
365; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP6]]
366; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[TMP7]]
367; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[TMP8]], align 1
368; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i64 0
369; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
370; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP9]], align 1
371; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i8> poison, i8 [[TMP13]], i64 0
372; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT7]], <4 x i8> poison, <4 x i32> zeroinitializer
373; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP10]], align 1
374; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i8> poison, i8 [[TMP14]], i64 0
375; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT9]], <4 x i8> poison, <4 x i32> zeroinitializer
376; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP11]], align 1
377; CHECK-NEXT:    [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <4 x i8> poison, i8 [[TMP15]], i64 0
378; CHECK-NEXT:    [[BROADCAST_SPLAT12:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT11]], <4 x i8> poison, <4 x i32> zeroinitializer
379; CHECK-NEXT:    [[TMP16:%.*]] = urem <4 x i64> [[VEC_IND]], splat (i64 8)
380; CHECK-NEXT:    [[TMP17:%.*]] = urem <4 x i64> [[STEP_ADD]], splat (i64 8)
381; CHECK-NEXT:    [[TMP18:%.*]] = urem <4 x i64> [[STEP_ADD_2]], splat (i64 8)
382; CHECK-NEXT:    [[TMP19:%.*]] = urem <4 x i64> [[STEP_ADD_3]], splat (i64 8)
383; CHECK-NEXT:    [[TMP20:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i8>
384; CHECK-NEXT:    [[TMP21:%.*]] = trunc <4 x i64> [[TMP17]] to <4 x i8>
385; CHECK-NEXT:    [[TMP22:%.*]] = trunc <4 x i64> [[TMP18]] to <4 x i8>
386; CHECK-NEXT:    [[TMP23:%.*]] = trunc <4 x i64> [[TMP19]] to <4 x i8>
387; CHECK-NEXT:    [[TMP24:%.*]] = lshr <4 x i8> [[BROADCAST_SPLAT]], [[TMP20]]
388; CHECK-NEXT:    [[TMP25:%.*]] = lshr <4 x i8> [[BROADCAST_SPLAT8]], [[TMP21]]
389; CHECK-NEXT:    [[TMP26:%.*]] = lshr <4 x i8> [[BROADCAST_SPLAT10]], [[TMP22]]
390; CHECK-NEXT:    [[TMP27:%.*]] = lshr <4 x i8> [[BROADCAST_SPLAT12]], [[TMP23]]
391; CHECK-NEXT:    [[TMP28:%.*]] = and <4 x i8> [[TMP24]], splat (i8 1)
392; CHECK-NEXT:    [[TMP29:%.*]] = and <4 x i8> [[TMP25]], splat (i8 1)
393; CHECK-NEXT:    [[TMP30:%.*]] = and <4 x i8> [[TMP26]], splat (i8 1)
394; CHECK-NEXT:    [[TMP31:%.*]] = and <4 x i8> [[TMP27]], splat (i8 1)
395; CHECK-NEXT:    [[TMP32:%.*]] = zext <4 x i8> [[TMP28]] to <4 x i32>
396; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i8> [[TMP29]] to <4 x i32>
397; CHECK-NEXT:    [[TMP34:%.*]] = zext <4 x i8> [[TMP30]] to <4 x i32>
398; CHECK-NEXT:    [[TMP35:%.*]] = zext <4 x i8> [[TMP31]] to <4 x i32>
399; CHECK-NEXT:    [[TMP36]] = add <4 x i32> [[VEC_PHI]], [[TMP32]]
400; CHECK-NEXT:    [[TMP37]] = add <4 x i32> [[VEC_PHI4]], [[TMP33]]
401; CHECK-NEXT:    [[TMP38]] = add <4 x i32> [[VEC_PHI5]], [[TMP34]]
402; CHECK-NEXT:    [[TMP39]] = add <4 x i32> [[VEC_PHI6]], [[TMP35]]
403; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
404; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 4)
405; CHECK-NEXT:    [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
406; CHECK-NEXT:    br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
407; CHECK:       middle.block:
408; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP37]], [[TMP36]]
409; CHECK-NEXT:    [[BIN_RDX13:%.*]] = add <4 x i32> [[TMP38]], [[BIN_RDX]]
410; CHECK-NEXT:    [[BIN_RDX14:%.*]] = add <4 x i32> [[TMP39]], [[BIN_RDX13]]
411; CHECK-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX14]])
412; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
413; CHECK:       scalar.ph:
414; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
415; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP41]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
416; CHECK-NEXT:    br label [[LOOP:%.*]]
417; CHECK:       loop:
418; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
419; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ACCUM_NEXT:%.*]], [[LOOP]] ]
420; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
421; CHECK-NEXT:    [[BYTE:%.*]] = udiv i64 [[IV]], 8
422; CHECK-NEXT:    [[TEST_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TEST_BASE]], i64 [[BYTE]]
423; CHECK-NEXT:    [[EARLYCND:%.*]] = load i8, ptr [[TEST_ADDR]], align 1
424; CHECK-NEXT:    [[BIT:%.*]] = urem i64 [[IV]], 8
425; CHECK-NEXT:    [[BIT_TRUNC:%.*]] = trunc i64 [[BIT]] to i8
426; CHECK-NEXT:    [[MASK:%.*]] = lshr i8 [[EARLYCND]], [[BIT_TRUNC]]
427; CHECK-NEXT:    [[TEST:%.*]] = and i8 [[MASK]], 1
428; CHECK-NEXT:    [[VAL:%.*]] = zext i8 [[TEST]] to i32
429; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[VAL]]
430; CHECK-NEXT:    [[EXIT:%.*]] = icmp ugt i64 [[IV]], 4094
431; CHECK-NEXT:    br i1 [[EXIT]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP20:![0-9]+]]
432; CHECK:       loop_exit:
433; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[LOOP]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
434; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
435;
436entry:
437  %alloca = alloca [4096 x i32]
438  call void @init(ptr %alloca)
439  br label %loop
440loop:
441  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
442  %accum = phi i32 [ 0, %entry ], [ %accum.next, %loop ]
443  %iv.next = add i64 %iv, 1
444  %byte = udiv i64 %iv, 8
445  %test_addr = getelementptr inbounds i8, ptr %test_base, i64 %byte
446  %earlycnd = load i8, ptr %test_addr
447  %bit = urem i64 %iv, 8
448  %bit.trunc = trunc i64 %bit to i8
449  %mask = lshr i8 %earlycnd, %bit.trunc
450  %test = and i8 %mask, 1
451  %val = zext i8 %test to i32
452  %accum.next = add i32 %accum, %val
453  %exit = icmp ugt i64 %iv, 4094
454  br i1 %exit, label %loop_exit, label %loop
455
456loop_exit:
457  ret i32 %accum.next
458}
459
460;; Same as uniform_load, but show that the uniformity analysis can handle
461;; pointer operands which are not local to the function.
462@GAddr = external global i32 align 4
463define i32 @uniform_load_global() {
464; CHECK-LABEL: @uniform_load_global(
465; CHECK-NEXT:  entry:
466; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
467; CHECK:       vector.ph:
468; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
469; CHECK:       vector.body:
470; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
471; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
472; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
473; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
474; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
475; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @GAddr, align 4
476; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
477; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
478; CHECK-NEXT:    [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]]
479; CHECK-NEXT:    [[TMP2]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT9]]
480; CHECK-NEXT:    [[TMP3]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT9]]
481; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
482; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
483; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
484; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
485; CHECK:       middle.block:
486; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]]
487; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]]
488; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]]
489; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
490; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
491; CHECK:       scalar.ph:
492; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
493; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
494; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
495; CHECK:       for.body:
496; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
497; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
498; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr @GAddr, align 4
499; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]]
500; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
501; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
502; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
503; CHECK:       loopexit:
504; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
505; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
506;
507entry:
508  br label %for.body
509
510for.body:
511  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
512  %accum = phi i32 [%accum.next, %for.body], [0, %entry]
513  %load = load i32, ptr @GAddr
514  %accum.next = add i32 %accum, %load
515  %iv.next = add nuw nsw i64 %iv, 1
516  %exitcond = icmp eq i64 %iv, 4096
517  br i1 %exitcond, label %loopexit, label %for.body
518
519loopexit:
520  ret i32 %accum.next
521}
522
523;; Same as the global case, but using a constexpr
524define i32 @uniform_load_constexpr() {
525; CHECK-LABEL: @uniform_load_constexpr(
526; CHECK-NEXT:  entry:
527; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
528; CHECK:       vector.ph:
529; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
530; CHECK:       vector.body:
531; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
532; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
533; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
534; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
535; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
536; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4
537; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i64 0
538; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
539; CHECK-NEXT:    [[TMP1]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT9]]
540; CHECK-NEXT:    [[TMP2]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT9]]
541; CHECK-NEXT:    [[TMP3]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT9]]
542; CHECK-NEXT:    [[TMP4]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
543; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
544; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
545; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
546; CHECK:       middle.block:
547; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP2]], [[TMP1]]
548; CHECK-NEXT:    [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP3]], [[BIN_RDX]]
549; CHECK-NEXT:    [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP4]], [[BIN_RDX10]]
550; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
551; CHECK-NEXT:    br i1 false, label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
552; CHECK:       scalar.ph:
553; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
554; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
555; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
556; CHECK:       for.body:
557; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
558; CHECK-NEXT:    [[ACCUM:%.*]] = phi i32 [ [[ACCUM_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
559; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4
560; CHECK-NEXT:    [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[LOAD]]
561; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
562; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
563; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
564; CHECK:       loopexit:
565; CHECK-NEXT:    [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
566; CHECK-NEXT:    ret i32 [[ACCUM_NEXT_LCSSA]]
567;
568entry:
569  br label %for.body
570
571for.body:                                         ; preds = %for.body, %entry
572  %iv = phi i64 [ %iv.next, %for.body ], [ 0, %entry ]
573  %accum = phi i32 [ %accum.next, %for.body ], [ 0, %entry ]
574  %load = load i32, ptr getelementptr (i32, ptr @GAddr, i64 5), align 4
575  %accum.next = add i32 %accum, %load
576  %iv.next = add nuw nsw i64 %iv, 1
577  %exitcond = icmp eq i64 %iv, 4096
578  br i1 %exitcond, label %loopexit, label %for.body
579
580loopexit:                                         ; preds = %for.body
581  ret i32 %accum.next
582}
583