xref: /llvm-project/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll (revision 7f3428d3ed71d87a2088b77b6cab9f3d86544234)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -S -hints-allow-reordering=false -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue < %s | FileCheck %s
3
4target triple = "aarch64-unknown-linux-gnu"
5
6
7define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 {
8; CHECK-LABEL: @simple_memset(
9; CHECK-NEXT:  entry:
10; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
11; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
12; CHECK:       vector.ph:
13; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
14; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
15; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
16; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP2]]
17; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
18; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
19; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
20; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
21; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
22; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
23; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
24; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
25; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
26; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
27; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
28; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
29; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
30; CHECK:       vector.body:
31; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
32; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
33; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], 0
34; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP10]]
35; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
36; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
37; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
38; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
39; CHECK-NEXT:    [[TMP13:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
40; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[TMP13]], i32 0
41; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
42; CHECK:       middle.block:
43; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
44; CHECK:       scalar.ph:
45; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
46; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
47; CHECK:       while.body:
48; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
49; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
50; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
51; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
52; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
53; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
54; CHECK:       while.end.loopexit:
55; CHECK-NEXT:    ret void
56;
57entry:
58  br label %while.body
59
60while.body:                                       ; preds = %while.body, %entry
61  %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
62  %gep = getelementptr i32, ptr %ptr, i64 %index
63  store i32 %val, ptr %gep
64  %index.next = add nsw i64 %index, 1
65  %cmp10 = icmp ult i64 %index.next, %n
66  br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
67
68while.end.loopexit:                               ; preds = %while.body
69  ret void
70}
71
72
73define void @simple_memset_v4i32(i32 %val, ptr %ptr, i64 %n) #0 {
74; CHECK-LABEL: @simple_memset_v4i32(
75; CHECK-NEXT:  entry:
76; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
77; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
78; CHECK:       vector.ph:
79; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], 3
80; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
81; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
82; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[UMAX]], 4
83; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[UMAX]], 4
84; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0
85; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[UMAX]])
86; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[VAL:%.*]], i64 0
87; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
88; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
89; CHECK:       vector.body:
90; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
91; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
92; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX1]], 0
93; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP3]]
94; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[TMP4]], i32 0
95; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
96; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], 4
97; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX1]], i64 [[TMP2]])
98; CHECK-NEXT:    [[TMP6:%.*]] = xor <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
99; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
100; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
101; CHECK:       middle.block:
102; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
103; CHECK:       scalar.ph:
104; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
105; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
106; CHECK:       while.body:
107; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
108; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
109; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
110; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
111; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
112; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
113; CHECK:       while.end.loopexit:
114; CHECK-NEXT:    ret void
115;
116entry:
117  br label %while.body
118
119while.body:                                       ; preds = %while.body, %entry
120  %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
121  %gep = getelementptr i32, ptr %ptr, i64 %index
122  store i32 %val, ptr %gep
123  %index.next = add nsw i64 %index, 1
124  %cmp10 = icmp ult i64 %index.next, %n
125  br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !3
126
127while.end.loopexit:                               ; preds = %while.body
128  ret void
129}
130
131
132define void @simple_memcpy(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
133; CHECK-LABEL: @simple_memcpy(
134; CHECK-NEXT:  entry:
135; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
136; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
137; CHECK:       vector.ph:
138; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
139; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
140; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
141; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP2]]
142; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
143; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
144; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
145; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
146; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
147; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
148; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
149; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
150; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
151; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
152; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
153; CHECK:       vector.body:
154; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
155; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
156; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], 0
157; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP10]]
158; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
159; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
160; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP10]]
161; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
162; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
163; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
164; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
165; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
166; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
167; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
168; CHECK:       middle.block:
169; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
170; CHECK:       scalar.ph:
171; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
172; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
173; CHECK:       while.body:
174; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
175; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]]
176; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP1]], align 4
177; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]]
178; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP2]], align 4
179; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
180; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
181; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
182; CHECK:       while.end.loopexit:
183; CHECK-NEXT:    ret void
184;
185entry:
186  br label %while.body
187
188while.body:                                       ; preds = %while.body, %entry
189  %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
190  %gep1 = getelementptr i32, ptr %src, i64 %index
191  %val = load i32, ptr %gep1
192  %gep2 = getelementptr i32, ptr %dst, i64 %index
193  store i32 %val, ptr %gep2
194  %index.next = add nsw i64 %index, 1
195  %cmp10 = icmp ult i64 %index.next, %n
196  br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
197
198while.end.loopexit:                               ; preds = %while.body
199  ret void
200}
201
202
203define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
204; CHECK-LABEL: @copy_stride4(
205; CHECK-NEXT:  entry:
206; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 4)
207; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
208; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
209; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
210; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
211; CHECK:       vector.ph:
212; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
213; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
214; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[TMP4]], 1
215; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], [[TMP5]]
216; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP4]]
217; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
218; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
219; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 4
220; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
221; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
222; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
223; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
224; CHECK-NEXT:    [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]]
225; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
226; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]])
227; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
228; CHECK-NEXT:    [[TMP15:%.*]] = mul <vscale x 4 x i64> [[TMP13]], splat (i64 4)
229; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP15]]
230; CHECK-NEXT:    [[TMP18:%.*]] = mul i64 4, [[TMP7]]
231; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP18]], i64 0
232; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
233; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
234; CHECK:       vector.body:
235; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
236; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
237; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
238; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i64> [[VEC_IND]]
239; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP19]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
240; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
241; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
242; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP7]]
243; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]])
244; CHECK-NEXT:    [[TMP21:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
245; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
246; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <vscale x 4 x i1> [[TMP21]], i32 0
247; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
248; CHECK:       middle.block:
249; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
250; CHECK:       scalar.ph:
251; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
252; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
253; CHECK:       while.body:
254; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
255; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]]
256; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP1]], align 4
257; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]]
258; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP2]], align 4
259; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 4
260; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
261; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
262; CHECK:       while.end.loopexit:
263; CHECK-NEXT:    ret void
264;
265entry:
266  br label %while.body
267
268while.body:                                       ; preds = %while.body, %entry
269  %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
270  %gep1 = getelementptr i32, ptr %src, i64 %index
271  %val = load i32, ptr %gep1
272  %gep2 = getelementptr i32, ptr %dst, i64 %index
273  store i32 %val, ptr %gep2
274  %index.next = add nsw i64 %index, 4
275  %cmp10 = icmp ult i64 %index.next, %n
276  br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
277
278while.end.loopexit:                               ; preds = %while.body
279  ret void
280}
281
282
283define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noalias %ind, i64 %n) #0 {
284; CHECK-LABEL: @simple_gather_scatter(
285; CHECK-NEXT:  entry:
286; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
287; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
288; CHECK:       vector.ph:
289; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
290; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
291; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
292; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP2]]
293; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
294; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
295; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
296; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
297; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
298; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
299; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
300; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
301; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
302; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
303; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
304; CHECK:       vector.body:
305; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
306; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
307; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], 0
308; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[IND:%.*]], i64 [[TMP10]]
309; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
310; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
311; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
312; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
313; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
314; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
315; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
316; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
317; CHECK-NEXT:    [[TMP15:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
318; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 4 x i1> [[TMP15]], i32 0
319; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
320; CHECK:       middle.block:
321; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
322; CHECK:       scalar.ph:
323; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
324; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
325; CHECK:       while.body:
326; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
327; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[IND]], i64 [[INDEX]]
328; CHECK-NEXT:    [[IND_VAL:%.*]] = load i32, ptr [[GEP1]], align 4
329; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[IND_VAL]]
330; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP2]], align 4
331; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i32, ptr [[DST]], i32 [[IND_VAL]]
332; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP3]], align 4
333; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
334; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
335; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]]
336; CHECK:       while.end.loopexit:
337; CHECK-NEXT:    ret void
338;
339entry:
340  br label %while.body
341
342while.body:                                       ; preds = %while.body, %entry
343  %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
344  %gep1 = getelementptr i32, ptr %ind, i64 %index
345  %ind_val = load i32, ptr %gep1
346  %gep2 = getelementptr i32, ptr %src, i32 %ind_val
347  %val = load i32, ptr %gep2
348  %gep3 = getelementptr i32, ptr %dst, i32 %ind_val
349  store i32 %val, ptr %gep3
350  %index.next = add nsw i64 %index, 1
351  %cmp10 = icmp ult i64 %index.next, %n
352  br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
353
354while.end.loopexit:                               ; preds = %while.body
355  ret void
356}
357
358
359; The original loop had an unconditional uniform load. Let's make sure
360; we don't artificially create new predicated blocks for the load.
361define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 {
362; CHECK-LABEL: @uniform_load(
363; CHECK-NEXT:  entry:
364; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
365; CHECK:       vector.ph:
366; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
367; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
368; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
369; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP2]]
370; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
371; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
372; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
373; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
374; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
375; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
376; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
377; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
378; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
379; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
380; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
381; CHECK:       vector.body:
382; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
383; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
384; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
385; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[SRC:%.*]], align 4
386; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
387; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
388; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]]
389; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0
390; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
391; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
392; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
393; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
394; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x i1> [[TMP14]], i32 0
395; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
396; CHECK:       middle.block:
397; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
398; CHECK:       scalar.ph:
399; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
400; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
401; CHECK:       for.body:
402; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
403; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[SRC]], align 4
404; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDVARS_IV]]
405; CHECK-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4
406; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
407; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
408; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
409; CHECK:       for.end:
410; CHECK-NEXT:    ret void
411;
412
413entry:
414  br label %for.body
415
416for.body:                                         ; preds = %entry, %for.body
417  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
418  %val = load i32, ptr %src, align 4
419  %arrayidx = getelementptr inbounds i32, ptr %dst, i64 %indvars.iv
420  store i32 %val, ptr %arrayidx, align 4
421  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
422  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
423  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
424
425for.end:                                          ; preds = %for.body, %entry
426  ret void
427}
428
429
430; The original loop had a conditional uniform load. In this case we actually
431; do need to perform conditional loads and so we end up using a gather instead.
432; However, we at least ensure the mask is the overlap of the loop predicate
433; and the original condition.
434define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr noalias readonly %cond, i64 %n) #0 {
435; CHECK-LABEL: @cond_uniform_load(
436; CHECK-NEXT:  entry:
437; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
438; CHECK:       vector.ph:
439; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
440; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
441; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
442; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP2]]
443; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
444; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
445; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
446; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
447; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
448; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
449; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
450; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
451; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
452; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
453; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[SRC:%.*]], i64 0
454; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
455; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
456; CHECK:       vector.body:
457; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
458; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
459; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], 0
460; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[TMP10]]
461; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
462; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
463; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
464; CHECK-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP13]], splat (i1 true)
465; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
466; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> poison)
467; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32> zeroinitializer
468; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP10]]
469; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
470; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[PREDPHI]], ptr [[TMP17]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
471; CHECK-NEXT:    [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]]
472; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
473; CHECK-NEXT:    [[TMP18:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
474; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 4 x i1> [[TMP18]], i32 0
475; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
476; CHECK:       middle.block:
477; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
478; CHECK:       scalar.ph:
479; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
480; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
481; CHECK:       for.body:
482; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
483; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COND]], i64 [[INDEX]]
484; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
485; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP20]], 0
486; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
487; CHECK:       if.then:
488; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[SRC]], align 4
489; CHECK-NEXT:    br label [[IF_END]]
490; CHECK:       if.end:
491; CHECK-NEXT:    [[VAL_0:%.*]] = phi i32 [ [[TMP21]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
492; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]]
493; CHECK-NEXT:    store i32 [[VAL_0]], ptr [[ARRAYIDX1]], align 4
494; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
495; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
496; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
497; CHECK:       for.end:
498; CHECK-NEXT:    ret void
499;
500
501entry:
502  br label %for.body
503
504for.body:                                         ; preds = %entry, %if.end
505  %index = phi i64 [ %index.next, %if.end ], [ 0, %entry ]
506  %arrayidx = getelementptr inbounds i32, ptr %cond, i64 %index
507  %0 = load i32, ptr %arrayidx, align 4
508  %tobool.not = icmp eq i32 %0, 0
509  br i1 %tobool.not, label %if.end, label %if.then
510
511if.then:                                          ; preds = %for.body
512  %1 = load i32, ptr %src, align 4
513  br label %if.end
514
515if.end:                                           ; preds = %if.then, %for.body
516  %val.0 = phi i32 [ %1, %if.then ], [ 0, %for.body ]
517  %arrayidx1 = getelementptr inbounds i32, ptr %dst, i64 %index
518  store i32 %val.0, ptr %arrayidx1, align 4
519  %index.next = add nuw i64 %index, 1
520  %exitcond.not = icmp eq i64 %index.next, %n
521  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
522
523for.end:                                          ; preds = %for.inc, %entry
524  ret void
525}
526
527
528; The original loop had an unconditional uniform store. Let's make sure
529; we don't artificially create new predicated blocks for the load.
530define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 {
531; CHECK-LABEL: @uniform_store(
532; CHECK-NEXT:  entry:
533; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
534; CHECK:       vector.ph:
535; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
536; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
537; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
538; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP2]]
539; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
540; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
541; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
542; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
543; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
544; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
545; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
546; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
547; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
548; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
549; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[DST:%.*]], i64 0
550; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
551; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
552; CHECK:       vector.body:
553; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
554; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
555; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 0
556; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP10]]
557; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
558; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
559; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x ptr> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
560; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
561; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]])
562; CHECK-NEXT:    [[TMP13:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
563; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x i1> [[TMP13]], i32 0
564; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
565; CHECK:       middle.block:
566; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
567; CHECK:       scalar.ph:
568; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
569; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
570; CHECK:       for.body:
571; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
572; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDVARS_IV]]
573; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
574; CHECK-NEXT:    store i32 [[VAL]], ptr [[DST]], align 4
575; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
576; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
577; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
578; CHECK:       for.end:
579; CHECK-NEXT:    ret void
580;
581
582entry:
583  br label %for.body
584
585for.body:                                         ; preds = %entry, %for.body
586  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
587  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv
588  %val = load i32, ptr %arrayidx, align 4
589  store i32 %val, ptr %dst, align 4
590  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
591  %exitcond.not = icmp eq i64 %indvars.iv.next, %n
592  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
593
594for.end:                                          ; preds = %for.body, %entry
595  ret void
596}
597
598
599define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
600; CHECK-LABEL: @simple_fdiv(
601; CHECK-NEXT:  entry:
602; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
603; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
604; CHECK:       vector.ph:
605; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
606; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
607; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
608; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP2]]
609; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
610; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
611; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
612; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
613; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
614; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
615; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
616; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
617; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
618; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
619; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
620; CHECK:       vector.body:
621; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
622; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
623; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], 0
624; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr float, ptr [[SRC:%.*]], i64 [[TMP10]]
625; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[DST:%.*]], i64 [[TMP10]]
626; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr [[TMP11]], i32 0
627; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
628; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr float, ptr [[TMP12]], i32 0
629; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
630; CHECK-NEXT:    [[TMP15:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
631; CHECK-NEXT:    call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> [[TMP15]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
632; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP4]]
633; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
634; CHECK-NEXT:    [[TMP16:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
635; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 4 x i1> [[TMP16]], i32 0
636; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
637; CHECK:       middle.block:
638; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
639; CHECK:       scalar.ph:
640; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
641; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
642; CHECK:       while.body:
643; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
644; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr float, ptr [[SRC]], i64 [[INDEX]]
645; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr float, ptr [[DST]], i64 [[INDEX]]
646; CHECK-NEXT:    [[VAL1:%.*]] = load float, ptr [[GEP1]], align 4
647; CHECK-NEXT:    [[VAL2:%.*]] = load float, ptr [[GEP2]], align 4
648; CHECK-NEXT:    [[RES:%.*]] = fdiv float [[VAL1]], [[VAL2]]
649; CHECK-NEXT:    store float [[RES]], ptr [[GEP2]], align 4
650; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
651; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
652; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]]
653; CHECK:       while.end.loopexit:
654; CHECK-NEXT:    ret void
655;
656entry:
657  br label %while.body
658
659while.body:                                       ; preds = %while.body, %entry
660  %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
661  %gep1 = getelementptr float, ptr %src, i64 %index
662  %gep2 = getelementptr float, ptr %dst, i64 %index
663  %val1 = load float, ptr %gep1
664  %val2 = load float, ptr %gep2
665  %res = fdiv float %val1, %val2
666  store float %res, ptr %gep2
667  %index.next = add nsw i64 %index, 1
668  %cmp10 = icmp ult i64 %index.next, %n
669  br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
670
671while.end.loopexit:                               ; preds = %while.body
672  ret void
673}
674
675; Integer divides can throw exceptions; if we vectorize, we must ensure
676; that speculated lanes don't fault.
677define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
678; CHECK-LABEL: @simple_idiv(
679; CHECK-NEXT:  entry:
680; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
681; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
682; CHECK:       vector.ph:
683; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
684; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
685; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
686; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP2]]
687; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
688; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
689; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
690; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 4
691; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
692; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
693; CHECK-NEXT:    [[TMP7:%.*]] = sub i64 [[UMAX]], [[TMP6]]
694; CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[UMAX]], [[TMP6]]
695; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
696; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[UMAX]])
697; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
698; CHECK:       vector.body:
699; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
700; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
701; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX1]], 0
702; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[SRC:%.*]], i64 [[TMP10]]
703; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP10]]
704; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
705; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
706; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
707; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
708; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> splat (i32 1)
709; CHECK-NEXT:    [[TMP16:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP15]]
710; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
711; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP4]]
712; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]])
713; CHECK-NEXT:    [[TMP17:%.*]] = xor <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], splat (i1 true)
714; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 4 x i1> [[TMP17]], i32 0
715; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
716; CHECK:       middle.block:
717; CHECK-NEXT:    br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
718; CHECK:       scalar.ph:
719; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
720; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
721; CHECK:       while.body:
722; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
723; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[INDEX]]
724; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i32, ptr [[DST]], i64 [[INDEX]]
725; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[GEP1]], align 4
726; CHECK-NEXT:    [[VAL2:%.*]] = load i32, ptr [[GEP2]], align 4
727; CHECK-NEXT:    [[RES:%.*]] = udiv i32 [[VAL1]], [[VAL2]]
728; CHECK-NEXT:    store i32 [[RES]], ptr [[GEP2]], align 4
729; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
730; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
731; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP21:![0-9]+]]
732; CHECK:       while.end.loopexit:
733; CHECK-NEXT:    ret void
734;
735entry:
736  br label %while.body
737
738while.body:                                       ; preds = %while.body, %entry
739  %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
740  %gep1 = getelementptr i32, ptr %src, i64 %index
741  %gep2 = getelementptr i32, ptr %dst, i64 %index
742  %val1 = load i32, ptr %gep1
743  %val2 = load i32, ptr %gep2
744  %res = udiv i32 %val1, %val2
745  store i32 %res, ptr %gep2
746  %index.next = add nsw i64 %index, 1
747  %cmp10 = icmp ult i64 %index.next, %n
748  br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
749
750while.end.loopexit:                               ; preds = %while.body
751  ret void
752}
753
754define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 {
755; CHECK-LABEL: @simple_memset_trip1024(
756; CHECK-NEXT:  entry:
757; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
758; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
759; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
760; CHECK:       vector.ph:
761; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
762; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
763; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
764; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
765; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
766; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
767; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
768; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
769; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
770; CHECK:       vector.body:
771; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
772; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX1]], 0
773; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP6]]
774; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0
775; CHECK-NEXT:    store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP8]], align 4
776; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP5]]
777; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
778; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
779; CHECK:       middle.block:
780; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
781; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
782; CHECK:       scalar.ph:
783; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
784; CHECK-NEXT:    br label [[WHILE_BODY:%.*]]
785; CHECK:       while.body:
786; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
787; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
788; CHECK-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
789; CHECK-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
790; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], 1024
791; CHECK-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP23:![0-9]+]]
792; CHECK:       while.end.loopexit:
793; CHECK-NEXT:    ret void
794;
795entry:
796  br label %while.body
797
798while.body:                                       ; preds = %while.body, %entry
799  %index = phi i64 [ %index.next, %while.body ], [ 0, %entry ]
800  %gep = getelementptr i32, ptr %ptr, i64 %index
801  store i32 %val, ptr %gep
802  %index.next = add nsw i64 %index, 1
803  %cmp10 = icmp ult i64 %index.next, 1024
804  br i1 %cmp10, label %while.body, label %while.end.loopexit, !llvm.loop !0
805
806while.end.loopexit:                               ; preds = %while.body
807  ret void
808}
809
810!0 = distinct !{!0, !1, !2}
811!1 = !{!"llvm.loop.vectorize.width", i32 4}
812!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
813!3 = distinct !{!3, !4}
814!4 = !{!"llvm.loop.vectorize.width", i32 4}
815
816attributes #0 = { "target-features"="+sve" vscale_range(1,16) }
817