xref: /llvm-project/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll (revision 56c091ea7106507b36015297ee9005c9d5fab0bf)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=VLENUNK
3; RUN: opt < %s -passes=loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=-1 -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=VLEN128
4
5; Note: +v implies a Zvl128b (i.e. minimal VLEN of 128), but as can be seen, we're currently
6; not using that information unless an explicit vector width is set. (FIXME)
7
8; A collection of fairly basic functional tests when both fixed and scalable vectorization is
9; allowed.  The primary goal of this is check for crashes during cost modeling, but it also
10; exercises the default heuristics in a useful way.
11
12target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
13target triple = "riscv64"
14
15define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
16; VLENUNK-LABEL: @vector_add(
17; VLENUNK-NEXT:  entry:
18; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
19; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
20; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
21; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
22; VLENUNK:       vector.ph:
23; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
24; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
25; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
26; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
27; VLENUNK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
28; VLENUNK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
29; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
30; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
31; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
32; VLENUNK:       vector.body:
33; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
34; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
35; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
36; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
37; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
38; VLENUNK-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
39; VLENUNK-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
40; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
41; VLENUNK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
42; VLENUNK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
43; VLENUNK:       middle.block:
44; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
45; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
46; VLENUNK:       scalar.ph:
47; VLENUNK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
48; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
49; VLENUNK:       for.body:
50; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
51; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
52; VLENUNK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
53; VLENUNK-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V]]
54; VLENUNK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
55; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
56; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
57; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
58; VLENUNK:       for.end:
59; VLENUNK-NEXT:    ret void
60;
61; VLEN128-LABEL: @vector_add(
62; VLEN128-NEXT:  entry:
63; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
64; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
65; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
66; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
67; VLEN128:       vector.ph:
68; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
69; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
70; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
71; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
72; VLEN128-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
73; VLEN128-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
74; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
75; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
76; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
77; VLEN128:       vector.body:
78; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
79; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
80; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
81; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
82; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
83; VLEN128-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
84; VLEN128-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
85; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
86; VLEN128-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
87; VLEN128-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
88; VLEN128:       middle.block:
89; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
90; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
91; VLEN128:       scalar.ph:
92; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
93; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
94; VLEN128:       for.body:
95; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
96; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
97; VLEN128-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
98; VLEN128-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V]]
99; VLEN128-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
100; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
101; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
102; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
103; VLEN128:       for.end:
104; VLEN128-NEXT:    ret void
105;
106entry:
107  br label %for.body
108
109for.body:
110  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
111  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
112  %elem = load i64, ptr %arrayidx
113  %add = add i64 %elem, %v
114  store i64 %add, ptr %arrayidx
115  %iv.next = add nuw nsw i64 %iv, 1
116  %exitcond.not = icmp eq i64 %iv.next, 1024
117  br i1 %exitcond.not, label %for.end, label %for.body
118
119for.end:
120  ret void
121}
122
123; Same as above, but with op type of i32.  We currently have a bug around
124; etype=ELEN profitability in the vectorizer, and having a smaller element
125; width test allows us to highlight different aspects of codegen.
126define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {
127; VLENUNK-LABEL: @vector_add_i32(
128; VLENUNK-NEXT:  entry:
129; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
130; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
131; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
132; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
133; VLENUNK:       vector.ph:
134; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
135; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
136; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
137; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
138; VLENUNK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
139; VLENUNK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
140; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
141; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
142; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
143; VLENUNK:       vector.body:
144; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
145; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
146; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]]
147; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
148; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
149; VLENUNK-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
150; VLENUNK-NEXT:    store <vscale x 4 x i32> [[TMP7]], ptr [[TMP6]], align 4
151; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
152; VLENUNK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
153; VLENUNK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
154; VLENUNK:       middle.block:
155; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
156; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
157; VLENUNK:       scalar.ph:
158; VLENUNK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
159; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
160; VLENUNK:       for.body:
161; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
162; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
163; VLENUNK-NEXT:    [[ELEM:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
164; VLENUNK-NEXT:    [[ADD:%.*]] = add i32 [[ELEM]], [[V]]
165; VLENUNK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
166; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
167; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
168; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
169; VLENUNK:       for.end:
170; VLENUNK-NEXT:    ret void
171;
172; VLEN128-LABEL: @vector_add_i32(
173; VLEN128-NEXT:  entry:
174; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
175; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
176; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
177; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
178; VLEN128:       vector.ph:
179; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
180; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
181; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
182; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
183; VLEN128-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
184; VLEN128-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
185; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[V:%.*]], i64 0
186; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
187; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
188; VLEN128:       vector.body:
189; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
190; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
191; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]]
192; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
193; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
194; VLEN128-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
195; VLEN128-NEXT:    store <vscale x 4 x i32> [[TMP7]], ptr [[TMP6]], align 4
196; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
197; VLEN128-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
198; VLEN128-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
199; VLEN128:       middle.block:
200; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
201; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
202; VLEN128:       scalar.ph:
203; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
204; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
205; VLEN128:       for.body:
206; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
207; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
208; VLEN128-NEXT:    [[ELEM:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
209; VLEN128-NEXT:    [[ADD:%.*]] = add i32 [[ELEM]], [[V]]
210; VLEN128-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
211; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
212; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
213; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
214; VLEN128:       for.end:
215; VLEN128-NEXT:    ret void
216;
217entry:
218  br label %for.body
219
220for.body:
221  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
222  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
223  %elem = load i32, ptr %arrayidx
224  %add = add i32 %elem, %v
225  store i32 %add, ptr %arrayidx
226  %iv.next = add nuw nsw i64 %iv, 1
227  %exitcond.not = icmp eq i64 %iv.next, 1024
228  br i1 %exitcond.not, label %for.end, label %for.body
229
230for.end:
231  ret void
232}
233
234
235; a[b[i]] += v, mostly to exercise scatter/gather costing
236; TODO: Currently fails to vectorize due to a memory conflict
237define void @indexed_add(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
238; VLENUNK-LABEL: @indexed_add(
239; VLENUNK-NEXT:  entry:
240; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
241; VLENUNK:       for.body:
242; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
243; VLENUNK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
244; VLENUNK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
245; VLENUNK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
246; VLENUNK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
247; VLENUNK-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V:%.*]]
248; VLENUNK-NEXT:    store i64 [[ADD]], ptr [[AADDR]], align 8
249; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
250; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
251; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
252; VLENUNK:       for.end:
253; VLENUNK-NEXT:    ret void
254;
255; VLEN128-LABEL: @indexed_add(
256; VLEN128-NEXT:  entry:
257; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
258; VLEN128:       for.body:
259; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
260; VLEN128-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
261; VLEN128-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
262; VLEN128-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
263; VLEN128-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
264; VLEN128-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V:%.*]]
265; VLEN128-NEXT:    store i64 [[ADD]], ptr [[AADDR]], align 8
266; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
267; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
268; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
269; VLEN128:       for.end:
270; VLEN128-NEXT:    ret void
271;
272entry:
273  br label %for.body
274
275for.body:
276  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
277  %baddr = getelementptr inbounds i64, ptr %b, i64 %iv
278  %aidx = load i64, ptr %baddr
279  %aaddr = getelementptr inbounds i64, ptr %a, i64 %aidx
280  %elem = load i64, ptr %aaddr
281  %add = add i64 %elem, %v
282  store i64 %add, ptr %aaddr
283  %iv.next = add nuw nsw i64 %iv, 1
284  %exitcond.not = icmp eq i64 %iv.next, 1024
285  br i1 %exitcond.not, label %for.end, label %for.body
286
287for.end:
288  ret void
289}
290
291; a[b[i]] = v, exercise scatter support
292define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
293; VLENUNK-LABEL: @indexed_store(
294; VLENUNK-NEXT:  entry:
295; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
296; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
297; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
298; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
299; VLENUNK:       vector.ph:
300; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
301; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
302; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
303; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
304; VLENUNK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
305; VLENUNK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
306; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
307; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
308; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
309; VLENUNK:       vector.body:
310; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
311; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
312; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]]
313; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
314; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
315; VLENUNK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
316; VLENUNK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> splat (i1 true))
317; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
318; VLENUNK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
319; VLENUNK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
320; VLENUNK:       middle.block:
321; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
322; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
323; VLENUNK:       scalar.ph:
324; VLENUNK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
325; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
326; VLENUNK:       for.body:
327; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
328; VLENUNK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
329; VLENUNK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
330; VLENUNK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
331; VLENUNK-NEXT:    store i64 [[V]], ptr [[AADDR]], align 8
332; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
333; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
334; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
335; VLENUNK:       for.end:
336; VLENUNK-NEXT:    ret void
337;
338; VLEN128-LABEL: @indexed_store(
339; VLEN128-NEXT:  entry:
340; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
341; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
342; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
343; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
344; VLEN128:       vector.ph:
345; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
346; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
347; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
348; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
349; VLEN128-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
350; VLEN128-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
351; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
352; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
353; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
354; VLEN128:       vector.body:
355; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
356; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
357; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]]
358; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
359; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
360; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
361; VLEN128-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> splat (i1 true))
362; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
363; VLEN128-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
364; VLEN128-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
365; VLEN128:       middle.block:
366; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
367; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
368; VLEN128:       scalar.ph:
369; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
370; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
371; VLEN128:       for.body:
372; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
373; VLEN128-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
374; VLEN128-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
375; VLEN128-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
376; VLEN128-NEXT:    store i64 [[V]], ptr [[AADDR]], align 8
377; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
378; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
379; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
380; VLEN128:       for.end:
381; VLEN128-NEXT:    ret void
382;
383entry:
384  br label %for.body
385
386for.body:
387  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
388  %baddr = getelementptr inbounds i64, ptr %b, i64 %iv
389  %aidx = load i64, ptr %baddr
390  %aaddr = getelementptr inbounds i64, ptr %a, i64 %aidx
391  store i64 %v, ptr %aaddr
392  %iv.next = add nuw nsw i64 %iv, 1
393  %exitcond.not = icmp eq i64 %iv.next, 1024
394  br i1 %exitcond.not, label %for.end, label %for.body
395
396for.end:
397  ret void
398}
399
400define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
401; VLENUNK-LABEL: @indexed_load(
402; VLENUNK-NEXT:  entry:
403; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
404; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
405; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
406; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
407; VLENUNK:       vector.ph:
408; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
409; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
410; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
411; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
412; VLENUNK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
413; VLENUNK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
414; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
415; VLENUNK:       vector.body:
416; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
417; VLENUNK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
418; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
419; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP4]]
420; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
421; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
422; VLENUNK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
423; VLENUNK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP7]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
424; VLENUNK-NEXT:    [[TMP8]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
425; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
426; VLENUNK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
427; VLENUNK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
428; VLENUNK:       middle.block:
429; VLENUNK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP8]])
430; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
431; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
432; VLENUNK:       scalar.ph:
433; VLENUNK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
434; VLENUNK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
435; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
436; VLENUNK:       for.body:
437; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
438; VLENUNK-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
439; VLENUNK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
440; VLENUNK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
441; VLENUNK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
442; VLENUNK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
443; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
444; VLENUNK-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
445; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
446; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
447; VLENUNK:       for.end:
448; VLENUNK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
449; VLENUNK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
450;
451; VLEN128-LABEL: @indexed_load(
452; VLEN128-NEXT:  entry:
453; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
454; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
455; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
456; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
457; VLEN128:       vector.ph:
458; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
459; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
460; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
461; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
462; VLEN128-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
463; VLEN128-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
464; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
465; VLEN128:       vector.body:
466; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
467; VLEN128-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
468; VLEN128-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
469; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP6]]
470; VLEN128-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 0
471; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP8]], align 8
472; VLEN128-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <vscale x 2 x i64> [[WIDE_LOAD]]
473; VLEN128-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP9]], i32 8, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> poison)
474; VLEN128-NEXT:    [[TMP10]] = add <vscale x 2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
475; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
476; VLEN128-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
477; VLEN128-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
478; VLEN128:       middle.block:
479; VLEN128-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP10]])
480; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
481; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
482; VLEN128:       scalar.ph:
483; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
484; VLEN128-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP12]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
485; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
486; VLEN128:       for.body:
487; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
488; VLEN128-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
489; VLEN128-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
490; VLEN128-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
491; VLEN128-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
492; VLEN128-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
493; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
494; VLEN128-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
495; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
496; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
497; VLEN128:       for.end:
498; VLEN128-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
499; VLEN128-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
500;
501entry:
502  br label %for.body
503
504for.body:
505  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
506  %sum = phi i64 [0, %entry], [%sum.next, %for.body]
507  %baddr = getelementptr inbounds i64, ptr %b, i64 %iv
508  %aidx = load i64, ptr %baddr
509  %aaddr = getelementptr inbounds i64, ptr %a, i64 %aidx
510  %elem = load i64, ptr %aaddr
511  %iv.next = add nuw nsw i64 %iv, 1
512  %sum.next = add i64 %sum, %elem
513  %exitcond.not = icmp eq i64 %iv.next, 1024
514  br i1 %exitcond.not, label %for.end, label %for.body
515
516for.end:
517  ret i64 %sum.next
518}
519
520define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
521; VLENUNK-LABEL: @splat_int(
522; VLENUNK-NEXT:  entry:
523; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
524; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
525; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
526; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
527; VLENUNK:       vector.ph:
528; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
529; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
530; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
531; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
532; VLENUNK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
533; VLENUNK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
534; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
535; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
536; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
537; VLENUNK:       vector.body:
538; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
539; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
540; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
541; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
542; VLENUNK-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
543; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
544; VLENUNK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
545; VLENUNK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
546; VLENUNK:       middle.block:
547; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
548; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
549; VLENUNK:       scalar.ph:
550; VLENUNK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
551; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
552; VLENUNK:       for.body:
553; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
554; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
555; VLENUNK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
556; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
557; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
558; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
559; VLENUNK:       for.end:
560; VLENUNK-NEXT:    ret void
561;
562; VLEN128-LABEL: @splat_int(
563; VLEN128-NEXT:  entry:
564; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
565; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
566; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
567; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
568; VLEN128:       vector.ph:
569; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
570; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
571; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
572; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
573; VLEN128-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
574; VLEN128-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
575; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[V:%.*]], i64 0
576; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
577; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
578; VLEN128:       vector.body:
579; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
580; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
581; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
582; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
583; VLEN128-NEXT:    store <vscale x 2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
584; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
585; VLEN128-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
586; VLEN128-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
587; VLEN128:       middle.block:
588; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
589; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
590; VLEN128:       scalar.ph:
591; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
592; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
593; VLEN128:       for.body:
594; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
595; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
596; VLEN128-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
597; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
598; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
599; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
600; VLEN128:       for.end:
601; VLEN128-NEXT:    ret void
602;
603entry:
604  br label %for.body
605
606for.body:
607  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
608  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
609  store i64 %v, ptr %arrayidx
610  %iv.next = add nuw nsw i64 %iv, 1
611  %exitcond.not = icmp eq i64 %iv.next, 1024
612  br i1 %exitcond.not, label %for.end, label %for.body
613
614for.end:
615  ret void
616}
617
618define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) {
619; VLENUNK-LABEL: @splat_ptr(
620; VLENUNK-NEXT:  entry:
621; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
622; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
623; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
624; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
625; VLENUNK:       vector.ph:
626; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
627; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
628; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
629; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
630; VLENUNK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
631; VLENUNK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
632; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[V:%.*]], i64 0
633; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
634; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
635; VLENUNK:       vector.body:
636; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
637; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
638; VLENUNK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
639; VLENUNK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
640; VLENUNK-NEXT:    store <vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
641; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
642; VLENUNK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
643; VLENUNK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
644; VLENUNK:       middle.block:
645; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
646; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
647; VLENUNK:       scalar.ph:
648; VLENUNK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
649; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
650; VLENUNK:       for.body:
651; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
652; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
653; VLENUNK-NEXT:    store ptr [[V]], ptr [[ARRAYIDX]], align 8
654; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
655; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
656; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
657; VLENUNK:       for.end:
658; VLENUNK-NEXT:    ret void
659;
660; VLEN128-LABEL: @splat_ptr(
661; VLEN128-NEXT:  entry:
662; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
663; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
664; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
665; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
666; VLEN128:       vector.ph:
667; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
668; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
669; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
670; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
671; VLEN128-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
672; VLEN128-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
673; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x ptr> poison, ptr [[V:%.*]], i64 0
674; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 2 x ptr> poison, <vscale x 2 x i32> zeroinitializer
675; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
676; VLEN128:       vector.body:
677; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
678; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
679; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
680; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr, ptr [[TMP5]], i32 0
681; VLEN128-NEXT:    store <vscale x 2 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
682; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
683; VLEN128-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
684; VLEN128-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
685; VLEN128:       middle.block:
686; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
687; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
688; VLEN128:       scalar.ph:
689; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
690; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
691; VLEN128:       for.body:
692; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
693; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
694; VLEN128-NEXT:    store ptr [[V]], ptr [[ARRAYIDX]], align 8
695; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
696; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
697; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
698; VLEN128:       for.end:
699; VLEN128-NEXT:    ret void
700;
701entry:
702  br label %for.body
703
704for.body:
705  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
706  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
707  store ptr %v, ptr %arrayidx
708  %iv.next = add nuw nsw i64 %iv, 1
709  %exitcond.not = icmp eq i64 %iv.next, 1024
710  br i1 %exitcond.not, label %for.end, label %for.body
711
712for.end:
713  ret void
714}
715
716