xref: /llvm-project/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll (revision 9803b0d1e7b3cbcce33c1c91d4e1cd1f20eea3d4)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=VLENUNK
3; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -riscv-v-vector-bits-min=-1 -mtriple riscv64-linux-gnu -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=VLEN128
4
5; Note: +v implies a Zvl128b (i.e. minimal VLEN of 128), but as can be seen, we're currently
6; not using that information unless an explicit vector width is set. (FIXME)
7
8; A collection of fairly basic functional tests when both fixed and scalable vectorization is
9; allowed.  The primary goal of this is check for crashes during cost modeling, but it also
10; exercises the default heuristics in a useful way.
11
12target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
13target triple = "riscv64"
14
15define void @vector_add(ptr noalias nocapture %a, i64 %v, i64 %n) {
16; VLENUNK-LABEL: @vector_add(
17; VLENUNK-NEXT:  entry:
18; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
19; VLENUNK:       for.body:
20; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
21; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
22; VLENUNK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
23; VLENUNK-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V:%.*]]
24; VLENUNK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
25; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
26; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
27; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
28; VLENUNK:       for.end:
29; VLENUNK-NEXT:    ret void
30;
31; VLEN128-LABEL: @vector_add(
32; VLEN128-NEXT:  entry:
33; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
34; VLEN128:       vector.ph:
35; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i32 0
36; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
37; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i32 0
38; VLEN128-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
39; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
40; VLEN128:       vector.body:
41; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
42; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
43; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
44; VLEN128-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
45; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
46; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
47; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
48; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
49; VLEN128-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
50; VLEN128-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
51; VLEN128-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
52; VLEN128-NEXT:    store <2 x i64> [[TMP6]], ptr [[TMP4]], align 8
53; VLEN128-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8
54; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
55; VLEN128-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
56; VLEN128-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
57; VLEN128:       middle.block:
58; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
59; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
60; VLEN128:       scalar.ph:
61; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
62; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
63; VLEN128:       for.body:
64; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
65; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
66; VLEN128-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
67; VLEN128-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V]]
68; VLEN128-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
69; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
70; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
71; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
72; VLEN128:       for.end:
73; VLEN128-NEXT:    ret void
74;
75entry:
76  br label %for.body
77
78for.body:
79  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
80  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
81  %elem = load i64, ptr %arrayidx
82  %add = add i64 %elem, %v
83  store i64 %add, ptr %arrayidx
84  %iv.next = add nuw nsw i64 %iv, 1
85  %exitcond.not = icmp eq i64 %iv.next, 1024
86  br i1 %exitcond.not, label %for.end, label %for.body
87
88for.end:
89  ret void
90}
91
92; Same as above, but with op type of i32.  We currently have a bug around
93; etype=ELEN profitability in the vectorizer, and having a smaller element
94; width test allows us to highlight different aspects of codegen.
95define void @vector_add_i32(ptr noalias nocapture %a, i32 %v, i64 %n) {
96; VLENUNK-LABEL: @vector_add_i32(
97; VLENUNK-NEXT:  entry:
98; VLENUNK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
99; VLENUNK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
100; VLENUNK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
101; VLENUNK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
102; VLENUNK:       vector.ph:
103; VLENUNK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
104; VLENUNK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
105; VLENUNK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
106; VLENUNK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
107; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.*]], i32 0
108; VLENUNK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
109; VLENUNK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V]], i32 0
110; VLENUNK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
111; VLENUNK-NEXT:    br label [[VECTOR_BODY:%.*]]
112; VLENUNK:       vector.body:
113; VLENUNK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
114; VLENUNK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
115; VLENUNK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
116; VLENUNK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
117; VLENUNK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
118; VLENUNK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
119; VLENUNK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
120; VLENUNK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]]
121; VLENUNK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
122; VLENUNK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
123; VLENUNK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP12]], align 4
124; VLENUNK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
125; VLENUNK-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 2
126; VLENUNK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP14]]
127; VLENUNK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i32>, ptr [[TMP15]], align 4
128; VLENUNK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
129; VLENUNK-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
130; VLENUNK-NEXT:    store <vscale x 2 x i32> [[TMP16]], ptr [[TMP12]], align 4
131; VLENUNK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vscale.i32()
132; VLENUNK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], 2
133; VLENUNK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP19]]
134; VLENUNK-NEXT:    store <vscale x 2 x i32> [[TMP17]], ptr [[TMP20]], align 4
135; VLENUNK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
136; VLENUNK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
137; VLENUNK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
138; VLENUNK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
139; VLENUNK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
140; VLENUNK:       middle.block:
141; VLENUNK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
142; VLENUNK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
143; VLENUNK:       scalar.ph:
144; VLENUNK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
145; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
146; VLENUNK:       for.body:
147; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
148; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
149; VLENUNK-NEXT:    [[ELEM:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
150; VLENUNK-NEXT:    [[ADD:%.*]] = add i32 [[ELEM]], [[V]]
151; VLENUNK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
152; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
153; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
154; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
155; VLENUNK:       for.end:
156; VLENUNK-NEXT:    ret void
157;
158; VLEN128-LABEL: @vector_add_i32(
159; VLEN128-NEXT:  entry:
160; VLEN128-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
161; VLEN128-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
162; VLEN128-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
163; VLEN128-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
164; VLEN128:       vector.ph:
165; VLEN128-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
166; VLEN128-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
167; VLEN128-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
168; VLEN128-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
169; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V:%.*]], i32 0
170; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
171; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[V]], i32 0
172; VLEN128-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
173; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
174; VLEN128:       vector.body:
175; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
176; VLEN128-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
177; VLEN128-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
178; VLEN128-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
179; VLEN128-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
180; VLEN128-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
181; VLEN128-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
182; VLEN128-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]]
183; VLEN128-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]]
184; VLEN128-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
185; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP12]], align 4
186; VLEN128-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
187; VLEN128-NEXT:    [[TMP14:%.*]] = mul i32 [[TMP13]], 2
188; VLEN128-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP14]]
189; VLEN128-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i32>, ptr [[TMP15]], align 4
190; VLEN128-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
191; VLEN128-NEXT:    [[TMP17:%.*]] = add <vscale x 2 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
192; VLEN128-NEXT:    store <vscale x 2 x i32> [[TMP16]], ptr [[TMP12]], align 4
193; VLEN128-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vscale.i32()
194; VLEN128-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], 2
195; VLEN128-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 [[TMP19]]
196; VLEN128-NEXT:    store <vscale x 2 x i32> [[TMP17]], ptr [[TMP20]], align 4
197; VLEN128-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
198; VLEN128-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
199; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
200; VLEN128-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
201; VLEN128-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
202; VLEN128:       middle.block:
203; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
204; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
205; VLEN128:       scalar.ph:
206; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
207; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
208; VLEN128:       for.body:
209; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
210; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]]
211; VLEN128-NEXT:    [[ELEM:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
212; VLEN128-NEXT:    [[ADD:%.*]] = add i32 [[ELEM]], [[V]]
213; VLEN128-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
214; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
215; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
216; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
217; VLEN128:       for.end:
218; VLEN128-NEXT:    ret void
219;
220entry:
221  br label %for.body
222
223for.body:
224  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
225  %arrayidx = getelementptr inbounds i32, ptr %a, i64 %iv
226  %elem = load i32, ptr %arrayidx
227  %add = add i32 %elem, %v
228  store i32 %add, ptr %arrayidx
229  %iv.next = add nuw nsw i64 %iv, 1
230  %exitcond.not = icmp eq i64 %iv.next, 1024
231  br i1 %exitcond.not, label %for.end, label %for.body
232
233for.end:
234  ret void
235}
236
237
238; a[b[i]] += v, mostly to exercise scatter/gather costing
239; TODO: Currently fails to vectorize due to a memory conflict
240define void @indexed_add(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
241; VLENUNK-LABEL: @indexed_add(
242; VLENUNK-NEXT:  entry:
243; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
244; VLENUNK:       for.body:
245; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
246; VLENUNK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
247; VLENUNK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
248; VLENUNK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
249; VLENUNK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
250; VLENUNK-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V:%.*]]
251; VLENUNK-NEXT:    store i64 [[ADD]], ptr [[AADDR]], align 8
252; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
253; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
254; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
255; VLENUNK:       for.end:
256; VLENUNK-NEXT:    ret void
257;
258; VLEN128-LABEL: @indexed_add(
259; VLEN128-NEXT:  entry:
260; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
261; VLEN128:       for.body:
262; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
263; VLEN128-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
264; VLEN128-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
265; VLEN128-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
266; VLEN128-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
267; VLEN128-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V:%.*]]
268; VLEN128-NEXT:    store i64 [[ADD]], ptr [[AADDR]], align 8
269; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
270; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
271; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
272; VLEN128:       for.end:
273; VLEN128-NEXT:    ret void
274;
275entry:
276  br label %for.body
277
278for.body:
279  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
280  %baddr = getelementptr inbounds i64, ptr %b, i64 %iv
281  %aidx = load i64, ptr %baddr
282  %aaddr = getelementptr inbounds i64, ptr %a, i64 %aidx
283  %elem = load i64, ptr %aaddr
284  %add = add i64 %elem, %v
285  store i64 %add, ptr %aaddr
286  %iv.next = add nuw nsw i64 %iv, 1
287  %exitcond.not = icmp eq i64 %iv.next, 1024
288  br i1 %exitcond.not, label %for.end, label %for.body
289
290for.end:
291  ret void
292}
293
294; a[b[i]] = v, exercise scatter support
295define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
296; VLENUNK-LABEL: @indexed_store(
297; VLENUNK-NEXT:  entry:
298; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
299; VLENUNK:       for.body:
300; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
301; VLENUNK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
302; VLENUNK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
303; VLENUNK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
304; VLENUNK-NEXT:    store i64 [[V:%.*]], ptr [[AADDR]], align 8
305; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
306; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
307; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
308; VLENUNK:       for.end:
309; VLENUNK-NEXT:    ret void
310;
311; VLEN128-LABEL: @indexed_store(
312; VLEN128-NEXT:  entry:
313; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
314; VLEN128:       vector.ph:
315; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i32 0
316; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
317; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i32 0
318; VLEN128-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
319; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
320; VLEN128:       vector.body:
321; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
322; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
323; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
324; VLEN128-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP0]]
325; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
326; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
327; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
328; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
329; VLEN128-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
330; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <2 x i64> [[WIDE_LOAD]]
331; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], <2 x i64> [[WIDE_LOAD1]]
332; VLEN128-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT]], <2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>)
333; VLEN128-NEXT:    call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT3]], <2 x ptr> [[TMP7]], i32 8, <2 x i1> <i1 true, i1 true>)
334; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
335; VLEN128-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
336; VLEN128-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
337; VLEN128:       middle.block:
338; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
339; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
340; VLEN128:       scalar.ph:
341; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
342; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
343; VLEN128:       for.body:
344; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
345; VLEN128-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
346; VLEN128-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
347; VLEN128-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
348; VLEN128-NEXT:    store i64 [[V]], ptr [[AADDR]], align 8
349; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
350; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
351; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
352; VLEN128:       for.end:
353; VLEN128-NEXT:    ret void
354;
355entry:
356  br label %for.body
357
358for.body:
359  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
360  %baddr = getelementptr inbounds i64, ptr %b, i64 %iv
361  %aidx = load i64, ptr %baddr
362  %aaddr = getelementptr inbounds i64, ptr %a, i64 %aidx
363  store i64 %v, ptr %aaddr
364  %iv.next = add nuw nsw i64 %iv, 1
365  %exitcond.not = icmp eq i64 %iv.next, 1024
366  br i1 %exitcond.not, label %for.end, label %for.body
367
368for.end:
369  ret void
370}
371
372define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
373; VLENUNK-LABEL: @indexed_load(
374; VLENUNK-NEXT:  entry:
375; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
376; VLENUNK:       for.body:
377; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
378; VLENUNK-NEXT:    [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
379; VLENUNK-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]]
380; VLENUNK-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
381; VLENUNK-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]]
382; VLENUNK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
383; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
384; VLENUNK-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
385; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
386; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
387; VLENUNK:       for.end:
388; VLENUNK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ]
389; VLENUNK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
390;
391; VLEN128-LABEL: @indexed_load(
392; VLEN128-NEXT:  entry:
393; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
394; VLEN128:       vector.ph:
395; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
396; VLEN128:       vector.body:
397; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
398; VLEN128-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
399; VLEN128-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
400; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
401; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
402; VLEN128-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP0]]
403; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]]
404; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
405; VLEN128-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8
406; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
407; VLEN128-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
408; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <2 x i64> [[WIDE_LOAD]]
409; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], <2 x i64> [[WIDE_LOAD2]]
410; VLEN128-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> undef)
411; VLEN128-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[TMP7]], i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> undef)
412; VLEN128-NEXT:    [[TMP8]] = add <2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]]
413; VLEN128-NEXT:    [[TMP9]] = add <2 x i64> [[VEC_PHI1]], [[WIDE_MASKED_GATHER3]]
414; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
415; VLEN128-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
416; VLEN128-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
417; VLEN128:       middle.block:
418; VLEN128-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]]
419; VLEN128-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
420; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
421; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
422; VLEN128:       scalar.ph:
423; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
424; VLEN128-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
425; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
426; VLEN128:       for.body:
427; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
428; VLEN128-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
429; VLEN128-NEXT:    [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]]
430; VLEN128-NEXT:    [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8
431; VLEN128-NEXT:    [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]]
432; VLEN128-NEXT:    [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8
433; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
434; VLEN128-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]]
435; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
436; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
437; VLEN128:       for.end:
438; VLEN128-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
439; VLEN128-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
440;
441entry:
442  br label %for.body
443
444for.body:
445  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
446  %sum = phi i64 [0, %entry], [%sum.next, %for.body]
447  %baddr = getelementptr inbounds i64, ptr %b, i64 %iv
448  %aidx = load i64, ptr %baddr
449  %aaddr = getelementptr inbounds i64, ptr %a, i64 %aidx
450  %elem = load i64, ptr %aaddr
451  %iv.next = add nuw nsw i64 %iv, 1
452  %sum.next = add i64 %sum, %elem
453  %exitcond.not = icmp eq i64 %iv.next, 1024
454  br i1 %exitcond.not, label %for.end, label %for.body
455
456for.end:
457  ret i64 %sum.next
458}
459
460define void @splat_int(ptr noalias nocapture %a, i64 %v, i64 %n) {
461; VLENUNK-LABEL: @splat_int(
462; VLENUNK-NEXT:  entry:
463; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
464; VLENUNK:       for.body:
465; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
466; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
467; VLENUNK-NEXT:    store i64 [[V:%.*]], ptr [[ARRAYIDX]], align 8
468; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
469; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
470; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
471; VLENUNK:       for.end:
472; VLENUNK-NEXT:    ret void
473;
474; VLEN128-LABEL: @splat_int(
475; VLEN128-NEXT:  entry:
476; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
477; VLEN128:       vector.ph:
478; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i32 0
479; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
480; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i32 0
481; VLEN128-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
482; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
483; VLEN128:       vector.body:
484; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
485; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
486; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
487; VLEN128-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
488; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
489; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
490; VLEN128-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
491; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
492; VLEN128-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
493; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
494; VLEN128-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
495; VLEN128-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
496; VLEN128:       middle.block:
497; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
498; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
499; VLEN128:       scalar.ph:
500; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
501; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
502; VLEN128:       for.body:
503; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
504; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
505; VLEN128-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
506; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
507; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
508; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
509; VLEN128:       for.end:
510; VLEN128-NEXT:    ret void
511;
512entry:
513  br label %for.body
514
515for.body:
516  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
517  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
518  store i64 %v, ptr %arrayidx
519  %iv.next = add nuw nsw i64 %iv, 1
520  %exitcond.not = icmp eq i64 %iv.next, 1024
521  br i1 %exitcond.not, label %for.end, label %for.body
522
523for.end:
524  ret void
525}
526
527define void @splat_ptr(ptr noalias nocapture %a, ptr %v, i64 %n) {
528; VLENUNK-LABEL: @splat_ptr(
529; VLENUNK-NEXT:  entry:
530; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
531; VLENUNK:       for.body:
532; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
533; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
534; VLENUNK-NEXT:    store ptr [[V:%.*]], ptr [[ARRAYIDX]], align 8
535; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
536; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
537; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
538; VLENUNK:       for.end:
539; VLENUNK-NEXT:    ret void
540;
541; VLEN128-LABEL: @splat_ptr(
542; VLEN128-NEXT:  entry:
543; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
544; VLEN128:       vector.ph:
545; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x ptr> poison, ptr [[V:%.*]], i32 0
546; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT]], <2 x ptr> poison, <2 x i32> zeroinitializer
547; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x ptr> poison, ptr [[V]], i32 0
548; VLEN128-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT1]], <2 x ptr> poison, <2 x i32> zeroinitializer
549; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
550; VLEN128:       vector.body:
551; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
552; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
553; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
554; VLEN128-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
555; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
556; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i32 0
557; VLEN128-NEXT:    store <2 x ptr> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
558; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i32 2
559; VLEN128-NEXT:    store <2 x ptr> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
560; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
561; VLEN128-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
562; VLEN128-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
563; VLEN128:       middle.block:
564; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
565; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
566; VLEN128:       scalar.ph:
567; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
568; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
569; VLEN128:       for.body:
570; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
571; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
572; VLEN128-NEXT:    store ptr [[V]], ptr [[ARRAYIDX]], align 8
573; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
574; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
575; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
576; VLEN128:       for.end:
577; VLEN128-NEXT:    ret void
578;
579entry:
580  br label %for.body
581
582for.body:
583  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
584  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
585  store ptr %v, ptr %arrayidx
586  %iv.next = add nuw nsw i64 %iv, 1
587  %exitcond.not = icmp eq i64 %iv.next, 1024
588  br i1 %exitcond.not, label %for.end, label %for.body
589
590for.end:
591  ret void
592}
593
594define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
595; VLENUNK-LABEL: @uniform_store(
596; VLENUNK-NEXT:  entry:
597; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
598; VLENUNK:       for.body:
599; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
600; VLENUNK-NEXT:    store i64 [[V:%.*]], ptr [[B:%.*]], align 8
601; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
602; VLENUNK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
603; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
604; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
605; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
606; VLENUNK:       for.end:
607; VLENUNK-NEXT:    ret void
608;
609; VLEN128-LABEL: @uniform_store(
610; VLEN128-NEXT:  entry:
611; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
612; VLEN128:       vector.ph:
613; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i32 0
614; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
615; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i32 0
616; VLEN128-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
617; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
618; VLEN128:       vector.body:
619; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
620; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
621; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
622; VLEN128-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 8
623; VLEN128-NEXT:    store i64 [[V]], ptr [[B]], align 8
624; VLEN128-NEXT:    store i64 [[V]], ptr [[B]], align 8
625; VLEN128-NEXT:    store i64 [[V]], ptr [[B]], align 8
626; VLEN128-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
627; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
628; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
629; VLEN128-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
630; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
631; VLEN128-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
632; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
633; VLEN128-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
634; VLEN128-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
635; VLEN128:       middle.block:
636; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
637; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
638; VLEN128:       scalar.ph:
639; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
640; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
641; VLEN128:       for.body:
642; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
643; VLEN128-NEXT:    store i64 [[V]], ptr [[B]], align 8
644; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
645; VLEN128-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
646; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
647; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
648; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
649; VLEN128:       for.end:
650; VLEN128-NEXT:    ret void
651;
652entry:
653  br label %for.body
654
655for.body:
656  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
657  store i64 %v, ptr %b, align 8
658  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
659  store i64 %v, ptr %arrayidx
660  %iv.next = add nuw nsw i64 %iv, 1
661  %exitcond.not = icmp eq i64 %iv.next, 1024
662  br i1 %exitcond.not, label %for.end, label %for.body
663
664for.end:
665  ret void
666}
667
668define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) {
669; VLENUNK-LABEL: @uniform_store_unaligned(
670; VLENUNK-NEXT:  entry:
671; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
672; VLENUNK:       for.body:
673; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
674; VLENUNK-NEXT:    store i64 [[V:%.*]], ptr [[B:%.*]], align 1
675; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
676; VLENUNK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
677; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
678; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
679; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
680; VLENUNK:       for.end:
681; VLENUNK-NEXT:    ret void
682;
683; VLEN128-LABEL: @uniform_store_unaligned(
684; VLEN128-NEXT:  entry:
685; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
686; VLEN128:       vector.ph:
687; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i32 0
688; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
689; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i32 0
690; VLEN128-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
691; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
692; VLEN128:       vector.body:
693; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
694; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
695; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
696; VLEN128-NEXT:    store i64 [[V]], ptr [[B:%.*]], align 1
697; VLEN128-NEXT:    store i64 [[V]], ptr [[B]], align 1
698; VLEN128-NEXT:    store i64 [[V]], ptr [[B]], align 1
699; VLEN128-NEXT:    store i64 [[V]], ptr [[B]], align 1
700; VLEN128-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
701; VLEN128-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
702; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
703; VLEN128-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8
704; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2
705; VLEN128-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8
706; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
707; VLEN128-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
708; VLEN128-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
709; VLEN128:       middle.block:
710; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
711; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
712; VLEN128:       scalar.ph:
713; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
714; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
715; VLEN128:       for.body:
716; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
717; VLEN128-NEXT:    store i64 [[V]], ptr [[B]], align 1
718; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
719; VLEN128-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
720; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
721; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
722; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
723; VLEN128:       for.end:
724; VLEN128-NEXT:    ret void
725;
726entry:
727  br label %for.body
728
729for.body:
730  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
731  store i64 %v, ptr %b, align 1
732  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
733  store i64 %v, ptr %arrayidx
734  %iv.next = add nuw nsw i64 %iv, 1
735  %exitcond.not = icmp eq i64 %iv.next, 1024
736  br i1 %exitcond.not, label %for.end, label %for.body
737
738for.end:
739  ret void
740}
741
742define i64 @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
743; VLENUNK-LABEL: @uniform_load(
744; VLENUNK-NEXT:  entry:
745; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
746; VLENUNK:       for.body:
747; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
748; VLENUNK-NEXT:    [[V:%.*]] = load i64, ptr [[B:%.*]], align 8
749; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
750; VLENUNK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
751; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
752; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
753; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
754; VLENUNK:       for.end:
755; VLENUNK-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ]
756; VLENUNK-NEXT:    ret i64 [[V_LCSSA]]
757;
758; VLEN128-LABEL: @uniform_load(
759; VLEN128-NEXT:  entry:
760; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
761; VLEN128:       vector.ph:
762; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
763; VLEN128:       vector.body:
764; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
765; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
766; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
767; VLEN128-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 8
768; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
769; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
770; VLEN128-NEXT:    [[TMP3:%.*]] = load i64, ptr [[B]], align 8
771; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
772; VLEN128-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
773; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
774; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
775; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
776; VLEN128-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
777; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 2
778; VLEN128-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP7]], align 8
779; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
780; VLEN128-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
781; VLEN128-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
782; VLEN128:       middle.block:
783; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
784; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
785; VLEN128:       scalar.ph:
786; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
787; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
788; VLEN128:       for.body:
789; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
790; VLEN128-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 8
791; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
792; VLEN128-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
793; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
794; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
795; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
796; VLEN128:       for.end:
797; VLEN128-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
798; VLEN128-NEXT:    ret i64 [[V_LCSSA]]
799;
800entry:
801  br label %for.body
802
803for.body:
804  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
805  %v = load i64, ptr %b, align 8
806  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
807  store i64 %v, ptr %arrayidx
808  %iv.next = add nuw nsw i64 %iv, 1
809  %exitcond.not = icmp eq i64 %iv.next, 1024
810  br i1 %exitcond.not, label %for.end, label %for.body
811
812for.end:
813  ret i64 %v
814}
815
816define i64 @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %n) {
817; VLENUNK-LABEL: @uniform_load_unaligned(
818; VLENUNK-NEXT:  entry:
819; VLENUNK-NEXT:    br label [[FOR_BODY:%.*]]
820; VLENUNK:       for.body:
821; VLENUNK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
822; VLENUNK-NEXT:    [[V:%.*]] = load i64, ptr [[B:%.*]], align 1
823; VLENUNK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]]
824; VLENUNK-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
825; VLENUNK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
826; VLENUNK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
827; VLENUNK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
828; VLENUNK:       for.end:
829; VLENUNK-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ]
830; VLENUNK-NEXT:    ret i64 [[V_LCSSA]]
831;
832; VLEN128-LABEL: @uniform_load_unaligned(
833; VLEN128-NEXT:  entry:
834; VLEN128-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
835; VLEN128:       vector.ph:
836; VLEN128-NEXT:    br label [[VECTOR_BODY:%.*]]
837; VLEN128:       vector.body:
838; VLEN128-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
839; VLEN128-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
840; VLEN128-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
841; VLEN128-NEXT:    [[TMP2:%.*]] = load i64, ptr [[B:%.*]], align 1
842; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0
843; VLEN128-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
844; VLEN128-NEXT:    [[TMP3:%.*]] = load i64, ptr [[B]], align 1
845; VLEN128-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0
846; VLEN128-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
847; VLEN128-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
848; VLEN128-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
849; VLEN128-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
850; VLEN128-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8
851; VLEN128-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 2
852; VLEN128-NEXT:    store <2 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP7]], align 8
853; VLEN128-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
854; VLEN128-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
855; VLEN128-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
856; VLEN128:       middle.block:
857; VLEN128-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, 1024
858; VLEN128-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
859; VLEN128:       scalar.ph:
860; VLEN128-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
861; VLEN128-NEXT:    br label [[FOR_BODY:%.*]]
862; VLEN128:       for.body:
863; VLEN128-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
864; VLEN128-NEXT:    [[V:%.*]] = load i64, ptr [[B]], align 1
865; VLEN128-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
866; VLEN128-NEXT:    store i64 [[V]], ptr [[ARRAYIDX]], align 8
867; VLEN128-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
868; VLEN128-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
869; VLEN128-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
870; VLEN128:       for.end:
871; VLEN128-NEXT:    [[V_LCSSA:%.*]] = phi i64 [ [[V]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
872; VLEN128-NEXT:    ret i64 [[V_LCSSA]]
873;
874entry:
875  br label %for.body
876
877for.body:
878  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
879  %v = load i64, ptr %b, align 1
880  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
881  store i64 %v, ptr %arrayidx
882  %iv.next = add nuw nsw i64 %iv, 1
883  %exitcond.not = icmp eq i64 %iv.next, 1024
884  br i1 %exitcond.not, label %for.end, label %for.body
885
886for.end:
887  ret i64 %v
888}
889