xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store.ll (revision a761e26b2364ea457b79b9a4bea6d792e4913d24)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+v,+zvl256b | FileCheck %s --check-prefixes=CHECK,V
3; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+f,+zve32f,+zvl256b | FileCheck %s --check-prefixes=CHECK,ZVE32F
4
5%struct.foo = type { i32, i32, i32, i32 }
6
7; void gather(signed char * __restrict  A, signed char * __restrict B) {
8;   for (int i = 0; i != 1024; ++i)
9;       A[i] += B[i * 5];
10; }
11define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
12; CHECK-LABEL: @gather(
13; CHECK-NEXT:  entry:
14; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
15; CHECK:       vector.body:
16; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
17; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
18; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
19; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> splat (i1 true), i32 32)
20; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> splat (i1 true), <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
21; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
22; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
23; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
24; CHECK-NEXT:    store <32 x i8> [[I4]], ptr [[I2]], align 1
25; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
26; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
27; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
28; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
29; CHECK:       for.cond.cleanup:
30; CHECK-NEXT:    ret void
31;
32entry:
33  br label %vector.body
34
35vector.body:                                      ; preds = %vector.body, %entry
36  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
37  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
38  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
39  %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
40  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
41  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
42  %wide.load = load <32 x i8>, ptr %i2, align 1
43  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
44  store <32 x i8> %i4, ptr %i2, align 1
45  %index.next = add nuw i64 %index, 32
46  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
47  %i6 = icmp eq i64 %index.next, 1024
48  br i1 %i6, label %for.cond.cleanup, label %vector.body
49
50for.cond.cleanup:                                 ; preds = %vector.body
51  ret void
52}
53
54define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
55; CHECK-LABEL: @gather_masked(
56; CHECK-NEXT:  entry:
57; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
58; CHECK:       vector.body:
59; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
60; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
61; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
62; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, i32 32)
63; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> [[TMP1]], <32 x i8> [[MASKEDOFF:%.*]], i32 32)
64; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
65; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
66; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
67; CHECK-NEXT:    store <32 x i8> [[I4]], ptr [[I2]], align 1
68; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
69; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
70; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
71; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
72; CHECK:       for.cond.cleanup:
73; CHECK-NEXT:    ret void
74;
75entry:
76  br label %vector.body
77
78vector.body:                                      ; preds = %vector.body, %entry
79  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
80  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
81  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
82  %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
83  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
84  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
85  %wide.load = load <32 x i8>, ptr %i2, align 1
86  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
87  store <32 x i8> %i4, ptr %i2, align 1
88  %index.next = add nuw i64 %index, 32
89  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
90  %i6 = icmp eq i64 %index.next, 1024
91  br i1 %i6, label %for.cond.cleanup, label %vector.body
92
93for.cond.cleanup:                                 ; preds = %vector.body
94  ret void
95}
96
97define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
98; CHECK-LABEL: @gather_negative_stride(
99; CHECK-NEXT:  entry:
100; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
101; CHECK:       vector.body:
102; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
103; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 155, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
104; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
105; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 -5, <32 x i1> splat (i1 true), i32 32)
106; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> splat (i1 true), <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
107; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
108; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
109; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
110; CHECK-NEXT:    store <32 x i8> [[I4]], ptr [[I2]], align 1
111; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
112; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
113; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
114; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
115; CHECK:       for.cond.cleanup:
116; CHECK-NEXT:    ret void
117;
118entry:
119  br label %vector.body
120
121vector.body:                                      ; preds = %vector.body, %entry
122  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
123  %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ]
124  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
125  %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
126  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
127  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
128  %wide.load = load <32 x i8>, ptr %i2, align 1
129  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
130  store <32 x i8> %i4, ptr %i2, align 1
131  %index.next = add nuw i64 %index, 32
132  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
133  %i6 = icmp eq i64 %index.next, 1024
134  br i1 %i6, label %for.cond.cleanup, label %vector.body
135
136for.cond.cleanup:                                 ; preds = %vector.body
137  ret void
138}
139
140define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
141; CHECK-LABEL: @gather_zero_stride(
142; CHECK-NEXT:  entry:
143; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
144; CHECK:       vector.body:
145; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
146; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
147; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
148; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 0, <32 x i1> splat (i1 true), i32 32)
149; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> splat (i1 true), <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
150; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
151; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
152; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
153; CHECK-NEXT:    store <32 x i8> [[I4]], ptr [[I2]], align 1
154; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
155; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
156; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
157; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
158; CHECK:       for.cond.cleanup:
159; CHECK-NEXT:    ret void
160;
161entry:
162  br label %vector.body
163
164vector.body:                                      ; preds = %vector.body, %entry
165  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
166  %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
167  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
168  %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
169  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
170  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
171  %wide.load = load <32 x i8>, ptr %i2, align 1
172  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
173  store <32 x i8> %i4, ptr %i2, align 1
174  %index.next = add nuw i64 %index, 32
175  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
176  %i6 = icmp eq i64 %index.next, 1024
177  br i1 %i6, label %for.cond.cleanup, label %vector.body
178
179for.cond.cleanup:                                 ; preds = %vector.body
180  ret void
181}
182
183;void scatter(signed char * __restrict  A, signed char * __restrict B) {
184;  for (int i = 0; i < 1024; ++i)
185;      A[i * 5] += B[i];
186;}
187define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
188; CHECK-LABEL: @scatter(
189; CHECK-NEXT:  entry:
190; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
191; CHECK:       vector.body:
192; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
193; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
194; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[INDEX]]
195; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I]], align 1
196; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
197; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> splat (i1 true), i32 32)
198; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> splat (i1 true), <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
199; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
200; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v32i8.p0.i64(<32 x i8> [[I4]], ptr [[TMP0]], i64 5, <32 x i1> splat (i1 true), i32 32)
201; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
202; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
203; CHECK-NEXT:    [[I5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
204; CHECK-NEXT:    br i1 [[I5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
205; CHECK:       for.cond.cleanup:
206; CHECK-NEXT:    ret void
207;
208entry:
209  br label %vector.body
210
211vector.body:                                      ; preds = %vector.body, %entry
212  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
213  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
214  %i = getelementptr inbounds i8, ptr %B, i64 %index
215  %wide.load = load <32 x i8>, ptr %i, align 1
216  %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
217  %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
218  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
219  %i4 = add <32 x i8> %wide.masked.gather, %wide.load
220  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true))
221  %index.next = add nuw i64 %index, 32
222  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
223  %i5 = icmp eq i64 %index.next, 1024
224  br i1 %i5, label %for.cond.cleanup, label %vector.body
225
226for.cond.cleanup:                                 ; preds = %vector.body
227  ret void
228}
229
230define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
231; CHECK-LABEL: @scatter_masked(
232; CHECK-NEXT:  entry:
233; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
234; CHECK:       vector.body:
235; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
236; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
237; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[INDEX]]
238; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I]], align 1
239; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
240; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, i32 32)
241; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> [[TMP1]], <32 x i8> [[MASKEDOFF:%.*]], i32 32)
242; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
243; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v32i8.p0.i64(<32 x i8> [[I4]], ptr [[TMP0]], i64 5, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, i32 32)
244; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
245; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160
246; CHECK-NEXT:    [[I5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
247; CHECK-NEXT:    br i1 [[I5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
248; CHECK:       for.cond.cleanup:
249; CHECK-NEXT:    ret void
250;
251entry:
252  br label %vector.body
253
254vector.body:                                      ; preds = %vector.body, %entry
255  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
256  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
257  %i = getelementptr inbounds i8, ptr %B, i64 %index
258  %wide.load = load <32 x i8>, ptr %i, align 1
259  %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
260  %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
261  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
262  %i4 = add <32 x i8> %wide.masked.gather, %wide.load
263  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
264  %index.next = add nuw i64 %index, 32
265  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
266  %i5 = icmp eq i64 %index.next, 1024
267  br i1 %i5, label %for.cond.cleanup, label %vector.body
268
269for.cond.cleanup:                                 ; preds = %vector.body
270  ret void
271}
272
273; void gather_pow2(signed char * __restrict  A, signed char * __restrict B) {
274;   for (int i = 0; i != 1024; ++i)
275;       A[i] += B[i * 4];
276; }
277define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
278; CHECK-LABEL: @gather_pow2(
279; CHECK-NEXT:  entry:
280; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
281; CHECK:       vector.body:
282; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
283; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
284; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
285; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 16, <8 x i1> splat (i1 true), i32 8)
286; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP1]], <8 x i32> undef, i32 8)
287; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
288; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 1
289; CHECK-NEXT:    [[I4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
290; CHECK-NEXT:    store <8 x i32> [[I4]], ptr [[I2]], align 1
291; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
292; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 32
293; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
294; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
295; CHECK:       for.cond.cleanup:
296; CHECK-NEXT:    ret void
297;
298entry:
299  br label %vector.body
300
301vector.body:                                      ; preds = %vector.body, %entry
302  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
303  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
304  %i = shl nsw <8 x i64> %vec.ind, splat (i64 2)
305  %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
306  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
307  %i2 = getelementptr inbounds i32, ptr %A, i64 %index
308  %wide.load = load <8 x i32>, ptr %i2, align 1
309  %i4 = add <8 x i32> %wide.load, %wide.masked.gather
310  store <8 x i32> %i4, ptr %i2, align 1
311  %index.next = add nuw i64 %index, 8
312  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
313  %i6 = icmp eq i64 %index.next, 1024
314  br i1 %i6, label %for.cond.cleanup, label %vector.body
315
316for.cond.cleanup:                                 ; preds = %vector.body
317  ret void
318}
319
320define void @gather_unknown_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, i64 %shift) {
321; CHECK-LABEL: @gather_unknown_pow2(
322; CHECK-NEXT:  entry:
323; CHECK-NEXT:    [[STRIDE:%.*]] = shl i64 1, [[SHIFT:%.*]]
324; CHECK-NEXT:    [[STEP:%.*]] = shl i64 8, [[SHIFT]]
325; CHECK-NEXT:    [[TMP0:%.*]] = mul i64 [[STRIDE]], 4
326; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
327; CHECK:       vector.body:
328; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
329; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
330; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
331; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP1]], i64 [[TMP0]], <8 x i1> splat (i1 true), i32 8)
332; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP2]], <8 x i32> undef, i32 8)
333; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
334; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 1
335; CHECK-NEXT:    [[I4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
336; CHECK-NEXT:    store <8 x i32> [[I4]], ptr [[I2]], align 1
337; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
338; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], [[STEP]]
339; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
340; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
341; CHECK:       for.cond.cleanup:
342; CHECK-NEXT:    ret void
343;
344entry:
345  %.splatinsert = insertelement <8 x i64> poison, i64 %shift, i64 0
346  %.splat = shufflevector <8 x i64> %.splatinsert, <8 x i64> poison, <8 x i32> zeroinitializer
347  br label %vector.body
348
349vector.body:                                      ; preds = %vector.body, %entry
350  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
351  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
352  %i = shl nsw <8 x i64> %vec.ind, %.splat
353  %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
354  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
355  %i2 = getelementptr inbounds i32, ptr %A, i64 %index
356  %wide.load = load <8 x i32>, ptr %i2, align 1
357  %i4 = add <8 x i32> %wide.load, %wide.masked.gather
358  store <8 x i32> %i4, ptr %i2, align 1
359  %index.next = add nuw i64 %index, 8
360  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
361  %i6 = icmp eq i64 %index.next, 1024
362  br i1 %i6, label %for.cond.cleanup, label %vector.body
363
364for.cond.cleanup:                                 ; preds = %vector.body
365  ret void
366}
367
368define void @negative_shl_non_commute(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, i64 %shift) {
369; CHECK-LABEL: @negative_shl_non_commute(
370; CHECK-NEXT:  entry:
371; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[SHIFT:%.*]], i64 0
372; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
373; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
374; CHECK:       vector.body:
375; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
376; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
377; CHECK-NEXT:    [[I:%.*]] = shl nsw <8 x i64> [[DOTSPLAT]], [[VEC_IND]]
378; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], <8 x i64> [[I]]
379; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[I1]], i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
380; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
381; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 1
382; CHECK-NEXT:    [[I4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
383; CHECK-NEXT:    store <8 x i32> [[I4]], ptr [[I2]], align 1
384; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
385; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], splat (i64 8)
386; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
387; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
388; CHECK:       for.cond.cleanup:
389; CHECK-NEXT:    ret void
390;
391entry:
392  %.splatinsert = insertelement <8 x i64> poison, i64 %shift, i64 0
393  %.splat = shufflevector <8 x i64> %.splatinsert, <8 x i64> poison, <8 x i32> zeroinitializer
394  br label %vector.body
395
396vector.body:                                      ; preds = %vector.body, %entry
397  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
398  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
399  %i = shl nsw <8 x i64> %.splat, %vec.ind
400  %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
401  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
402  %i2 = getelementptr inbounds i32, ptr %A, i64 %index
403  %wide.load = load <8 x i32>, ptr %i2, align 1
404  %i4 = add <8 x i32> %wide.load, %wide.masked.gather
405  store <8 x i32> %i4, ptr %i2, align 1
406  %index.next = add nuw i64 %index, 8
407  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
408  %i6 = icmp eq i64 %index.next, 1024
409  br i1 %i6, label %for.cond.cleanup, label %vector.body
410
411for.cond.cleanup:                                 ; preds = %vector.body
412  ret void
413}
414
415;void scatter_pow2(signed char * __restrict  A, signed char * __restrict B) {
416;  for (int i = 0; i < 1024; ++i)
417;      A[i * 4] += B[i];
418;}
419define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
420; CHECK-LABEL: @scatter_pow2(
421; CHECK-NEXT:  entry:
422; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
423; CHECK:       vector.body:
424; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
425; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
426; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
427; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I]], align 1
428; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
429; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 16, <8 x i1> splat (i1 true), i32 8)
430; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP1]], <8 x i32> undef, i32 8)
431; CHECK-NEXT:    [[I4:%.*]] = add <8 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
432; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I4]], ptr [[TMP0]], i64 16, <8 x i1> splat (i1 true), i32 8)
433; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
434; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 32
435; CHECK-NEXT:    [[I5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
436; CHECK-NEXT:    br i1 [[I5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
437; CHECK:       for.cond.cleanup:
438; CHECK-NEXT:    ret void
439;
440entry:
441  br label %vector.body
442
443vector.body:                                      ; preds = %vector.body, %entry
444  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
445  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
446  %i = getelementptr inbounds i32, ptr %B, i64 %index
447  %wide.load = load <8 x i32>, ptr %i, align 1
448  %i2 = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
449  %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2
450  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
451  %i4 = add <8 x i32> %wide.masked.gather, %wide.load
452  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true))
453  %index.next = add nuw i64 %index, 8
454  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
455  %i5 = icmp eq i64 %index.next, 1024
456  br i1 %i5, label %for.cond.cleanup, label %vector.body
457
458for.cond.cleanup:                                 ; preds = %vector.body
459  ret void
460}
461
462;struct foo {
463;  int a, b, c, d;
464;};
465;
466;void struct_gather(int * __restrict  A, struct foo * __restrict B) {
467;  for (int i = 0; i < 1024; ++i)
468;      A[i] += B[i].b;
469;}
470define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
471; CHECK-LABEL: @struct_gather(
472; CHECK-NEXT:  entry:
473; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
474; CHECK:       vector.body:
475; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
476; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
477; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 8, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
478; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]], i32 1
479; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_FOO]], ptr [[B]], i64 [[VEC_IND_SCALAR1]], i32 1
480; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 16, <8 x i1> splat (i1 true), i32 8)
481; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP2]], <8 x i32> undef, i32 8)
482; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP1]], i64 16, <8 x i1> splat (i1 true), i32 8)
483; CHECK-NEXT:    [[WIDE_MASKED_GATHER9:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP3]], <8 x i32> undef, i32 8)
484; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
485; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[I2]], align 4
486; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds i32, ptr [[I2]], i64 8
487; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <8 x i32>, ptr [[I4]], align 4
488; CHECK-NEXT:    [[I6:%.*]] = add nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
489; CHECK-NEXT:    [[I7:%.*]] = add nsw <8 x i32> [[WIDE_LOAD10]], [[WIDE_MASKED_GATHER9]]
490; CHECK-NEXT:    store <8 x i32> [[I6]], ptr [[I2]], align 4
491; CHECK-NEXT:    store <8 x i32> [[I7]], ptr [[I4]], align 4
492; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
493; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 16
494; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 16
495; CHECK-NEXT:    [[I10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
496; CHECK-NEXT:    br i1 [[I10]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
497; CHECK:       for.cond.cleanup:
498; CHECK-NEXT:    ret void
499;
500entry:
501  br label %vector.body
502
503vector.body:                                      ; preds = %vector.body, %entry
504  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
505  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
506  %step.add = add <8 x i64> %vec.ind, splat (i64 8)
507  %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1
508  %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1
509  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
510  %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
511  %i2 = getelementptr inbounds i32, ptr %A, i64 %index
512  %wide.load = load <8 x i32>, ptr %i2, align 4
513  %i4 = getelementptr inbounds i32, ptr %i2, i64 8
514  %wide.load10 = load <8 x i32>, ptr %i4, align 4
515  %i6 = add nsw <8 x i32> %wide.load, %wide.masked.gather
516  %i7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9
517  store <8 x i32> %i6, ptr %i2, align 4
518  store <8 x i32> %i7, ptr %i4, align 4
519  %index.next = add nuw i64 %index, 16
520  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 16)
521  %i10 = icmp eq i64 %index.next, 1024
522  br i1 %i10, label %for.cond.cleanup, label %vector.body
523
524for.cond.cleanup:                                 ; preds = %vector.body
525  ret void
526}
527
528;void gather_unroll(int * __restrict  A, int * __restrict B) {
529;  for (int i = 0; i < 1024; i+= 4 ) {
530;    A[i] += B[i * 4];
531;    A[i+1] += B[(i+1) * 4];
532;    A[i+2] += B[(i+2) * 4];
533;    A[i+3] += B[(i+3) * 4];
534;  }
535;}
536define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
537; CHECK-LABEL: @gather_unroll(
538; CHECK-NEXT:  entry:
539; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
540; CHECK:       vector.body:
541; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
542; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
543; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ]
544; CHECK-NEXT:    [[VEC_IND_SCALAR3:%.*]] = phi i64 [ 4, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR4:%.*]], [[VECTOR_BODY]] ]
545; CHECK-NEXT:    [[VEC_IND_SCALAR5:%.*]] = phi i64 [ 1, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR6:%.*]], [[VECTOR_BODY]] ]
546; CHECK-NEXT:    [[VEC_IND_SCALAR7:%.*]] = phi i64 [ 8, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR8:%.*]], [[VECTOR_BODY]] ]
547; CHECK-NEXT:    [[VEC_IND_SCALAR9:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR10:%.*]], [[VECTOR_BODY]] ]
548; CHECK-NEXT:    [[VEC_IND_SCALAR11:%.*]] = phi i64 [ 12, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR12:%.*]], [[VECTOR_BODY]] ]
549; CHECK-NEXT:    [[VEC_IND_SCALAR13:%.*]] = phi i64 [ 3, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR14:%.*]], [[VECTOR_BODY]] ]
550; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
551; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP0]], i64 64, <8 x i1> splat (i1 true), i32 8)
552; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP1]], <8 x i32> undef, i32 8)
553; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]]
554; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP2]], i64 16, <8 x i1> splat (i1 true), i32 8)
555; CHECK-NEXT:    [[WIDE_MASKED_GATHER52:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP3]], <8 x i32> undef, i32 8)
556; CHECK-NEXT:    [[I3:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER52]], [[WIDE_MASKED_GATHER]]
557; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I3]], ptr [[TMP2]], i64 16, <8 x i1> splat (i1 true), i32 8)
558; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR3]]
559; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP4]], i64 64, <8 x i1> splat (i1 true), i32 8)
560; CHECK-NEXT:    [[WIDE_MASKED_GATHER53:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP5]], <8 x i32> undef, i32 8)
561; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR5]]
562; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP6]], i64 16, <8 x i1> splat (i1 true), i32 8)
563; CHECK-NEXT:    [[WIDE_MASKED_GATHER54:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP7]], <8 x i32> undef, i32 8)
564; CHECK-NEXT:    [[I8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER54]], [[WIDE_MASKED_GATHER53]]
565; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I8]], ptr [[TMP6]], i64 16, <8 x i1> splat (i1 true), i32 8)
566; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR7]]
567; CHECK-NEXT:    [[TMP9:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP8]], i64 64, <8 x i1> splat (i1 true), i32 8)
568; CHECK-NEXT:    [[WIDE_MASKED_GATHER55:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP9]], <8 x i32> undef, i32 8)
569; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR9]]
570; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP10]], i64 16, <8 x i1> splat (i1 true), i32 8)
571; CHECK-NEXT:    [[WIDE_MASKED_GATHER56:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP11]], <8 x i32> undef, i32 8)
572; CHECK-NEXT:    [[I13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER56]], [[WIDE_MASKED_GATHER55]]
573; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I13]], ptr [[TMP10]], i64 16, <8 x i1> splat (i1 true), i32 8)
574; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[B]], i64 [[VEC_IND_SCALAR11]]
575; CHECK-NEXT:    [[TMP13:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP12]], i64 64, <8 x i1> splat (i1 true), i32 8)
576; CHECK-NEXT:    [[WIDE_MASKED_GATHER57:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP13]], <8 x i32> undef, i32 8)
577; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[A]], i64 [[VEC_IND_SCALAR13]]
578; CHECK-NEXT:    [[TMP15:%.*]] = call <8 x i32> @llvm.experimental.vp.strided.load.v8i32.p0.i64(ptr [[TMP14]], i64 16, <8 x i1> splat (i1 true), i32 8)
579; CHECK-NEXT:    [[WIDE_MASKED_GATHER58:%.*]] = call <8 x i32> @llvm.vp.select.v8i32(<8 x i1> splat (i1 true), <8 x i32> [[TMP15]], <8 x i32> undef, i32 8)
580; CHECK-NEXT:    [[I18:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER58]], [[WIDE_MASKED_GATHER57]]
581; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v8i32.p0.i64(<8 x i32> [[I18]], ptr [[TMP14]], i64 16, <8 x i1> splat (i1 true), i32 8)
582; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
583; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 128
584; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 32
585; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR4]] = add i64 [[VEC_IND_SCALAR3]], 128
586; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR6]] = add i64 [[VEC_IND_SCALAR5]], 32
587; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR8]] = add i64 [[VEC_IND_SCALAR7]], 128
588; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR10]] = add i64 [[VEC_IND_SCALAR9]], 32
589; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR12]] = add i64 [[VEC_IND_SCALAR11]], 128
590; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR14]] = add i64 [[VEC_IND_SCALAR13]], 32
591; CHECK-NEXT:    [[I19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
592; CHECK-NEXT:    br i1 [[I19]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
593; CHECK:       for.cond.cleanup:
594; CHECK-NEXT:    ret void
595;
596entry:
597  br label %vector.body
598
599vector.body:                                      ; preds = %vector.body, %entry
600  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
601  %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ]
602  %i = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
603  %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
604  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
605  %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind
606  %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
607  %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather
608  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true))
609  %i4 = or disjoint <8 x i64> %vec.ind, splat (i64 1)
610  %i5 = shl nsw <8 x i64> %i4, splat (i64 2)
611  %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5
612  %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
613  %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4
614  %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
615  %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53
616  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true))
617  %i9 = or disjoint <8 x i64> %vec.ind, splat (i64 2)
618  %i10 = shl nsw <8 x i64> %i9, splat (i64 2)
619  %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10
620  %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
621  %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9
622  %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
623  %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55
624  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true))
625  %i14 = or disjoint <8 x i64> %vec.ind, splat (i64 3)
626  %i15 = shl nsw <8 x i64> %i14, splat (i64 2)
627  %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15
628  %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
629  %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14
630  %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
631  %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57
632  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true))
633  %index.next = add nuw i64 %index, 8
634  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
635  %i19 = icmp eq i64 %index.next, 256
636  br i1 %i19, label %for.cond.cleanup, label %vector.body
637
638for.cond.cleanup:                                 ; preds = %vector.body
639  ret void
640}
641
642declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i8>)
643declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>)
644declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32 immarg, <32 x i1>)
645declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>)
646
647; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
648define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
649; V-LABEL: @gather_of_pointers(
650; V-NEXT:  bb:
651; V-NEXT:    br label [[BB2:%.*]]
652; V:       bb2:
653; V-NEXT:    [[I:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ [[I15:%.*]], [[BB2]] ]
654; V-NEXT:    [[I3_SCALAR:%.*]] = phi i64 [ 0, [[BB]] ], [ [[I16_SCALAR:%.*]], [[BB2]] ]
655; V-NEXT:    [[I3_SCALAR1:%.*]] = phi i64 [ 10, [[BB]] ], [ [[I16_SCALAR2:%.*]], [[BB2]] ]
656; V-NEXT:    [[TMP0:%.*]] = getelementptr ptr, ptr [[ARG1:%.*]], i64 [[I3_SCALAR]]
657; V-NEXT:    [[TMP1:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[I3_SCALAR1]]
658; V-NEXT:    [[TMP2:%.*]] = call <2 x ptr> @llvm.experimental.vp.strided.load.v2p0.p0.i64(ptr [[TMP0]], i64 40, <2 x i1> splat (i1 true), i32 2)
659; V-NEXT:    [[I9:%.*]] = call <2 x ptr> @llvm.vp.select.v2p0(<2 x i1> splat (i1 true), <2 x ptr> [[TMP2]], <2 x ptr> undef, i32 2)
660; V-NEXT:    [[TMP3:%.*]] = call <2 x ptr> @llvm.experimental.vp.strided.load.v2p0.p0.i64(ptr [[TMP1]], i64 40, <2 x i1> splat (i1 true), i32 2)
661; V-NEXT:    [[I10:%.*]] = call <2 x ptr> @llvm.vp.select.v2p0(<2 x i1> splat (i1 true), <2 x ptr> [[TMP3]], <2 x ptr> undef, i32 2)
662; V-NEXT:    [[I11:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], i64 [[I]]
663; V-NEXT:    store <2 x ptr> [[I9]], ptr [[I11]], align 8
664; V-NEXT:    [[I13:%.*]] = getelementptr inbounds ptr, ptr [[I11]], i64 2
665; V-NEXT:    store <2 x ptr> [[I10]], ptr [[I13]], align 8
666; V-NEXT:    [[I15]] = add nuw i64 [[I]], 4
667; V-NEXT:    [[I16_SCALAR]] = add i64 [[I3_SCALAR]], 20
668; V-NEXT:    [[I16_SCALAR2]] = add i64 [[I3_SCALAR1]], 20
669; V-NEXT:    [[I17:%.*]] = icmp eq i64 [[I15]], 1024
670; V-NEXT:    br i1 [[I17]], label [[BB18:%.*]], label [[BB2]]
671; V:       bb18:
672; V-NEXT:    ret void
673;
674; ZVE32F-LABEL: @gather_of_pointers(
675; ZVE32F-NEXT:  bb:
676; ZVE32F-NEXT:    br label [[BB2:%.*]]
677; ZVE32F:       bb2:
678; ZVE32F-NEXT:    [[I:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ [[I15:%.*]], [[BB2]] ]
679; ZVE32F-NEXT:    [[I3:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[BB]] ], [ [[I16:%.*]], [[BB2]] ]
680; ZVE32F-NEXT:    [[I4:%.*]] = mul nuw nsw <2 x i64> [[I3]], splat (i64 5)
681; ZVE32F-NEXT:    [[I5:%.*]] = mul <2 x i64> [[I3]], splat (i64 5)
682; ZVE32F-NEXT:    [[I6:%.*]] = add <2 x i64> [[I5]], splat (i64 10)
683; ZVE32F-NEXT:    [[I7:%.*]] = getelementptr inbounds ptr, ptr [[ARG1:%.*]], <2 x i64> [[I4]]
684; ZVE32F-NEXT:    [[I8:%.*]] = getelementptr inbounds ptr, ptr [[ARG1]], <2 x i64> [[I6]]
685; ZVE32F-NEXT:    [[I9:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> [[I7]], i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
686; ZVE32F-NEXT:    [[I10:%.*]] = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> [[I8]], i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
687; ZVE32F-NEXT:    [[I11:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], i64 [[I]]
688; ZVE32F-NEXT:    store <2 x ptr> [[I9]], ptr [[I11]], align 8
689; ZVE32F-NEXT:    [[I13:%.*]] = getelementptr inbounds ptr, ptr [[I11]], i64 2
690; ZVE32F-NEXT:    store <2 x ptr> [[I10]], ptr [[I13]], align 8
691; ZVE32F-NEXT:    [[I15]] = add nuw i64 [[I]], 4
692; ZVE32F-NEXT:    [[I16]] = add <2 x i64> [[I3]], splat (i64 4)
693; ZVE32F-NEXT:    [[I17:%.*]] = icmp eq i64 [[I15]], 1024
694; ZVE32F-NEXT:    br i1 [[I17]], label [[BB18:%.*]], label [[BB2]]
695; ZVE32F:       bb18:
696; ZVE32F-NEXT:    ret void
697;
698bb:
699  br label %bb2
700
701bb2:                                              ; preds = %bb2, %bb
702  %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
703  %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
704  %i4 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
705  %i5 = mul <2 x i64> %i3, splat (i64 5)
706  %i6 = add <2 x i64> %i5, <i64 10, i64 10>
707  %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4
708  %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6
709  %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
710  %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
711  %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i
712  store <2 x ptr> %i9, ptr %i11, align 8
713  %i13 = getelementptr inbounds ptr, ptr %i11, i64 2
714  store <2 x ptr> %i10, ptr %i13, align 8
715  %i15 = add nuw i64 %i, 4
716  %i16 = add <2 x i64> %i3, <i64 4, i64 4>
717  %i17 = icmp eq i64 %i15, 1024
718  br i1 %i17, label %bb18, label %bb2
719
720bb18:                                             ; preds = %bb2
721  ret void
722}
723
724declare <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x ptr>)
725
726; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
727define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
728; V-LABEL: @scatter_of_pointers(
729; V-NEXT:  bb:
730; V-NEXT:    br label [[BB2:%.*]]
731; V:       bb2:
732; V-NEXT:    [[I:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ [[I15:%.*]], [[BB2]] ]
733; V-NEXT:    [[I3_SCALAR:%.*]] = phi i64 [ 0, [[BB]] ], [ [[I16_SCALAR:%.*]], [[BB2]] ]
734; V-NEXT:    [[I3_SCALAR1:%.*]] = phi i64 [ 10, [[BB]] ], [ [[I16_SCALAR2:%.*]], [[BB2]] ]
735; V-NEXT:    [[I4:%.*]] = getelementptr inbounds ptr, ptr [[ARG1:%.*]], i64 [[I]]
736; V-NEXT:    [[I6:%.*]] = load <2 x ptr>, ptr [[I4]], align 8
737; V-NEXT:    [[I7:%.*]] = getelementptr inbounds ptr, ptr [[I4]], i64 2
738; V-NEXT:    [[I9:%.*]] = load <2 x ptr>, ptr [[I7]], align 8
739; V-NEXT:    [[TMP0:%.*]] = getelementptr ptr, ptr [[ARG:%.*]], i64 [[I3_SCALAR]]
740; V-NEXT:    [[TMP1:%.*]] = getelementptr ptr, ptr [[ARG]], i64 [[I3_SCALAR1]]
741; V-NEXT:    call void @llvm.experimental.vp.strided.store.v2p0.p0.i64(<2 x ptr> [[I6]], ptr [[TMP0]], i64 40, <2 x i1> splat (i1 true), i32 2)
742; V-NEXT:    call void @llvm.experimental.vp.strided.store.v2p0.p0.i64(<2 x ptr> [[I9]], ptr [[TMP1]], i64 40, <2 x i1> splat (i1 true), i32 2)
743; V-NEXT:    [[I15]] = add nuw i64 [[I]], 4
744; V-NEXT:    [[I16_SCALAR]] = add i64 [[I3_SCALAR]], 20
745; V-NEXT:    [[I16_SCALAR2]] = add i64 [[I3_SCALAR1]], 20
746; V-NEXT:    [[I17:%.*]] = icmp eq i64 [[I15]], 1024
747; V-NEXT:    br i1 [[I17]], label [[BB18:%.*]], label [[BB2]]
748; V:       bb18:
749; V-NEXT:    ret void
750;
751; ZVE32F-LABEL: @scatter_of_pointers(
752; ZVE32F-NEXT:  bb:
753; ZVE32F-NEXT:    br label [[BB2:%.*]]
754; ZVE32F:       bb2:
755; ZVE32F-NEXT:    [[I:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ [[I15:%.*]], [[BB2]] ]
756; ZVE32F-NEXT:    [[I3:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[BB]] ], [ [[I16:%.*]], [[BB2]] ]
757; ZVE32F-NEXT:    [[I4:%.*]] = getelementptr inbounds ptr, ptr [[ARG1:%.*]], i64 [[I]]
758; ZVE32F-NEXT:    [[I6:%.*]] = load <2 x ptr>, ptr [[I4]], align 8
759; ZVE32F-NEXT:    [[I7:%.*]] = getelementptr inbounds ptr, ptr [[I4]], i64 2
760; ZVE32F-NEXT:    [[I9:%.*]] = load <2 x ptr>, ptr [[I7]], align 8
761; ZVE32F-NEXT:    [[I10:%.*]] = mul nuw nsw <2 x i64> [[I3]], splat (i64 5)
762; ZVE32F-NEXT:    [[I11:%.*]] = mul <2 x i64> [[I3]], splat (i64 5)
763; ZVE32F-NEXT:    [[I12:%.*]] = add <2 x i64> [[I11]], splat (i64 10)
764; ZVE32F-NEXT:    [[I13:%.*]] = getelementptr inbounds ptr, ptr [[ARG:%.*]], <2 x i64> [[I10]]
765; ZVE32F-NEXT:    [[I14:%.*]] = getelementptr inbounds ptr, ptr [[ARG]], <2 x i64> [[I12]]
766; ZVE32F-NEXT:    call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I6]], <2 x ptr> [[I13]], i32 8, <2 x i1> splat (i1 true))
767; ZVE32F-NEXT:    call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> [[I9]], <2 x ptr> [[I14]], i32 8, <2 x i1> splat (i1 true))
768; ZVE32F-NEXT:    [[I15]] = add nuw i64 [[I]], 4
769; ZVE32F-NEXT:    [[I16]] = add <2 x i64> [[I3]], splat (i64 4)
770; ZVE32F-NEXT:    [[I17:%.*]] = icmp eq i64 [[I15]], 1024
771; ZVE32F-NEXT:    br i1 [[I17]], label [[BB18:%.*]], label [[BB2]]
772; ZVE32F:       bb18:
773; ZVE32F-NEXT:    ret void
774;
775bb:
776  br label %bb2
777
778bb2:                                              ; preds = %bb2, %bb
779  %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
780  %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
781  %i4 = getelementptr inbounds ptr, ptr %arg1, i64 %i
782  %i6 = load <2 x ptr>, ptr %i4, align 8
783  %i7 = getelementptr inbounds ptr, ptr %i4, i64 2
784  %i9 = load <2 x ptr>, ptr %i7, align 8
785  %i10 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
786  %i11 = mul <2 x i64> %i3, splat (i64 5)
787  %i12 = add <2 x i64> %i11, <i64 10, i64 10>
788  %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10
789  %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12
790  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> splat (i1 true))
791  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> splat (i1 true))
792  %i15 = add nuw i64 %i, 4
793  %i16 = add <2 x i64> %i3, <i64 4, i64 4>
794  %i17 = icmp eq i64 %i15, 1024
795  br i1 %i17, label %bb18, label %bb2
796
797bb18:                                             ; preds = %bb2
798  ret void
799}
800
801declare void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr>, <2 x ptr>, i32 immarg, <2 x i1>)
802
803define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1, i32 signext %arg2) {
804; CHECK-LABEL: @strided_load_startval_add_with_splat(
805; CHECK-NEXT:  bb:
806; CHECK-NEXT:    [[I:%.*]] = icmp eq i32 [[ARG2:%.*]], 1024
807; CHECK-NEXT:    br i1 [[I]], label [[BB34:%.*]], label [[BB3:%.*]]
808; CHECK:       bb3:
809; CHECK-NEXT:    [[I4:%.*]] = sext i32 [[ARG2]] to i64
810; CHECK-NEXT:    [[I5:%.*]] = sub i32 1023, [[ARG2]]
811; CHECK-NEXT:    [[I6:%.*]] = zext i32 [[I5]] to i64
812; CHECK-NEXT:    [[I7:%.*]] = add nuw nsw i64 [[I6]], 1
813; CHECK-NEXT:    [[I8:%.*]] = icmp ult i32 [[I5]], 31
814; CHECK-NEXT:    br i1 [[I8]], label [[BB32:%.*]], label [[BB9:%.*]]
815; CHECK:       bb9:
816; CHECK-NEXT:    [[I10:%.*]] = and i64 [[I7]], 8589934560
817; CHECK-NEXT:    [[I11:%.*]] = add nsw i64 [[I10]], [[I4]]
818; CHECK-NEXT:    [[START:%.*]] = mul i64 [[I4]], 5
819; CHECK-NEXT:    br label [[BB15:%.*]]
820; CHECK:       bb15:
821; CHECK-NEXT:    [[I16:%.*]] = phi i64 [ 0, [[BB9]] ], [ [[I27:%.*]], [[BB15]] ]
822; CHECK-NEXT:    [[I17_SCALAR:%.*]] = phi i64 [ [[START]], [[BB9]] ], [ [[I28_SCALAR:%.*]], [[BB15]] ]
823; CHECK-NEXT:    [[I18:%.*]] = add i64 [[I16]], [[I4]]
824; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[ARG1:%.*]], i64 [[I17_SCALAR]]
825; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> splat (i1 true), i32 32)
826; CHECK-NEXT:    [[I21:%.*]] = call <32 x i8> @llvm.vp.select.v32i8(<32 x i1> splat (i1 true), <32 x i8> [[TMP1]], <32 x i8> undef, i32 32)
827; CHECK-NEXT:    [[I22:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 [[I18]]
828; CHECK-NEXT:    [[I24:%.*]] = load <32 x i8>, ptr [[I22]], align 1
829; CHECK-NEXT:    [[I25:%.*]] = add <32 x i8> [[I24]], [[I21]]
830; CHECK-NEXT:    store <32 x i8> [[I25]], ptr [[I22]], align 1
831; CHECK-NEXT:    [[I27]] = add nuw i64 [[I16]], 32
832; CHECK-NEXT:    [[I28_SCALAR]] = add i64 [[I17_SCALAR]], 160
833; CHECK-NEXT:    [[I29:%.*]] = icmp eq i64 [[I27]], [[I10]]
834; CHECK-NEXT:    br i1 [[I29]], label [[BB30:%.*]], label [[BB15]]
835; CHECK:       bb30:
836; CHECK-NEXT:    [[I31:%.*]] = icmp eq i64 [[I7]], [[I10]]
837; CHECK-NEXT:    br i1 [[I31]], label [[BB34]], label [[BB32]]
838; CHECK:       bb32:
839; CHECK-NEXT:    [[I33:%.*]] = phi i64 [ [[I4]], [[BB3]] ], [ [[I11]], [[BB30]] ]
840; CHECK-NEXT:    br label [[BB35:%.*]]
841; CHECK:       bb34:
842; CHECK-NEXT:    ret void
843; CHECK:       bb35:
844; CHECK-NEXT:    [[I36:%.*]] = phi i64 [ [[I43:%.*]], [[BB35]] ], [ [[I33]], [[BB32]] ]
845; CHECK-NEXT:    [[I37:%.*]] = mul nsw i64 [[I36]], 5
846; CHECK-NEXT:    [[I38:%.*]] = getelementptr inbounds i8, ptr [[ARG1]], i64 [[I37]]
847; CHECK-NEXT:    [[I39:%.*]] = load i8, ptr [[I38]], align 1
848; CHECK-NEXT:    [[I40:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 [[I36]]
849; CHECK-NEXT:    [[I41:%.*]] = load i8, ptr [[I40]], align 1
850; CHECK-NEXT:    [[I42:%.*]] = add i8 [[I41]], [[I39]]
851; CHECK-NEXT:    store i8 [[I42]], ptr [[I40]], align 1
852; CHECK-NEXT:    [[I43]] = add nsw i64 [[I36]], 1
853; CHECK-NEXT:    [[I44:%.*]] = trunc i64 [[I43]] to i32
854; CHECK-NEXT:    [[I45:%.*]] = icmp eq i32 [[I44]], 1024
855; CHECK-NEXT:    br i1 [[I45]], label [[BB34]], label [[BB35]]
856;
857bb:
858  %i = icmp eq i32 %arg2, 1024
859  br i1 %i, label %bb34, label %bb3
860
861bb3:                                              ; preds = %bb
862  %i4 = sext i32 %arg2 to i64
863  %i5 = sub i32 1023, %arg2
864  %i6 = zext i32 %i5 to i64
865  %i7 = add nuw nsw i64 %i6, 1
866  %i8 = icmp ult i32 %i5, 31
867  br i1 %i8, label %bb32, label %bb9
868
869bb9:                                              ; preds = %bb3
870  %i10 = and i64 %i7, 8589934560
871  %i11 = add nsw i64 %i10, %i4
872  %i12 = insertelement <32 x i64> poison, i64 %i4, i64 0
873  %i13 = shufflevector <32 x i64> %i12, <32 x i64> poison, <32 x i32> zeroinitializer
874  %i14 = add <32 x i64> %i13, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>
875  br label %bb15
876
877bb15:                                             ; preds = %bb15, %bb9
878  %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ]
879  %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ]
880  %i18 = add i64 %i16, %i4
881  %i19 = mul nsw <32 x i64> %i17, splat (i64 5)
882  %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19
883  %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
884  %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18
885  %i24 = load <32 x i8>, ptr %i22, align 1
886  %i25 = add <32 x i8> %i24, %i21
887  store <32 x i8> %i25, ptr %i22, align 1
888  %i27 = add nuw i64 %i16, 32
889  %i28 = add <32 x i64> %i17, splat (i64 32)
890  %i29 = icmp eq i64 %i27, %i10
891  br i1 %i29, label %bb30, label %bb15
892
893bb30:                                             ; preds = %bb15
894  %i31 = icmp eq i64 %i7, %i10
895  br i1 %i31, label %bb34, label %bb32
896
897bb32:                                             ; preds = %bb30, %bb3
898  %i33 = phi i64 [ %i4, %bb3 ], [ %i11, %bb30 ]
899  br label %bb35
900
901bb34:                                             ; preds = %bb35, %bb30, %bb
902  ret void
903
904bb35:                                             ; preds = %bb35, %bb32
905  %i36 = phi i64 [ %i43, %bb35 ], [ %i33, %bb32 ]
906  %i37 = mul nsw i64 %i36, 5
907  %i38 = getelementptr inbounds i8, ptr %arg1, i64 %i37
908  %i39 = load i8, ptr %i38, align 1
909  %i40 = getelementptr inbounds i8, ptr %arg, i64 %i36
910  %i41 = load i8, ptr %i40, align 1
911  %i42 = add i8 %i41, %i39
912  store i8 %i42, ptr %i40, align 1
913  %i43 = add nsw i64 %i36, 1
914  %i44 = trunc i64 %i43 to i32
915  %i45 = icmp eq i32 %i44, 1024
916  br i1 %i45, label %bb34, label %bb35
917}
918
919declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i8>)
920declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32 immarg, <16 x i1>)
921
922define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr noalias nocapture noundef readonly %arg1, i64 noundef %arg2) {
923; CHECK-LABEL: @gather_no_scalar_remainder(
924; CHECK-NEXT:  bb:
925; CHECK-NEXT:    [[I:%.*]] = shl i64 [[ARG2:%.*]], 4
926; CHECK-NEXT:    [[I3:%.*]] = icmp eq i64 [[I]], 0
927; CHECK-NEXT:    br i1 [[I3]], label [[BB16:%.*]], label [[BB2:%.*]]
928; CHECK:       bb2:
929; CHECK-NEXT:    br label [[BB4:%.*]]
930; CHECK:       bb4:
931; CHECK-NEXT:    [[I5:%.*]] = phi i64 [ [[I13:%.*]], [[BB4]] ], [ 0, [[BB2]] ]
932; CHECK-NEXT:    [[I6_SCALAR:%.*]] = phi i64 [ 0, [[BB2]] ], [ [[I14_SCALAR:%.*]], [[BB4]] ]
933; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[ARG1:%.*]], i64 [[I6_SCALAR]]
934; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr [[TMP0]], i64 5, <16 x i1> splat (i1 true), i32 16)
935; CHECK-NEXT:    [[I9:%.*]] = call <16 x i8> @llvm.vp.select.v16i8(<16 x i1> splat (i1 true), <16 x i8> [[TMP1]], <16 x i8> undef, i32 16)
936; CHECK-NEXT:    [[I10:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 [[I5]]
937; CHECK-NEXT:    [[I11:%.*]] = load <16 x i8>, ptr [[I10]], align 1
938; CHECK-NEXT:    [[I12:%.*]] = add <16 x i8> [[I11]], [[I9]]
939; CHECK-NEXT:    store <16 x i8> [[I12]], ptr [[I10]], align 1
940; CHECK-NEXT:    [[I13]] = add nuw i64 [[I5]], 16
941; CHECK-NEXT:    [[I14_SCALAR]] = add i64 [[I6_SCALAR]], 80
942; CHECK-NEXT:    [[I15:%.*]] = icmp eq i64 [[I13]], [[I]]
943; CHECK-NEXT:    br i1 [[I15]], label [[BB16]], label [[BB4]]
944; CHECK:       bb16:
945; CHECK-NEXT:    ret void
946;
947bb:
948  %i = shl i64 %arg2, 4
949  %i3 = icmp eq i64 %i, 0
950  br i1 %i3, label %bb16, label %bb2
951
952bb2:                                              ; preds = %bb
953  br label %bb4
954
955bb4:                                              ; preds = %bb4, %bb2
956  %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ]
957  %i6 = phi <16 x i64> [ %i14, %bb4 ], [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %bb2 ]
958  %i7 = mul <16 x i64> %i6, splat (i64 5)
959  %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7
960  %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> splat (i1 true), <16 x i8> undef)
961  %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5
962  %i11 = load <16 x i8>, ptr %i10, align 1
963  %i12 = add <16 x i8> %i11, %i9
964  store <16 x i8> %i12, ptr %i10, align 1
965  %i13 = add nuw i64 %i5, 16
966  %i14 = add <16 x i64> %i6, splat (i64 16)
967  %i15 = icmp eq i64 %i13, %i
968  br i1 %i15, label %bb16, label %bb4
969
970bb16:                                             ; preds = %bb4, %bb
971  ret void
972}
973
974define <8 x i8> @broadcast_ptr_base(ptr %a) {
975; CHECK-LABEL: @broadcast_ptr_base(
976; CHECK-NEXT:  entry:
977; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.experimental.vp.strided.load.v8i8.p0.i64(ptr [[A:%.*]], i64 64, <8 x i1> splat (i1 true), i32 8)
978; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i8> @llvm.vp.select.v8i8(<8 x i1> splat (i1 true), <8 x i8> [[TMP0]], <8 x i8> poison, i32 8)
979; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
980;
981entry:
982  %0 = insertelement <8 x ptr> poison, ptr %a, i64 0
983  %1 = shufflevector <8 x ptr> %0, <8 x ptr> poison, <8 x i32> zeroinitializer
984  %2 = getelementptr i8, <8 x ptr> %1, <8 x i64> <i64 0, i64 64, i64 128, i64 192, i64 256, i64 320, i64 384, i64 448>
985  %3 = tail call <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr> %2, i32 1, <8 x i1> splat (i1 true), <8 x i8> poison)
986  ret <8 x i8> %3
987}
988
989declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i8>)
990
991define void @gather_narrow_idx(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
992; CHECK-LABEL: @gather_narrow_idx(
993; CHECK-NEXT:  entry:
994; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
995; CHECK:       vector.body:
996; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
997; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
998; CHECK-NEXT:    [[I:%.*]] = mul nuw nsw <32 x i16> [[VEC_IND]], splat (i16 5)
999; CHECK-NEXT:    [[I1:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], <32 x i16> [[I]]
1000; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> [[I1]], i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
1001; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
1002; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
1003; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]]
1004; CHECK-NEXT:    store <32 x i8> [[I4]], ptr [[I2]], align 1
1005; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
1006; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i16> [[VEC_IND]], splat (i16 32)
1007; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
1008; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
1009; CHECK:       for.cond.cleanup:
1010; CHECK-NEXT:    ret void
1011;
1012entry:
1013  br label %vector.body
1014
1015vector.body:                                      ; preds = %vector.body, %entry
1016  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
1017  %vec.ind = phi <32 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19, i16 20, i16 21, i16 22, i16 23, i16 24, i16 25, i16 26, i16 27, i16 28, i16 29, i16 30, i16 31>, %entry ], [ %vec.ind.next, %vector.body ]
1018  %i = mul nuw nsw <32 x i16> %vec.ind, splat (i16 5)
1019  %i1 = getelementptr inbounds i8, ptr %B, <32 x i16> %i
1020  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
1021  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
1022  %wide.load = load <32 x i8>, ptr %i2, align 1
1023  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
1024  store <32 x i8> %i4, ptr %i2, align 1
1025  %index.next = add nuw i64 %index, 32
1026  %vec.ind.next = add <32 x i16> %vec.ind, splat (i16 32)
1027  %i6 = icmp eq i64 %index.next, 1024
1028  br i1 %i6, label %for.cond.cleanup, label %vector.body
1029
1030for.cond.cleanup:                                 ; preds = %vector.body
1031  ret void
1032}
1033
1034define void @vp_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
1035; CHECK-LABEL: @vp_gather(
1036; CHECK-NEXT:  entry:
1037; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1038; CHECK:       vector.body:
1039; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
1040; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
1041; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1042; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR1]]
1043; CHECK-NEXT:    [[ELEMS:%.*]] = sub i64 1024, [[VEC_IND_SCALAR]]
1044; CHECK-NEXT:    [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 32, i1 false)
1045; CHECK-NEXT:    [[ODD:%.*]] = and <32 x i64> [[VEC_IND]], splat (i64 1)
1046; CHECK-NEXT:    [[MASK:%.*]] = icmp ne <32 x i64> [[ODD]], zeroinitializer
1047; CHECK-NEXT:    [[WIDE_VP_GATHER:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> [[MASK]], i32 [[EVL]])
1048; CHECK-NEXT:    [[I2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR]]
1049; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I2]], align 1
1050; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_VP_GATHER]]
1051; CHECK-NEXT:    store <32 x i8> [[I4]], ptr [[I2]], align 1
1052; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], 32
1053; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], 160
1054; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], splat (i64 32)
1055; CHECK-NEXT:    [[I6:%.*]] = icmp eq i64 [[VEC_IND_NEXT_SCALAR]], 1024
1056; CHECK-NEXT:    br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
1057; CHECK:       for.cond.cleanup:
1058; CHECK-NEXT:    ret void
1059;
1060entry:
1061  br label %vector.body
1062
1063vector.body:                                      ; preds = %vector.body, %entry
1064  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
1065  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
1066  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
1067  %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
1068
1069  %elems = sub i64 1024, %index
1070  %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 32, i1 false)
1071
1072  %odd = and <32 x i64> %vec.ind, splat (i64 1)
1073  %mask = icmp ne <32 x i64> %odd, splat (i64 0)
1074
1075  %wide.vp.gather = call <32 x i8> @llvm.vp.gather(<32 x ptr> %i1, <32 x i1> %mask, i32 %evl)
1076  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
1077  %wide.load = load <32 x i8>, ptr %i2, align 1
1078  %i4 = add <32 x i8> %wide.load, %wide.vp.gather
1079  store <32 x i8> %i4, ptr %i2, align 1
1080  %index.next = add nuw i64 %index, 32
1081  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
1082  %i6 = icmp eq i64 %index.next, 1024
1083  br i1 %i6, label %for.cond.cleanup, label %vector.body
1084
1085for.cond.cleanup:                                 ; preds = %vector.body
1086  ret void
1087}
1088
1089define void @vp_scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
1090; CHECK-LABEL: @vp_scatter(
1091; CHECK-NEXT:  entry:
1092; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1093; CHECK:       vector.body:
1094; CHECK-NEXT:    [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ]
1095; CHECK-NEXT:    [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR1:%.*]], [[VECTOR_BODY]] ]
1096; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
1097; CHECK-NEXT:    [[I:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[VEC_IND_SCALAR]]
1098; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[I]], align 1
1099; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[VEC_IND_SCALAR1]]
1100; CHECK-NEXT:    [[ELEMS:%.*]] = sub i64 1024, [[VEC_IND_SCALAR]]
1101; CHECK-NEXT:    [[EVL:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[ELEMS]], i32 32, i1 false)
1102; CHECK-NEXT:    [[ODD:%.*]] = and <32 x i64> [[VEC_IND]], splat (i64 1)
1103; CHECK-NEXT:    [[MASK:%.*]] = icmp ne <32 x i64> [[ODD]], zeroinitializer
1104; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.experimental.vp.strided.load.v32i8.p0.i64(ptr [[TMP0]], i64 5, <32 x i1> [[MASK]], i32 [[EVL]])
1105; CHECK-NEXT:    [[I4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]]
1106; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.v32i8.p0.i64(<32 x i8> [[I4]], ptr [[TMP0]], i64 5, <32 x i1> [[MASK]], i32 [[EVL]])
1107; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR]] = add nuw i64 [[VEC_IND_SCALAR]], 32
1108; CHECK-NEXT:    [[VEC_IND_NEXT_SCALAR1]] = add i64 [[VEC_IND_SCALAR1]], 160
1109; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], splat (i64 32)
1110; CHECK-NEXT:    [[I5:%.*]] = icmp eq i64 [[VEC_IND_NEXT_SCALAR]], 1024
1111; CHECK-NEXT:    br i1 [[I5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
1112; CHECK:       for.cond.cleanup:
1113; CHECK-NEXT:    ret void
1114;
1115entry:
1116  br label %vector.body
1117
1118vector.body:                                      ; preds = %vector.body, %entry
1119  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
1120  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
1121  %i = getelementptr inbounds i8, ptr %B, i64 %index
1122  %wide.load = load <32 x i8>, ptr %i, align 1
1123  %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
1124  %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
1125
1126
1127  %elems = sub i64 1024, %index
1128  %evl = call i32 @llvm.experimental.get.vector.length.i64(i64 %elems, i32 32, i1 false)
1129
1130  %odd = and <32 x i64> %vec.ind, splat (i64 1)
1131  %mask = icmp ne <32 x i64> %odd, splat (i64 0)
1132
1133  %wide.masked.gather = call <32 x i8> @llvm.vp.gather(<32 x ptr> %i3, <32 x i1> %mask, i32 %evl)
1134  %i4 = add <32 x i8> %wide.masked.gather, %wide.load
1135  call void @llvm.vp.scatter(<32 x i8> %i4, <32 x ptr> %i3, <32 x i1> %mask, i32 %evl)
1136  %index.next = add nuw i64 %index, 32
1137  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
1138  %i5 = icmp eq i64 %index.next, 1024
1139  br i1 %i5, label %for.cond.cleanup, label %vector.body
1140
1141for.cond.cleanup:                                 ; preds = %vector.body
1142  ret void
1143}
1144