xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-load-store-asm.ll (revision b6c0f1bfa79a3a32d841ac5ab1f94c3aee3b5d90)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvl256b | FileCheck %s --check-prefixes=CHECK,V
3; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+zvl256b | FileCheck %s --check-prefixes=CHECK,ZVE32F
4; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED,OPTZVE32F
5; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f,+optimized-zero-stride-load,+zvl256b | FileCheck %s --check-prefixes=CHECK,OPTIMIZED,OPTV
6
7%struct.foo = type { i32, i32, i32, i32 }
8
9; void gather(signed char * __restrict  A, signed char * __restrict B) {
10;   for (int i = 0; i != 1024; ++i)
11;       A[i] += B[i * 5];
12; }
13define void @gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
14; CHECK-LABEL: gather:
15; CHECK:       # %bb.0: # %entry
16; CHECK-NEXT:    addi a2, a0, 1024
17; CHECK-NEXT:    li a4, 32
18; CHECK-NEXT:    li a3, 5
19; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
20; CHECK-NEXT:  .LBB0_1: # %vector.body
21; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
22; CHECK-NEXT:    vlse8.v v8, (a1), a3
23; CHECK-NEXT:    vle8.v v9, (a0)
24; CHECK-NEXT:    vadd.vv v8, v9, v8
25; CHECK-NEXT:    vse8.v v8, (a0)
26; CHECK-NEXT:    addi a0, a0, 32
27; CHECK-NEXT:    addi a1, a1, 160
28; CHECK-NEXT:    bne a0, a2, .LBB0_1
29; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
30; CHECK-NEXT:    ret
31entry:
32  br label %vector.body
33
34vector.body:                                      ; preds = %vector.body, %entry
35  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
36  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
37  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
38  %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
39  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
40  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
41  %wide.load = load <32 x i8>, ptr %i2, align 1
42  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
43  store <32 x i8> %i4, ptr %i2, align 1
44  %index.next = add nuw i64 %index, 32
45  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
46  %i6 = icmp eq i64 %index.next, 1024
47  br i1 %i6, label %for.cond.cleanup, label %vector.body
48
49for.cond.cleanup:                                 ; preds = %vector.body
50  ret void
51}
52
53define void @gather_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
54; CHECK-LABEL: gather_masked:
55; CHECK:       # %bb.0: # %entry
56; CHECK-NEXT:    addi a2, a0, 1024
57; CHECK-NEXT:    lui a4, 983765
58; CHECK-NEXT:    li a3, 32
59; CHECK-NEXT:    addi a4, a4, 873
60; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
61; CHECK-NEXT:    vmv.s.x v0, a4
62; CHECK-NEXT:    li a4, 5
63; CHECK-NEXT:  .LBB1_1: # %vector.body
64; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
65; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, mu
66; CHECK-NEXT:    vmv1r.v v9, v8
67; CHECK-NEXT:    vlse8.v v9, (a1), a4, v0.t
68; CHECK-NEXT:    vle8.v v10, (a0)
69; CHECK-NEXT:    vadd.vv v9, v10, v9
70; CHECK-NEXT:    vse8.v v9, (a0)
71; CHECK-NEXT:    addi a0, a0, 32
72; CHECK-NEXT:    addi a1, a1, 160
73; CHECK-NEXT:    bne a0, a2, .LBB1_1
74; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
75; CHECK-NEXT:    ret
76entry:
77  br label %vector.body
78
79vector.body:                                      ; preds = %vector.body, %entry
80  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
81  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
82  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
83  %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
84  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
85  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
86  %wide.load = load <32 x i8>, ptr %i2, align 1
87  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
88  store <32 x i8> %i4, ptr %i2, align 1
89  %index.next = add nuw i64 %index, 32
90  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
91  %i6 = icmp eq i64 %index.next, 1024
92  br i1 %i6, label %for.cond.cleanup, label %vector.body
93
94for.cond.cleanup:                                 ; preds = %vector.body
95  ret void
96}
97
98define void @gather_negative_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
99; CHECK-LABEL: gather_negative_stride:
100; CHECK:       # %bb.0: # %entry
101; CHECK-NEXT:    addi a1, a1, 155
102; CHECK-NEXT:    addi a2, a0, 1024
103; CHECK-NEXT:    li a4, 32
104; CHECK-NEXT:    li a3, -5
105; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
106; CHECK-NEXT:  .LBB2_1: # %vector.body
107; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
108; CHECK-NEXT:    vlse8.v v8, (a1), a3
109; CHECK-NEXT:    vle8.v v9, (a0)
110; CHECK-NEXT:    vadd.vv v8, v9, v8
111; CHECK-NEXT:    vse8.v v8, (a0)
112; CHECK-NEXT:    addi a0, a0, 32
113; CHECK-NEXT:    addi a1, a1, 160
114; CHECK-NEXT:    bne a0, a2, .LBB2_1
115; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
116; CHECK-NEXT:    ret
117entry:
118  br label %vector.body
119
120vector.body:                                      ; preds = %vector.body, %entry
121  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
122  %vec.ind = phi <32 x i64> [ <i64 31, i64 30, i64 29, i64 28, i64 27, i64 26, i64 25, i64 24, i64 23, i64 22, i64 21, i64 20, i64 19, i64 18, i64 17, i64 16, i64 15, i64 14, i64 13, i64 12, i64 11, i64 10, i64 9, i64 8, i64 7, i64 6, i64 5, i64 4, i64 3, i64 2, i64 1, i64 0>, %entry ], [ %vec.ind.next, %vector.body ]
123  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
124  %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
125  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
126  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
127  %wide.load = load <32 x i8>, ptr %i2, align 1
128  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
129  store <32 x i8> %i4, ptr %i2, align 1
130  %index.next = add nuw i64 %index, 32
131  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
132  %i6 = icmp eq i64 %index.next, 1024
133  br i1 %i6, label %for.cond.cleanup, label %vector.body
134
135for.cond.cleanup:                                 ; preds = %vector.body
136  ret void
137}
138
139define void @gather_zero_stride(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
140; CHECK-LABEL: gather_zero_stride:
141; CHECK:       # %bb.0: # %entry
142; CHECK-NEXT:    addi a2, a0, 1024
143; CHECK-NEXT:    li a3, 32
144; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
145; CHECK-NEXT:  .LBB3_1: # %vector.body
146; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
147; CHECK-NEXT:    lbu a3, 0(a1)
148; CHECK-NEXT:    vle8.v v8, (a0)
149; CHECK-NEXT:    vadd.vx v8, v8, a3
150; CHECK-NEXT:    vse8.v v8, (a0)
151; CHECK-NEXT:    addi a0, a0, 32
152; CHECK-NEXT:    addi a1, a1, 160
153; CHECK-NEXT:    bne a0, a2, .LBB3_1
154; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
155; CHECK-NEXT:    ret
156entry:
157  br label %vector.body
158
159vector.body:                                      ; preds = %vector.body, %entry
160  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
161  %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
162  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
163  %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
164  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
165  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
166  %wide.load = load <32 x i8>, ptr %i2, align 1
167  %i4 = add <32 x i8> %wide.load, %wide.masked.gather
168  store <32 x i8> %i4, ptr %i2, align 1
169  %index.next = add nuw i64 %index, 32
170  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
171  %i6 = icmp eq i64 %index.next, 1024
172  br i1 %i6, label %for.cond.cleanup, label %vector.body
173
174for.cond.cleanup:                                 ; preds = %vector.body
175  ret void
176}
177
178define void @gather_zero_stride_i32(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
179; CHECK-LABEL: gather_zero_stride_i32:
180; CHECK:       # %bb.0: # %entry
181; CHECK-NEXT:    addi a2, a0, 1024
182; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
183; CHECK-NEXT:  .LBB4_1: # %vector.body
184; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
185; CHECK-NEXT:    lw a3, 0(a1)
186; CHECK-NEXT:    vle32.v v8, (a0)
187; CHECK-NEXT:    vadd.vx v8, v8, a3
188; CHECK-NEXT:    vse32.v v8, (a0)
189; CHECK-NEXT:    addi a0, a0, 8
190; CHECK-NEXT:    addi a1, a1, 160
191; CHECK-NEXT:    bne a0, a2, .LBB4_1
192; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
193; CHECK-NEXT:    ret
194entry:
195  br label %vector.body
196
197vector.body:                                      ; preds = %vector.body, %entry
198  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
199  %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
200  %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5)
201  %i1 = getelementptr inbounds i8, ptr %B, <8 x i64> %i
202  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
203  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
204  %wide.load = load <8 x i32>, ptr %i2, align 4
205  %i4 = add <8 x i32> %wide.load, %wide.masked.gather
206  store <8 x i32> %i4, ptr %i2, align 4
207  %index.next = add nuw i64 %index, 8
208  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
209  %i6 = icmp eq i64 %index.next, 1024
210  br i1 %i6, label %for.cond.cleanup, label %vector.body
211
212for.cond.cleanup:                                 ; preds = %vector.body
213  ret void
214}
215
216define void @gather_zero_stride_unfold(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
217; V-LABEL: gather_zero_stride_unfold:
218; V:       # %bb.0: # %entry
219; V-NEXT:    addi a2, a0, 1024
220; V-NEXT:    li a3, 32
221; V-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
222; V-NEXT:  .LBB5_1: # %vector.body
223; V-NEXT:    # =>This Inner Loop Header: Depth=1
224; V-NEXT:    lbu a3, 0(a1)
225; V-NEXT:    vle8.v v8, (a0)
226; V-NEXT:    vmv.v.x v9, a3
227; V-NEXT:    vdivu.vv v8, v9, v8
228; V-NEXT:    vse8.v v8, (a0)
229; V-NEXT:    addi a0, a0, 32
230; V-NEXT:    addi a1, a1, 160
231; V-NEXT:    bne a0, a2, .LBB5_1
232; V-NEXT:  # %bb.2: # %for.cond.cleanup
233; V-NEXT:    ret
234;
235; ZVE32F-LABEL: gather_zero_stride_unfold:
236; ZVE32F:       # %bb.0: # %entry
237; ZVE32F-NEXT:    addi a2, a0, 1024
238; ZVE32F-NEXT:    li a3, 32
239; ZVE32F-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
240; ZVE32F-NEXT:  .LBB5_1: # %vector.body
241; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
242; ZVE32F-NEXT:    lbu a3, 0(a1)
243; ZVE32F-NEXT:    vle8.v v8, (a0)
244; ZVE32F-NEXT:    vmv.v.x v9, a3
245; ZVE32F-NEXT:    vdivu.vv v8, v9, v8
246; ZVE32F-NEXT:    vse8.v v8, (a0)
247; ZVE32F-NEXT:    addi a0, a0, 32
248; ZVE32F-NEXT:    addi a1, a1, 160
249; ZVE32F-NEXT:    bne a0, a2, .LBB5_1
250; ZVE32F-NEXT:  # %bb.2: # %for.cond.cleanup
251; ZVE32F-NEXT:    ret
252;
253; OPTIMIZED-LABEL: gather_zero_stride_unfold:
254; OPTIMIZED:       # %bb.0: # %entry
255; OPTIMIZED-NEXT:    addi a2, a0, 1024
256; OPTIMIZED-NEXT:    li a3, 32
257; OPTIMIZED-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
258; OPTIMIZED-NEXT:  .LBB5_1: # %vector.body
259; OPTIMIZED-NEXT:    # =>This Inner Loop Header: Depth=1
260; OPTIMIZED-NEXT:    vlse8.v v8, (a1), zero
261; OPTIMIZED-NEXT:    vle8.v v9, (a0)
262; OPTIMIZED-NEXT:    vdivu.vv v8, v8, v9
263; OPTIMIZED-NEXT:    vse8.v v8, (a0)
264; OPTIMIZED-NEXT:    addi a0, a0, 32
265; OPTIMIZED-NEXT:    addi a1, a1, 160
266; OPTIMIZED-NEXT:    bne a0, a2, .LBB5_1
267; OPTIMIZED-NEXT:  # %bb.2: # %for.cond.cleanup
268; OPTIMIZED-NEXT:    ret
269entry:
270  br label %vector.body
271
272vector.body:                                      ; preds = %vector.body, %entry
273  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
274  %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
275  %i = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
276  %i1 = getelementptr inbounds i8, ptr %B, <32 x i64> %i
277  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i1, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
278  %i2 = getelementptr inbounds i8, ptr %A, i64 %index
279  %wide.load = load <32 x i8>, ptr %i2, align 1
280  %i4 = udiv <32 x i8> %wide.masked.gather, %wide.load
281  store <32 x i8> %i4, ptr %i2, align 1
282  %index.next = add nuw i64 %index, 32
283  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
284  %i6 = icmp eq i64 %index.next, 1024
285  br i1 %i6, label %for.cond.cleanup, label %vector.body
286
287for.cond.cleanup:                                 ; preds = %vector.body
288  ret void
289}
290
291;void scatter(signed char * __restrict  A, signed char * __restrict B) {
292;  for (int i = 0; i < 1024; ++i)
293;      A[i * 5] += B[i];
294;}
295define void @scatter(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
296; CHECK-LABEL: scatter:
297; CHECK:       # %bb.0: # %entry
298; CHECK-NEXT:    addi a2, a1, 1024
299; CHECK-NEXT:    li a4, 32
300; CHECK-NEXT:    li a3, 5
301; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
302; CHECK-NEXT:  .LBB6_1: # %vector.body
303; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
304; CHECK-NEXT:    vle8.v v8, (a1)
305; CHECK-NEXT:    vlse8.v v9, (a0), a3
306; CHECK-NEXT:    addi a1, a1, 32
307; CHECK-NEXT:    vadd.vv v8, v9, v8
308; CHECK-NEXT:    vsse8.v v8, (a0), a3
309; CHECK-NEXT:    addi a0, a0, 160
310; CHECK-NEXT:    bne a1, a2, .LBB6_1
311; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
312; CHECK-NEXT:    ret
313entry:
314  br label %vector.body
315
316vector.body:                                      ; preds = %vector.body, %entry
317  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
318  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
319  %i = getelementptr inbounds i8, ptr %B, i64 %index
320  %wide.load = load <32 x i8>, ptr %i, align 1
321  %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
322  %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
323  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
324  %i4 = add <32 x i8> %wide.masked.gather, %wide.load
325  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> splat (i1 true))
326  %index.next = add nuw i64 %index, 32
327  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
328  %i5 = icmp eq i64 %index.next, 1024
329  br i1 %i5, label %for.cond.cleanup, label %vector.body
330
331for.cond.cleanup:                                 ; preds = %vector.body
332  ret void
333}
334
335define void @scatter_masked(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, <32 x i8> %maskedoff) {
336; CHECK-LABEL: scatter_masked:
337; CHECK:       # %bb.0: # %entry
338; CHECK-NEXT:    addi a2, a1, 1024
339; CHECK-NEXT:    li a3, 32
340; CHECK-NEXT:    lui a4, 983765
341; CHECK-NEXT:    addi a4, a4, 873
342; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
343; CHECK-NEXT:    vmv.s.x v0, a4
344; CHECK-NEXT:    li a4, 5
345; CHECK-NEXT:  .LBB7_1: # %vector.body
346; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
347; CHECK-NEXT:    vsetvli zero, a3, e8, m1, ta, mu
348; CHECK-NEXT:    vle8.v v9, (a1)
349; CHECK-NEXT:    vmv1r.v v10, v8
350; CHECK-NEXT:    vlse8.v v10, (a0), a4, v0.t
351; CHECK-NEXT:    addi a1, a1, 32
352; CHECK-NEXT:    vadd.vv v9, v10, v9
353; CHECK-NEXT:    vsse8.v v9, (a0), a4, v0.t
354; CHECK-NEXT:    addi a0, a0, 160
355; CHECK-NEXT:    bne a1, a2, .LBB7_1
356; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
357; CHECK-NEXT:    ret
358entry:
359  br label %vector.body
360
361vector.body:                                      ; preds = %vector.body, %entry
362  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
363  %vec.ind = phi <32 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>, %entry ], [ %vec.ind.next, %vector.body ]
364  %i = getelementptr inbounds i8, ptr %B, i64 %index
365  %wide.load = load <32 x i8>, ptr %i, align 1
366  %i2 = mul nuw nsw <32 x i64> %vec.ind, splat (i64 5)
367  %i3 = getelementptr inbounds i8, ptr %A, <32 x i64> %i2
368  %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <32 x i8> %maskedoff)
369  %i4 = add <32 x i8> %wide.masked.gather, %wide.load
370  call void @llvm.masked.scatter.v32i8.v32p0(<32 x i8> %i4, <32 x ptr> %i3, i32 1, <32 x i1> <i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>)
371  %index.next = add nuw i64 %index, 32
372  %vec.ind.next = add <32 x i64> %vec.ind, splat (i64 32)
373  %i5 = icmp eq i64 %index.next, 1024
374  br i1 %i5, label %for.cond.cleanup, label %vector.body
375
376for.cond.cleanup:                                 ; preds = %vector.body
377  ret void
378}
379
380; void gather_pow2(signed char * __restrict  A, signed char * __restrict B) {
381;   for (int i = 0; i != 1024; ++i)
382;       A[i] += B[i * 4];
383; }
384define void @gather_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
385; CHECK-LABEL: gather_pow2:
386; CHECK:       # %bb.0: # %entry
387; CHECK-NEXT:    lui a3, 1
388; CHECK-NEXT:    li a2, 16
389; CHECK-NEXT:    add a3, a0, a3
390; CHECK-NEXT:    li a4, 32
391; CHECK-NEXT:  .LBB8_1: # %vector.body
392; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
393; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
394; CHECK-NEXT:    vlse32.v v8, (a1), a2
395; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
396; CHECK-NEXT:    vle8.v v9, (a0)
397; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
398; CHECK-NEXT:    vadd.vv v8, v9, v8
399; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
400; CHECK-NEXT:    vse8.v v8, (a0)
401; CHECK-NEXT:    addi a0, a0, 32
402; CHECK-NEXT:    addi a1, a1, 128
403; CHECK-NEXT:    bne a0, a3, .LBB8_1
404; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
405; CHECK-NEXT:    ret
406entry:
407  br label %vector.body
408
409vector.body:                                      ; preds = %vector.body, %entry
410  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
411  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
412  %i = shl nsw <8 x i64> %vec.ind, splat (i64 2)
413  %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
414  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
415  %i2 = getelementptr inbounds i32, ptr %A, i64 %index
416  %wide.load = load <8 x i32>, ptr %i2, align 1
417  %i4 = add <8 x i32> %wide.load, %wide.masked.gather
418  store <8 x i32> %i4, ptr %i2, align 1
419  %index.next = add nuw i64 %index, 8
420  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
421  %i6 = icmp eq i64 %index.next, 1024
422  br i1 %i6, label %for.cond.cleanup, label %vector.body
423
424for.cond.cleanup:                                 ; preds = %vector.body
425  ret void
426}
427
428;void scatter_pow2(signed char * __restrict  A, signed char * __restrict B) {
429;  for (int i = 0; i < 1024; ++i)
430;      A[i * 4] += B[i];
431;}
432define void @scatter_pow2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
433; CHECK-LABEL: scatter_pow2:
434; CHECK:       # %bb.0: # %entry
435; CHECK-NEXT:    lui a3, 1
436; CHECK-NEXT:    li a2, 32
437; CHECK-NEXT:    add a3, a1, a3
438; CHECK-NEXT:    li a4, 16
439; CHECK-NEXT:  .LBB9_1: # %vector.body
440; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
441; CHECK-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
442; CHECK-NEXT:    vle8.v v8, (a1)
443; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
444; CHECK-NEXT:    vlse32.v v9, (a0), a4
445; CHECK-NEXT:    addi a1, a1, 32
446; CHECK-NEXT:    vadd.vv v8, v9, v8
447; CHECK-NEXT:    vsse32.v v8, (a0), a4
448; CHECK-NEXT:    addi a0, a0, 128
449; CHECK-NEXT:    bne a1, a3, .LBB9_1
450; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
451; CHECK-NEXT:    ret
452entry:
453  br label %vector.body
454
455vector.body:                                      ; preds = %vector.body, %entry
456  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
457  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
458  %i = getelementptr inbounds i32, ptr %B, i64 %index
459  %wide.load = load <8 x i32>, ptr %i, align 1
460  %i2 = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
461  %i3 = getelementptr inbounds i32, ptr %A, <8 x i64> %i2
462  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
463  %i4 = add <8 x i32> %wide.masked.gather, %wide.load
464  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i4, <8 x ptr> %i3, i32 4, <8 x i1> splat (i1 true))
465  %index.next = add nuw i64 %index, 8
466  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 8)
467  %i5 = icmp eq i64 %index.next, 1024
468  br i1 %i5, label %for.cond.cleanup, label %vector.body
469
470for.cond.cleanup:                                 ; preds = %vector.body
471  ret void
472}
473
474;struct foo {
475;  int a, b, c, d;
476;};
477;
478;void struct_gather(int * __restrict  A, struct foo * __restrict B) {
479;  for (int i = 0; i < 1024; ++i)
480;      A[i] += B[i].b;
481;}
482define void @struct_gather(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
483; CHECK-LABEL: struct_gather:
484; CHECK:       # %bb.0: # %entry
485; CHECK-NEXT:    addi a1, a1, 132
486; CHECK-NEXT:    lui a2, 1
487; CHECK-NEXT:    add a2, a0, a2
488; CHECK-NEXT:    li a3, 16
489; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
490; CHECK-NEXT:  .LBB10_1: # %vector.body
491; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
492; CHECK-NEXT:    addi a4, a0, 32
493; CHECK-NEXT:    addi a5, a1, -128
494; CHECK-NEXT:    vlse32.v v8, (a1), a3
495; CHECK-NEXT:    vle32.v v9, (a0)
496; CHECK-NEXT:    vlse32.v v10, (a5), a3
497; CHECK-NEXT:    vle32.v v11, (a4)
498; CHECK-NEXT:    vadd.vv v9, v9, v10
499; CHECK-NEXT:    vadd.vv v8, v11, v8
500; CHECK-NEXT:    vse32.v v9, (a0)
501; CHECK-NEXT:    vse32.v v8, (a4)
502; CHECK-NEXT:    addi a0, a0, 64
503; CHECK-NEXT:    addi a1, a1, 256
504; CHECK-NEXT:    bne a0, a2, .LBB10_1
505; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
506; CHECK-NEXT:    ret
507entry:
508  br label %vector.body
509
510vector.body:                                      ; preds = %vector.body, %entry
511  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
512  %vec.ind = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, %entry ], [ %vec.ind.next, %vector.body ]
513  %step.add = add <8 x i64> %vec.ind, splat (i64 8)
514  %i = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %vec.ind, i32 1
515  %i1 = getelementptr inbounds %struct.foo, ptr %B, <8 x i64> %step.add, i32 1
516  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
517  %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
518  %i2 = getelementptr inbounds i32, ptr %A, i64 %index
519  %wide.load = load <8 x i32>, ptr %i2, align 4
520  %i4 = getelementptr inbounds i32, ptr %i2, i64 8
521  %wide.load10 = load <8 x i32>, ptr %i4, align 4
522  %i6 = add nsw <8 x i32> %wide.load, %wide.masked.gather
523  %i7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9
524  store <8 x i32> %i6, ptr %i2, align 4
525  store <8 x i32> %i7, ptr %i4, align 4
526  %index.next = add nuw i64 %index, 16
527  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 16)
528  %i10 = icmp eq i64 %index.next, 1024
529  br i1 %i10, label %for.cond.cleanup, label %vector.body
530
531for.cond.cleanup:                                 ; preds = %vector.body
532  ret void
533}
534
535;void gather_unroll(int * __restrict  A, int * __restrict B) {
536;  for (int i = 0; i < 1024; i+= 4 ) {
537;    A[i] += B[i * 4];
538;    A[i+1] += B[(i+1) * 4];
539;    A[i+2] += B[(i+2) * 4];
540;    A[i+3] += B[(i+3) * 4];
541;  }
542;}
543define void @gather_unroll(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
544; CHECK-LABEL: gather_unroll:
545; CHECK:       # %bb.0: # %entry
546; CHECK-NEXT:    li a2, 256
547; CHECK-NEXT:    li a3, 64
548; CHECK-NEXT:    li a4, 16
549; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
550; CHECK-NEXT:  .LBB11_1: # %vector.body
551; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
552; CHECK-NEXT:    vlse32.v v8, (a1), a3
553; CHECK-NEXT:    vlse32.v v9, (a0), a4
554; CHECK-NEXT:    addi a5, a1, 16
555; CHECK-NEXT:    vadd.vv v8, v9, v8
556; CHECK-NEXT:    vsse32.v v8, (a0), a4
557; CHECK-NEXT:    vlse32.v v8, (a5), a3
558; CHECK-NEXT:    addi a5, a0, 4
559; CHECK-NEXT:    vlse32.v v9, (a5), a4
560; CHECK-NEXT:    vadd.vv v8, v9, v8
561; CHECK-NEXT:    vsse32.v v8, (a5), a4
562; CHECK-NEXT:    addi a5, a1, 32
563; CHECK-NEXT:    vlse32.v v8, (a5), a3
564; CHECK-NEXT:    addi a5, a0, 8
565; CHECK-NEXT:    vlse32.v v9, (a5), a4
566; CHECK-NEXT:    vadd.vv v8, v9, v8
567; CHECK-NEXT:    vsse32.v v8, (a5), a4
568; CHECK-NEXT:    addi a5, a1, 48
569; CHECK-NEXT:    vlse32.v v8, (a5), a3
570; CHECK-NEXT:    addi a5, a0, 12
571; CHECK-NEXT:    vlse32.v v9, (a5), a4
572; CHECK-NEXT:    addi a2, a2, -8
573; CHECK-NEXT:    addi a1, a1, 512
574; CHECK-NEXT:    vadd.vv v8, v9, v8
575; CHECK-NEXT:    vsse32.v v8, (a5), a4
576; CHECK-NEXT:    addi a0, a0, 128
577; CHECK-NEXT:    bnez a2, .LBB11_1
578; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
579; CHECK-NEXT:    ret
580entry:
581  br label %vector.body
582
583vector.body:                                      ; preds = %vector.body, %entry
584  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
585  %vec.ind = phi <8 x i64> [ <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>, %entry ], [ %vec.ind.next, %vector.body ]
586  %i = shl nuw nsw <8 x i64> %vec.ind, splat (i64 2)
587  %i1 = getelementptr inbounds i32, ptr %B, <8 x i64> %i
588  %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
589  %i2 = getelementptr inbounds i32, ptr %A, <8 x i64> %vec.ind
590  %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
591  %i3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather
592  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i3, <8 x ptr> %i2, i32 4, <8 x i1> splat (i1 true))
593  %i4 = or disjoint <8 x i64> %vec.ind, splat (i64 1)
594  %i5 = shl nsw <8 x i64> %i4, splat (i64 2)
595  %i6 = getelementptr inbounds i32, ptr %B, <8 x i64> %i5
596  %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i6, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
597  %i7 = getelementptr inbounds i32, ptr %A, <8 x i64> %i4
598  %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
599  %i8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53
600  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i8, <8 x ptr> %i7, i32 4, <8 x i1> splat (i1 true))
601  %i9 = or disjoint <8 x i64> %vec.ind, splat (i64 2)
602  %i10 = shl nsw <8 x i64> %i9, splat (i64 2)
603  %i11 = getelementptr inbounds i32, ptr %B, <8 x i64> %i10
604  %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i11, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
605  %i12 = getelementptr inbounds i32, ptr %A, <8 x i64> %i9
606  %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
607  %i13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55
608  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i13, <8 x ptr> %i12, i32 4, <8 x i1> splat (i1 true))
609  %i14 = or disjoint <8 x i64> %vec.ind, <i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3, i64 3>
610  %i15 = shl nsw <8 x i64> %i14, <i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2, i64 2>
611  %i16 = getelementptr inbounds i32, ptr %B, <8 x i64> %i15
612  %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i16, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
613  %i17 = getelementptr inbounds i32, ptr %A, <8 x i64> %i14
614  %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true), <8 x i32> undef)
615  %i18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57
616  call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %i18, <8 x ptr> %i17, i32 4, <8 x i1> splat (i1 true))
617  %index.next = add nuw i64 %index, 8
618  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
619  %i19 = icmp eq i64 %index.next, 256
620  br i1 %i19, label %for.cond.cleanup, label %vector.body
621
622for.cond.cleanup:                                 ; preds = %vector.body
623  ret void
624}
625
626declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i8>)
627declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32 immarg, <8 x i1>, <8 x i32>)
628declare void @llvm.masked.scatter.v32i8.v32p0(<32 x i8>, <32 x ptr>, i32 immarg, <32 x i1>)
629declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32>, <8 x ptr>, i32 immarg, <8 x i1>)
630
631; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
632define void @gather_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
633; V-LABEL: gather_of_pointers:
634; V:       # %bb.0: # %bb
635; V-NEXT:    lui a2, 2
636; V-NEXT:    add a2, a0, a2
637; V-NEXT:    li a3, 40
638; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
639; V-NEXT:  .LBB12_1: # %bb2
640; V-NEXT:    # =>This Inner Loop Header: Depth=1
641; V-NEXT:    vlse64.v v8, (a1), a3
642; V-NEXT:    addi a4, a1, 80
643; V-NEXT:    vlse64.v v9, (a4), a3
644; V-NEXT:    addi a4, a0, 16
645; V-NEXT:    vse64.v v8, (a0)
646; V-NEXT:    addi a0, a0, 32
647; V-NEXT:    vse64.v v9, (a4)
648; V-NEXT:    addi a1, a1, 160
649; V-NEXT:    bne a0, a2, .LBB12_1
650; V-NEXT:  # %bb.2: # %bb18
651; V-NEXT:    ret
652;
653; ZVE32F-LABEL: gather_of_pointers:
654; ZVE32F:       # %bb.0: # %bb
655; ZVE32F-NEXT:    li a2, 0
656; ZVE32F-NEXT:    lui a4, 2
657; ZVE32F-NEXT:    li a3, 1
658; ZVE32F-NEXT:    add a4, a0, a4
659; ZVE32F-NEXT:    li a5, 40
660; ZVE32F-NEXT:  .LBB12_1: # %bb2
661; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
662; ZVE32F-NEXT:    mul a6, a3, a5
663; ZVE32F-NEXT:    mul a7, a2, a5
664; ZVE32F-NEXT:    addi a2, a2, 4
665; ZVE32F-NEXT:    add a6, a1, a6
666; ZVE32F-NEXT:    add a7, a1, a7
667; ZVE32F-NEXT:    ld t0, 0(a7)
668; ZVE32F-NEXT:    ld t1, 0(a6)
669; ZVE32F-NEXT:    ld a7, 80(a7)
670; ZVE32F-NEXT:    ld a6, 80(a6)
671; ZVE32F-NEXT:    sd t0, 0(a0)
672; ZVE32F-NEXT:    sd t1, 8(a0)
673; ZVE32F-NEXT:    sd a7, 16(a0)
674; ZVE32F-NEXT:    sd a6, 24(a0)
675; ZVE32F-NEXT:    addi a0, a0, 32
676; ZVE32F-NEXT:    addi a3, a3, 4
677; ZVE32F-NEXT:    bne a0, a4, .LBB12_1
678; ZVE32F-NEXT:  # %bb.2: # %bb18
679; ZVE32F-NEXT:    ret
680;
681; OPTZVE32F-LABEL: gather_of_pointers:
682; OPTZVE32F:       # %bb.0: # %bb
683; OPTZVE32F-NEXT:    lui a2, 2
684; OPTZVE32F-NEXT:    add a2, a0, a2
685; OPTZVE32F-NEXT:    li a3, 40
686; OPTZVE32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
687; OPTZVE32F-NEXT:  .LBB12_1: # %bb2
688; OPTZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
689; OPTZVE32F-NEXT:    vlse64.v v8, (a1), a3
690; OPTZVE32F-NEXT:    addi a4, a1, 80
691; OPTZVE32F-NEXT:    vlse64.v v9, (a4), a3
692; OPTZVE32F-NEXT:    addi a4, a0, 16
693; OPTZVE32F-NEXT:    vse64.v v8, (a0)
694; OPTZVE32F-NEXT:    addi a0, a0, 32
695; OPTZVE32F-NEXT:    vse64.v v9, (a4)
696; OPTZVE32F-NEXT:    addi a1, a1, 160
697; OPTZVE32F-NEXT:    bne a0, a2, .LBB12_1
698; OPTZVE32F-NEXT:  # %bb.2: # %bb18
699; OPTZVE32F-NEXT:    ret
700;
701; OPTV-LABEL: gather_of_pointers:
702; OPTV:       # %bb.0: # %bb
703; OPTV-NEXT:    li a2, 0
704; OPTV-NEXT:    lui a4, 2
705; OPTV-NEXT:    li a3, 1
706; OPTV-NEXT:    add a4, a0, a4
707; OPTV-NEXT:    li a5, 40
708; OPTV-NEXT:  .LBB12_1: # %bb2
709; OPTV-NEXT:    # =>This Inner Loop Header: Depth=1
710; OPTV-NEXT:    mul a6, a3, a5
711; OPTV-NEXT:    mul a7, a2, a5
712; OPTV-NEXT:    addi a2, a2, 4
713; OPTV-NEXT:    add a6, a1, a6
714; OPTV-NEXT:    add a7, a1, a7
715; OPTV-NEXT:    ld t0, 0(a7)
716; OPTV-NEXT:    ld t1, 0(a6)
717; OPTV-NEXT:    ld a7, 80(a7)
718; OPTV-NEXT:    ld a6, 80(a6)
719; OPTV-NEXT:    sd t0, 0(a0)
720; OPTV-NEXT:    sd t1, 8(a0)
721; OPTV-NEXT:    sd a7, 16(a0)
722; OPTV-NEXT:    sd a6, 24(a0)
723; OPTV-NEXT:    addi a0, a0, 32
724; OPTV-NEXT:    addi a3, a3, 4
725; OPTV-NEXT:    bne a0, a4, .LBB12_1
726; OPTV-NEXT:  # %bb.2: # %bb18
727; OPTV-NEXT:    ret
728bb:
729  br label %bb2
730
731bb2:                                              ; preds = %bb2, %bb
732  %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
733  %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
734  %i4 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
735  %i5 = mul <2 x i64> %i3, splat (i64 5)
736  %i6 = add <2 x i64> %i5, <i64 10, i64 10>
737  %i7 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i4
738  %i8 = getelementptr inbounds ptr, ptr %arg1, <2 x i64> %i6
739  %i9 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i7, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
740  %i10 = call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> %i8, i32 8, <2 x i1> splat (i1 true), <2 x ptr> undef)
741  %i11 = getelementptr inbounds ptr, ptr %arg, i64 %i
742  store <2 x ptr> %i9, ptr %i11, align 8
743  %i13 = getelementptr inbounds ptr, ptr %i11, i64 2
744  store <2 x ptr> %i10, ptr %i13, align 8
745  %i15 = add nuw i64 %i, 4
746  %i16 = add <2 x i64> %i3, <i64 4, i64 4>
747  %i17 = icmp eq i64 %i15, 1024
748  br i1 %i17, label %bb18, label %bb2
749
750bb18:                                             ; preds = %bb2
751  ret void
752}
753
754declare <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x ptr>)
755
756; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
757define void @scatter_of_pointers(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1) {
758; V-LABEL: scatter_of_pointers:
759; V:       # %bb.0: # %bb
760; V-NEXT:    lui a2, 2
761; V-NEXT:    add a2, a1, a2
762; V-NEXT:    li a3, 40
763; V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
764; V-NEXT:  .LBB13_1: # %bb2
765; V-NEXT:    # =>This Inner Loop Header: Depth=1
766; V-NEXT:    addi a4, a1, 16
767; V-NEXT:    vle64.v v8, (a1)
768; V-NEXT:    vle64.v v9, (a4)
769; V-NEXT:    addi a4, a0, 80
770; V-NEXT:    addi a1, a1, 32
771; V-NEXT:    vsse64.v v8, (a0), a3
772; V-NEXT:    vsse64.v v9, (a4), a3
773; V-NEXT:    addi a0, a0, 160
774; V-NEXT:    bne a1, a2, .LBB13_1
775; V-NEXT:  # %bb.2: # %bb18
776; V-NEXT:    ret
777;
778; ZVE32F-LABEL: scatter_of_pointers:
779; ZVE32F:       # %bb.0: # %bb
780; ZVE32F-NEXT:    li a2, 0
781; ZVE32F-NEXT:    lui a4, 2
782; ZVE32F-NEXT:    li a3, 1
783; ZVE32F-NEXT:    add a4, a1, a4
784; ZVE32F-NEXT:    li a5, 40
785; ZVE32F-NEXT:  .LBB13_1: # %bb2
786; ZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
787; ZVE32F-NEXT:    ld a6, 0(a1)
788; ZVE32F-NEXT:    ld a7, 8(a1)
789; ZVE32F-NEXT:    ld t0, 16(a1)
790; ZVE32F-NEXT:    ld t1, 24(a1)
791; ZVE32F-NEXT:    mul t2, a3, a5
792; ZVE32F-NEXT:    mul t3, a2, a5
793; ZVE32F-NEXT:    addi a2, a2, 4
794; ZVE32F-NEXT:    addi a1, a1, 32
795; ZVE32F-NEXT:    add t2, a0, t2
796; ZVE32F-NEXT:    add t3, a0, t3
797; ZVE32F-NEXT:    sd a6, 0(t3)
798; ZVE32F-NEXT:    sd a7, 0(t2)
799; ZVE32F-NEXT:    sd t0, 80(t3)
800; ZVE32F-NEXT:    sd t1, 80(t2)
801; ZVE32F-NEXT:    addi a3, a3, 4
802; ZVE32F-NEXT:    bne a1, a4, .LBB13_1
803; ZVE32F-NEXT:  # %bb.2: # %bb18
804; ZVE32F-NEXT:    ret
805;
806; OPTZVE32F-LABEL: scatter_of_pointers:
807; OPTZVE32F:       # %bb.0: # %bb
808; OPTZVE32F-NEXT:    lui a2, 2
809; OPTZVE32F-NEXT:    add a2, a1, a2
810; OPTZVE32F-NEXT:    li a3, 40
811; OPTZVE32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
812; OPTZVE32F-NEXT:  .LBB13_1: # %bb2
813; OPTZVE32F-NEXT:    # =>This Inner Loop Header: Depth=1
814; OPTZVE32F-NEXT:    addi a4, a1, 16
815; OPTZVE32F-NEXT:    vle64.v v8, (a1)
816; OPTZVE32F-NEXT:    vle64.v v9, (a4)
817; OPTZVE32F-NEXT:    addi a4, a0, 80
818; OPTZVE32F-NEXT:    addi a1, a1, 32
819; OPTZVE32F-NEXT:    vsse64.v v8, (a0), a3
820; OPTZVE32F-NEXT:    vsse64.v v9, (a4), a3
821; OPTZVE32F-NEXT:    addi a0, a0, 160
822; OPTZVE32F-NEXT:    bne a1, a2, .LBB13_1
823; OPTZVE32F-NEXT:  # %bb.2: # %bb18
824; OPTZVE32F-NEXT:    ret
825;
826; OPTV-LABEL: scatter_of_pointers:
827; OPTV:       # %bb.0: # %bb
828; OPTV-NEXT:    li a2, 0
829; OPTV-NEXT:    lui a4, 2
830; OPTV-NEXT:    li a3, 1
831; OPTV-NEXT:    add a4, a1, a4
832; OPTV-NEXT:    li a5, 40
833; OPTV-NEXT:  .LBB13_1: # %bb2
834; OPTV-NEXT:    # =>This Inner Loop Header: Depth=1
835; OPTV-NEXT:    ld a6, 0(a1)
836; OPTV-NEXT:    ld a7, 8(a1)
837; OPTV-NEXT:    ld t0, 16(a1)
838; OPTV-NEXT:    ld t1, 24(a1)
839; OPTV-NEXT:    mul t2, a3, a5
840; OPTV-NEXT:    mul t3, a2, a5
841; OPTV-NEXT:    addi a2, a2, 4
842; OPTV-NEXT:    addi a1, a1, 32
843; OPTV-NEXT:    add t2, a0, t2
844; OPTV-NEXT:    add t3, a0, t3
845; OPTV-NEXT:    sd a6, 0(t3)
846; OPTV-NEXT:    sd a7, 0(t2)
847; OPTV-NEXT:    sd t0, 80(t3)
848; OPTV-NEXT:    sd t1, 80(t2)
849; OPTV-NEXT:    addi a3, a3, 4
850; OPTV-NEXT:    bne a1, a4, .LBB13_1
851; OPTV-NEXT:  # %bb.2: # %bb18
852; OPTV-NEXT:    ret
853bb:
854  br label %bb2
855
856bb2:                                              ; preds = %bb2, %bb
857  %i = phi i64 [ 0, %bb ], [ %i15, %bb2 ]
858  %i3 = phi <2 x i64> [ <i64 0, i64 1>, %bb ], [ %i16, %bb2 ]
859  %i4 = getelementptr inbounds ptr, ptr %arg1, i64 %i
860  %i6 = load <2 x ptr>, ptr %i4, align 8
861  %i7 = getelementptr inbounds ptr, ptr %i4, i64 2
862  %i9 = load <2 x ptr>, ptr %i7, align 8
863  %i10 = mul nuw nsw <2 x i64> %i3, splat (i64 5)
864  %i11 = mul <2 x i64> %i3, splat (i64 5)
865  %i12 = add <2 x i64> %i11, <i64 10, i64 10>
866  %i13 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i10
867  %i14 = getelementptr inbounds ptr, ptr %arg, <2 x i64> %i12
868  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i6, <2 x ptr> %i13, i32 8, <2 x i1> splat (i1 true))
869  call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> %i9, <2 x ptr> %i14, i32 8, <2 x i1> splat (i1 true))
870  %i15 = add nuw i64 %i, 4
871  %i16 = add <2 x i64> %i3, <i64 4, i64 4>
872  %i17 = icmp eq i64 %i15, 1024
873  br i1 %i17, label %bb18, label %bb2
874
875bb18:                                             ; preds = %bb2
876  ret void
877}
878
879declare void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr>, <2 x ptr>, i32 immarg, <2 x i1>)
880
881define void @strided_load_startval_add_with_splat(ptr noalias nocapture %arg, ptr noalias nocapture readonly %arg1, i32 signext %arg2) {
882; CHECK-LABEL: strided_load_startval_add_with_splat:
883; CHECK:       # %bb.0: # %bb
884; CHECK-NEXT:    li a3, 1024
885; CHECK-NEXT:    beq a2, a3, .LBB14_7
886; CHECK-NEXT:  # %bb.1: # %bb3
887; CHECK-NEXT:    li a3, 1023
888; CHECK-NEXT:    subw a5, a3, a2
889; CHECK-NEXT:    li a6, 31
890; CHECK-NEXT:    mv a4, a2
891; CHECK-NEXT:    bltu a5, a6, .LBB14_5
892; CHECK-NEXT:  # %bb.2: # %bb9
893; CHECK-NEXT:    slli a4, a5, 32
894; CHECK-NEXT:    slli t0, a2, 2
895; CHECK-NEXT:    add a5, a0, a2
896; CHECK-NEXT:    add a6, a1, a2
897; CHECK-NEXT:    li t2, 32
898; CHECK-NEXT:    srli a4, a4, 32
899; CHECK-NEXT:    add t0, a6, t0
900; CHECK-NEXT:    addi a6, a4, 1
901; CHECK-NEXT:    andi a7, a6, -32
902; CHECK-NEXT:    add a4, a7, a2
903; CHECK-NEXT:    add a2, a4, a0
904; CHECK-NEXT:    li t1, 5
905; CHECK-NEXT:    vsetvli zero, t2, e8, m1, ta, ma
906; CHECK-NEXT:  .LBB14_3: # %bb15
907; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
908; CHECK-NEXT:    vlse8.v v8, (t0), t1
909; CHECK-NEXT:    vle8.v v9, (a5)
910; CHECK-NEXT:    vadd.vv v8, v9, v8
911; CHECK-NEXT:    vse8.v v8, (a5)
912; CHECK-NEXT:    addi a5, a5, 32
913; CHECK-NEXT:    addi t0, t0, 160
914; CHECK-NEXT:    bne a5, a2, .LBB14_3
915; CHECK-NEXT:  # %bb.4: # %bb30
916; CHECK-NEXT:    beq a6, a7, .LBB14_7
917; CHECK-NEXT:  .LBB14_5: # %bb32
918; CHECK-NEXT:    add a2, a0, a4
919; CHECK-NEXT:    slli a5, a4, 2
920; CHECK-NEXT:    add a1, a1, a4
921; CHECK-NEXT:    subw a3, a3, a4
922; CHECK-NEXT:    add a1, a1, a5
923; CHECK-NEXT:    slli a3, a3, 32
924; CHECK-NEXT:    srli a3, a3, 32
925; CHECK-NEXT:    add a0, a4, a0
926; CHECK-NEXT:    add a0, a0, a3
927; CHECK-NEXT:    addi a0, a0, 1
928; CHECK-NEXT:  .LBB14_6: # %bb35
929; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
930; CHECK-NEXT:    lbu a3, 0(a1)
931; CHECK-NEXT:    lbu a4, 0(a2)
932; CHECK-NEXT:    add a3, a4, a3
933; CHECK-NEXT:    sb a3, 0(a2)
934; CHECK-NEXT:    addi a2, a2, 1
935; CHECK-NEXT:    addi a1, a1, 5
936; CHECK-NEXT:    bne a2, a0, .LBB14_6
937; CHECK-NEXT:  .LBB14_7: # %bb34
938; CHECK-NEXT:    ret
939bb:
940  %i = icmp eq i32 %arg2, 1024
941  br i1 %i, label %bb34, label %bb3
942
943bb3:                                              ; preds = %bb
944  %i4 = sext i32 %arg2 to i64
945  %i5 = sub i32 1023, %arg2
946  %i6 = zext i32 %i5 to i64
947  %i7 = add nuw nsw i64 %i6, 1
948  %i8 = icmp ult i32 %i5, 31
949  br i1 %i8, label %bb32, label %bb9
950
951bb9:                                              ; preds = %bb3
952  %i10 = and i64 %i7, 8589934560
953  %i11 = add nsw i64 %i10, %i4
954  %i12 = insertelement <32 x i64> poison, i64 %i4, i64 0
955  %i13 = shufflevector <32 x i64> %i12, <32 x i64> poison, <32 x i32> zeroinitializer
956  %i14 = add <32 x i64> %i13, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15, i64 16, i64 17, i64 18, i64 19, i64 20, i64 21, i64 22, i64 23, i64 24, i64 25, i64 26, i64 27, i64 28, i64 29, i64 30, i64 31>
957  br label %bb15
958
959bb15:                                             ; preds = %bb15, %bb9
960  %i16 = phi i64 [ 0, %bb9 ], [ %i27, %bb15 ]
961  %i17 = phi <32 x i64> [ %i14, %bb9 ], [ %i28, %bb15 ]
962  %i18 = add i64 %i16, %i4
963  %i19 = mul nsw <32 x i64> %i17, splat (i64 5)
964  %i20 = getelementptr inbounds i8, ptr %arg1, <32 x i64> %i19
965  %i21 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr> %i20, i32 1, <32 x i1> splat (i1 true), <32 x i8> undef)
966  %i22 = getelementptr inbounds i8, ptr %arg, i64 %i18
967  %i24 = load <32 x i8>, ptr %i22, align 1
968  %i25 = add <32 x i8> %i24, %i21
969  store <32 x i8> %i25, ptr %i22, align 1
970  %i27 = add nuw i64 %i16, 32
971  %i28 = add <32 x i64> %i17, splat (i64 32)
972  %i29 = icmp eq i64 %i27, %i10
973  br i1 %i29, label %bb30, label %bb15
974
975bb30:                                             ; preds = %bb15
976  %i31 = icmp eq i64 %i7, %i10
977  br i1 %i31, label %bb34, label %bb32
978
979bb32:                                             ; preds = %bb30, %bb3
980  %i33 = phi i64 [ %i4, %bb3 ], [ %i11, %bb30 ]
981  br label %bb35
982
983bb34:                                             ; preds = %bb35, %bb30, %bb
984  ret void
985
986bb35:                                             ; preds = %bb35, %bb32
987  %i36 = phi i64 [ %i43, %bb35 ], [ %i33, %bb32 ]
988  %i37 = mul nsw i64 %i36, 5
989  %i38 = getelementptr inbounds i8, ptr %arg1, i64 %i37
990  %i39 = load i8, ptr %i38, align 1
991  %i40 = getelementptr inbounds i8, ptr %arg, i64 %i36
992  %i41 = load i8, ptr %i40, align 1
993  %i42 = add i8 %i41, %i39
994  store i8 %i42, ptr %i40, align 1
995  %i43 = add nsw i64 %i36, 1
996  %i44 = trunc i64 %i43 to i32
997  %i45 = icmp eq i32 %i44, 1024
998  br i1 %i45, label %bb34, label %bb35
999}
1000
1001declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32 immarg, <16 x i1>, <16 x i8>)
1002declare void @llvm.masked.scatter.v16i8.v16p0(<16 x i8>, <16 x ptr>, i32 immarg, <16 x i1>)
1003
1004define void @gather_no_scalar_remainder(ptr noalias nocapture noundef %arg, ptr noalias nocapture noundef readonly %arg1, i64 noundef %arg2) {
1005; CHECK-LABEL: gather_no_scalar_remainder:
1006; CHECK:       # %bb.0: # %bb
1007; CHECK-NEXT:    slli a2, a2, 4
1008; CHECK-NEXT:    beqz a2, .LBB15_3
1009; CHECK-NEXT:  # %bb.1: # %bb2
1010; CHECK-NEXT:    addi a2, a2, -16
1011; CHECK-NEXT:    andi a2, a2, -16
1012; CHECK-NEXT:    add a2, a2, a0
1013; CHECK-NEXT:    addi a2, a2, 16
1014; CHECK-NEXT:    li a3, 5
1015; CHECK-NEXT:    vsetivli zero, 16, e8, mf2, ta, ma
1016; CHECK-NEXT:  .LBB15_2: # %bb4
1017; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1018; CHECK-NEXT:    vlse8.v v8, (a1), a3
1019; CHECK-NEXT:    vle8.v v9, (a0)
1020; CHECK-NEXT:    vadd.vv v8, v9, v8
1021; CHECK-NEXT:    vse8.v v8, (a0)
1022; CHECK-NEXT:    addi a0, a0, 16
1023; CHECK-NEXT:    addi a1, a1, 80
1024; CHECK-NEXT:    bne a0, a2, .LBB15_2
1025; CHECK-NEXT:  .LBB15_3: # %bb16
1026; CHECK-NEXT:    ret
1027bb:
1028  %i = shl i64 %arg2, 4
1029  %i3 = icmp eq i64 %i, 0
1030  br i1 %i3, label %bb16, label %bb2
1031
1032bb2:                                              ; preds = %bb
1033  br label %bb4
1034
1035bb4:                                              ; preds = %bb4, %bb2
1036  %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ]
1037  %i6 = phi <16 x i64> [ %i14, %bb4 ], [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>, %bb2 ]
1038  %i7 = mul <16 x i64> %i6, splat (i64 5)
1039  %i8 = getelementptr inbounds i8, ptr %arg1, <16 x i64> %i7
1040  %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %i8, i32 1, <16 x i1> splat (i1 true), <16 x i8> undef)
1041  %i10 = getelementptr inbounds i8, ptr %arg, i64 %i5
1042  %i11 = load <16 x i8>, ptr %i10, align 1
1043  %i12 = add <16 x i8> %i11, %i9
1044  store <16 x i8> %i12, ptr %i10, align 1
1045  %i13 = add nuw i64 %i5, 16
1046  %i14 = add <16 x i64> %i6, splat (i64 16)
1047  %i15 = icmp eq i64 %i13, %i
1048  br i1 %i15, label %bb16, label %bb4
1049
1050bb16:                                             ; preds = %bb4, %bb
1051  ret void
1052}
1053
1054define void @gather_zero_stride_fp(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
1055; CHECK-LABEL: gather_zero_stride_fp:
1056; CHECK:       # %bb.0: # %entry
1057; CHECK-NEXT:    lui a2, 1
1058; CHECK-NEXT:    add a2, a0, a2
1059; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
1060; CHECK-NEXT:  .LBB16_1: # %vector.body
1061; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
1062; CHECK-NEXT:    flw fa5, 0(a1)
1063; CHECK-NEXT:    vle32.v v8, (a0)
1064; CHECK-NEXT:    vfadd.vf v8, v8, fa5
1065; CHECK-NEXT:    vse32.v v8, (a0)
1066; CHECK-NEXT:    addi a0, a0, 128
1067; CHECK-NEXT:    addi a1, a1, 640
1068; CHECK-NEXT:    bne a0, a2, .LBB16_1
1069; CHECK-NEXT:  # %bb.2: # %for.cond.cleanup
1070; CHECK-NEXT:    ret
1071entry:
1072  br label %vector.body
1073
1074vector.body:                                      ; preds = %vector.body, %entry
1075  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
1076  %vec.ind = phi <8 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
1077  %i = mul nuw nsw <8 x i64> %vec.ind, splat (i64 5)
1078  %i1 = getelementptr inbounds float, ptr %B, <8 x i64> %i
1079  %wide.masked.gather = call <8 x float> @llvm.masked.gather.v8f32.v32p0(<8 x ptr> %i1, i32 4, <8 x i1> splat (i1 true), <8 x float> undef)
1080  %i2 = getelementptr inbounds float, ptr %A, i64 %index
1081  %wide.load = load <8 x float>, ptr %i2, align 4
1082  %i4 = fadd <8 x float> %wide.load, %wide.masked.gather
1083  store <8 x float> %i4, ptr %i2, align 4
1084  %index.next = add nuw i64 %index, 32
1085  %vec.ind.next = add <8 x i64> %vec.ind, splat (i64 32)
1086  %i6 = icmp eq i64 %index.next, 1024
1087  br i1 %i6, label %for.cond.cleanup, label %vector.body
1088
1089for.cond.cleanup:                                 ; preds = %vector.body
1090  ret void
1091}
1092