1; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-darwin -S %s | FileCheck %s
2
3; Test cases for extending the vectorization factor, if small memory operations
4; are not profitable.
5
6; Test with a loop that contains memory accesses of i8 and i32 types. The
7; maximum VF for NEON is calculated by 128/size of smallest type in loop.
8; And while we don't have an instruction to  load 4 x i8, vectorization
9; might still be profitable.
10define void @test_load_i8_store_i32(ptr noalias %src, ptr noalias %dst, i32 %off, i64 %N) {
11; CHECK-LABEL: @test_load_i8_store_i32(
12; CHECK:       <16 x i8>
13;
14entry:
15  br label %loop
16
17loop:
18  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
19  %gep.src = getelementptr inbounds i8, ptr %src, i64 %iv
20  %lv = load i8, ptr %gep.src, align 1
21  %lv.ext = zext i8 %lv to i32
22  %add = add i32 %lv.ext, %off
23  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 %iv
24  store i32 %add, ptr %gep.dst
25  %iv.next = add nuw nsw i64 %iv, 1
26  %exitcond.not = icmp eq i64 %iv.next, %N
27  br i1 %exitcond.not, label %exit, label %loop
28
29exit:
30  ret void
31}
32
33; Same as test_load_i8_store_i32, but with types flipped for load and store.
34define void @test_load_i32_store_i8(ptr noalias %src, ptr noalias %dst, i32 %off, i64 %N) {
35; CHECK-LABEL: @test_load_i32_store_i8(
36; CHECK:     <16 x i8>
37;
38entry:
39  br label %loop
40
41loop:
42  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
43  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
44  %lv = load i32, ptr %gep.src, align 1
45  %add = add i32 %lv, %off
46  %add.trunc = trunc i32 %add to i8
47  %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv
48  store i8 %add.trunc, ptr %gep.dst
49  %iv.next = add nuw nsw i64 %iv, 1
50  %exitcond.not = icmp eq i64 %iv.next, %N
51  br i1 %exitcond.not, label %exit, label %loop
52
53exit:
54  ret void
55}
56
57; All memory operations use i32, all memory operations are profitable with VF 4.
58define void @test_load_i32_store_i32(ptr noalias %src, ptr noalias %dst, i8 %off, i64 %N) {
59; CHECK-LABEL: @test_load_i32_store_i32(
60; CHECK: vector.body:
61; CHECK:   <4 x i32>
62;
63entry:
64  br label %loop
65
66loop:
67  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
68  %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv
69  %lv = load i32, ptr %gep.src, align 1
70  %lv.trunc = trunc i32 %lv to i8
71  %add = add i8 %lv.trunc, %off
72  %add.ext = zext i8 %add to i32
73  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 %iv
74  store i32 %add.ext, ptr %gep.dst
75  %iv.next = add nuw nsw i64 %iv, 1
76  %exitcond.not = icmp eq i64 %iv.next, %N
77  br i1 %exitcond.not, label %exit, label %loop
78
79exit:
80  ret void
81}
82
83; Test with loop body that requires a large number of vector registers if the
84; vectorization factor is large. Make sure the register estimates limit the
85; vectorization factor.
86define void @test_load_i8_store_i64_large(ptr noalias %src, ptr noalias %dst, ptr noalias %dst.2, ptr noalias %dst.3, ptr noalias %dst.4, ptr noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
87; CHECK-LABEL: @test_load_i8_store_i64_large
88; CHECK: <8 x i64>
89;
90entry:
91  br label %loop
92
93loop:
94  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
95  %gep.src = getelementptr inbounds i8, ptr %src, i64 %iv
96  %gep.dst.3 = getelementptr inbounds i64, ptr %dst.3, i64 %iv
97  %lv.dst.3 = load i64, ptr %gep.dst.3, align 1
98  %gep.dst.5 = getelementptr inbounds i64, ptr %dst.5, i64 %iv
99  %lv.dst.5 = load i64, ptr %gep.dst.3, align 1
100
101  %lv = load i8, ptr %gep.src, align 1
102  %lv.ext = zext i8 %lv to i64
103  %add = add i64 %lv.ext, %off
104  %add.2 = add i64 %add, %off.2
105  %gep.dst = getelementptr inbounds i64, ptr %dst, i64 %iv
106  %gep.dst.2 = getelementptr inbounds i64, ptr %dst.2, i64 %iv
107
108  %add.3 = add i64 %add.2, %lv.dst.3
109  %add.4 = add i64 %add.3, %add
110  %gep.dst.4 = getelementptr inbounds i64, ptr %dst.4, i64 %iv
111  %add.5 = add i64 %add.2, %lv.dst.5
112  store i64 %add.2, ptr %gep.dst.2
113  store i64 %add, ptr %gep.dst
114  store i64 %add.3, ptr %gep.dst.3
115  store i64 %add.4, ptr %gep.dst.4
116  store i64 %add.5, ptr %gep.dst.5
117
118  %iv.next = add nuw nsw i64 %iv, 1
119  %exitcond.not = icmp eq i64 %iv.next, %N
120  br i1 %exitcond.not, label %exit, label %loop
121
122exit:
123  ret void
124}
125