1; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-darwin -S %s | FileCheck %s 2 3; Test cases for extending the vectorization factor, if small memory operations 4; are not profitable. 5 6; Test with a loop that contains memory accesses of i8 and i32 types. The 7; maximum VF for NEON is calculated by 128/size of smallest type in loop. 8; And while we don't have an instruction to load 4 x i8, vectorization 9; might still be profitable. 10define void @test_load_i8_store_i32(ptr noalias %src, ptr noalias %dst, i32 %off, i64 %N) { 11; CHECK-LABEL: @test_load_i8_store_i32( 12; CHECK: <16 x i8> 13; 14entry: 15 br label %loop 16 17loop: 18 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] 19 %gep.src = getelementptr inbounds i8, ptr %src, i64 %iv 20 %lv = load i8, ptr %gep.src, align 1 21 %lv.ext = zext i8 %lv to i32 22 %add = add i32 %lv.ext, %off 23 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 %iv 24 store i32 %add, ptr %gep.dst 25 %iv.next = add nuw nsw i64 %iv, 1 26 %exitcond.not = icmp eq i64 %iv.next, %N 27 br i1 %exitcond.not, label %exit, label %loop 28 29exit: 30 ret void 31} 32 33; Same as test_load_i8_store_i32, but with types flipped for load and store. 34define void @test_load_i32_store_i8(ptr noalias %src, ptr noalias %dst, i32 %off, i64 %N) { 35; CHECK-LABEL: @test_load_i32_store_i8( 36; CHECK: <16 x i8> 37; 38entry: 39 br label %loop 40 41loop: 42 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] 43 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 44 %lv = load i32, ptr %gep.src, align 1 45 %add = add i32 %lv, %off 46 %add.trunc = trunc i32 %add to i8 47 %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv 48 store i8 %add.trunc, ptr %gep.dst 49 %iv.next = add nuw nsw i64 %iv, 1 50 %exitcond.not = icmp eq i64 %iv.next, %N 51 br i1 %exitcond.not, label %exit, label %loop 52 53exit: 54 ret void 55} 56 57; All memory operations use i32, all memory operations are profitable with VF 4. 58define void @test_load_i32_store_i32(ptr noalias %src, ptr noalias %dst, i8 %off, i64 %N) { 59; CHECK-LABEL: @test_load_i32_store_i32( 60; CHECK: vector.body: 61; CHECK: <4 x i32> 62; 63entry: 64 br label %loop 65 66loop: 67 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] 68 %gep.src = getelementptr inbounds i32, ptr %src, i64 %iv 69 %lv = load i32, ptr %gep.src, align 1 70 %lv.trunc = trunc i32 %lv to i8 71 %add = add i8 %lv.trunc, %off 72 %add.ext = zext i8 %add to i32 73 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 %iv 74 store i32 %add.ext, ptr %gep.dst 75 %iv.next = add nuw nsw i64 %iv, 1 76 %exitcond.not = icmp eq i64 %iv.next, %N 77 br i1 %exitcond.not, label %exit, label %loop 78 79exit: 80 ret void 81} 82 83; Test with loop body that requires a large number of vector registers if the 84; vectorization factor is large. Make sure the register estimates limit the 85; vectorization factor. 86define void @test_load_i8_store_i64_large(ptr noalias %src, ptr noalias %dst, ptr noalias %dst.2, ptr noalias %dst.3, ptr noalias %dst.4, ptr noalias %dst.5, i64%off, i64 %off.2, i64 %N) { 87; CHECK-LABEL: @test_load_i8_store_i64_large 88; CHECK: <8 x i64> 89; 90entry: 91 br label %loop 92 93loop: 94 %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ] 95 %gep.src = getelementptr inbounds i8, ptr %src, i64 %iv 96 %gep.dst.3 = getelementptr inbounds i64, ptr %dst.3, i64 %iv 97 %lv.dst.3 = load i64, ptr %gep.dst.3, align 1 98 %gep.dst.5 = getelementptr inbounds i64, ptr %dst.5, i64 %iv 99 %lv.dst.5 = load i64, ptr %gep.dst.3, align 1 100 101 %lv = load i8, ptr %gep.src, align 1 102 %lv.ext = zext i8 %lv to i64 103 %add = add i64 %lv.ext, %off 104 %add.2 = add i64 %add, %off.2 105 %gep.dst = getelementptr inbounds i64, ptr %dst, i64 %iv 106 %gep.dst.2 = getelementptr inbounds i64, ptr %dst.2, i64 %iv 107 108 %add.3 = add i64 %add.2, %lv.dst.3 109 %add.4 = add i64 %add.3, %add 110 %gep.dst.4 = getelementptr inbounds i64, ptr %dst.4, i64 %iv 111 %add.5 = add i64 %add.2, %lv.dst.5 112 store i64 %add.2, ptr %gep.dst.2 113 store i64 %add, ptr %gep.dst 114 store i64 %add.3, ptr %gep.dst.3 115 store i64 %add.4, ptr %gep.dst.4 116 store i64 %add.5, ptr %gep.dst.5 117 118 %iv.next = add nuw nsw i64 %iv, 1 119 %exitcond.not = icmp eq i64 %iv.next, %N 120 br i1 %exitcond.not, label %exit, label %loop 121 122exit: 123 ret void 124} 125