1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -o - -S -load-store-vectorizer -dce %s | FileCheck %s 3 4; Make sure LoadStoreVectorizer vectorizes the loads below. 5; In order to prove that the vectorization is safe, it tries to 6; match nested adds and find an expression that adds a constant 7; value to an existing index and the result doesn't overflow. 8 9target triple = "x86_64--" 10 11define void @ld_v4i8_add_nsw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { 12; CHECK-LABEL: @ld_v4i8_add_nsw( 13; CHECK-NEXT: bb: 14; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1 15; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]] 16; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 17; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] 18; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* 19; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 20; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 21; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 22; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 23; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 24; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 25; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 26; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 27; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 28; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 29; CHECK-NEXT: ret void 30; 31bb: 32 %tmp = add nsw i32 %v0, -1 33 %tmp1 = add nsw i32 %v1, %tmp 34 %tmp2 = sext i32 %tmp1 to i64 35 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 36 %tmp4 = load i8, i8* %tmp3, align 1 37 %tmp5 = add nsw i32 %v1, %v0 38 %tmp6 = sext i32 %tmp5 to i64 39 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 40 %tmp8 = load i8, i8* %tmp7, align 1 41 %tmp9 = add nsw i32 %v0, 1 42 %tmp10 = add nsw i32 %v1, %tmp9 43 %tmp11 = sext i32 %tmp10 to i64 44 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 45 %tmp13 = load i8, i8* %tmp12, align 1 46 %tmp14 = add nsw i32 %v0, 2 47 %tmp15 = add nsw i32 %v1, %tmp14 48 %tmp16 = sext i32 %tmp15 to i64 49 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 50 %tmp18 = load i8, i8* %tmp17, align 1 51 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 52 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 53 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 54 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 55 store <4 x i8> %tmp22, <4 x i8>* %dst 56 ret void 57} 58 59define void @ld_v4i8_add_nuw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { 60; CHECK-LABEL: @ld_v4i8_add_nuw( 61; CHECK-NEXT: bb: 62; CHECK-NEXT: [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1 63; CHECK-NEXT: [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]] 64; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 65; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] 66; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>* 67; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 68; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0 69; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 70; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 71; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3 72; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0 73; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1 74; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2 75; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3 76; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 77; CHECK-NEXT: ret void 78; 79bb: 80 %tmp = add nuw i32 %v0, -1 81 %tmp1 = add nuw i32 %v1, %tmp 82 %tmp2 = zext i32 %tmp1 to i64 83 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 84 %tmp4 = load i8, i8* %tmp3, align 1 85 %tmp5 = add nuw i32 %v1, %v0 86 %tmp6 = zext i32 %tmp5 to i64 87 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 88 %tmp8 = load i8, i8* %tmp7, align 1 89 %tmp9 = add nuw i32 %v0, 1 90 %tmp10 = add nuw i32 %v1, %tmp9 91 %tmp11 = zext i32 %tmp10 to i64 92 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 93 %tmp13 = load i8, i8* %tmp12, align 1 94 %tmp14 = add nuw i32 %v0, 2 95 %tmp15 = add nuw i32 %v1, %tmp14 96 %tmp16 = zext i32 %tmp15 to i64 97 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 98 %tmp18 = load i8, i8* %tmp17, align 1 99 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 100 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 101 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 102 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 103 store <4 x i8> %tmp22, <4 x i8>* %dst 104 ret void 105} 106 107; Make sure we don't vectorize the loads below because the source of 108; sext instructions doesn't have the nsw flag. 109 110define void @ld_v4i8_add_not_safe(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) { 111; CHECK-LABEL: @ld_v4i8_add_not_safe( 112; CHECK-NEXT: bb: 113; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1 114; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1:%.*]], [[TMP]] 115; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64 116; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]] 117; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1 118; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]] 119; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64 120; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]] 121; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[TMP7]], align 1 122; CHECK-NEXT: [[TMP9:%.*]] = add nsw i32 [[V0]], 1 123; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[V1]], [[TMP9]] 124; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64 125; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP11]] 126; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP12]], align 1 127; CHECK-NEXT: [[TMP14:%.*]] = add nsw i32 [[V0]], 2 128; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[V1]], [[TMP14]] 129; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 130; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP16]] 131; CHECK-NEXT: [[TMP18:%.*]] = load i8, i8* [[TMP17]], align 1 132; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0 133; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP8]], i32 1 134; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP13]], i32 2 135; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP18]], i32 3 136; CHECK-NEXT: store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]] 137; CHECK-NEXT: ret void 138; 139bb: 140 %tmp = add nsw i32 %v0, -1 141 %tmp1 = add i32 %v1, %tmp 142 %tmp2 = sext i32 %tmp1 to i64 143 %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2 144 %tmp4 = load i8, i8* %tmp3, align 1 145 %tmp5 = add i32 %v1, %v0 146 %tmp6 = sext i32 %tmp5 to i64 147 %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6 148 %tmp8 = load i8, i8* %tmp7, align 1 149 %tmp9 = add nsw i32 %v0, 1 150 %tmp10 = add i32 %v1, %tmp9 151 %tmp11 = sext i32 %tmp10 to i64 152 %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11 153 %tmp13 = load i8, i8* %tmp12, align 1 154 %tmp14 = add nsw i32 %v0, 2 155 %tmp15 = add i32 %v1, %tmp14 156 %tmp16 = sext i32 %tmp15 to i64 157 %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16 158 %tmp18 = load i8, i8* %tmp17, align 1 159 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0 160 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1 161 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2 162 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3 163 store <4 x i8> %tmp22, <4 x i8>* %dst 164 ret void 165} 166