xref: /llvm-project/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll (revision 63081dc6f642a6be61b3ef213f5c6e257f35671c)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -o - -S -load-store-vectorizer -dce %s | FileCheck %s
3
4; Make sure LoadStoreVectorizer vectorizes the loads below.
5; In order to prove that the vectorization is safe, it tries to
6; match nested adds and find an expression that adds a constant
7; value to an existing index and the result doesn't overflow.
8
9target triple = "x86_64--"
10
11define void @ld_v4i8_add_nsw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
12; CHECK-LABEL: @ld_v4i8_add_nsw(
13; CHECK-NEXT:  bb:
14; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
15; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]]
16; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
17; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
18; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
19; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
20; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
21; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
22; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
23; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
24; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
25; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
26; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
27; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
28; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
29; CHECK-NEXT:    ret void
30;
31bb:
32  %tmp = add nsw i32 %v0, -1
33  %tmp1 = add nsw i32 %v1, %tmp
34  %tmp2 = sext i32 %tmp1 to i64
35  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
36  %tmp4 = load i8, i8* %tmp3, align 1
37  %tmp5 = add nsw i32 %v1, %v0
38  %tmp6 = sext i32 %tmp5 to i64
39  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
40  %tmp8 = load i8, i8* %tmp7, align 1
41  %tmp9 = add nsw i32 %v0, 1
42  %tmp10 = add nsw i32 %v1, %tmp9
43  %tmp11 = sext i32 %tmp10 to i64
44  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
45  %tmp13 = load i8, i8* %tmp12, align 1
46  %tmp14 = add nsw i32 %v0, 2
47  %tmp15 = add nsw i32 %v1, %tmp14
48  %tmp16 = sext i32 %tmp15 to i64
49  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
50  %tmp18 = load i8, i8* %tmp17, align 1
51  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
52  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
53  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
54  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
55  store <4 x i8> %tmp22, <4 x i8>* %dst
56  ret void
57}
58
59define void @ld_v4i8_add_nuw(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
60; CHECK-LABEL: @ld_v4i8_add_nuw(
61; CHECK-NEXT:  bb:
62; CHECK-NEXT:    [[TMP:%.*]] = add nuw i32 [[V0:%.*]], -1
63; CHECK-NEXT:    [[TMP1:%.*]] = add nuw i32 [[V1:%.*]], [[TMP]]
64; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
65; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
66; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
67; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1
68; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
69; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
70; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
71; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
72; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
73; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
74; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
75; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
76; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
77; CHECK-NEXT:    ret void
78;
79bb:
80  %tmp = add nuw i32 %v0, -1
81  %tmp1 = add nuw i32 %v1, %tmp
82  %tmp2 = zext i32 %tmp1 to i64
83  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
84  %tmp4 = load i8, i8* %tmp3, align 1
85  %tmp5 = add nuw i32 %v1, %v0
86  %tmp6 = zext i32 %tmp5 to i64
87  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
88  %tmp8 = load i8, i8* %tmp7, align 1
89  %tmp9 = add nuw i32 %v0, 1
90  %tmp10 = add nuw i32 %v1, %tmp9
91  %tmp11 = zext i32 %tmp10 to i64
92  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
93  %tmp13 = load i8, i8* %tmp12, align 1
94  %tmp14 = add nuw i32 %v0, 2
95  %tmp15 = add nuw i32 %v1, %tmp14
96  %tmp16 = zext i32 %tmp15 to i64
97  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
98  %tmp18 = load i8, i8* %tmp17, align 1
99  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
100  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
101  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
102  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
103  store <4 x i8> %tmp22, <4 x i8>* %dst
104  ret void
105}
106
107; Make sure we don't vectorize the loads below because the source of
108; sext instructions doesn't have the nsw flag.
109
110define void @ld_v4i8_add_not_safe(i32 %v0, i32 %v1, i8* %src, <4 x i8>* %dst) {
111; CHECK-LABEL: @ld_v4i8_add_not_safe(
112; CHECK-NEXT:  bb:
113; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
114; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[V1:%.*]], [[TMP]]
115; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
116; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP2]]
117; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[TMP3]], align 1
118; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
119; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
120; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP6]]
121; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[TMP7]], align 1
122; CHECK-NEXT:    [[TMP9:%.*]] = add nsw i32 [[V0]], 1
123; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[V1]], [[TMP9]]
124; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
125; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP11]]
126; CHECK-NEXT:    [[TMP13:%.*]] = load i8, i8* [[TMP12]], align 1
127; CHECK-NEXT:    [[TMP14:%.*]] = add nsw i32 [[V0]], 2
128; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[V1]], [[TMP14]]
129; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
130; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP16]]
131; CHECK-NEXT:    [[TMP18:%.*]] = load i8, i8* [[TMP17]], align 1
132; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
133; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP8]], i32 1
134; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP13]], i32 2
135; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP18]], i32 3
136; CHECK-NEXT:    store <4 x i8> [[TMP22]], <4 x i8>* [[DST:%.*]]
137; CHECK-NEXT:    ret void
138;
139bb:
140  %tmp = add nsw i32 %v0, -1
141  %tmp1 = add i32 %v1, %tmp
142  %tmp2 = sext i32 %tmp1 to i64
143  %tmp3 = getelementptr inbounds i8, i8* %src, i64 %tmp2
144  %tmp4 = load i8, i8* %tmp3, align 1
145  %tmp5 = add i32 %v1, %v0
146  %tmp6 = sext i32 %tmp5 to i64
147  %tmp7 = getelementptr inbounds i8, i8* %src, i64 %tmp6
148  %tmp8 = load i8, i8* %tmp7, align 1
149  %tmp9 = add nsw i32 %v0, 1
150  %tmp10 = add i32 %v1, %tmp9
151  %tmp11 = sext i32 %tmp10 to i64
152  %tmp12 = getelementptr inbounds i8, i8* %src, i64 %tmp11
153  %tmp13 = load i8, i8* %tmp12, align 1
154  %tmp14 = add nsw i32 %v0, 2
155  %tmp15 = add i32 %v1, %tmp14
156  %tmp16 = sext i32 %tmp15 to i64
157  %tmp17 = getelementptr inbounds i8, i8* %src, i64 %tmp16
158  %tmp18 = load i8, i8* %tmp17, align 1
159  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
160  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
161  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
162  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
163  store <4 x i8> %tmp22, <4 x i8>* %dst
164  ret void
165}
166