xref: /llvm-project/llvm/test/Transforms/LoadStoreVectorizer/X86/vectorize-i8-nested-add.ll (revision 2be0abb7fe72ed4537b3eabcd3102d48ea845717)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -o - -S -passes=load-store-vectorizer,dce %s | FileCheck %s
3
4; Make sure LoadStoreVectorizer vectorizes the loads below.
5; In order to prove that the vectorization is safe, it tries to
6; match nested adds and find an expression that adds a constant
7; value to an existing index and the result doesn't overflow.
8
9target triple = "x86_64--"
10
11define void @ld_v4i8_add_nsw(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
12; CHECK-LABEL: @ld_v4i8_add_nsw(
13; CHECK-NEXT:  bb:
14; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
15; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]]
16; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
17; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
18; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
19; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
20; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
21; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
22; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
23; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
24; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
25; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
26; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
27; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
28; CHECK-NEXT:    ret void
29;
30bb:
31  %tmp = add nsw i32 %v0, -1
32  %tmp1 = add nsw i32 %v1, %tmp
33  %tmp2 = sext i32 %tmp1 to i64
34  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
35  %tmp4 = load i8, ptr %tmp3, align 1
36  %tmp5 = add nsw i32 %v1, %v0
37  %tmp6 = sext i32 %tmp5 to i64
38  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
39  %tmp8 = load i8, ptr %tmp7, align 1
40  %tmp9 = add nsw i32 %v0, 1
41  %tmp10 = add nsw i32 %v1, %tmp9
42  %tmp11 = sext i32 %tmp10 to i64
43  %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
44  %tmp13 = load i8, ptr %tmp12, align 1
45  %tmp14 = add nsw i32 %v0, 2
46  %tmp15 = add nsw i32 %v1, %tmp14
47  %tmp16 = sext i32 %tmp15 to i64
48  %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
49  %tmp18 = load i8, ptr %tmp17, align 1
50  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
51  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
52  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
53  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
54  store <4 x i8> %tmp22, ptr %dst
55  ret void
56}
57
58; Apply different operand orders for the nested add sequences
59define void @ld_v4i8_add_nsw_operand_orders(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
60; CHECK-LABEL: @ld_v4i8_add_nsw_operand_orders(
61; CHECK-NEXT:  bb:
62; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
63; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]]
64; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
65; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
66; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
67; CHECK-NEXT:    [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
68; CHECK-NEXT:    [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
69; CHECK-NEXT:    [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
70; CHECK-NEXT:    [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
71; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
72; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
73; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
74; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
75; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
76; CHECK-NEXT:    ret void
77;
78bb:
79  %tmp = add nsw i32 %v0, -1
80  %tmp1 = add nsw i32 %v1, %tmp
81  %tmp2 = sext i32 %tmp1 to i64
82  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
83  %tmp4 = load i8, ptr %tmp3, align 1
84  %tmp5 = add nsw i32 %v0, %v1
85  %tmp6 = sext i32 %tmp5 to i64
86  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
87  %tmp8 = load i8, ptr %tmp7, align 1
88  %tmp9 = add nsw i32 %v0, 1
89  %tmp10 = add nsw i32 %tmp9, %v1
90  %tmp11 = sext i32 %tmp10 to i64
91  %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
92  %tmp13 = load i8, ptr %tmp12, align 1
93  %tmp14 = add nsw i32 %v0, 2
94  %tmp15 = add nsw i32 %v1, %tmp14
95  %tmp16 = sext i32 %tmp15 to i64
96  %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
97  %tmp18 = load i8, ptr %tmp17, align 1
98  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
99  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
100  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
101  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
102  store <4 x i8> %tmp22, ptr %dst
103  ret void
104}
105
106define void @ld_v4i8_add_known_bits(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
107; CHECK-LABEL: @ld_v4i8_add_known_bits(
108; CHECK-NEXT:  bb:
109; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
110; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 4
111; CHECK-NEXT:    [[TMP:%.*]] = add i32 [[V0]], -1
112; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[V1]], [[TMP]]
113; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
114; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
115; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
116; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
117; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
118; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
119; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i8>, ptr [[TMP7]], align 1
120; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0
121; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1
122; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2
123; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
124; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
125; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
126; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
127; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
128; CHECK-NEXT:    ret void
129;
130bb:
131  %v0 = mul i32 %ind0, 4
132  %v1 = mul i32 %ind1, 4
133  %tmp = add i32 %v0, -1
134  %tmp1 = add i32 %v1, %tmp
135  %tmp2 = sext i32 %tmp1 to i64
136  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
137  %tmp4 = load i8, ptr %tmp3, align 1
138  %tmp5 = add i32 %v1, %v0
139  %tmp6 = sext i32 %tmp5 to i64
140  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
141  %tmp8 = load i8, ptr %tmp7, align 1
142  %tmp9 = add i32 %v0, 1
143  %tmp10 = add i32 %v1, %tmp9
144  %tmp11 = sext i32 %tmp10 to i64
145  %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
146  %tmp13 = load i8, ptr %tmp12, align 1
147  %tmp14 = add i32 %v0, 2
148  %tmp15 = add i32 %v1, %tmp14
149  %tmp16 = sext i32 %tmp15 to i64
150  %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
151  %tmp18 = load i8, ptr %tmp17, align 1
152  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
153  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
154  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
155  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
156  store <4 x i8> %tmp22, ptr %dst
157  ret void
158}
159
160define void @ld_v4i8_add_known_bits1(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
161; CHECK-LABEL: @ld_v4i8_add_known_bits1(
162; CHECK-NEXT:  bb:
163; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
164; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 4
165; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
166; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
167; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
168; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
169; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
170; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
171; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
172; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
173; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0
174; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
175; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
176; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
177; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
178; CHECK-NEXT:    ret void
179;
180bb:
181  %v0 = mul i32 %ind0, 4
182  %v1 = mul i32 %ind1, 4
183  %tmp = add i32 %v0, 3
184  %tmp1 = add i32 %v1, %tmp
185  %tmp2 = sext i32 %tmp1 to i64
186  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
187  %tmp4 = load i8, ptr %tmp3, align 1
188  %tmp5 = add i32 %v1, %v0
189  %tmp6 = sext i32 %tmp5 to i64
190  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
191  %tmp8 = load i8, ptr %tmp7, align 1
192  %tmp9 = add i32 %v0, 1
193  %tmp10 = add i32 %v1, %tmp9
194  %tmp11 = sext i32 %tmp10 to i64
195  %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
196  %tmp13 = load i8, ptr %tmp12, align 1
197  %tmp14 = add i32 %v0, 2
198  %tmp15 = add i32 %v1, %tmp14
199  %tmp16 = sext i32 %tmp15 to i64
200  %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
201  %tmp18 = load i8, ptr %tmp17, align 1
202  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
203  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
204  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
205  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
206  store <4 x i8> %tmp22, ptr %dst
207  ret void
208}
209
210define void @ld_v4i8_add_known_bits_by_assume(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
211; CHECK-LABEL: @ld_v4i8_add_known_bits_by_assume(
212; CHECK-NEXT:  bb:
213; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 3
214; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
215; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[V0]], 3
216; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0
217; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[V1]], 3
218; CHECK-NEXT:    [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0
219; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I]])
220; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I_1]])
221; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
222; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
223; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
224; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
225; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
226; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
227; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
228; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
229; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0
230; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
231; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
232; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
233; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
234; CHECK-NEXT:    ret void
235;
236bb:
237  %v0 = mul i32 %ind0, 3
238  %v1 = mul i32 %ind1, 3
239  %and.i = and i32 %v0, 3
240  %cmp.i = icmp eq i32 %and.i, 0
241  %and.i.1 = and i32 %v1, 3
242  %cmp.i.1 = icmp eq i32 %and.i.1, 0
243  call void @llvm.assume(i1 %cmp.i)
244  call void @llvm.assume(i1 %cmp.i.1)
245  %tmp = add i32 %v0, 3
246  %tmp1 = add i32 %v1, %tmp
247  %tmp2 = sext i32 %tmp1 to i64
248  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
249  %tmp4 = load i8, ptr %tmp3, align 1
250  %tmp5 = add i32 %v1, %v0
251  %tmp6 = sext i32 %tmp5 to i64
252  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
253  %tmp8 = load i8, ptr %tmp7, align 1
254  %tmp9 = add i32 %v0, 1
255  %tmp10 = add i32 %v1, %tmp9
256  %tmp11 = sext i32 %tmp10 to i64
257  %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
258  %tmp13 = load i8, ptr %tmp12, align 1
259  %tmp14 = add i32 %v0, 2
260  %tmp15 = add i32 %v1, %tmp14
261  %tmp16 = sext i32 %tmp15 to i64
262  %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
263  %tmp18 = load i8, ptr %tmp17, align 1
264  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
265  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
266  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
267  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
268  store <4 x i8> %tmp22, ptr %dst
269  ret void
270}
271
272declare void @llvm.assume(i1)
273
274define void @ld_v4i8_add_assume_on_arg(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
275; CHECK-LABEL: @ld_v4i8_add_assume_on_arg(
276; CHECK-NEXT:  bb:
277; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[V0:%.*]], 3
278; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0
279; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3
280; CHECK-NEXT:    [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0
281; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I]])
282; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I_1]])
283; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0]], -1
284; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[V1]], [[TMP]]
285; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
286; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
287; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
288; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
289; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
290; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
291; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i8>, ptr [[TMP7]], align 1
292; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0
293; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1
294; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2
295; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
296; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
297; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
298; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
299; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
300; CHECK-NEXT:    ret void
301;
302bb:
303  %and.i = and i32 %v0, 3
304  %cmp.i = icmp eq i32 %and.i, 0
305  %and.i.1 = and i32 %v1, 3
306  %cmp.i.1 = icmp eq i32 %and.i.1, 0
307  call void @llvm.assume(i1 %cmp.i)
308  call void @llvm.assume(i1 %cmp.i.1)
309  %tmp = add nsw i32 %v0, -1
310  %tmp1 = add i32 %v1, %tmp
311  %tmp2 = sext i32 %tmp1 to i64
312  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
313  %tmp4 = load i8, ptr %tmp3, align 1
314  %tmp5 = add i32 %v1, %v0
315  %tmp6 = sext i32 %tmp5 to i64
316  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
317  %tmp8 = load i8, ptr %tmp7, align 1
318  %tmp9 = add nsw i32 %v0, 1
319  %tmp10 = add i32 %v1, %tmp9
320  %tmp11 = sext i32 %tmp10 to i64
321  %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
322  %tmp13 = load i8, ptr %tmp12, align 1
323  %tmp14 = add nsw i32 %v0, 2
324  %tmp15 = add i32 %v1, %tmp14
325  %tmp16 = sext i32 %tmp15 to i64
326  %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
327  %tmp18 = load i8, ptr %tmp17, align 1
328  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
329  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
330  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
331  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
332  store <4 x i8> %tmp22, ptr %dst
333  ret void
334}
335
336define void @ld_v4i8_add_assume_on_arg1(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
337; CHECK-LABEL: @ld_v4i8_add_assume_on_arg1(
338; CHECK-NEXT:  bb:
339; CHECK-NEXT:    [[AND_I:%.*]] = and i32 [[V0:%.*]], 3
340; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0
341; CHECK-NEXT:    [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3
342; CHECK-NEXT:    [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0
343; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I]])
344; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I_1]])
345; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
346; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
347; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
348; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
349; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
350; CHECK-NEXT:    [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
351; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
352; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
353; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0
354; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
355; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
356; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
357; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
358; CHECK-NEXT:    ret void
359;
360bb:
361  %and.i = and i32 %v0, 3
362  %cmp.i = icmp eq i32 %and.i, 0
363  %and.i.1 = and i32 %v1, 3
364  %cmp.i.1 = icmp eq i32 %and.i.1, 0
365  call void @llvm.assume(i1 %cmp.i)
366  call void @llvm.assume(i1 %cmp.i.1)
367  %tmp = add nsw i32 %v0, 3
368  %tmp1 = add i32 %v1, %tmp
369  %tmp2 = sext i32 %tmp1 to i64
370  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
371  %tmp4 = load i8, ptr %tmp3, align 1
372  %tmp5 = add i32 %v1, %v0
373  %tmp6 = sext i32 %tmp5 to i64
374  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
375  %tmp8 = load i8, ptr %tmp7, align 1
376  %tmp9 = add nsw i32 %v0, 1
377  %tmp10 = add i32 %v1, %tmp9
378  %tmp11 = sext i32 %tmp10 to i64
379  %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
380  %tmp13 = load i8, ptr %tmp12, align 1
381  %tmp14 = add nsw i32 %v0, 2
382  %tmp15 = add i32 %v1, %tmp14
383  %tmp16 = sext i32 %tmp15 to i64
384  %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
385  %tmp18 = load i8, ptr %tmp17, align 1
386  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
387  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
388  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
389  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
390  store <4 x i8> %tmp22, ptr %dst
391  ret void
392}
393
394; Address computations are partly separated by control flow and with llvm.assume placed
395; in the second basic block
396
397define void @ld_v2i8_add_different_contexts(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
398; CHECK-LABEL: @ld_v2i8_add_different_contexts(
399; CHECK-NEXT:  bb:
400; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
401; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
402; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
403; CHECK-NEXT:    [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0
404; CHECK-NEXT:    br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]]
405; CHECK:       bb.loads:
406; CHECK-NEXT:    call void @llvm.assume(i1 [[BIT_COND]])
407; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
408; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
409; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
410; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
411; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
412; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
413; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
414; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
415; CHECK-NEXT:    br label [[BB_SKIP]]
416; CHECK:       bb.skip:
417; CHECK-NEXT:    ret void
418;
419bb:
420  %v0 = mul i32 %ind0, 4
421  %v1 = mul i32 %ind1, 3
422  %tmp5 = add i32 %v1, %v0
423  %bit_cond = icmp eq i32 %v1, 0
424  br i1 %bit_cond, label %bb.loads, label %bb.skip
425
426bb.loads:
427  call void @llvm.assume(i1 %bit_cond)
428  %tmp = add nsw i32 %v0, 1
429  %tmp1 = add i32 %v1, %tmp
430  %tmp2 = sext i32 %tmp1 to i64
431  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
432  %tmp4 = load i8, ptr %tmp3, align 1
433  %tmp6 = sext i32 %tmp5 to i64
434  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
435  %tmp8 = load i8, ptr %tmp7, align 1
436  %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
437  %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
438  store <2 x i8> %tmp20, ptr %dst
439  br label %bb.skip
440
441bb.skip:
442  ret void
443}
444
445; Same as ld_v2i8_add_different_contexts but with llvm.assume placed between loads
446
447define void @ld_v2i8_add_different_contexts1(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
448; CHECK-LABEL: @ld_v2i8_add_different_contexts1(
449; CHECK-NEXT:  bb:
450; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
451; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
452; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
453; CHECK-NEXT:    [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0
454; CHECK-NEXT:    br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]]
455; CHECK:       bb.loads:
456; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
457; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
458; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
459; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
460; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
461; CHECK-NEXT:    call void @llvm.assume(i1 [[BIT_COND]])
462; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
463; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
464; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
465; CHECK-NEXT:    br label [[BB_SKIP]]
466; CHECK:       bb.skip:
467; CHECK-NEXT:    ret void
468;
469bb:
470  %v0 = mul i32 %ind0, 4
471  %v1 = mul i32 %ind1, 3
472  %tmp5 = add i32 %v1, %v0
473  %bit_cond = icmp eq i32 %v1, 0
474  br i1 %bit_cond, label %bb.loads, label %bb.skip
475
476bb.loads:
477  %tmp6 = sext i32 %tmp5 to i64
478  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
479  %tmp8 = load i8, ptr %tmp7, align 1
480  call void @llvm.assume(i1 %bit_cond)
481  %tmp = add nsw i32 %v0, 1
482  %tmp1 = add i32 %v1, %tmp
483  %tmp2 = sext i32 %tmp1 to i64
484  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
485  %tmp4 = load i8, ptr %tmp3, align 1
486  %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
487  %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
488  store <2 x i8> %tmp20, ptr %dst
489  br label %bb.skip
490
491bb.skip:
492  ret void
493}
494
495; llvm.assume is placed between loads in a single basic block
496
497define void @ld_v2i8_add_context(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
498; CHECK-LABEL: @ld_v2i8_add_context(
499; CHECK-NEXT:  bb:
500; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
501; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
502; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
503; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
504; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
505; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
506; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
507; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
508; CHECK-NEXT:    [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0
509; CHECK-NEXT:    call void @llvm.assume(i1 [[BIT_COND]])
510; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
511; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
512; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
513; CHECK-NEXT:    ret void
514;
515bb:
516  %v0 = mul i32 %ind0, 4
517  %v1 = mul i32 %ind1, 3
518  %tmp5 = add i32 %v1, %v0
519  %tmp6 = sext i32 %tmp5 to i64
520  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
521  %tmp8 = load i8, ptr %tmp7, align 1
522  %bit_cond = icmp eq i32 %tmp5, 0
523  call void @llvm.assume(i1 %bit_cond)
524  %tmp = add nsw i32 %v0, 1
525  %tmp1 = add i32 %v1, %tmp
526  %tmp2 = sext i32 %tmp1 to i64
527  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
528  %tmp4 = load i8, ptr %tmp3, align 1
529  %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
530  %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
531  store <2 x i8> %tmp20, ptr %dst
532  ret void
533}
534
535; Placing llvm.assume after all the loads and stores in the basic block still works
536
537define void @ld_v2i8_add_context1(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
538; CHECK-LABEL: @ld_v2i8_add_context1(
539; CHECK-NEXT:  bb:
540; CHECK-NEXT:    [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
541; CHECK-NEXT:    [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
542; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
543; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
544; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
545; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
546; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
547; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
548; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
549; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
550; CHECK-NEXT:    store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
551; CHECK-NEXT:    [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0
552; CHECK-NEXT:    call void @llvm.assume(i1 [[BIT_COND]])
553; CHECK-NEXT:    ret void
554;
555bb:
556  %v0 = mul i32 %ind0, 4
557  %v1 = mul i32 %ind1, 3
558  %tmp5 = add i32 %v1, %v0
559  %tmp6 = sext i32 %tmp5 to i64
560  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
561  %tmp8 = load i8, ptr %tmp7, align 1
562  %tmp = add nsw i32 %v0, 1
563  %tmp1 = add i32 %v1, %tmp
564  %tmp2 = sext i32 %tmp1 to i64
565  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
566  %tmp4 = load i8, ptr %tmp3, align 1
567  %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
568  %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
569  store <2 x i8> %tmp20, ptr %dst
570  %bit_cond = icmp eq i32 %tmp5, 0
571  call void @llvm.assume(i1 %bit_cond)
572  ret void
573}
574
575; Make sure we don't vectorize the loads below because the source of
576; sext instructions doesn't have the nsw flag or known bits allowing
577; to apply the vectorization.
578
579define void @ld_v4i8_add_not_safe(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
580; CHECK-LABEL: @ld_v4i8_add_not_safe(
581; CHECK-NEXT:  bb:
582; CHECK-NEXT:    [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
583; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[V1:%.*]], [[TMP]]
584; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
585; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
586; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
587; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
588; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
589; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
590; CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[TMP7]], align 1
591; CHECK-NEXT:    [[TMP9:%.*]] = add nsw i32 [[V0]], 1
592; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[V1]], [[TMP9]]
593; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
594; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP11]]
595; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1
596; CHECK-NEXT:    [[TMP14:%.*]] = add nsw i32 [[V0]], 2
597; CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[V1]], [[TMP14]]
598; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
599; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP16]]
600; CHECK-NEXT:    [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1
601; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
602; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP8]], i32 1
603; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP13]], i32 2
604; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP18]], i32 3
605; CHECK-NEXT:    store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
606; CHECK-NEXT:    ret void
607;
608bb:
609  %tmp = add nsw i32 %v0, -1
610  %tmp1 = add i32 %v1, %tmp
611  %tmp2 = sext i32 %tmp1 to i64
612  %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
613  %tmp4 = load i8, ptr %tmp3, align 1
614  %tmp5 = add i32 %v1, %v0
615  %tmp6 = sext i32 %tmp5 to i64
616  %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
617  %tmp8 = load i8, ptr %tmp7, align 1
618  %tmp9 = add nsw i32 %v0, 1
619  %tmp10 = add i32 %v1, %tmp9
620  %tmp11 = sext i32 %tmp10 to i64
621  %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
622  %tmp13 = load i8, ptr %tmp12, align 1
623  %tmp14 = add nsw i32 %v0, 2
624  %tmp15 = add i32 %v1, %tmp14
625  %tmp16 = sext i32 %tmp15 to i64
626  %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
627  %tmp18 = load i8, ptr %tmp17, align 1
628  %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
629  %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
630  %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
631  %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
632  store <4 x i8> %tmp22, ptr %dst
633  ret void
634}
635