xref: /llvm-project/llvm/test/CodeGen/ARM/loop-indexing.ll (revision eecb99c5f66c8491766628a2925587e20f3b1dbd)
1; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 %s -o - | \
2; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
3
4; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 -lsr-preferred-addressing-mode=none %s -o - | \
5; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
6
7; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 %s -o - | \
8; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
9
10; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-preferred-addressing-mode=postindexed %s -o - | \
11; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
12
13; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-preferred-addressing-mode=preindexed %s -o - | \
14; RUN:    FileCheck %s --check-prefixes=CHECK,CHECK-T2
15
16; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
17; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
18
19; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 -lsr-complexity-limit=2147483647 %s -o - | \
20; RUN:    FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2
21
22; Tests to check that post increment addressing modes are used instead of
23; updating base pointers with add instructions.
24
25; TODO: I think we should be able to use post inc addressing with VLDM
26; instructions.
27; CHECK-LABEL: test_fma
28; CHECK: @ %loop
29
30; CHECK-DEFAULT: vldr s{{.*}}, #8]
31; CHECK-DEFAULT: vldr s{{.*}}, #8]
32; CHECK-DEFAULT: vldr s{{.*}}, #12]
33; CHECK-DEFAULT: vldr s{{.*}}, #12]
34
35; CHECK-COMPLEX: vldr s{{.*}}, #8]
36; CHECK-COMPLEX: vldr s{{.*}}, #8]
37; CHECK-COMPLEX: vldr s{{.*}}, #12]
38; CHECK-COMPLEX: vldr s{{.*}}, #12]
39
40define float @test_fma(ptr %a, ptr %b, i32 %N) {
41entry:
42  br label %loop
43
44loop:
45  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
46  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
47  %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ]
48  %gep.a.1 = getelementptr inbounds float, ptr %a, i32 %idx.1
49  %a.1 = load float, ptr %gep.a.1
50  %gep.b.1 = getelementptr inbounds float, ptr %b, i32 %idx.1
51  %b.1 = load float, ptr %gep.b.1
52  %fmul.1 = fmul float %a.1, %b.1
53  %fma.1 = fadd float %fmul.1, %res
54  %idx.2 = or disjoint i32 %idx.1, 1
55  %gep.a.2 = getelementptr inbounds float, ptr %a, i32 %idx.2
56  %a.2 = load float, ptr %gep.a.2
57  %gep.b.2 = getelementptr inbounds float, ptr %b, i32 %idx.2
58  %b.2 = load float, ptr %gep.b.2
59  %fmul.2 = fmul float %a.2, %b.2
60  %fma.2 = fadd float %fmul.2, %fma.1
61  %i.next = add nsw nuw i32 %i, -2
62  %idx.next = add nsw nuw i32 %idx.1, 2
63  %cmp = icmp ult i32 %i.next, %N
64  br i1 %cmp, label %loop, label %exit
65
66exit:
67  ret float %fma.2
68}
69
70; CHECK-LABEL: convolve_16bit
71; TODO: Both arrays should use indexing
72; CHECK-DEFAULT: ldr{{.*}}, #8]!
73; CHECK-DEFAULT-NOT: ldr{{.*}}]!
74
75; CHECK-COMPLEX: ldr{{.*}}, #8]!
76; CHECK-COMPLEX-NOT: ldr{{.*}}]!
77
78; DISABLED-NOT: ldr{{.*}}]!
79; DISABLED-NOT: str{{.*}}]!
80
81define void @convolve_16bit(ptr nocapture readonly %input_image, ptr nocapture readonly %filter,
82                            i32 %filter_dim, i32 %out_width, i32 %out_height,
83                            ptr nocapture readonly %convolved) {
84entry:
85  %cmp92 = icmp eq i32 %out_height, 0
86  br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
87
88for.cond1.preheader.lr.ph:                        ; preds = %entry
89  %xtraiter = and i32 %filter_dim, 3
90  %unroll_iter = sub i32 %filter_dim, %xtraiter
91  br label %for.cond1.preheader
92
93for.cond1.preheader:                              ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph
94  %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ]
95  %arrayidx22 = getelementptr inbounds ptr, ptr %convolved, i32 %res_y.093
96  %tmp3 = load ptr, ptr %arrayidx22, align 4
97  br label %for.cond9.preheader.us.us.preheader
98
99for.cond9.preheader.us.us.preheader:              ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph
100  %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ]
101  br label %for.cond9.preheader.us.us
102
103for.cond9.preheader.us.us:                        ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader
104  %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
105  %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
106  %add.us.us = add i32 %filter_y.056.us.us, %res_y.093
107  %arrayidx.us.us = getelementptr inbounds ptr, ptr %filter, i32 %filter_y.056.us.us
108  %tmp5 = load ptr, ptr %arrayidx.us.us, align 4
109  %arrayidx15.us.us = getelementptr inbounds ptr, ptr %input_image, i32 %add.us.us
110  %tmp6 = load ptr, ptr %arrayidx15.us.us, align 4
111  br label %for.body12.us.us
112
113for.body12.us.us:                                 ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
114  %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
115  %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ]
116  %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ]
117  %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us
118  %arrayidx14.us.us = getelementptr inbounds i16, ptr %tmp5, i32 %filter_x.053.us.us
119  %tmp9 = load i16, ptr %arrayidx14.us.us, align 2
120  %conv.us.us = sext i16 %tmp9 to i32
121  %arrayidx16.us.us = getelementptr inbounds i16, ptr %tmp6, i32 %add13.us.us
122  %tmp10 = load i16, ptr %arrayidx16.us.us, align 2
123  %conv17.us.us = sext i16 %tmp10 to i32
124  %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us
125  %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us
126  %inc.us.us = or disjoint i32 %filter_x.053.us.us, 1
127  %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us
128  %arrayidx14.us.us.1 = getelementptr inbounds i16, ptr %tmp5, i32 %inc.us.us
129  %tmp11 = load i16, ptr %arrayidx14.us.us.1, align 2
130  %conv.us.us.1 = sext i16 %tmp11 to i32
131  %arrayidx16.us.us.1 = getelementptr inbounds i16, ptr %tmp6, i32 %add13.us.us.1
132  %tmp12 = load i16, ptr %arrayidx16.us.us.1, align 2
133  %conv17.us.us.1 = sext i16 %tmp12 to i32
134  %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1
135  %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us
136  %inc.us.us.1 = or disjoint i32 %filter_x.053.us.us, 2
137  %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us
138  %arrayidx14.us.us.2 = getelementptr inbounds i16, ptr %tmp5, i32 %inc.us.us.1
139  %tmp13 = load i16, ptr %arrayidx14.us.us.2, align 2
140  %conv.us.us.2 = sext i16 %tmp13 to i32
141  %arrayidx16.us.us.2 = getelementptr inbounds i16, ptr %tmp6, i32 %add13.us.us.2
142  %tmp14 = load i16, ptr %arrayidx16.us.us.2, align 2
143  %conv17.us.us.2 = sext i16 %tmp14 to i32
144  %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2
145  %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1
146  %inc.us.us.2 = or disjoint i32 %filter_x.053.us.us, 3
147  %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us
148  %arrayidx14.us.us.3 = getelementptr inbounds i16, ptr %tmp5, i32 %inc.us.us.2
149  %tmp15 = load i16, ptr %arrayidx14.us.us.3, align 2
150  %conv.us.us.3 = sext i16 %tmp15 to i32
151  %arrayidx16.us.us.3 = getelementptr inbounds i16, ptr %tmp6, i32 %add13.us.us.3
152  %tmp16 = load i16, ptr %arrayidx16.us.us.3, align 2
153  %conv17.us.us.3 = sext i16 %tmp16 to i32
154  %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3
155  %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2
156  %inc.us.us.3 = add i32 %filter_x.053.us.us, 4
157  %niter.nsub.3 = add i32 %niter, -4
158  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
159  br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us
160
161for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
162  %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1
163  %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim
164  br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us
165
166for.cond5.for.cond.cleanup7_crit_edge.us:         ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us
167  %arrayidx23.us = getelementptr inbounds i32, ptr %tmp3, i32 %res_x.060.us
168  store i32 %add18.us.us.3, ptr %arrayidx23.us, align 4
169  %add25.us = add nuw i32 %res_x.060.us, 1
170  %exitcond99 = icmp eq i32 %add25.us, %out_width
171  br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader
172
173for.cond.cleanup3:                                ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader
174  %add28 = add nuw i32 %res_y.093, 1
175  %exitcond100 = icmp eq i32 %add28, %out_height
176  br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader
177
178for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
179  ret void
180}
181
182; CHECK-LABEL: mul_8x8
183; CHECK: @ %for.body
184
185; CHECK-DEFAULT: str{{.*}}, #16]!
186; CHECK-DEFAULT: ldrb{{.*}}, #4]!
187; CHECK-DEFAULT: ldrb{{.*}}, #4]!
188
189; CHECK-COMPLEX: str{{.*}}, #16]!
190; CHECK-COMPLEX: ldrb{{.*}}, #4]!
191; CHECK-COMPLEX: ldrb{{.*}}, #4]!
192
193; DISABLED-NOT: ldr{{.*}}]!
194; DISABLED-NOT: str{{.*}}]!
195
196; CHECK-T2: @ %for.body.epil
197; CHECK-T2: ldrb{{.*}}, #1]!
198; CHECK-T2: ldrb{{.*}}, #1]!
199; CHECK-T2: str{{.*}}, #4]!
200
201define void @mul_8x8(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture %C, i32 %N) {
202entry:
203  %cmp9 = icmp eq i32 %N, 0
204  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
205
206for.body.preheader:                               ; preds = %entry
207  %tmp = add i32 %N, -1
208  %xtraiter = and i32 %N, 3
209  %tmp1 = icmp ult i32 %tmp, 3
210  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
211
212for.body.preheader.new:                           ; preds = %for.body.preheader
213  %unroll_iter = sub i32 %N, %xtraiter
214  br label %for.body
215
216for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
217  %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
218  %lcmp.mod = icmp eq i32 %xtraiter, 0
219  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
220
221for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
222  %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
223  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
224  %arrayidx.epil = getelementptr inbounds i8, ptr %A, i32 %i.010.epil
225  %tmp2 = load i8, ptr %arrayidx.epil, align 1
226  %conv.epil = zext i8 %tmp2 to i32
227  %arrayidx1.epil = getelementptr inbounds i8, ptr %B, i32 %i.010.epil
228  %tmp3 = load i8, ptr %arrayidx1.epil, align 1
229  %conv2.epil = zext i8 %tmp3 to i32
230  %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
231  %arrayidx3.epil = getelementptr inbounds i32, ptr %C, i32 %i.010.epil
232  store i32 %mul.epil, ptr %arrayidx3.epil, align 4
233  %inc.epil = add nuw i32 %i.010.epil, 1
234  %epil.iter.sub = add i32 %epil.iter, -1
235  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
236  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
237
238for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
239  ret void
240
241for.body:                                         ; preds = %for.body, %for.body.preheader.new
242  %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
243  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
244  %arrayidx = getelementptr inbounds i8, ptr %A, i32 %i.010
245  %tmp4 = load i8, ptr %arrayidx, align 1
246  %conv = zext i8 %tmp4 to i32
247  %arrayidx1 = getelementptr inbounds i8, ptr %B, i32 %i.010
248  %tmp5 = load i8, ptr %arrayidx1, align 1
249  %conv2 = zext i8 %tmp5 to i32
250  %mul = mul nuw nsw i32 %conv2, %conv
251  %arrayidx3 = getelementptr inbounds i32, ptr %C, i32 %i.010
252  store i32 %mul, ptr %arrayidx3, align 4
253  %inc = or disjoint i32 %i.010, 1
254  %arrayidx.1 = getelementptr inbounds i8, ptr %A, i32 %inc
255  %tmp6 = load i8, ptr %arrayidx.1, align 1
256  %conv.1 = zext i8 %tmp6 to i32
257  %arrayidx1.1 = getelementptr inbounds i8, ptr %B, i32 %inc
258  %tmp7 = load i8, ptr %arrayidx1.1, align 1
259  %conv2.1 = zext i8 %tmp7 to i32
260  %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
261  %arrayidx3.1 = getelementptr inbounds i32, ptr %C, i32 %inc
262  store i32 %mul.1, ptr %arrayidx3.1, align 4
263  %inc.1 = or disjoint i32 %i.010, 2
264  %arrayidx.2 = getelementptr inbounds i8, ptr %A, i32 %inc.1
265  %tmp8 = load i8, ptr %arrayidx.2, align 1
266  %conv.2 = zext i8 %tmp8 to i32
267  %arrayidx1.2 = getelementptr inbounds i8, ptr %B, i32 %inc.1
268  %tmp9 = load i8, ptr %arrayidx1.2, align 1
269  %conv2.2 = zext i8 %tmp9 to i32
270  %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
271  %arrayidx3.2 = getelementptr inbounds i32, ptr %C, i32 %inc.1
272  store i32 %mul.2, ptr %arrayidx3.2, align 4
273  %inc.2 = or disjoint i32 %i.010, 3
274  %arrayidx.3 = getelementptr inbounds i8, ptr %A, i32 %inc.2
275  %tmp10 = load i8, ptr %arrayidx.3, align 1
276  %conv.3 = zext i8 %tmp10 to i32
277  %arrayidx1.3 = getelementptr inbounds i8, ptr %B, i32 %inc.2
278  %tmp11 = load i8, ptr %arrayidx1.3, align 1
279  %conv2.3 = zext i8 %tmp11 to i32
280  %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
281  %arrayidx3.3 = getelementptr inbounds i32, ptr %C, i32 %inc.2
282  store i32 %mul.3, ptr %arrayidx3.3, align 4
283  %inc.3 = add i32 %i.010, 4
284  %niter.nsub.3 = add i32 %niter, -4
285  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
286  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
287}
288
289; CHECK-LABEL: mul_16x8
290; CHECK: @ %for.body
291
292; CHECK-DEFAULT: str{{.*}}, #16]!
293; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
294
295; CHECK-COMPLEX: ldrsh{{.*}}, #8]!
296; CHECK-COMPLEX: str{{.*}}, #16]!
297; CHECK-COMPLEX: ldrb{{.*}}, #4]!
298
299; DISABLED-NOT: ldr{{.*}}]!
300; DISABLED-NOT: str{{.*}}]!
301
302; CHECK-T2: @ %for.body.epil
303; CHECK-T2: ldrsh{{.*}}, #2]!
304; CHECK-T2: ldrb{{.*}}, #1]!
305; CHECK-T2: str{{.*}}, #4]!
306
307define void @mul_16x8(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture %C, i32 %N) {
308entry:
309  %cmp9 = icmp eq i32 %N, 0
310  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
311
312for.body.preheader:                               ; preds = %entry
313  %tmp = add i32 %N, -1
314  %xtraiter = and i32 %N, 3
315  %tmp1 = icmp ult i32 %tmp, 3
316  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
317
318for.body.preheader.new:                           ; preds = %for.body.preheader
319  %unroll_iter = sub i32 %N, %xtraiter
320  br label %for.body
321
322for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
323  %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
324  %lcmp.mod = icmp eq i32 %xtraiter, 0
325  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
326
327for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
328  %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
329  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
330  %arrayidx.epil = getelementptr inbounds i16, ptr %A, i32 %i.010.epil
331  %tmp2 = load i16, ptr %arrayidx.epil, align 2
332  %conv.epil = sext i16 %tmp2 to i32
333  %arrayidx1.epil = getelementptr inbounds i8, ptr %B, i32 %i.010.epil
334  %tmp3 = load i8, ptr %arrayidx1.epil, align 1
335  %conv2.epil = zext i8 %tmp3 to i32
336  %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
337  %arrayidx3.epil = getelementptr inbounds i32, ptr %C, i32 %i.010.epil
338  store i32 %mul.epil, ptr %arrayidx3.epil, align 4
339  %inc.epil = add nuw i32 %i.010.epil, 1
340  %epil.iter.sub = add i32 %epil.iter, -1
341  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
342  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
343
344for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
345  ret void
346
347for.body:                                         ; preds = %for.body, %for.body.preheader.new
348  %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
349  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
350  %arrayidx = getelementptr inbounds i16, ptr %A, i32 %i.010
351  %tmp4 = load i16, ptr %arrayidx, align 2
352  %conv = sext i16 %tmp4 to i32
353  %arrayidx1 = getelementptr inbounds i8, ptr %B, i32 %i.010
354  %tmp5 = load i8, ptr %arrayidx1, align 1
355  %conv2 = zext i8 %tmp5 to i32
356  %mul = mul nsw i32 %conv2, %conv
357  %arrayidx3 = getelementptr inbounds i32, ptr %C, i32 %i.010
358  store i32 %mul, ptr %arrayidx3, align 4
359  %inc = or disjoint i32 %i.010, 1
360  %arrayidx.1 = getelementptr inbounds i16, ptr %A, i32 %inc
361  %tmp6 = load i16, ptr %arrayidx.1, align 2
362  %conv.1 = sext i16 %tmp6 to i32
363  %arrayidx1.1 = getelementptr inbounds i8, ptr %B, i32 %inc
364  %tmp7 = load i8, ptr %arrayidx1.1, align 1
365  %conv2.1 = zext i8 %tmp7 to i32
366  %mul.1 = mul nsw i32 %conv2.1, %conv.1
367  %arrayidx3.1 = getelementptr inbounds i32, ptr %C, i32 %inc
368  store i32 %mul.1, ptr %arrayidx3.1, align 4
369  %inc.1 = or disjoint i32 %i.010, 2
370  %arrayidx.2 = getelementptr inbounds i16, ptr %A, i32 %inc.1
371  %tmp8 = load i16, ptr %arrayidx.2, align 2
372  %conv.2 = sext i16 %tmp8 to i32
373  %arrayidx1.2 = getelementptr inbounds i8, ptr %B, i32 %inc.1
374  %tmp9 = load i8, ptr %arrayidx1.2, align 1
375  %conv2.2 = zext i8 %tmp9 to i32
376  %mul.2 = mul nsw i32 %conv2.2, %conv.2
377  %arrayidx3.2 = getelementptr inbounds i32, ptr %C, i32 %inc.1
378  store i32 %mul.2, ptr %arrayidx3.2, align 4
379  %inc.2 = or disjoint i32 %i.010, 3
380  %arrayidx.3 = getelementptr inbounds i16, ptr %A, i32 %inc.2
381  %tmp10 = load i16, ptr %arrayidx.3, align 2
382  %conv.3 = sext i16 %tmp10 to i32
383  %arrayidx1.3 = getelementptr inbounds i8, ptr %B, i32 %inc.2
384  %tmp11 = load i8, ptr %arrayidx1.3, align 1
385  %conv2.3 = zext i8 %tmp11 to i32
386  %mul.3 = mul nsw i32 %conv2.3, %conv.3
387  %arrayidx3.3 = getelementptr inbounds i32, ptr %C, i32 %inc.2
388  store i32 %mul.3, ptr %arrayidx3.3, align 4
389  %inc.3 = add i32 %i.010, 4
390  %niter.nsub.3 = add i32 %niter, -4
391  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
392  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
393}
394
395; CHECK-LABEL: mul_16x16
396; CHECK: @ %for.body
397
398; TODO: pre-indexed loads
399; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
400; CHECK-DEFAULT: str{{.*}}, #16]!
401; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
402
403; CHECK-COMPLEX: ldrsh{{.*}}]!
404; CHECK-COMPLEX: ldrsh{{.*}}]!
405; CHECK-COMPLEX: str{{.*}}]!
406
407; DISABLED-NOT: ldr{{.*}}]!
408; DISABLED-NOT: str{{.*}}]!
409
410; CHECK-T2: @ %for.body.epil
411; CHECK-T2: ldrsh{{.*}}, #2]!
412; CHECK-T2: ldrsh{{.*}}, #2]!
413; CHECK-T2: str{{.*}}, #4]!
414
415define void @mul_16x16(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture %C, i32 %N) {
416entry:
417  %cmp9 = icmp eq i32 %N, 0
418  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
419
420for.body.preheader:                               ; preds = %entry
421  %tmp = add i32 %N, -1
422  %xtraiter = and i32 %N, 3
423  %tmp1 = icmp ult i32 %tmp, 3
424  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
425
426for.body.preheader.new:                           ; preds = %for.body.preheader
427  %unroll_iter = sub i32 %N, %xtraiter
428  br label %for.body
429
430for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
431  %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
432  %lcmp.mod = icmp eq i32 %xtraiter, 0
433  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
434
435for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
436  %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
437  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
438  %arrayidx.epil = getelementptr inbounds i16, ptr %A, i32 %i.010.epil
439  %tmp2 = load i16, ptr %arrayidx.epil, align 2
440  %conv.epil = sext i16 %tmp2 to i32
441  %arrayidx1.epil = getelementptr inbounds i16, ptr %B, i32 %i.010.epil
442  %tmp3 = load i16, ptr %arrayidx1.epil, align 2
443  %conv2.epil = sext i16 %tmp3 to i32
444  %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
445  %arrayidx3.epil = getelementptr inbounds i32, ptr %C, i32 %i.010.epil
446  store i32 %mul.epil, ptr %arrayidx3.epil, align 4
447  %inc.epil = add nuw i32 %i.010.epil, 1
448  %epil.iter.sub = add i32 %epil.iter, -1
449  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
450  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
451
452for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
453  ret void
454
455for.body:                                         ; preds = %for.body, %for.body.preheader.new
456  %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
457  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
458  %arrayidx = getelementptr inbounds i16, ptr %A, i32 %i.010
459  %tmp4 = load i16, ptr %arrayidx, align 2
460  %conv = sext i16 %tmp4 to i32
461  %arrayidx1 = getelementptr inbounds i16, ptr %B, i32 %i.010
462  %tmp5 = load i16, ptr %arrayidx1, align 2
463  %conv2 = sext i16 %tmp5 to i32
464  %mul = mul nsw i32 %conv2, %conv
465  %arrayidx3 = getelementptr inbounds i32, ptr %C, i32 %i.010
466  store i32 %mul, ptr %arrayidx3, align 4
467  %inc = or disjoint i32 %i.010, 1
468  %arrayidx.1 = getelementptr inbounds i16, ptr %A, i32 %inc
469  %tmp6 = load i16, ptr %arrayidx.1, align 2
470  %conv.1 = sext i16 %tmp6 to i32
471  %arrayidx1.1 = getelementptr inbounds i16, ptr %B, i32 %inc
472  %tmp7 = load i16, ptr %arrayidx1.1, align 2
473  %conv2.1 = sext i16 %tmp7 to i32
474  %mul.1 = mul nsw i32 %conv2.1, %conv.1
475  %arrayidx3.1 = getelementptr inbounds i32, ptr %C, i32 %inc
476  store i32 %mul.1, ptr %arrayidx3.1, align 4
477  %inc.1 = or disjoint i32 %i.010, 2
478  %arrayidx.2 = getelementptr inbounds i16, ptr %A, i32 %inc.1
479  %tmp8 = load i16, ptr %arrayidx.2, align 2
480  %conv.2 = sext i16 %tmp8 to i32
481  %arrayidx1.2 = getelementptr inbounds i16, ptr %B, i32 %inc.1
482  %tmp9 = load i16, ptr %arrayidx1.2, align 2
483  %conv2.2 = sext i16 %tmp9 to i32
484  %mul.2 = mul nsw i32 %conv2.2, %conv.2
485  %arrayidx3.2 = getelementptr inbounds i32, ptr %C, i32 %inc.1
486  store i32 %mul.2, ptr %arrayidx3.2, align 4
487  %inc.2 = or disjoint i32 %i.010, 3
488  %arrayidx.3 = getelementptr inbounds i16, ptr %A, i32 %inc.2
489  %tmp10 = load i16, ptr %arrayidx.3, align 2
490  %conv.3 = sext i16 %tmp10 to i32
491  %arrayidx1.3 = getelementptr inbounds i16, ptr %B, i32 %inc.2
492  %tmp11 = load i16, ptr %arrayidx1.3, align 2
493  %conv2.3 = sext i16 %tmp11 to i32
494  %mul.3 = mul nsw i32 %conv2.3, %conv.3
495  %arrayidx3.3 = getelementptr inbounds i32, ptr %C, i32 %inc.2
496  store i32 %mul.3, ptr %arrayidx3.3, align 4
497  %inc.3 = add i32 %i.010, 4
498  %niter.nsub.3 = add i32 %niter, -4
499  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
500  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
501}
502
503; CHECK-LABEL: mul_8x8_2d
504; CHECK: @ %for.body4.us
505
506; CHECK-DEFAULT: ldr{{.*}}, #16]!
507; CHECK-DEFAULT: ldrb{{.*}}, #4]!
508
509; DISABLED-NOT: ldr{{.*}}]!
510; DISABLED-NOT: str{{.*}}]!
511
512; CHECK-T2: @ %for.body4.us.epil
513; CHECK-T2: ldrb{{.*}}, #1]!
514; CHECK-T2: ldr{{.*}}, #4]!
515
516define void @mul_8x8_2d(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture readonly %C, i32 %N, i32 %M) {
517entry:
518  %cmp24 = icmp eq i32 %N, 0
519  %cmp222 = icmp eq i32 %M, 0
520  %or.cond = or i1 %cmp24, %cmp222
521  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
522
523for.cond1.preheader.us.preheader:                 ; preds = %entry
524  %tmp = add i32 %M, -1
525  %xtraiter = and i32 %M, 3
526  %tmp1 = icmp ult i32 %tmp, 3
527  %unroll_iter = sub i32 %M, %xtraiter
528  %lcmp.mod = icmp eq i32 %xtraiter, 0
529  br label %for.cond1.preheader.us
530
531for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
532  %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
533  %arrayidx.us = getelementptr inbounds i8, ptr %A, i32 %i.025.us
534  %arrayidx5.us = getelementptr inbounds ptr, ptr %B, i32 %i.025.us
535  %arrayidx8.us = getelementptr inbounds ptr, ptr %C, i32 %i.025.us
536  %.pre = load ptr, ptr %arrayidx5.us, align 4
537  %.pre30 = load ptr, ptr %arrayidx8.us, align 4
538  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
539
540for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
541  %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
542  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
543  %tmp2 = load i8, ptr %arrayidx.us, align 1
544  %conv.us = zext i8 %tmp2 to i32
545  %arrayidx6.us = getelementptr inbounds i8, ptr %.pre, i32 %j.023.us
546  %tmp3 = load i8, ptr %arrayidx6.us, align 1
547  %conv7.us = zext i8 %tmp3 to i32
548  %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
549  %arrayidx9.us = getelementptr inbounds i32, ptr %.pre30, i32 %j.023.us
550  %tmp4 = load i32, ptr %arrayidx9.us, align 4
551  %add.us = add nsw i32 %tmp4, %mul.us
552  store i32 %add.us, ptr %arrayidx9.us, align 4
553  %inc.us = or disjoint i32 %j.023.us, 1
554  %tmp5 = load i8, ptr %arrayidx.us, align 1
555  %conv.us.1 = zext i8 %tmp5 to i32
556  %arrayidx6.us.1 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us
557  %tmp6 = load i8, ptr %arrayidx6.us.1, align 1
558  %conv7.us.1 = zext i8 %tmp6 to i32
559  %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
560  %arrayidx9.us.1 = getelementptr inbounds i32, ptr %.pre30, i32 %inc.us
561  %tmp7 = load i32, ptr %arrayidx9.us.1, align 4
562  %add.us.1 = add nsw i32 %tmp7, %mul.us.1
563  store i32 %add.us.1, ptr %arrayidx9.us.1, align 4
564  %inc.us.1 = or disjoint i32 %j.023.us, 2
565  %tmp8 = load i8, ptr %arrayidx.us, align 1
566  %conv.us.2 = zext i8 %tmp8 to i32
567  %arrayidx6.us.2 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us.1
568  %tmp9 = load i8, ptr %arrayidx6.us.2, align 1
569  %conv7.us.2 = zext i8 %tmp9 to i32
570  %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
571  %arrayidx9.us.2 = getelementptr inbounds i32, ptr %.pre30, i32 %inc.us.1
572  %tmp10 = load i32, ptr %arrayidx9.us.2, align 4
573  %add.us.2 = add nsw i32 %tmp10, %mul.us.2
574  store i32 %add.us.2, ptr %arrayidx9.us.2, align 4
575  %inc.us.2 = or disjoint i32 %j.023.us, 3
576  %tmp11 = load i8, ptr %arrayidx.us, align 1
577  %conv.us.3 = zext i8 %tmp11 to i32
578  %arrayidx6.us.3 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us.2
579  %tmp12 = load i8, ptr %arrayidx6.us.3, align 1
580  %conv7.us.3 = zext i8 %tmp12 to i32
581  %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
582  %arrayidx9.us.3 = getelementptr inbounds i32, ptr %.pre30, i32 %inc.us.2
583  %tmp13 = load i32, ptr %arrayidx9.us.3, align 4
584  %add.us.3 = add nsw i32 %tmp13, %mul.us.3
585  store i32 %add.us.3, ptr %arrayidx9.us.3, align 4
586  %inc.us.3 = add i32 %j.023.us, 4
587  %niter.nsub.3 = add i32 %niter, -4
588  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
589  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
590
591for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
592  %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
593  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
594
595for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
596  %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
597  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
598  %tmp14 = load i8, ptr %arrayidx.us, align 1
599  %conv.us.epil = zext i8 %tmp14 to i32
600  %arrayidx6.us.epil = getelementptr inbounds i8, ptr %.pre, i32 %j.023.us.epil
601  %tmp15 = load i8, ptr %arrayidx6.us.epil, align 1
602  %conv7.us.epil = zext i8 %tmp15 to i32
603  %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
604  %arrayidx9.us.epil = getelementptr inbounds i32, ptr %.pre30, i32 %j.023.us.epil
605  %tmp16 = load i32, ptr %arrayidx9.us.epil, align 4
606  %add.us.epil = add nsw i32 %tmp16, %mul.us.epil
607  store i32 %add.us.epil, ptr %arrayidx9.us.epil, align 4
608  %inc.us.epil = add nuw i32 %j.023.us.epil, 1
609  %epil.iter.sub = add i32 %epil.iter, -1
610  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
611  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
612
613for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
614  %inc11.us = add nuw i32 %i.025.us, 1
615  %exitcond28 = icmp eq i32 %inc11.us, %N
616  br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
617
618for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
619  ret void
620}
621
622; CHECK-LABEL: mul_16x16_2d
623; CHECK: @ %for.body4.us
624
625; CHECK-DEFAULT: ldr{{.*}}, #16]!
626; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
627
628; DISABLED-NOT: ldr{{.*}}]!
629; DISABLED-NOT: str{{.*}}]!
630
631; CHECK-T2: @ %for.body4.us.epil
632; CHECK-T2: ldrsh{{.*}}, #2]!
633; CHECK-T2: ldr{{.*}}, #4]!
634
635define void @mul_16x16_2d(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture readonly %C, i32 %N, i32 %M) {
636entry:
637  %cmp24 = icmp eq i32 %N, 0
638  %cmp222 = icmp eq i32 %M, 0
639  %or.cond = or i1 %cmp24, %cmp222
640  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
641
642for.cond1.preheader.us.preheader:                 ; preds = %entry
643  %tmp = add i32 %M, -1
644  %xtraiter = and i32 %M, 3
645  %tmp1 = icmp ult i32 %tmp, 3
646  %unroll_iter = sub i32 %M, %xtraiter
647  %lcmp.mod = icmp eq i32 %xtraiter, 0
648  br label %for.cond1.preheader.us
649
650for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
651  %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
652  %arrayidx.us = getelementptr inbounds i16, ptr %A, i32 %i.025.us
653  %tmp2 = load i16, ptr %arrayidx.us, align 2
654  %conv.us = sext i16 %tmp2 to i32
655  %arrayidx5.us = getelementptr inbounds ptr, ptr %B, i32 %i.025.us
656  %tmp3 = load ptr, ptr %arrayidx5.us, align 4
657  %arrayidx8.us = getelementptr inbounds ptr, ptr %C, i32 %i.025.us
658  %tmp4 = load ptr, ptr %arrayidx8.us, align 4
659  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
660
661for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
662  %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
663  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
664  %arrayidx6.us = getelementptr inbounds i16, ptr %tmp3, i32 %j.023.us
665  %tmp5 = load i16, ptr %arrayidx6.us, align 2
666  %conv7.us = sext i16 %tmp5 to i32
667  %mul.us = mul nsw i32 %conv7.us, %conv.us
668  %arrayidx9.us = getelementptr inbounds i32, ptr %tmp4, i32 %j.023.us
669  %tmp6 = load i32, ptr %arrayidx9.us, align 4
670  %add.us = add nsw i32 %tmp6, %mul.us
671  store i32 %add.us, ptr %arrayidx9.us, align 4
672  %inc.us = or disjoint i32 %j.023.us, 1
673  %arrayidx6.us.1 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us
674  %tmp7 = load i16, ptr %arrayidx6.us.1, align 2
675  %conv7.us.1 = sext i16 %tmp7 to i32
676  %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
677  %arrayidx9.us.1 = getelementptr inbounds i32, ptr %tmp4, i32 %inc.us
678  %tmp8 = load i32, ptr %arrayidx9.us.1, align 4
679  %add.us.1 = add nsw i32 %tmp8, %mul.us.1
680  store i32 %add.us.1, ptr %arrayidx9.us.1, align 4
681  %inc.us.1 = or disjoint i32 %j.023.us, 2
682  %arrayidx6.us.2 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us.1
683  %tmp9 = load i16, ptr %arrayidx6.us.2, align 2
684  %conv7.us.2 = sext i16 %tmp9 to i32
685  %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
686  %arrayidx9.us.2 = getelementptr inbounds i32, ptr %tmp4, i32 %inc.us.1
687  %tmp10 = load i32, ptr %arrayidx9.us.2, align 4
688  %add.us.2 = add nsw i32 %tmp10, %mul.us.2
689  store i32 %add.us.2, ptr %arrayidx9.us.2, align 4
690  %inc.us.2 = or disjoint i32 %j.023.us, 3
691  %arrayidx6.us.3 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us.2
692  %tmp11 = load i16, ptr %arrayidx6.us.3, align 2
693  %conv7.us.3 = sext i16 %tmp11 to i32
694  %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
695  %arrayidx9.us.3 = getelementptr inbounds i32, ptr %tmp4, i32 %inc.us.2
696  %tmp12 = load i32, ptr %arrayidx9.us.3, align 4
697  %add.us.3 = add nsw i32 %tmp12, %mul.us.3
698  store i32 %add.us.3, ptr %arrayidx9.us.3, align 4
699  %inc.us.3 = add i32 %j.023.us, 4
700  %niter.nsub.3 = add i32 %niter, -4
701  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
702  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
703
704for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
705  %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
706  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
707
708for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
709  %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
710  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
711  %arrayidx6.us.epil = getelementptr inbounds i16, ptr %tmp3, i32 %j.023.us.epil
712  %tmp13 = load i16, ptr %arrayidx6.us.epil, align 2
713  %conv7.us.epil = sext i16 %tmp13 to i32
714  %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
715  %arrayidx9.us.epil = getelementptr inbounds i32, ptr %tmp4, i32 %j.023.us.epil
716  %tmp14 = load i32, ptr %arrayidx9.us.epil, align 4
717  %add.us.epil = add nsw i32 %tmp14, %mul.us.epil
718  store i32 %add.us.epil, ptr %arrayidx9.us.epil, align 4
719  %inc.us.epil = add nuw i32 %j.023.us.epil, 1
720  %epil.iter.sub = add i32 %epil.iter, -1
721  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
722  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
723
724for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
725  %inc11.us = add nuw i32 %i.025.us, 1
726  %exitcond28 = icmp eq i32 %inc11.us, %N
727  br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
728
729for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
730  ret void
731}
732
733; CHECK-LABEL: mac_8x8_2d
734; CHECK: @ %for.body4.us
735
736; TODO: Both input arrays could use pre-indexed loads.
737; TODO: pre-indexed stores.
738; CHECK-DEFAULT: ldrb{{.*}}, #4]!
739; CHECK-DEFAULT-NOT: ldr{{.*}}]!
740; CHECK-DEFAULT-NOT: str{{.*}}]!
741
742; TODO: Increased complexity shouldn't prevent indexed accesses.
743; CHECK-COMPLEX-NOT: ldr{{.*}}]!
744; CHECK-COMPLEX-NOT: str{{.*}}]!
745
746; DISABLED-NOT: ldr{{.*}}]!
747; DISABLED-NOT: str{{.*}}]!
748
749; CHECK-T2: @ %for.body4.us.epil
750; CHECK-T2: ldrb{{.*}}, #1]!
751
752define void @mac_8x8_2d(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture %C, i32 %N, i32 %M) {
753entry:
754  %cmp22 = icmp eq i32 %N, 0
755  %cmp220 = icmp eq i32 %M, 0
756  %or.cond = or i1 %cmp22, %cmp220
757  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
758
759for.cond1.preheader.us.preheader:                 ; preds = %entry
760  %tmp = add i32 %M, -1
761  %xtraiter = and i32 %M, 3
762  %tmp1 = icmp ult i32 %tmp, 3
763  %unroll_iter = sub i32 %M, %xtraiter
764  %lcmp.mod = icmp eq i32 %xtraiter, 0
765  br label %for.cond1.preheader.us
766
767for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
768  %i.023.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
769  %arrayidx.us = getelementptr inbounds i8, ptr %A, i32 %i.023.us
770  %arrayidx5.us = getelementptr inbounds ptr, ptr %B, i32 %i.023.us
771  %arrayidx8.us = getelementptr inbounds i32, ptr %C, i32 %i.023.us
772  %.pre = load ptr, ptr %arrayidx5.us, align 4
773  %.pre28 = load i32, ptr %arrayidx8.us, align 4
774  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
775
776for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
777  %tmp2 = phi i32 [ %add.us.3, %for.body4.us ], [ %.pre28, %for.cond1.preheader.us ]
778  %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
779  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
780  %tmp3 = load i8, ptr %arrayidx.us, align 1
781  %conv.us = zext i8 %tmp3 to i32
782  %arrayidx6.us = getelementptr inbounds i8, ptr %.pre, i32 %j.021.us
783  %tmp4 = load i8, ptr %arrayidx6.us, align 1
784  %conv7.us = zext i8 %tmp4 to i32
785  %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
786  %add.us = add nsw i32 %mul.us, %tmp2
787  store i32 %add.us, ptr %arrayidx8.us, align 4
788  %inc.us = or disjoint i32 %j.021.us, 1
789  %tmp5 = load i8, ptr %arrayidx.us, align 1
790  %conv.us.1 = zext i8 %tmp5 to i32
791  %arrayidx6.us.1 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us
792  %tmp6 = load i8, ptr %arrayidx6.us.1, align 1
793  %conv7.us.1 = zext i8 %tmp6 to i32
794  %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
795  %add.us.1 = add nsw i32 %mul.us.1, %add.us
796  store i32 %add.us.1, ptr %arrayidx8.us, align 4
797  %inc.us.1 = or disjoint i32 %j.021.us, 2
798  %tmp7 = load i8, ptr %arrayidx.us, align 1
799  %conv.us.2 = zext i8 %tmp7 to i32
800  %arrayidx6.us.2 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us.1
801  %tmp8 = load i8, ptr %arrayidx6.us.2, align 1
802  %conv7.us.2 = zext i8 %tmp8 to i32
803  %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
804  %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
805  store i32 %add.us.2, ptr %arrayidx8.us, align 4
806  %inc.us.2 = or disjoint i32 %j.021.us, 3
807  %tmp9 = load i8, ptr %arrayidx.us, align 1
808  %conv.us.3 = zext i8 %tmp9 to i32
809  %arrayidx6.us.3 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us.2
810  %tmp10 = load i8, ptr %arrayidx6.us.3, align 1
811  %conv7.us.3 = zext i8 %tmp10 to i32
812  %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
813  %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
814  store i32 %add.us.3, ptr %arrayidx8.us, align 4
815  %inc.us.3 = add i32 %j.021.us, 4
816  %niter.nsub.3 = add i32 %niter, -4
817  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
818  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
819
820for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
821  %.unr = phi i32 [ %.pre28, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
822  %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
823  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
824
825for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
826  %tmp11 = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
827  %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
828  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
829  %tmp12 = load i8, ptr %arrayidx.us, align 1
830  %conv.us.epil = zext i8 %tmp12 to i32
831  %arrayidx6.us.epil = getelementptr inbounds i8, ptr %.pre, i32 %j.021.us.epil
832  %tmp13 = load i8, ptr %arrayidx6.us.epil, align 1
833  %conv7.us.epil = zext i8 %tmp13 to i32
834  %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
835  %add.us.epil = add nsw i32 %mul.us.epil, %tmp11
836  store i32 %add.us.epil, ptr %arrayidx8.us, align 4
837  %inc.us.epil = add nuw i32 %j.021.us.epil, 1
838  %epil.iter.sub = add i32 %epil.iter, -1
839  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
840  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
841
842for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
843  %inc10.us = add nuw i32 %i.023.us, 1
844  %exitcond26 = icmp eq i32 %inc10.us, %N
845  br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us
846
847for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
848  ret void
849}
850
851; CHECK-LABEL: mac_16x16_2d
852; CHECK: @ %for.body4.us
853
854; TODO: pre-indexed loads for both input arrays.
855; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
856; CHECK-DEFAULT-NOT: ldr{{.*}}]!
857
858; TODO: increased complexity should lead to better codegen.
859; CHECK-COMPLEX-NOT: ldr{{.*}}]!
860
861; DISABLED-NOT: ldr{{.*}}]!
862
863; CHECK-T2: @ %for.body4.us.epil
864; CHECK-T2: ldrsh{{.*}}, #2]!
865
866define void @mac_16x16_2d(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture %C, i32 %N, i32 %M) {
867entry:
868  %cmp23 = icmp eq i32 %N, 0
869  %cmp220 = icmp eq i32 %M, 0
870  %or.cond = or i1 %cmp23, %cmp220
871  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
872
873for.cond1.preheader.us.preheader:                 ; preds = %entry
874  %tmp = add i32 %M, -1
875  %xtraiter = and i32 %M, 3
876  %tmp1 = icmp ult i32 %tmp, 3
877  %unroll_iter = sub i32 %M, %xtraiter
878  %lcmp.mod = icmp eq i32 %xtraiter, 0
879  br label %for.cond1.preheader.us
880
881for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
882  %i.024.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
883  %arrayidx.us = getelementptr inbounds i16, ptr %A, i32 %i.024.us
884  %tmp2 = load i16, ptr %arrayidx.us, align 2
885  %conv.us = sext i16 %tmp2 to i32
886  %arrayidx5.us = getelementptr inbounds ptr, ptr %B, i32 %i.024.us
887  %tmp3 = load ptr, ptr %arrayidx5.us, align 4
888  %arrayidx8.us = getelementptr inbounds i32, ptr %C, i32 %i.024.us
889  %arrayidx8.promoted.us = load i32, ptr %arrayidx8.us, align 4
890  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
891
892for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
893  %add22.us = phi i32 [ %add.us.3, %for.body4.us ], [ %arrayidx8.promoted.us, %for.cond1.preheader.us ]
894  %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
895  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
896  %arrayidx6.us = getelementptr inbounds i16, ptr %tmp3, i32 %j.021.us
897  %tmp4 = load i16, ptr %arrayidx6.us, align 2
898  %conv7.us = sext i16 %tmp4 to i32
899  %mul.us = mul nsw i32 %conv7.us, %conv.us
900  %add.us = add nsw i32 %mul.us, %add22.us
901  %inc.us = or disjoint i32 %j.021.us, 1
902  %arrayidx6.us.1 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us
903  %tmp5 = load i16, ptr %arrayidx6.us.1, align 2
904  %conv7.us.1 = sext i16 %tmp5 to i32
905  %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
906  %add.us.1 = add nsw i32 %mul.us.1, %add.us
907  %inc.us.1 = or disjoint i32 %j.021.us, 2
908  %arrayidx6.us.2 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us.1
909  %tmp6 = load i16, ptr %arrayidx6.us.2, align 2
910  %conv7.us.2 = sext i16 %tmp6 to i32
911  %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
912  %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
913  %inc.us.2 = or disjoint i32 %j.021.us, 3
914  %arrayidx6.us.3 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us.2
915  %tmp7 = load i16, ptr %arrayidx6.us.3, align 2
916  %conv7.us.3 = sext i16 %tmp7 to i32
917  %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
918  %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
919  %inc.us.3 = add i32 %j.021.us, 4
920  %niter.nsub.3 = add i32 %niter, -4
921  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
922  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
923
924for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
925  %add.us.lcssa.ph = phi i32 [ undef, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
926  %add22.us.unr = phi i32 [ %arrayidx8.promoted.us, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
927  %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
928  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
929
930for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
931  %add22.us.epil = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %add22.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
932  %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
933  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
934  %arrayidx6.us.epil = getelementptr inbounds i16, ptr %tmp3, i32 %j.021.us.epil
935  %tmp8 = load i16, ptr %arrayidx6.us.epil, align 2
936  %conv7.us.epil = sext i16 %tmp8 to i32
937  %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
938  %add.us.epil = add nsw i32 %mul.us.epil, %add22.us.epil
939  %inc.us.epil = add nuw i32 %j.021.us.epil, 1
940  %epil.iter.sub = add i32 %epil.iter, -1
941  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
942  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
943
944for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
945  %add.us.lcssa = phi i32 [ %add.us.lcssa.ph, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ], [ %add.us.epil, %for.body4.us.epil ]
946  store i32 %add.us.lcssa, ptr %arrayidx8.us, align 4
947  %inc10.us = add nuw i32 %i.024.us, 1
948  %exitcond27 = icmp eq i32 %inc10.us, %N
949  br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us
950
951for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
952  ret void
953}
954
955; CHECK-LABEL: mul32x32_backwards
956; CHECK: @ %for.body
957
958; TODO: post increments for decreasing addresses
959; CHECK-DEFAULT-NOT: ldr{{.*}}]!
960; CHECK-DEFAULT-NOT: str{{.*}}]!
961
962; CHECK-COMPLEX-NOT: ldr{{.*}}]!
963; CHECK-COMPLEX-NOT: str{{.*}}]!
964
965define void @mul32x32_backwards(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %N) {
966entry:
967  %i.08 = add i32 %N, -1
968  %cmp9 = icmp sgt i32 %i.08, -1
969  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
970
971for.body.preheader:                               ; preds = %entry
972  %xtraiter = and i32 %N, 3
973  %lcmp.mod = icmp eq i32 %xtraiter, 0
974  br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
975
976for.body.prol:                                    ; preds = %for.body.prol, %for.body.preheader
977  %i.010.prol = phi i32 [ %i.0.prol, %for.body.prol ], [ %i.08, %for.body.preheader ]
978  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader ]
979  %arrayidx.prol = getelementptr inbounds i32, ptr %b, i32 %i.010.prol
980  %tmp = load i32, ptr %arrayidx.prol, align 4
981  %arrayidx1.prol = getelementptr inbounds i32, ptr %c, i32 %i.010.prol
982  %tmp1 = load i32, ptr %arrayidx1.prol, align 4
983  %mul.prol = mul nsw i32 %tmp1, %tmp
984  %arrayidx2.prol = getelementptr inbounds i32, ptr %a, i32 %i.010.prol
985  store i32 %mul.prol, ptr %arrayidx2.prol, align 4
986  %i.0.prol = add i32 %i.010.prol, -1
987  %prol.iter.sub = add i32 %prol.iter, -1
988  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
989  br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
990
991for.body.prol.loopexit:                           ; preds = %for.body.prol, %for.body.preheader
992  %i.010.unr = phi i32 [ %i.08, %for.body.preheader ], [ %i.0.prol, %for.body.prol ]
993  %tmp2 = icmp ult i32 %i.08, 3
994  br i1 %tmp2, label %for.cond.cleanup, label %for.body
995
996for.cond.cleanup:                                 ; preds = %for.body, %for.body.prol.loopexit, %entry
997  ret void
998
999for.body:                                         ; preds = %for.body, %for.body.prol.loopexit
1000  %i.010 = phi i32 [ %i.0.3, %for.body ], [ %i.010.unr, %for.body.prol.loopexit ]
1001  %arrayidx = getelementptr inbounds i32, ptr %b, i32 %i.010
1002  %tmp3 = load i32, ptr %arrayidx, align 4
1003  %arrayidx1 = getelementptr inbounds i32, ptr %c, i32 %i.010
1004  %tmp4 = load i32, ptr %arrayidx1, align 4
1005  %mul = mul nsw i32 %tmp4, %tmp3
1006  %arrayidx2 = getelementptr inbounds i32, ptr %a, i32 %i.010
1007  store i32 %mul, ptr %arrayidx2, align 4
1008  %i.0 = add i32 %i.010, -1
1009  %arrayidx.1 = getelementptr inbounds i32, ptr %b, i32 %i.0
1010  %tmp5 = load i32, ptr %arrayidx.1, align 4
1011  %arrayidx1.1 = getelementptr inbounds i32, ptr %c, i32 %i.0
1012  %tmp6 = load i32, ptr %arrayidx1.1, align 4
1013  %mul.1 = mul nsw i32 %tmp6, %tmp5
1014  %arrayidx2.1 = getelementptr inbounds i32, ptr %a, i32 %i.0
1015  store i32 %mul.1, ptr %arrayidx2.1, align 4
1016  %i.0.1 = add i32 %i.010, -2
1017  %arrayidx.2 = getelementptr inbounds i32, ptr %b, i32 %i.0.1
1018  %tmp7 = load i32, ptr %arrayidx.2, align 4
1019  %arrayidx1.2 = getelementptr inbounds i32, ptr %c, i32 %i.0.1
1020  %tmp8 = load i32, ptr %arrayidx1.2, align 4
1021  %mul.2 = mul nsw i32 %tmp8, %tmp7
1022  %arrayidx2.2 = getelementptr inbounds i32, ptr %a, i32 %i.0.1
1023  store i32 %mul.2, ptr %arrayidx2.2, align 4
1024  %i.0.2 = add i32 %i.010, -3
1025  %arrayidx.3 = getelementptr inbounds i32, ptr %b, i32 %i.0.2
1026  %tmp9 = load i32, ptr %arrayidx.3, align 4
1027  %arrayidx1.3 = getelementptr inbounds i32, ptr %c, i32 %i.0.2
1028  %tmp10 = load i32, ptr %arrayidx1.3, align 4
1029  %mul.3 = mul nsw i32 %tmp10, %tmp9
1030  %arrayidx2.3 = getelementptr inbounds i32, ptr %a, i32 %i.0.2
1031  store i32 %mul.3, ptr %arrayidx2.3, align 4
1032  %i.0.3 = add i32 %i.010, -4
1033  %cmp.3 = icmp sgt i32 %i.0.3, -1
1034  br i1 %cmp.3, label %for.body, label %for.cond.cleanup
1035}
1036
1037; CHECK-LABEL: mul32x32_forwards
1038; CHECK: @ %for.body
1039
1040; TODO: Would be good for the complexity limit didn't have to be increased to
1041; enable the pre-indexed accesses.
1042
1043; CHECK-DEFAULT-NOT: ldr{{.*}}]!
1044; CHECK-DEFAULT-NOT: str{{.*}}]!
1045
1046; CHECK-COMPLEX: ldr{{.*}}, #16]!
1047; CHECK-COMPLEX: ldr{{.*}}, #16]!
1048; CHECK-COMPLEX: str{{.*}}, #16]!
1049
1050; CHECK-T2: @ %for.body.epil
1051; CHECK-T2: ldr{{.*}}, #4]!
1052; CHECK-T2: ldr{{.*}}, #4]!
1053; CHECK-T2: str{{.*}}, #4]!
1054
1055define void @mul32x32_forwards(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %N) {
1056entry:
1057  %cmp8 = icmp eq i32 %N, 0
1058  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1059
1060for.body.preheader:                               ; preds = %entry
1061  %tmp = add i32 %N, -1
1062  %xtraiter = and i32 %N, 3
1063  %tmp1 = icmp ult i32 %tmp, 3
1064  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1065
1066for.body.preheader.new:                           ; preds = %for.body.preheader
1067  %unroll_iter = sub i32 %N, %xtraiter
1068  br label %for.body
1069
1070for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
1071  %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1072  %lcmp.mod = icmp eq i32 %xtraiter, 0
1073  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1074
1075for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
1076  %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1077  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1078  %arrayidx.epil = getelementptr inbounds i32, ptr %b, i32 %i.09.epil
1079  %tmp2 = load i32, ptr %arrayidx.epil, align 4
1080  %arrayidx1.epil = getelementptr inbounds i32, ptr %c, i32 %i.09.epil
1081  %tmp3 = load i32, ptr %arrayidx1.epil, align 4
1082  %mul.epil = mul nsw i32 %tmp3, %tmp2
1083  %arrayidx2.epil = getelementptr inbounds i32, ptr %a, i32 %i.09.epil
1084  store i32 %mul.epil, ptr %arrayidx2.epil, align 4
1085  %inc.epil = add nuw nsw i32 %i.09.epil, 1
1086  %epil.iter.sub = add i32 %epil.iter, -1
1087  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1088  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1089
1090for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
1091  ret void
1092
1093for.body:                                         ; preds = %for.body, %for.body.preheader.new
1094  %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1095  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1096  %arrayidx = getelementptr inbounds i32, ptr %b, i32 %i.09
1097  %tmp4 = load i32, ptr %arrayidx, align 4
1098  %arrayidx1 = getelementptr inbounds i32, ptr %c, i32 %i.09
1099  %tmp5 = load i32, ptr %arrayidx1, align 4
1100  %mul = mul nsw i32 %tmp5, %tmp4
1101  %arrayidx2 = getelementptr inbounds i32, ptr %a, i32 %i.09
1102  store i32 %mul, ptr %arrayidx2, align 4
1103  %inc = or disjoint i32 %i.09, 1
1104  %arrayidx.1 = getelementptr inbounds i32, ptr %b, i32 %inc
1105  %tmp6 = load i32, ptr %arrayidx.1, align 4
1106  %arrayidx1.1 = getelementptr inbounds i32, ptr %c, i32 %inc
1107  %tmp7 = load i32, ptr %arrayidx1.1, align 4
1108  %mul.1 = mul nsw i32 %tmp7, %tmp6
1109  %arrayidx2.1 = getelementptr inbounds i32, ptr %a, i32 %inc
1110  store i32 %mul.1, ptr %arrayidx2.1, align 4
1111  %inc.1 = or disjoint i32 %i.09, 2
1112  %arrayidx.2 = getelementptr inbounds i32, ptr %b, i32 %inc.1
1113  %tmp8 = load i32, ptr %arrayidx.2, align 4
1114  %arrayidx1.2 = getelementptr inbounds i32, ptr %c, i32 %inc.1
1115  %tmp9 = load i32, ptr %arrayidx1.2, align 4
1116  %mul.2 = mul nsw i32 %tmp9, %tmp8
1117  %arrayidx2.2 = getelementptr inbounds i32, ptr %a, i32 %inc.1
1118  store i32 %mul.2, ptr %arrayidx2.2, align 4
1119  %inc.2 = or disjoint i32 %i.09, 3
1120  %arrayidx.3 = getelementptr inbounds i32, ptr %b, i32 %inc.2
1121  %tmp10 = load i32, ptr %arrayidx.3, align 4
1122  %arrayidx1.3 = getelementptr inbounds i32, ptr %c, i32 %inc.2
1123  %tmp11 = load i32, ptr %arrayidx1.3, align 4
1124  %mul.3 = mul nsw i32 %tmp11, %tmp10
1125  %arrayidx2.3 = getelementptr inbounds i32, ptr %a, i32 %inc.2
1126  store i32 %mul.3, ptr %arrayidx2.3, align 4
1127  %inc.3 = add nuw nsw i32 %i.09, 4
1128  %niter.nsub.3 = add i32 %niter, -4
1129  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1130  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1131}
1132