xref: /llvm-project/llvm/test/CodeGen/ARM/loop-indexing.ll (revision 3fbacd4964edb44bce797de8fe248512a835524c)
1; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
2; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2
3; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
4; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
5; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED
6; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2
7
8; Tests to check that post increment addressing modes are used instead of
9; updating base pointers with add instructions.
10
11; TODO: I think we should be able to use post inc addressing with VLDM
12; instructions.
13; CHECK-LABEL: test_fma
14; CHECK: @ %loop
15
16; CHECK-DEFAULT: vldr s{{.*}}, #8]
17; CHECK-DEFAULT: vldr s{{.*}}, #8]
18; CHECK-DEFAULT: vldr s{{.*}}, #12]
19; CHECK-DEFAULT: vldr s{{.*}}, #12]
20
21; CHECK-COMPLEX: vldr s{{.*}}, #8]
22; CHECK-COMPLEX: vldr s{{.*}}, #8]
23; CHECK-COMPLEX: vldr s{{.*}}, #12]
24; CHECK-COMPLEX: vldr s{{.*}}, #12]
25
26define float @test_fma(float* %a, float* %b, i32 %N) {
27entry:
28  br label %loop
29
30loop:
31  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
32  %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ]
33  %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ]
34  %gep.a.1 = getelementptr inbounds float, float* %a, i32 %idx.1
35  %a.1 = load float, float* %gep.a.1
36  %gep.b.1 = getelementptr inbounds float, float* %b, i32 %idx.1
37  %b.1 = load float, float* %gep.b.1
38  %fmul.1 = fmul float %a.1, %b.1
39  %fma.1 = fadd float %fmul.1, %res
40  %idx.2 = or i32 %idx.1, 1
41  %gep.a.2 = getelementptr inbounds float, float* %a, i32 %idx.2
42  %a.2 = load float, float* %gep.a.2
43  %gep.b.2 = getelementptr inbounds float, float* %b, i32 %idx.2
44  %b.2 = load float, float* %gep.b.2
45  %fmul.2 = fmul float %a.2, %b.2
46  %fma.2 = fadd float %fmul.2, %fma.1
47  %i.next = add nsw nuw i32 %i, -2
48  %idx.next = add nsw nuw i32 %idx.1, 2
49  %cmp = icmp ult i32 %i.next, %N
50  br i1 %cmp, label %loop, label %exit
51
52exit:
53  ret float %fma.2
54}
55
56; CHECK-LABEL: convolve_16bit
57; TODO: Both arrays should use indexing
58; CHECK-DEFAULT: ldr{{.*}}, #8]!
59; CHECK-DEFAULT-NOT: ldr{{.*}}]!
60
61; CHECK-COMPLEX: ldr{{.*}}, #8]!
62; CHECK-COMPLEX-NOT: ldr{{.*}}]!
63
64; DISABLED-NOT: ldr{{.*}}]!
65; DISABLED-NOT: str{{.*}}]!
66
67define void @convolve_16bit(i16** nocapture readonly %input_image, i16** nocapture readonly %filter,
68                            i32 %filter_dim, i32 %out_width, i32 %out_height,
69                            i32** nocapture readonly %convolved) {
70entry:
71  %cmp92 = icmp eq i32 %out_height, 0
72  br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
73
74for.cond1.preheader.lr.ph:                        ; preds = %entry
75  %xtraiter = and i32 %filter_dim, 3
76  %unroll_iter = sub i32 %filter_dim, %xtraiter
77  br label %for.cond1.preheader
78
79for.cond1.preheader:                              ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph
80  %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ]
81  %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093
82  %tmp3 = load i32*, i32** %arrayidx22, align 4
83  br label %for.cond9.preheader.us.us.preheader
84
85for.cond9.preheader.us.us.preheader:              ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph
86  %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ]
87  br label %for.cond9.preheader.us.us
88
89for.cond9.preheader.us.us:                        ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader
90  %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
91  %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ]
92  %add.us.us = add i32 %filter_y.056.us.us, %res_y.093
93  %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us
94  %tmp5 = load i16*, i16** %arrayidx.us.us, align 4
95  %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us
96  %tmp6 = load i16*, i16** %arrayidx15.us.us, align 4
97  br label %for.body12.us.us
98
99for.body12.us.us:                                 ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
100  %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ]
101  %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ]
102  %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ]
103  %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us
104  %arrayidx14.us.us = getelementptr inbounds i16, i16* %tmp5, i32 %filter_x.053.us.us
105  %tmp9 = load i16, i16* %arrayidx14.us.us, align 2
106  %conv.us.us = sext i16 %tmp9 to i32
107  %arrayidx16.us.us = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us
108  %tmp10 = load i16, i16* %arrayidx16.us.us, align 2
109  %conv17.us.us = sext i16 %tmp10 to i32
110  %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us
111  %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us
112  %inc.us.us = or i32 %filter_x.053.us.us, 1
113  %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us
114  %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us
115  %tmp11 = load i16, i16* %arrayidx14.us.us.1, align 2
116  %conv.us.us.1 = sext i16 %tmp11 to i32
117  %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.1
118  %tmp12 = load i16, i16* %arrayidx16.us.us.1, align 2
119  %conv17.us.us.1 = sext i16 %tmp12 to i32
120  %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1
121  %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us
122  %inc.us.us.1 = or i32 %filter_x.053.us.us, 2
123  %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us
124  %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.1
125  %tmp13 = load i16, i16* %arrayidx14.us.us.2, align 2
126  %conv.us.us.2 = sext i16 %tmp13 to i32
127  %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.2
128  %tmp14 = load i16, i16* %arrayidx16.us.us.2, align 2
129  %conv17.us.us.2 = sext i16 %tmp14 to i32
130  %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2
131  %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1
132  %inc.us.us.2 = or i32 %filter_x.053.us.us, 3
133  %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us
134  %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.2
135  %tmp15 = load i16, i16* %arrayidx14.us.us.3, align 2
136  %conv.us.us.3 = sext i16 %tmp15 to i32
137  %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.3
138  %tmp16 = load i16, i16* %arrayidx16.us.us.3, align 2
139  %conv17.us.us.3 = sext i16 %tmp16 to i32
140  %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3
141  %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2
142  %inc.us.us.3 = add i32 %filter_x.053.us.us, 4
143  %niter.nsub.3 = add i32 %niter, -4
144  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
145  br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us
146
147for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us
148  %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1
149  %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim
150  br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us
151
152for.cond5.for.cond.cleanup7_crit_edge.us:         ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us
153  %arrayidx23.us = getelementptr inbounds i32, i32* %tmp3, i32 %res_x.060.us
154  store i32 %add18.us.us.3, i32* %arrayidx23.us, align 4
155  %add25.us = add nuw i32 %res_x.060.us, 1
156  %exitcond99 = icmp eq i32 %add25.us, %out_width
157  br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader
158
159for.cond.cleanup3:                                ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader
160  %add28 = add nuw i32 %res_y.093, 1
161  %exitcond100 = icmp eq i32 %add28, %out_height
162  br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader
163
164for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
165  ret void
166}
167
168; CHECK-LABEL: mul_8x8
169; CHECK: @ %for.body
170
171; CHECK-DEFAULT: str{{.*}}, #16]!
172; CHECK-DEFAULT: ldrb{{.*}}, #4]!
173; CHECK-DEFAULT: ldrb{{.*}}, #4]!
174
175; CHECK-COMPLEX: str{{.*}}, #16]!
176; CHECK-COMPLEX: ldrb{{.*}}, #4]!
177; CHECK-COMPLEX: ldrb{{.*}}, #4]!
178
179; DISABLED-NOT: ldr{{.*}}]!
180; DISABLED-NOT: str{{.*}}]!
181
182; CHECK-T2: @ %for.body.epil
183; CHECK-T2: ldrb{{.*}}, #1]!
184; CHECK-T2: ldrb{{.*}}, #1]!
185; CHECK-T2: str{{.*}}, #4]!
186
187define void @mul_8x8(i8* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
188entry:
189  %cmp9 = icmp eq i32 %N, 0
190  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
191
192for.body.preheader:                               ; preds = %entry
193  %tmp = add i32 %N, -1
194  %xtraiter = and i32 %N, 3
195  %tmp1 = icmp ult i32 %tmp, 3
196  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
197
198for.body.preheader.new:                           ; preds = %for.body.preheader
199  %unroll_iter = sub i32 %N, %xtraiter
200  br label %for.body
201
202for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
203  %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
204  %lcmp.mod = icmp eq i32 %xtraiter, 0
205  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
206
207for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
208  %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
209  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
210  %arrayidx.epil = getelementptr inbounds i8, i8* %A, i32 %i.010.epil
211  %tmp2 = load i8, i8* %arrayidx.epil, align 1
212  %conv.epil = zext i8 %tmp2 to i32
213  %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
214  %tmp3 = load i8, i8* %arrayidx1.epil, align 1
215  %conv2.epil = zext i8 %tmp3 to i32
216  %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
217  %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
218  store i32 %mul.epil, i32* %arrayidx3.epil, align 4
219  %inc.epil = add nuw i32 %i.010.epil, 1
220  %epil.iter.sub = add i32 %epil.iter, -1
221  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
222  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
223
224for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
225  ret void
226
227for.body:                                         ; preds = %for.body, %for.body.preheader.new
228  %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
229  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
230  %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.010
231  %tmp4 = load i8, i8* %arrayidx, align 1
232  %conv = zext i8 %tmp4 to i32
233  %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
234  %tmp5 = load i8, i8* %arrayidx1, align 1
235  %conv2 = zext i8 %tmp5 to i32
236  %mul = mul nuw nsw i32 %conv2, %conv
237  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
238  store i32 %mul, i32* %arrayidx3, align 4
239  %inc = or i32 %i.010, 1
240  %arrayidx.1 = getelementptr inbounds i8, i8* %A, i32 %inc
241  %tmp6 = load i8, i8* %arrayidx.1, align 1
242  %conv.1 = zext i8 %tmp6 to i32
243  %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
244  %tmp7 = load i8, i8* %arrayidx1.1, align 1
245  %conv2.1 = zext i8 %tmp7 to i32
246  %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
247  %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
248  store i32 %mul.1, i32* %arrayidx3.1, align 4
249  %inc.1 = or i32 %i.010, 2
250  %arrayidx.2 = getelementptr inbounds i8, i8* %A, i32 %inc.1
251  %tmp8 = load i8, i8* %arrayidx.2, align 1
252  %conv.2 = zext i8 %tmp8 to i32
253  %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
254  %tmp9 = load i8, i8* %arrayidx1.2, align 1
255  %conv2.2 = zext i8 %tmp9 to i32
256  %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
257  %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
258  store i32 %mul.2, i32* %arrayidx3.2, align 4
259  %inc.2 = or i32 %i.010, 3
260  %arrayidx.3 = getelementptr inbounds i8, i8* %A, i32 %inc.2
261  %tmp10 = load i8, i8* %arrayidx.3, align 1
262  %conv.3 = zext i8 %tmp10 to i32
263  %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
264  %tmp11 = load i8, i8* %arrayidx1.3, align 1
265  %conv2.3 = zext i8 %tmp11 to i32
266  %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
267  %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
268  store i32 %mul.3, i32* %arrayidx3.3, align 4
269  %inc.3 = add i32 %i.010, 4
270  %niter.nsub.3 = add i32 %niter, -4
271  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
272  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
273}
274
275; CHECK-LABEL: mul_16x8
276; CHECK: @ %for.body
277
278; CHECK-DEFAULT: str{{.*}}, #16]!
279; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
280
281; CHECK-COMPLEX: ldrsh{{.*}}, #8]!
282; CHECK-COMPLEX: str{{.*}}, #16]!
283; CHECK-COMPLEX: ldrb{{.*}}, #4]!
284
285; DISABLED-NOT: ldr{{.*}}]!
286; DISABLED-NOT: str{{.*}}]!
287
288; CHECK-T2: @ %for.body.epil
289; CHECK-T2: ldrsh{{.*}}, #2]!
290; CHECK-T2: ldrb{{.*}}, #1]!
291; CHECK-T2: str{{.*}}, #4]!
292
293define void @mul_16x8(i16* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) {
294entry:
295  %cmp9 = icmp eq i32 %N, 0
296  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
297
298for.body.preheader:                               ; preds = %entry
299  %tmp = add i32 %N, -1
300  %xtraiter = and i32 %N, 3
301  %tmp1 = icmp ult i32 %tmp, 3
302  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
303
304for.body.preheader.new:                           ; preds = %for.body.preheader
305  %unroll_iter = sub i32 %N, %xtraiter
306  br label %for.body
307
308for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
309  %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
310  %lcmp.mod = icmp eq i32 %xtraiter, 0
311  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
312
313for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
314  %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
315  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
316  %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
317  %tmp2 = load i16, i16* %arrayidx.epil, align 2
318  %conv.epil = sext i16 %tmp2 to i32
319  %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil
320  %tmp3 = load i8, i8* %arrayidx1.epil, align 1
321  %conv2.epil = zext i8 %tmp3 to i32
322  %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
323  %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
324  store i32 %mul.epil, i32* %arrayidx3.epil, align 4
325  %inc.epil = add nuw i32 %i.010.epil, 1
326  %epil.iter.sub = add i32 %epil.iter, -1
327  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
328  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
329
330for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
331  ret void
332
333for.body:                                         ; preds = %for.body, %for.body.preheader.new
334  %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
335  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
336  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
337  %tmp4 = load i16, i16* %arrayidx, align 2
338  %conv = sext i16 %tmp4 to i32
339  %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010
340  %tmp5 = load i8, i8* %arrayidx1, align 1
341  %conv2 = zext i8 %tmp5 to i32
342  %mul = mul nsw i32 %conv2, %conv
343  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
344  store i32 %mul, i32* %arrayidx3, align 4
345  %inc = or i32 %i.010, 1
346  %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
347  %tmp6 = load i16, i16* %arrayidx.1, align 2
348  %conv.1 = sext i16 %tmp6 to i32
349  %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc
350  %tmp7 = load i8, i8* %arrayidx1.1, align 1
351  %conv2.1 = zext i8 %tmp7 to i32
352  %mul.1 = mul nsw i32 %conv2.1, %conv.1
353  %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
354  store i32 %mul.1, i32* %arrayidx3.1, align 4
355  %inc.1 = or i32 %i.010, 2
356  %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
357  %tmp8 = load i16, i16* %arrayidx.2, align 2
358  %conv.2 = sext i16 %tmp8 to i32
359  %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1
360  %tmp9 = load i8, i8* %arrayidx1.2, align 1
361  %conv2.2 = zext i8 %tmp9 to i32
362  %mul.2 = mul nsw i32 %conv2.2, %conv.2
363  %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
364  store i32 %mul.2, i32* %arrayidx3.2, align 4
365  %inc.2 = or i32 %i.010, 3
366  %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
367  %tmp10 = load i16, i16* %arrayidx.3, align 2
368  %conv.3 = sext i16 %tmp10 to i32
369  %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2
370  %tmp11 = load i8, i8* %arrayidx1.3, align 1
371  %conv2.3 = zext i8 %tmp11 to i32
372  %mul.3 = mul nsw i32 %conv2.3, %conv.3
373  %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
374  store i32 %mul.3, i32* %arrayidx3.3, align 4
375  %inc.3 = add i32 %i.010, 4
376  %niter.nsub.3 = add i32 %niter, -4
377  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
378  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
379}
380
381; CHECK-LABEL: mul_16x16
382; CHECK: @ %for.body
383
384; TODO: pre-indexed loads
385; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
386; CHECK-DEFAULT: str{{.*}}, #16]!
387; CHECK-DEFAULT-NOT: ldrsh{{.*}}]!
388
389; CHECK-COMPLEX: ldrsh{{.*}}]!
390; CHECK-COMPLEX: ldrsh{{.*}}]!
391; CHECK-COMPLEX: str{{.*}}]!
392
393; DISABLED-NOT: ldr{{.*}}]!
394; DISABLED-NOT: str{{.*}}]!
395
396; CHECK-T2: @ %for.body.epil
397; CHECK-T2: ldrsh{{.*}}, #2]!
398; CHECK-T2: ldrsh{{.*}}, #2]!
399; CHECK-T2: str{{.*}}, #4]!
400
401define void @mul_16x16(i16* nocapture readonly %A, i16* nocapture readonly %B, i32* nocapture %C, i32 %N) {
402entry:
403  %cmp9 = icmp eq i32 %N, 0
404  br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader
405
406for.body.preheader:                               ; preds = %entry
407  %tmp = add i32 %N, -1
408  %xtraiter = and i32 %N, 3
409  %tmp1 = icmp ult i32 %tmp, 3
410  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
411
412for.body.preheader.new:                           ; preds = %for.body.preheader
413  %unroll_iter = sub i32 %N, %xtraiter
414  br label %for.body
415
416for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
417  %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
418  %lcmp.mod = icmp eq i32 %xtraiter, 0
419  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
420
421for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
422  %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
423  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
424  %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil
425  %tmp2 = load i16, i16* %arrayidx.epil, align 2
426  %conv.epil = sext i16 %tmp2 to i32
427  %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.010.epil
428  %tmp3 = load i16, i16* %arrayidx1.epil, align 2
429  %conv2.epil = sext i16 %tmp3 to i32
430  %mul.epil = mul nsw i32 %conv2.epil, %conv.epil
431  %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil
432  store i32 %mul.epil, i32* %arrayidx3.epil, align 4
433  %inc.epil = add nuw i32 %i.010.epil, 1
434  %epil.iter.sub = add i32 %epil.iter, -1
435  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
436  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
437
438for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
439  ret void
440
441for.body:                                         ; preds = %for.body, %for.body.preheader.new
442  %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
443  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
444  %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010
445  %tmp4 = load i16, i16* %arrayidx, align 2
446  %conv = sext i16 %tmp4 to i32
447  %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010
448  %tmp5 = load i16, i16* %arrayidx1, align 2
449  %conv2 = sext i16 %tmp5 to i32
450  %mul = mul nsw i32 %conv2, %conv
451  %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010
452  store i32 %mul, i32* %arrayidx3, align 4
453  %inc = or i32 %i.010, 1
454  %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc
455  %tmp6 = load i16, i16* %arrayidx.1, align 2
456  %conv.1 = sext i16 %tmp6 to i32
457  %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc
458  %tmp7 = load i16, i16* %arrayidx1.1, align 2
459  %conv2.1 = sext i16 %tmp7 to i32
460  %mul.1 = mul nsw i32 %conv2.1, %conv.1
461  %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc
462  store i32 %mul.1, i32* %arrayidx3.1, align 4
463  %inc.1 = or i32 %i.010, 2
464  %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1
465  %tmp8 = load i16, i16* %arrayidx.2, align 2
466  %conv.2 = sext i16 %tmp8 to i32
467  %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1
468  %tmp9 = load i16, i16* %arrayidx1.2, align 2
469  %conv2.2 = sext i16 %tmp9 to i32
470  %mul.2 = mul nsw i32 %conv2.2, %conv.2
471  %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1
472  store i32 %mul.2, i32* %arrayidx3.2, align 4
473  %inc.2 = or i32 %i.010, 3
474  %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2
475  %tmp10 = load i16, i16* %arrayidx.3, align 2
476  %conv.3 = sext i16 %tmp10 to i32
477  %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2
478  %tmp11 = load i16, i16* %arrayidx1.3, align 2
479  %conv2.3 = sext i16 %tmp11 to i32
480  %mul.3 = mul nsw i32 %conv2.3, %conv.3
481  %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2
482  store i32 %mul.3, i32* %arrayidx3.3, align 4
483  %inc.3 = add i32 %i.010, 4
484  %niter.nsub.3 = add i32 %niter, -4
485  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
486  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
487}
488
489; CHECK-LABEL: mul_8x8_2d
490; CHECK: @ %for.body4.us
491
492; CHECK-DEFAULT: ldr{{.*}}, #16]!
493; CHECK-DEFAULT: ldrb{{.*}}, #4]!
494
495; DISABLED-NOT: ldr{{.*}}]!
496; DISABLED-NOT: str{{.*}}]!
497
498; CHECK-T2: @ %for.body4.us.epil
499; CHECK-T2: ldrb{{.*}}, #1]!
500; CHECK-T2: ldr{{.*}}, #4]!
501
502define void @mul_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
503entry:
504  %cmp24 = icmp eq i32 %N, 0
505  %cmp222 = icmp eq i32 %M, 0
506  %or.cond = or i1 %cmp24, %cmp222
507  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
508
509for.cond1.preheader.us.preheader:                 ; preds = %entry
510  %tmp = add i32 %M, -1
511  %xtraiter = and i32 %M, 3
512  %tmp1 = icmp ult i32 %tmp, 3
513  %unroll_iter = sub i32 %M, %xtraiter
514  %lcmp.mod = icmp eq i32 %xtraiter, 0
515  br label %for.cond1.preheader.us
516
517for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
518  %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
519  %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.025.us
520  %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.025.us
521  %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
522  %.pre = load i8*, i8** %arrayidx5.us, align 4
523  %.pre30 = load i32*, i32** %arrayidx8.us, align 4
524  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
525
526for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
527  %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
528  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
529  %tmp2 = load i8, i8* %arrayidx.us, align 1
530  %conv.us = zext i8 %tmp2 to i32
531  %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us
532  %tmp3 = load i8, i8* %arrayidx6.us, align 1
533  %conv7.us = zext i8 %tmp3 to i32
534  %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
535  %arrayidx9.us = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us
536  %tmp4 = load i32, i32* %arrayidx9.us, align 4
537  %add.us = add nsw i32 %tmp4, %mul.us
538  store i32 %add.us, i32* %arrayidx9.us, align 4
539  %inc.us = or i32 %j.023.us, 1
540  %tmp5 = load i8, i8* %arrayidx.us, align 1
541  %conv.us.1 = zext i8 %tmp5 to i32
542  %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
543  %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
544  %conv7.us.1 = zext i8 %tmp6 to i32
545  %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
546  %arrayidx9.us.1 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us
547  %tmp7 = load i32, i32* %arrayidx9.us.1, align 4
548  %add.us.1 = add nsw i32 %tmp7, %mul.us.1
549  store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
550  %inc.us.1 = or i32 %j.023.us, 2
551  %tmp8 = load i8, i8* %arrayidx.us, align 1
552  %conv.us.2 = zext i8 %tmp8 to i32
553  %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
554  %tmp9 = load i8, i8* %arrayidx6.us.2, align 1
555  %conv7.us.2 = zext i8 %tmp9 to i32
556  %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
557  %arrayidx9.us.2 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.1
558  %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
559  %add.us.2 = add nsw i32 %tmp10, %mul.us.2
560  store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
561  %inc.us.2 = or i32 %j.023.us, 3
562  %tmp11 = load i8, i8* %arrayidx.us, align 1
563  %conv.us.3 = zext i8 %tmp11 to i32
564  %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
565  %tmp12 = load i8, i8* %arrayidx6.us.3, align 1
566  %conv7.us.3 = zext i8 %tmp12 to i32
567  %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
568  %arrayidx9.us.3 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.2
569  %tmp13 = load i32, i32* %arrayidx9.us.3, align 4
570  %add.us.3 = add nsw i32 %tmp13, %mul.us.3
571  store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
572  %inc.us.3 = add i32 %j.023.us, 4
573  %niter.nsub.3 = add i32 %niter, -4
574  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
575  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
576
577for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
578  %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
579  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
580
581for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
582  %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
583  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
584  %tmp14 = load i8, i8* %arrayidx.us, align 1
585  %conv.us.epil = zext i8 %tmp14 to i32
586  %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us.epil
587  %tmp15 = load i8, i8* %arrayidx6.us.epil, align 1
588  %conv7.us.epil = zext i8 %tmp15 to i32
589  %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
590  %arrayidx9.us.epil = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us.epil
591  %tmp16 = load i32, i32* %arrayidx9.us.epil, align 4
592  %add.us.epil = add nsw i32 %tmp16, %mul.us.epil
593  store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
594  %inc.us.epil = add nuw i32 %j.023.us.epil, 1
595  %epil.iter.sub = add i32 %epil.iter, -1
596  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
597  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
598
599for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
600  %inc11.us = add nuw i32 %i.025.us, 1
601  %exitcond28 = icmp eq i32 %inc11.us, %N
602  br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
603
604for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
605  ret void
606}
607
608; CHECK-LABEL: mul_16x16_2d
609; CHECK: @ %for.body4.us
610
611; CHECK-DEFAULT: ldr{{.*}}, #16]!
612; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
613
614; DISABLED-NOT: ldr{{.*}}]!
615; DISABLED-NOT: str{{.*}}]!
616
617; CHECK-T2: @ %for.body4.us.epil
618; CHECK-T2: ldrsh{{.*}}, #2]!
619; CHECK-T2: ldr{{.*}}, #4]!
620
621define void @mul_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) {
622entry:
623  %cmp24 = icmp eq i32 %N, 0
624  %cmp222 = icmp eq i32 %M, 0
625  %or.cond = or i1 %cmp24, %cmp222
626  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
627
628for.cond1.preheader.us.preheader:                 ; preds = %entry
629  %tmp = add i32 %M, -1
630  %xtraiter = and i32 %M, 3
631  %tmp1 = icmp ult i32 %tmp, 3
632  %unroll_iter = sub i32 %M, %xtraiter
633  %lcmp.mod = icmp eq i32 %xtraiter, 0
634  br label %for.cond1.preheader.us
635
636for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
637  %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
638  %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.025.us
639  %tmp2 = load i16, i16* %arrayidx.us, align 2
640  %conv.us = sext i16 %tmp2 to i32
641  %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.025.us
642  %tmp3 = load i16*, i16** %arrayidx5.us, align 4
643  %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us
644  %tmp4 = load i32*, i32** %arrayidx8.us, align 4
645  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
646
647for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
648  %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
649  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
650  %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us
651  %tmp5 = load i16, i16* %arrayidx6.us, align 2
652  %conv7.us = sext i16 %tmp5 to i32
653  %mul.us = mul nsw i32 %conv7.us, %conv.us
654  %arrayidx9.us = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us
655  %tmp6 = load i32, i32* %arrayidx9.us, align 4
656  %add.us = add nsw i32 %tmp6, %mul.us
657  store i32 %add.us, i32* %arrayidx9.us, align 4
658  %inc.us = or i32 %j.023.us, 1
659  %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
660  %tmp7 = load i16, i16* %arrayidx6.us.1, align 2
661  %conv7.us.1 = sext i16 %tmp7 to i32
662  %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
663  %arrayidx9.us.1 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us
664  %tmp8 = load i32, i32* %arrayidx9.us.1, align 4
665  %add.us.1 = add nsw i32 %tmp8, %mul.us.1
666  store i32 %add.us.1, i32* %arrayidx9.us.1, align 4
667  %inc.us.1 = or i32 %j.023.us, 2
668  %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
669  %tmp9 = load i16, i16* %arrayidx6.us.2, align 2
670  %conv7.us.2 = sext i16 %tmp9 to i32
671  %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
672  %arrayidx9.us.2 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.1
673  %tmp10 = load i32, i32* %arrayidx9.us.2, align 4
674  %add.us.2 = add nsw i32 %tmp10, %mul.us.2
675  store i32 %add.us.2, i32* %arrayidx9.us.2, align 4
676  %inc.us.2 = or i32 %j.023.us, 3
677  %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
678  %tmp11 = load i16, i16* %arrayidx6.us.3, align 2
679  %conv7.us.3 = sext i16 %tmp11 to i32
680  %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
681  %arrayidx9.us.3 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.2
682  %tmp12 = load i32, i32* %arrayidx9.us.3, align 4
683  %add.us.3 = add nsw i32 %tmp12, %mul.us.3
684  store i32 %add.us.3, i32* %arrayidx9.us.3, align 4
685  %inc.us.3 = add i32 %j.023.us, 4
686  %niter.nsub.3 = add i32 %niter, -4
687  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
688  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
689
690for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
691  %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
692  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
693
694for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
695  %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
696  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
697  %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us.epil
698  %tmp13 = load i16, i16* %arrayidx6.us.epil, align 2
699  %conv7.us.epil = sext i16 %tmp13 to i32
700  %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
701  %arrayidx9.us.epil = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us.epil
702  %tmp14 = load i32, i32* %arrayidx9.us.epil, align 4
703  %add.us.epil = add nsw i32 %tmp14, %mul.us.epil
704  store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4
705  %inc.us.epil = add nuw i32 %j.023.us.epil, 1
706  %epil.iter.sub = add i32 %epil.iter, -1
707  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
708  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
709
710for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
711  %inc11.us = add nuw i32 %i.025.us, 1
712  %exitcond28 = icmp eq i32 %inc11.us, %N
713  br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us
714
715for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
716  ret void
717}
718
719; CHECK-LABEL: mac_8x8_2d
720; CHECK: @ %for.body4.us
721
722; TODO: Both input arrays could use pre-indexed loads.
723; TODO: pre-indexed stores.
724; CHECK-DEFAULT: ldrb{{.*}}, #4]!
725; CHECK-DEFAULT-NOT: ldr{{.*}}]!
726; CHECK-DEFAULT-NOT: str{{.*}}]!
727
728; TODO: Increased complexity shouldn't prevent indexed accesses.
729; CHECK-COMPLEX-NOT: ldr{{.*}}]!
730; CHECK-COMPLEX-NOT: str{{.*}}]!
731
732; DISABLED-NOT: ldr{{.*}}]!
733; DISABLED-NOT: str{{.*}}]!
734
735; CHECK-T2: @ %for.body4.us.epil
736; CHECK-T2: ldrb{{.*}}, #1]!
737
738define void @mac_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
739entry:
740  %cmp22 = icmp eq i32 %N, 0
741  %cmp220 = icmp eq i32 %M, 0
742  %or.cond = or i1 %cmp22, %cmp220
743  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
744
745for.cond1.preheader.us.preheader:                 ; preds = %entry
746  %tmp = add i32 %M, -1
747  %xtraiter = and i32 %M, 3
748  %tmp1 = icmp ult i32 %tmp, 3
749  %unroll_iter = sub i32 %M, %xtraiter
750  %lcmp.mod = icmp eq i32 %xtraiter, 0
751  br label %for.cond1.preheader.us
752
753for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
754  %i.023.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
755  %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.023.us
756  %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.023.us
757  %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.023.us
758  %.pre = load i8*, i8** %arrayidx5.us, align 4
759  %.pre28 = load i32, i32* %arrayidx8.us, align 4
760  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
761
762for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
763  %tmp2 = phi i32 [ %add.us.3, %for.body4.us ], [ %.pre28, %for.cond1.preheader.us ]
764  %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
765  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
766  %tmp3 = load i8, i8* %arrayidx.us, align 1
767  %conv.us = zext i8 %tmp3 to i32
768  %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us
769  %tmp4 = load i8, i8* %arrayidx6.us, align 1
770  %conv7.us = zext i8 %tmp4 to i32
771  %mul.us = mul nuw nsw i32 %conv7.us, %conv.us
772  %add.us = add nsw i32 %mul.us, %tmp2
773  store i32 %add.us, i32* %arrayidx8.us, align 4
774  %inc.us = or i32 %j.021.us, 1
775  %tmp5 = load i8, i8* %arrayidx.us, align 1
776  %conv.us.1 = zext i8 %tmp5 to i32
777  %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us
778  %tmp6 = load i8, i8* %arrayidx6.us.1, align 1
779  %conv7.us.1 = zext i8 %tmp6 to i32
780  %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1
781  %add.us.1 = add nsw i32 %mul.us.1, %add.us
782  store i32 %add.us.1, i32* %arrayidx8.us, align 4
783  %inc.us.1 = or i32 %j.021.us, 2
784  %tmp7 = load i8, i8* %arrayidx.us, align 1
785  %conv.us.2 = zext i8 %tmp7 to i32
786  %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1
787  %tmp8 = load i8, i8* %arrayidx6.us.2, align 1
788  %conv7.us.2 = zext i8 %tmp8 to i32
789  %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2
790  %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
791  store i32 %add.us.2, i32* %arrayidx8.us, align 4
792  %inc.us.2 = or i32 %j.021.us, 3
793  %tmp9 = load i8, i8* %arrayidx.us, align 1
794  %conv.us.3 = zext i8 %tmp9 to i32
795  %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2
796  %tmp10 = load i8, i8* %arrayidx6.us.3, align 1
797  %conv7.us.3 = zext i8 %tmp10 to i32
798  %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3
799  %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
800  store i32 %add.us.3, i32* %arrayidx8.us, align 4
801  %inc.us.3 = add i32 %j.021.us, 4
802  %niter.nsub.3 = add i32 %niter, -4
803  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
804  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
805
806for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
807  %.unr = phi i32 [ %.pre28, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
808  %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
809  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
810
811for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
812  %tmp11 = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
813  %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
814  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
815  %tmp12 = load i8, i8* %arrayidx.us, align 1
816  %conv.us.epil = zext i8 %tmp12 to i32
817  %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us.epil
818  %tmp13 = load i8, i8* %arrayidx6.us.epil, align 1
819  %conv7.us.epil = zext i8 %tmp13 to i32
820  %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil
821  %add.us.epil = add nsw i32 %mul.us.epil, %tmp11
822  store i32 %add.us.epil, i32* %arrayidx8.us, align 4
823  %inc.us.epil = add nuw i32 %j.021.us.epil, 1
824  %epil.iter.sub = add i32 %epil.iter, -1
825  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
826  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
827
828for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
829  %inc10.us = add nuw i32 %i.023.us, 1
830  %exitcond26 = icmp eq i32 %inc10.us, %N
831  br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us
832
833for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
834  ret void
835}
836
837; CHECK-LABEL: mac_16x16_2d
838; CHECK: @ %for.body4.us
839
840; TODO: pre-indexed loads for both input arrays.
841; CHECK-DEFAULT: ldrsh{{.*}}, #8]!
842; CHECK-DEFAULT-NOT: ldr{{.*}}]!
843
844; TODO: increased complexity should lead to better codegen.
845; CHECK-COMPLEX-NOT: ldr{{.*}}]!
846
847; DISABLED-NOT: ldr{{.*}}]!
848
849; CHECK-T2: @ %for.body4.us.epil
850; CHECK-T2: ldrsh{{.*}}, #2]!
851
852define void @mac_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) {
853entry:
854  %cmp23 = icmp eq i32 %N, 0
855  %cmp220 = icmp eq i32 %M, 0
856  %or.cond = or i1 %cmp23, %cmp220
857  br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader
858
859for.cond1.preheader.us.preheader:                 ; preds = %entry
860  %tmp = add i32 %M, -1
861  %xtraiter = and i32 %M, 3
862  %tmp1 = icmp ult i32 %tmp, 3
863  %unroll_iter = sub i32 %M, %xtraiter
864  %lcmp.mod = icmp eq i32 %xtraiter, 0
865  br label %for.cond1.preheader.us
866
867for.cond1.preheader.us:                           ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader
868  %i.024.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
869  %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.024.us
870  %tmp2 = load i16, i16* %arrayidx.us, align 2
871  %conv.us = sext i16 %tmp2 to i32
872  %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.024.us
873  %tmp3 = load i16*, i16** %arrayidx5.us, align 4
874  %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us
875  %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4
876  br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
877
878for.body4.us:                                     ; preds = %for.body4.us, %for.cond1.preheader.us
879  %add22.us = phi i32 [ %add.us.3, %for.body4.us ], [ %arrayidx8.promoted.us, %for.cond1.preheader.us ]
880  %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ]
881  %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ]
882  %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us
883  %tmp4 = load i16, i16* %arrayidx6.us, align 2
884  %conv7.us = sext i16 %tmp4 to i32
885  %mul.us = mul nsw i32 %conv7.us, %conv.us
886  %add.us = add nsw i32 %mul.us, %add22.us
887  %inc.us = or i32 %j.021.us, 1
888  %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us
889  %tmp5 = load i16, i16* %arrayidx6.us.1, align 2
890  %conv7.us.1 = sext i16 %tmp5 to i32
891  %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us
892  %add.us.1 = add nsw i32 %mul.us.1, %add.us
893  %inc.us.1 = or i32 %j.021.us, 2
894  %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1
895  %tmp6 = load i16, i16* %arrayidx6.us.2, align 2
896  %conv7.us.2 = sext i16 %tmp6 to i32
897  %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us
898  %add.us.2 = add nsw i32 %mul.us.2, %add.us.1
899  %inc.us.2 = or i32 %j.021.us, 3
900  %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2
901  %tmp7 = load i16, i16* %arrayidx6.us.3, align 2
902  %conv7.us.3 = sext i16 %tmp7 to i32
903  %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us
904  %add.us.3 = add nsw i32 %mul.us.3, %add.us.2
905  %inc.us.3 = add i32 %j.021.us, 4
906  %niter.nsub.3 = add i32 %niter, -4
907  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
908  br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us
909
910for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us
911  %add.us.lcssa.ph = phi i32 [ undef, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
912  %add22.us.unr = phi i32 [ %arrayidx8.promoted.us, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ]
913  %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ]
914  br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
915
916for.body4.us.epil:                                ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
917  %add22.us.epil = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %add22.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
918  %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
919  %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ]
920  %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us.epil
921  %tmp8 = load i16, i16* %arrayidx6.us.epil, align 2
922  %conv7.us.epil = sext i16 %tmp8 to i32
923  %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us
924  %add.us.epil = add nsw i32 %mul.us.epil, %add22.us.epil
925  %inc.us.epil = add nuw i32 %j.021.us.epil, 1
926  %epil.iter.sub = add i32 %epil.iter, -1
927  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
928  br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil
929
930for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa
931  %add.us.lcssa = phi i32 [ %add.us.lcssa.ph, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ], [ %add.us.epil, %for.body4.us.epil ]
932  store i32 %add.us.lcssa, i32* %arrayidx8.us, align 4
933  %inc10.us = add nuw i32 %i.024.us, 1
934  %exitcond27 = icmp eq i32 %inc10.us, %N
935  br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us
936
937for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
938  ret void
939}
940
941; CHECK-LABEL: mul32x32_backwards
942; CHECK: @ %for.body
943
944; TODO: post increments for decreasing addresses
945; CHECK-DEFAULT-NOT: ldr{{.*}}]!
946; CHECK-DEFAULT-NOT: str{{.*}}]!
947
948; CHECK-COMPLEX-NOT: ldr{{.*}}]!
949; CHECK-COMPLEX-NOT: str{{.*}}]!
950
951define void @mul32x32_backwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
952entry:
953  %i.08 = add i32 %N, -1
954  %cmp9 = icmp sgt i32 %i.08, -1
955  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
956
957for.body.preheader:                               ; preds = %entry
958  %xtraiter = and i32 %N, 3
959  %lcmp.mod = icmp eq i32 %xtraiter, 0
960  br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol
961
962for.body.prol:                                    ; preds = %for.body.prol, %for.body.preheader
963  %i.010.prol = phi i32 [ %i.0.prol, %for.body.prol ], [ %i.08, %for.body.preheader ]
964  %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader ]
965  %arrayidx.prol = getelementptr inbounds i32, i32* %b, i32 %i.010.prol
966  %tmp = load i32, i32* %arrayidx.prol, align 4
967  %arrayidx1.prol = getelementptr inbounds i32, i32* %c, i32 %i.010.prol
968  %tmp1 = load i32, i32* %arrayidx1.prol, align 4
969  %mul.prol = mul nsw i32 %tmp1, %tmp
970  %arrayidx2.prol = getelementptr inbounds i32, i32* %a, i32 %i.010.prol
971  store i32 %mul.prol, i32* %arrayidx2.prol, align 4
972  %i.0.prol = add i32 %i.010.prol, -1
973  %prol.iter.sub = add i32 %prol.iter, -1
974  %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0
975  br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol
976
977for.body.prol.loopexit:                           ; preds = %for.body.prol, %for.body.preheader
978  %i.010.unr = phi i32 [ %i.08, %for.body.preheader ], [ %i.0.prol, %for.body.prol ]
979  %tmp2 = icmp ult i32 %i.08, 3
980  br i1 %tmp2, label %for.cond.cleanup, label %for.body
981
982for.cond.cleanup:                                 ; preds = %for.body, %for.body.prol.loopexit, %entry
983  ret void
984
985for.body:                                         ; preds = %for.body, %for.body.prol.loopexit
986  %i.010 = phi i32 [ %i.0.3, %for.body ], [ %i.010.unr, %for.body.prol.loopexit ]
987  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.010
988  %tmp3 = load i32, i32* %arrayidx, align 4
989  %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.010
990  %tmp4 = load i32, i32* %arrayidx1, align 4
991  %mul = mul nsw i32 %tmp4, %tmp3
992  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.010
993  store i32 %mul, i32* %arrayidx2, align 4
994  %i.0 = add i32 %i.010, -1
995  %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %i.0
996  %tmp5 = load i32, i32* %arrayidx.1, align 4
997  %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %i.0
998  %tmp6 = load i32, i32* %arrayidx1.1, align 4
999  %mul.1 = mul nsw i32 %tmp6, %tmp5
1000  %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %i.0
1001  store i32 %mul.1, i32* %arrayidx2.1, align 4
1002  %i.0.1 = add i32 %i.010, -2
1003  %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %i.0.1
1004  %tmp7 = load i32, i32* %arrayidx.2, align 4
1005  %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %i.0.1
1006  %tmp8 = load i32, i32* %arrayidx1.2, align 4
1007  %mul.2 = mul nsw i32 %tmp8, %tmp7
1008  %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %i.0.1
1009  store i32 %mul.2, i32* %arrayidx2.2, align 4
1010  %i.0.2 = add i32 %i.010, -3
1011  %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %i.0.2
1012  %tmp9 = load i32, i32* %arrayidx.3, align 4
1013  %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %i.0.2
1014  %tmp10 = load i32, i32* %arrayidx1.3, align 4
1015  %mul.3 = mul nsw i32 %tmp10, %tmp9
1016  %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %i.0.2
1017  store i32 %mul.3, i32* %arrayidx2.3, align 4
1018  %i.0.3 = add i32 %i.010, -4
1019  %cmp.3 = icmp sgt i32 %i.0.3, -1
1020  br i1 %cmp.3, label %for.body, label %for.cond.cleanup
1021}
1022
1023; CHECK-LABEL: mul32x32_forwards
1024; CHECK: @ %for.body
1025
1026; TODO: Would be good for the complexity limit didn't have to be increased to
1027; enable the pre-indexed accesses.
1028
1029; CHECK-DEFAULT-NOT: ldr{{.*}}]!
1030; CHECK-DEFAULT-NOT: str{{.*}}]!
1031
1032; CHECK-COMPLEX: ldr{{.*}}, #16]!
1033; CHECK-COMPLEX: ldr{{.*}}, #16]!
1034; CHECK-COMPLEX: str{{.*}}, #16]!
1035
1036; CHECK-T2: @ %for.body.epil
1037; CHECK-T2: ldr{{.*}}, #4]!
1038; CHECK-T2: ldr{{.*}}, #4]!
1039; CHECK-T2: str{{.*}}, #4]!
1040
1041define void @mul32x32_forwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) {
1042entry:
1043  %cmp8 = icmp eq i32 %N, 0
1044  br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
1045
1046for.body.preheader:                               ; preds = %entry
1047  %tmp = add i32 %N, -1
1048  %xtraiter = and i32 %N, 3
1049  %tmp1 = icmp ult i32 %tmp, 3
1050  br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1051
1052for.body.preheader.new:                           ; preds = %for.body.preheader
1053  %unroll_iter = sub i32 %N, %xtraiter
1054  br label %for.body
1055
1056for.cond.cleanup.loopexit.unr-lcssa:              ; preds = %for.body, %for.body.preheader
1057  %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1058  %lcmp.mod = icmp eq i32 %xtraiter, 0
1059  br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1060
1061for.body.epil:                                    ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa
1062  %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1063  %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1064  %arrayidx.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil
1065  %tmp2 = load i32, i32* %arrayidx.epil, align 4
1066  %arrayidx1.epil = getelementptr inbounds i32, i32* %c, i32 %i.09.epil
1067  %tmp3 = load i32, i32* %arrayidx1.epil, align 4
1068  %mul.epil = mul nsw i32 %tmp3, %tmp2
1069  %arrayidx2.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil
1070  store i32 %mul.epil, i32* %arrayidx2.epil, align 4
1071  %inc.epil = add nuw nsw i32 %i.09.epil, 1
1072  %epil.iter.sub = add i32 %epil.iter, -1
1073  %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1074  br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1075
1076for.cond.cleanup:                                 ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry
1077  ret void
1078
1079for.body:                                         ; preds = %for.body, %for.body.preheader.new
1080  %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1081  %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1082  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.09
1083  %tmp4 = load i32, i32* %arrayidx, align 4
1084  %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.09
1085  %tmp5 = load i32, i32* %arrayidx1, align 4
1086  %mul = mul nsw i32 %tmp5, %tmp4
1087  %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.09
1088  store i32 %mul, i32* %arrayidx2, align 4
1089  %inc = or i32 %i.09, 1
1090  %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %inc
1091  %tmp6 = load i32, i32* %arrayidx.1, align 4
1092  %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %inc
1093  %tmp7 = load i32, i32* %arrayidx1.1, align 4
1094  %mul.1 = mul nsw i32 %tmp7, %tmp6
1095  %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %inc
1096  store i32 %mul.1, i32* %arrayidx2.1, align 4
1097  %inc.1 = or i32 %i.09, 2
1098  %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
1099  %tmp8 = load i32, i32* %arrayidx.2, align 4
1100  %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %inc.1
1101  %tmp9 = load i32, i32* %arrayidx1.2, align 4
1102  %mul.2 = mul nsw i32 %tmp9, %tmp8
1103  %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1
1104  store i32 %mul.2, i32* %arrayidx2.2, align 4
1105  %inc.2 = or i32 %i.09, 3
1106  %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
1107  %tmp10 = load i32, i32* %arrayidx.3, align 4
1108  %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %inc.2
1109  %tmp11 = load i32, i32* %arrayidx1.3, align 4
1110  %mul.3 = mul nsw i32 %tmp11, %tmp10
1111  %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2
1112  store i32 %mul.3, i32* %arrayidx2.3, align 4
1113  %inc.3 = add nuw nsw i32 %i.09, 4
1114  %niter.nsub.3 = add i32 %niter, -4
1115  %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1116  br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1117}
1118