1; RUN: llc -mtriple=thumbv7em -mattr=+fp-armv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 2; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 3; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-backedge-indexing=false %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED 4; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED 5; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED 6; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-complexity-limit=2147483647 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2 7 8; Tests to check that post increment addressing modes are used instead of 9; updating base pointers with add instructions. 10 11; TODO: I think we should be able to use post inc addressing with VLDM 12; instructions. 13; CHECK-LABEL: test_fma 14; CHECK: @ %loop 15 16; CHECK-DEFAULT: vldr s{{.*}}, #8] 17; CHECK-DEFAULT: vldr s{{.*}}, #8] 18; CHECK-DEFAULT: vldr s{{.*}}, #12] 19; CHECK-DEFAULT: vldr s{{.*}}, #12] 20 21; CHECK-COMPLEX: vldr s{{.*}}, #8] 22; CHECK-COMPLEX: vldr s{{.*}}, #8] 23; CHECK-COMPLEX: vldr s{{.*}}, #12] 24; CHECK-COMPLEX: vldr s{{.*}}, #12] 25 26define float @test_fma(float* %a, float* %b, i32 %N) { 27entry: 28 br label %loop 29 30loop: 31 %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] 32 %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] 33 %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ] 34 %gep.a.1 = getelementptr inbounds float, float* %a, i32 %idx.1 35 %a.1 = load float, float* %gep.a.1 36 %gep.b.1 = getelementptr inbounds float, float* %b, i32 %idx.1 37 %b.1 = load float, float* %gep.b.1 38 %fmul.1 = fmul float %a.1, %b.1 39 %fma.1 = fadd float %fmul.1, %res 40 %idx.2 = or i32 %idx.1, 1 41 %gep.a.2 = getelementptr inbounds float, float* %a, i32 %idx.2 42 %a.2 = load float, float* %gep.a.2 43 %gep.b.2 = getelementptr inbounds float, float* %b, i32 %idx.2 44 %b.2 = load float, float* %gep.b.2 45 %fmul.2 = fmul float %a.2, %b.2 46 %fma.2 = fadd float %fmul.2, %fma.1 47 %i.next = add nsw nuw i32 %i, -2 48 %idx.next = add nsw nuw i32 %idx.1, 2 49 %cmp = icmp ult i32 %i.next, %N 50 br i1 %cmp, label %loop, label %exit 51 52exit: 53 ret float %fma.2 54} 55 56; CHECK-LABEL: convolve_16bit 57; TODO: Both arrays should use indexing 58; CHECK-DEFAULT: ldr{{.*}}, #8]! 59; CHECK-DEFAULT-NOT: ldr{{.*}}]! 60 61; CHECK-COMPLEX: ldr{{.*}}, #8]! 62; CHECK-COMPLEX-NOT: ldr{{.*}}]! 63 64; DISABLED-NOT: ldr{{.*}}]! 65; DISABLED-NOT: str{{.*}}]! 66 67define void @convolve_16bit(i16** nocapture readonly %input_image, i16** nocapture readonly %filter, 68 i32 %filter_dim, i32 %out_width, i32 %out_height, 69 i32** nocapture readonly %convolved) { 70entry: 71 %cmp92 = icmp eq i32 %out_height, 0 72 br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph 73 74for.cond1.preheader.lr.ph: ; preds = %entry 75 %xtraiter = and i32 %filter_dim, 3 76 %unroll_iter = sub i32 %filter_dim, %xtraiter 77 br label %for.cond1.preheader 78 79for.cond1.preheader: ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph 80 %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ] 81 %arrayidx22 = getelementptr inbounds i32*, i32** %convolved, i32 %res_y.093 82 %tmp3 = load i32*, i32** %arrayidx22, align 4 83 br label %for.cond9.preheader.us.us.preheader 84 85for.cond9.preheader.us.us.preheader: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph 86 %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ] 87 br label %for.cond9.preheader.us.us 88 89for.cond9.preheader.us.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader 90 %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ] 91 %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ] 92 %add.us.us = add i32 %filter_y.056.us.us, %res_y.093 93 %arrayidx.us.us = getelementptr inbounds i16*, i16** %filter, i32 %filter_y.056.us.us 94 %tmp5 = load i16*, i16** %arrayidx.us.us, align 4 95 %arrayidx15.us.us = getelementptr inbounds i16*, i16** %input_image, i32 %add.us.us 96 %tmp6 = load i16*, i16** %arrayidx15.us.us, align 4 97 br label %for.body12.us.us 98 99for.body12.us.us: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us 100 %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ] 101 %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ] 102 %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ] 103 %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us 104 %arrayidx14.us.us = getelementptr inbounds i16, i16* %tmp5, i32 %filter_x.053.us.us 105 %tmp9 = load i16, i16* %arrayidx14.us.us, align 2 106 %conv.us.us = sext i16 %tmp9 to i32 107 %arrayidx16.us.us = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us 108 %tmp10 = load i16, i16* %arrayidx16.us.us, align 2 109 %conv17.us.us = sext i16 %tmp10 to i32 110 %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us 111 %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us 112 %inc.us.us = or i32 %filter_x.053.us.us, 1 113 %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us 114 %arrayidx14.us.us.1 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us 115 %tmp11 = load i16, i16* %arrayidx14.us.us.1, align 2 116 %conv.us.us.1 = sext i16 %tmp11 to i32 117 %arrayidx16.us.us.1 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.1 118 %tmp12 = load i16, i16* %arrayidx16.us.us.1, align 2 119 %conv17.us.us.1 = sext i16 %tmp12 to i32 120 %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1 121 %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us 122 %inc.us.us.1 = or i32 %filter_x.053.us.us, 2 123 %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us 124 %arrayidx14.us.us.2 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.1 125 %tmp13 = load i16, i16* %arrayidx14.us.us.2, align 2 126 %conv.us.us.2 = sext i16 %tmp13 to i32 127 %arrayidx16.us.us.2 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.2 128 %tmp14 = load i16, i16* %arrayidx16.us.us.2, align 2 129 %conv17.us.us.2 = sext i16 %tmp14 to i32 130 %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2 131 %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1 132 %inc.us.us.2 = or i32 %filter_x.053.us.us, 3 133 %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us 134 %arrayidx14.us.us.3 = getelementptr inbounds i16, i16* %tmp5, i32 %inc.us.us.2 135 %tmp15 = load i16, i16* %arrayidx14.us.us.3, align 2 136 %conv.us.us.3 = sext i16 %tmp15 to i32 137 %arrayidx16.us.us.3 = getelementptr inbounds i16, i16* %tmp6, i32 %add13.us.us.3 138 %tmp16 = load i16, i16* %arrayidx16.us.us.3, align 2 139 %conv17.us.us.3 = sext i16 %tmp16 to i32 140 %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3 141 %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2 142 %inc.us.us.3 = add i32 %filter_x.053.us.us, 4 143 %niter.nsub.3 = add i32 %niter, -4 144 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 145 br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us 146 147for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us 148 %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1 149 %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim 150 br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us 151 152for.cond5.for.cond.cleanup7_crit_edge.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us 153 %arrayidx23.us = getelementptr inbounds i32, i32* %tmp3, i32 %res_x.060.us 154 store i32 %add18.us.us.3, i32* %arrayidx23.us, align 4 155 %add25.us = add nuw i32 %res_x.060.us, 1 156 %exitcond99 = icmp eq i32 %add25.us, %out_width 157 br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader 158 159for.cond.cleanup3: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader 160 %add28 = add nuw i32 %res_y.093, 1 161 %exitcond100 = icmp eq i32 %add28, %out_height 162 br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader 163 164for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry 165 ret void 166} 167 168; CHECK-LABEL: mul_8x8 169; CHECK: @ %for.body 170 171; CHECK-DEFAULT: str{{.*}}, #16]! 172; CHECK-DEFAULT: ldrb{{.*}}, #4]! 173; CHECK-DEFAULT: ldrb{{.*}}, #4]! 174 175; CHECK-COMPLEX: str{{.*}}, #16]! 176; CHECK-COMPLEX: ldrb{{.*}}, #4]! 177; CHECK-COMPLEX: ldrb{{.*}}, #4]! 178 179; DISABLED-NOT: ldr{{.*}}]! 180; DISABLED-NOT: str{{.*}}]! 181 182; CHECK-T2: @ %for.body.epil 183; CHECK-T2: ldrb{{.*}}, #1]! 184; CHECK-T2: ldrb{{.*}}, #1]! 185; CHECK-T2: str{{.*}}, #4]! 186 187define void @mul_8x8(i8* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) { 188entry: 189 %cmp9 = icmp eq i32 %N, 0 190 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader 191 192for.body.preheader: ; preds = %entry 193 %tmp = add i32 %N, -1 194 %xtraiter = and i32 %N, 3 195 %tmp1 = icmp ult i32 %tmp, 3 196 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 197 198for.body.preheader.new: ; preds = %for.body.preheader 199 %unroll_iter = sub i32 %N, %xtraiter 200 br label %for.body 201 202for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 203 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] 204 %lcmp.mod = icmp eq i32 %xtraiter, 0 205 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil 206 207for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa 208 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 209 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 210 %arrayidx.epil = getelementptr inbounds i8, i8* %A, i32 %i.010.epil 211 %tmp2 = load i8, i8* %arrayidx.epil, align 1 212 %conv.epil = zext i8 %tmp2 to i32 213 %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil 214 %tmp3 = load i8, i8* %arrayidx1.epil, align 1 215 %conv2.epil = zext i8 %tmp3 to i32 216 %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil 217 %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil 218 store i32 %mul.epil, i32* %arrayidx3.epil, align 4 219 %inc.epil = add nuw i32 %i.010.epil, 1 220 %epil.iter.sub = add i32 %epil.iter, -1 221 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 222 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil 223 224for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry 225 ret void 226 227for.body: ; preds = %for.body, %for.body.preheader.new 228 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] 229 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 230 %arrayidx = getelementptr inbounds i8, i8* %A, i32 %i.010 231 %tmp4 = load i8, i8* %arrayidx, align 1 232 %conv = zext i8 %tmp4 to i32 233 %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010 234 %tmp5 = load i8, i8* %arrayidx1, align 1 235 %conv2 = zext i8 %tmp5 to i32 236 %mul = mul nuw nsw i32 %conv2, %conv 237 %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010 238 store i32 %mul, i32* %arrayidx3, align 4 239 %inc = or i32 %i.010, 1 240 %arrayidx.1 = getelementptr inbounds i8, i8* %A, i32 %inc 241 %tmp6 = load i8, i8* %arrayidx.1, align 1 242 %conv.1 = zext i8 %tmp6 to i32 243 %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc 244 %tmp7 = load i8, i8* %arrayidx1.1, align 1 245 %conv2.1 = zext i8 %tmp7 to i32 246 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1 247 %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc 248 store i32 %mul.1, i32* %arrayidx3.1, align 4 249 %inc.1 = or i32 %i.010, 2 250 %arrayidx.2 = getelementptr inbounds i8, i8* %A, i32 %inc.1 251 %tmp8 = load i8, i8* %arrayidx.2, align 1 252 %conv.2 = zext i8 %tmp8 to i32 253 %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1 254 %tmp9 = load i8, i8* %arrayidx1.2, align 1 255 %conv2.2 = zext i8 %tmp9 to i32 256 %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2 257 %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1 258 store i32 %mul.2, i32* %arrayidx3.2, align 4 259 %inc.2 = or i32 %i.010, 3 260 %arrayidx.3 = getelementptr inbounds i8, i8* %A, i32 %inc.2 261 %tmp10 = load i8, i8* %arrayidx.3, align 1 262 %conv.3 = zext i8 %tmp10 to i32 263 %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2 264 %tmp11 = load i8, i8* %arrayidx1.3, align 1 265 %conv2.3 = zext i8 %tmp11 to i32 266 %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3 267 %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2 268 store i32 %mul.3, i32* %arrayidx3.3, align 4 269 %inc.3 = add i32 %i.010, 4 270 %niter.nsub.3 = add i32 %niter, -4 271 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 272 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 273} 274 275; CHECK-LABEL: mul_16x8 276; CHECK: @ %for.body 277 278; CHECK-DEFAULT: str{{.*}}, #16]! 279; CHECK-DEFAULT: ldrsh{{.*}}, #8]! 280 281; CHECK-COMPLEX: ldrsh{{.*}}, #8]! 282; CHECK-COMPLEX: str{{.*}}, #16]! 283; CHECK-COMPLEX: ldrb{{.*}}, #4]! 284 285; DISABLED-NOT: ldr{{.*}}]! 286; DISABLED-NOT: str{{.*}}]! 287 288; CHECK-T2: @ %for.body.epil 289; CHECK-T2: ldrsh{{.*}}, #2]! 290; CHECK-T2: ldrb{{.*}}, #1]! 291; CHECK-T2: str{{.*}}, #4]! 292 293define void @mul_16x8(i16* nocapture readonly %A, i8* nocapture readonly %B, i32* nocapture %C, i32 %N) { 294entry: 295 %cmp9 = icmp eq i32 %N, 0 296 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader 297 298for.body.preheader: ; preds = %entry 299 %tmp = add i32 %N, -1 300 %xtraiter = and i32 %N, 3 301 %tmp1 = icmp ult i32 %tmp, 3 302 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 303 304for.body.preheader.new: ; preds = %for.body.preheader 305 %unroll_iter = sub i32 %N, %xtraiter 306 br label %for.body 307 308for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 309 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] 310 %lcmp.mod = icmp eq i32 %xtraiter, 0 311 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil 312 313for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa 314 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 315 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 316 %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil 317 %tmp2 = load i16, i16* %arrayidx.epil, align 2 318 %conv.epil = sext i16 %tmp2 to i32 319 %arrayidx1.epil = getelementptr inbounds i8, i8* %B, i32 %i.010.epil 320 %tmp3 = load i8, i8* %arrayidx1.epil, align 1 321 %conv2.epil = zext i8 %tmp3 to i32 322 %mul.epil = mul nsw i32 %conv2.epil, %conv.epil 323 %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil 324 store i32 %mul.epil, i32* %arrayidx3.epil, align 4 325 %inc.epil = add nuw i32 %i.010.epil, 1 326 %epil.iter.sub = add i32 %epil.iter, -1 327 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 328 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil 329 330for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry 331 ret void 332 333for.body: ; preds = %for.body, %for.body.preheader.new 334 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] 335 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 336 %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010 337 %tmp4 = load i16, i16* %arrayidx, align 2 338 %conv = sext i16 %tmp4 to i32 339 %arrayidx1 = getelementptr inbounds i8, i8* %B, i32 %i.010 340 %tmp5 = load i8, i8* %arrayidx1, align 1 341 %conv2 = zext i8 %tmp5 to i32 342 %mul = mul nsw i32 %conv2, %conv 343 %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010 344 store i32 %mul, i32* %arrayidx3, align 4 345 %inc = or i32 %i.010, 1 346 %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc 347 %tmp6 = load i16, i16* %arrayidx.1, align 2 348 %conv.1 = sext i16 %tmp6 to i32 349 %arrayidx1.1 = getelementptr inbounds i8, i8* %B, i32 %inc 350 %tmp7 = load i8, i8* %arrayidx1.1, align 1 351 %conv2.1 = zext i8 %tmp7 to i32 352 %mul.1 = mul nsw i32 %conv2.1, %conv.1 353 %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc 354 store i32 %mul.1, i32* %arrayidx3.1, align 4 355 %inc.1 = or i32 %i.010, 2 356 %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1 357 %tmp8 = load i16, i16* %arrayidx.2, align 2 358 %conv.2 = sext i16 %tmp8 to i32 359 %arrayidx1.2 = getelementptr inbounds i8, i8* %B, i32 %inc.1 360 %tmp9 = load i8, i8* %arrayidx1.2, align 1 361 %conv2.2 = zext i8 %tmp9 to i32 362 %mul.2 = mul nsw i32 %conv2.2, %conv.2 363 %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1 364 store i32 %mul.2, i32* %arrayidx3.2, align 4 365 %inc.2 = or i32 %i.010, 3 366 %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2 367 %tmp10 = load i16, i16* %arrayidx.3, align 2 368 %conv.3 = sext i16 %tmp10 to i32 369 %arrayidx1.3 = getelementptr inbounds i8, i8* %B, i32 %inc.2 370 %tmp11 = load i8, i8* %arrayidx1.3, align 1 371 %conv2.3 = zext i8 %tmp11 to i32 372 %mul.3 = mul nsw i32 %conv2.3, %conv.3 373 %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2 374 store i32 %mul.3, i32* %arrayidx3.3, align 4 375 %inc.3 = add i32 %i.010, 4 376 %niter.nsub.3 = add i32 %niter, -4 377 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 378 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 379} 380 381; CHECK-LABEL: mul_16x16 382; CHECK: @ %for.body 383 384; TODO: pre-indexed loads 385; CHECK-DEFAULT-NOT: ldrsh{{.*}}]! 386; CHECK-DEFAULT: str{{.*}}, #16]! 387; CHECK-DEFAULT-NOT: ldrsh{{.*}}]! 388 389; CHECK-COMPLEX: ldrsh{{.*}}]! 390; CHECK-COMPLEX: ldrsh{{.*}}]! 391; CHECK-COMPLEX: str{{.*}}]! 392 393; DISABLED-NOT: ldr{{.*}}]! 394; DISABLED-NOT: str{{.*}}]! 395 396; CHECK-T2: @ %for.body.epil 397; CHECK-T2: ldrsh{{.*}}, #2]! 398; CHECK-T2: ldrsh{{.*}}, #2]! 399; CHECK-T2: str{{.*}}, #4]! 400 401define void @mul_16x16(i16* nocapture readonly %A, i16* nocapture readonly %B, i32* nocapture %C, i32 %N) { 402entry: 403 %cmp9 = icmp eq i32 %N, 0 404 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader 405 406for.body.preheader: ; preds = %entry 407 %tmp = add i32 %N, -1 408 %xtraiter = and i32 %N, 3 409 %tmp1 = icmp ult i32 %tmp, 3 410 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 411 412for.body.preheader.new: ; preds = %for.body.preheader 413 %unroll_iter = sub i32 %N, %xtraiter 414 br label %for.body 415 416for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 417 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] 418 %lcmp.mod = icmp eq i32 %xtraiter, 0 419 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil 420 421for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa 422 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 423 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 424 %arrayidx.epil = getelementptr inbounds i16, i16* %A, i32 %i.010.epil 425 %tmp2 = load i16, i16* %arrayidx.epil, align 2 426 %conv.epil = sext i16 %tmp2 to i32 427 %arrayidx1.epil = getelementptr inbounds i16, i16* %B, i32 %i.010.epil 428 %tmp3 = load i16, i16* %arrayidx1.epil, align 2 429 %conv2.epil = sext i16 %tmp3 to i32 430 %mul.epil = mul nsw i32 %conv2.epil, %conv.epil 431 %arrayidx3.epil = getelementptr inbounds i32, i32* %C, i32 %i.010.epil 432 store i32 %mul.epil, i32* %arrayidx3.epil, align 4 433 %inc.epil = add nuw i32 %i.010.epil, 1 434 %epil.iter.sub = add i32 %epil.iter, -1 435 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 436 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil 437 438for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry 439 ret void 440 441for.body: ; preds = %for.body, %for.body.preheader.new 442 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] 443 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 444 %arrayidx = getelementptr inbounds i16, i16* %A, i32 %i.010 445 %tmp4 = load i16, i16* %arrayidx, align 2 446 %conv = sext i16 %tmp4 to i32 447 %arrayidx1 = getelementptr inbounds i16, i16* %B, i32 %i.010 448 %tmp5 = load i16, i16* %arrayidx1, align 2 449 %conv2 = sext i16 %tmp5 to i32 450 %mul = mul nsw i32 %conv2, %conv 451 %arrayidx3 = getelementptr inbounds i32, i32* %C, i32 %i.010 452 store i32 %mul, i32* %arrayidx3, align 4 453 %inc = or i32 %i.010, 1 454 %arrayidx.1 = getelementptr inbounds i16, i16* %A, i32 %inc 455 %tmp6 = load i16, i16* %arrayidx.1, align 2 456 %conv.1 = sext i16 %tmp6 to i32 457 %arrayidx1.1 = getelementptr inbounds i16, i16* %B, i32 %inc 458 %tmp7 = load i16, i16* %arrayidx1.1, align 2 459 %conv2.1 = sext i16 %tmp7 to i32 460 %mul.1 = mul nsw i32 %conv2.1, %conv.1 461 %arrayidx3.1 = getelementptr inbounds i32, i32* %C, i32 %inc 462 store i32 %mul.1, i32* %arrayidx3.1, align 4 463 %inc.1 = or i32 %i.010, 2 464 %arrayidx.2 = getelementptr inbounds i16, i16* %A, i32 %inc.1 465 %tmp8 = load i16, i16* %arrayidx.2, align 2 466 %conv.2 = sext i16 %tmp8 to i32 467 %arrayidx1.2 = getelementptr inbounds i16, i16* %B, i32 %inc.1 468 %tmp9 = load i16, i16* %arrayidx1.2, align 2 469 %conv2.2 = sext i16 %tmp9 to i32 470 %mul.2 = mul nsw i32 %conv2.2, %conv.2 471 %arrayidx3.2 = getelementptr inbounds i32, i32* %C, i32 %inc.1 472 store i32 %mul.2, i32* %arrayidx3.2, align 4 473 %inc.2 = or i32 %i.010, 3 474 %arrayidx.3 = getelementptr inbounds i16, i16* %A, i32 %inc.2 475 %tmp10 = load i16, i16* %arrayidx.3, align 2 476 %conv.3 = sext i16 %tmp10 to i32 477 %arrayidx1.3 = getelementptr inbounds i16, i16* %B, i32 %inc.2 478 %tmp11 = load i16, i16* %arrayidx1.3, align 2 479 %conv2.3 = sext i16 %tmp11 to i32 480 %mul.3 = mul nsw i32 %conv2.3, %conv.3 481 %arrayidx3.3 = getelementptr inbounds i32, i32* %C, i32 %inc.2 482 store i32 %mul.3, i32* %arrayidx3.3, align 4 483 %inc.3 = add i32 %i.010, 4 484 %niter.nsub.3 = add i32 %niter, -4 485 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 486 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 487} 488 489; CHECK-LABEL: mul_8x8_2d 490; CHECK: @ %for.body4.us 491 492; CHECK-DEFAULT: ldr{{.*}}, #16]! 493; CHECK-DEFAULT: ldrb{{.*}}, #4]! 494 495; DISABLED-NOT: ldr{{.*}}]! 496; DISABLED-NOT: str{{.*}}]! 497 498; CHECK-T2: @ %for.body4.us.epil 499; CHECK-T2: ldrb{{.*}}, #1]! 500; CHECK-T2: ldr{{.*}}, #4]! 501 502define void @mul_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) { 503entry: 504 %cmp24 = icmp eq i32 %N, 0 505 %cmp222 = icmp eq i32 %M, 0 506 %or.cond = or i1 %cmp24, %cmp222 507 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader 508 509for.cond1.preheader.us.preheader: ; preds = %entry 510 %tmp = add i32 %M, -1 511 %xtraiter = and i32 %M, 3 512 %tmp1 = icmp ult i32 %tmp, 3 513 %unroll_iter = sub i32 %M, %xtraiter 514 %lcmp.mod = icmp eq i32 %xtraiter, 0 515 br label %for.cond1.preheader.us 516 517for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader 518 %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] 519 %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.025.us 520 %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.025.us 521 %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us 522 %.pre = load i8*, i8** %arrayidx5.us, align 4 523 %.pre30 = load i32*, i32** %arrayidx8.us, align 4 524 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 525 526for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us 527 %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] 528 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] 529 %tmp2 = load i8, i8* %arrayidx.us, align 1 530 %conv.us = zext i8 %tmp2 to i32 531 %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us 532 %tmp3 = load i8, i8* %arrayidx6.us, align 1 533 %conv7.us = zext i8 %tmp3 to i32 534 %mul.us = mul nuw nsw i32 %conv7.us, %conv.us 535 %arrayidx9.us = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us 536 %tmp4 = load i32, i32* %arrayidx9.us, align 4 537 %add.us = add nsw i32 %tmp4, %mul.us 538 store i32 %add.us, i32* %arrayidx9.us, align 4 539 %inc.us = or i32 %j.023.us, 1 540 %tmp5 = load i8, i8* %arrayidx.us, align 1 541 %conv.us.1 = zext i8 %tmp5 to i32 542 %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us 543 %tmp6 = load i8, i8* %arrayidx6.us.1, align 1 544 %conv7.us.1 = zext i8 %tmp6 to i32 545 %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1 546 %arrayidx9.us.1 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us 547 %tmp7 = load i32, i32* %arrayidx9.us.1, align 4 548 %add.us.1 = add nsw i32 %tmp7, %mul.us.1 549 store i32 %add.us.1, i32* %arrayidx9.us.1, align 4 550 %inc.us.1 = or i32 %j.023.us, 2 551 %tmp8 = load i8, i8* %arrayidx.us, align 1 552 %conv.us.2 = zext i8 %tmp8 to i32 553 %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1 554 %tmp9 = load i8, i8* %arrayidx6.us.2, align 1 555 %conv7.us.2 = zext i8 %tmp9 to i32 556 %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2 557 %arrayidx9.us.2 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.1 558 %tmp10 = load i32, i32* %arrayidx9.us.2, align 4 559 %add.us.2 = add nsw i32 %tmp10, %mul.us.2 560 store i32 %add.us.2, i32* %arrayidx9.us.2, align 4 561 %inc.us.2 = or i32 %j.023.us, 3 562 %tmp11 = load i8, i8* %arrayidx.us, align 1 563 %conv.us.3 = zext i8 %tmp11 to i32 564 %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2 565 %tmp12 = load i8, i8* %arrayidx6.us.3, align 1 566 %conv7.us.3 = zext i8 %tmp12 to i32 567 %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3 568 %arrayidx9.us.3 = getelementptr inbounds i32, i32* %.pre30, i32 %inc.us.2 569 %tmp13 = load i32, i32* %arrayidx9.us.3, align 4 570 %add.us.3 = add nsw i32 %tmp13, %mul.us.3 571 store i32 %add.us.3, i32* %arrayidx9.us.3, align 4 572 %inc.us.3 = add i32 %j.023.us, 4 573 %niter.nsub.3 = add i32 %niter, -4 574 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 575 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 576 577for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us 578 %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] 579 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 580 581for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 582 %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 583 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 584 %tmp14 = load i8, i8* %arrayidx.us, align 1 585 %conv.us.epil = zext i8 %tmp14 to i32 586 %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.023.us.epil 587 %tmp15 = load i8, i8* %arrayidx6.us.epil, align 1 588 %conv7.us.epil = zext i8 %tmp15 to i32 589 %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil 590 %arrayidx9.us.epil = getelementptr inbounds i32, i32* %.pre30, i32 %j.023.us.epil 591 %tmp16 = load i32, i32* %arrayidx9.us.epil, align 4 592 %add.us.epil = add nsw i32 %tmp16, %mul.us.epil 593 store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4 594 %inc.us.epil = add nuw i32 %j.023.us.epil, 1 595 %epil.iter.sub = add i32 %epil.iter, -1 596 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 597 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 598 599for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 600 %inc11.us = add nuw i32 %i.025.us, 1 601 %exitcond28 = icmp eq i32 %inc11.us, %N 602 br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us 603 604for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry 605 ret void 606} 607 608; CHECK-LABEL: mul_16x16_2d 609; CHECK: @ %for.body4.us 610 611; CHECK-DEFAULT: ldr{{.*}}, #16]! 612; CHECK-DEFAULT: ldrsh{{.*}}, #8]! 613 614; DISABLED-NOT: ldr{{.*}}]! 615; DISABLED-NOT: str{{.*}}]! 616 617; CHECK-T2: @ %for.body4.us.epil 618; CHECK-T2: ldrsh{{.*}}, #2]! 619; CHECK-T2: ldr{{.*}}, #4]! 620 621define void @mul_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32** nocapture readonly %C, i32 %N, i32 %M) { 622entry: 623 %cmp24 = icmp eq i32 %N, 0 624 %cmp222 = icmp eq i32 %M, 0 625 %or.cond = or i1 %cmp24, %cmp222 626 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader 627 628for.cond1.preheader.us.preheader: ; preds = %entry 629 %tmp = add i32 %M, -1 630 %xtraiter = and i32 %M, 3 631 %tmp1 = icmp ult i32 %tmp, 3 632 %unroll_iter = sub i32 %M, %xtraiter 633 %lcmp.mod = icmp eq i32 %xtraiter, 0 634 br label %for.cond1.preheader.us 635 636for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader 637 %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] 638 %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.025.us 639 %tmp2 = load i16, i16* %arrayidx.us, align 2 640 %conv.us = sext i16 %tmp2 to i32 641 %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.025.us 642 %tmp3 = load i16*, i16** %arrayidx5.us, align 4 643 %arrayidx8.us = getelementptr inbounds i32*, i32** %C, i32 %i.025.us 644 %tmp4 = load i32*, i32** %arrayidx8.us, align 4 645 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 646 647for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us 648 %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] 649 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] 650 %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us 651 %tmp5 = load i16, i16* %arrayidx6.us, align 2 652 %conv7.us = sext i16 %tmp5 to i32 653 %mul.us = mul nsw i32 %conv7.us, %conv.us 654 %arrayidx9.us = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us 655 %tmp6 = load i32, i32* %arrayidx9.us, align 4 656 %add.us = add nsw i32 %tmp6, %mul.us 657 store i32 %add.us, i32* %arrayidx9.us, align 4 658 %inc.us = or i32 %j.023.us, 1 659 %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us 660 %tmp7 = load i16, i16* %arrayidx6.us.1, align 2 661 %conv7.us.1 = sext i16 %tmp7 to i32 662 %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us 663 %arrayidx9.us.1 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us 664 %tmp8 = load i32, i32* %arrayidx9.us.1, align 4 665 %add.us.1 = add nsw i32 %tmp8, %mul.us.1 666 store i32 %add.us.1, i32* %arrayidx9.us.1, align 4 667 %inc.us.1 = or i32 %j.023.us, 2 668 %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1 669 %tmp9 = load i16, i16* %arrayidx6.us.2, align 2 670 %conv7.us.2 = sext i16 %tmp9 to i32 671 %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us 672 %arrayidx9.us.2 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.1 673 %tmp10 = load i32, i32* %arrayidx9.us.2, align 4 674 %add.us.2 = add nsw i32 %tmp10, %mul.us.2 675 store i32 %add.us.2, i32* %arrayidx9.us.2, align 4 676 %inc.us.2 = or i32 %j.023.us, 3 677 %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2 678 %tmp11 = load i16, i16* %arrayidx6.us.3, align 2 679 %conv7.us.3 = sext i16 %tmp11 to i32 680 %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us 681 %arrayidx9.us.3 = getelementptr inbounds i32, i32* %tmp4, i32 %inc.us.2 682 %tmp12 = load i32, i32* %arrayidx9.us.3, align 4 683 %add.us.3 = add nsw i32 %tmp12, %mul.us.3 684 store i32 %add.us.3, i32* %arrayidx9.us.3, align 4 685 %inc.us.3 = add i32 %j.023.us, 4 686 %niter.nsub.3 = add i32 %niter, -4 687 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 688 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 689 690for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us 691 %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] 692 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 693 694for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 695 %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 696 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 697 %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.023.us.epil 698 %tmp13 = load i16, i16* %arrayidx6.us.epil, align 2 699 %conv7.us.epil = sext i16 %tmp13 to i32 700 %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us 701 %arrayidx9.us.epil = getelementptr inbounds i32, i32* %tmp4, i32 %j.023.us.epil 702 %tmp14 = load i32, i32* %arrayidx9.us.epil, align 4 703 %add.us.epil = add nsw i32 %tmp14, %mul.us.epil 704 store i32 %add.us.epil, i32* %arrayidx9.us.epil, align 4 705 %inc.us.epil = add nuw i32 %j.023.us.epil, 1 706 %epil.iter.sub = add i32 %epil.iter, -1 707 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 708 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 709 710for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 711 %inc11.us = add nuw i32 %i.025.us, 1 712 %exitcond28 = icmp eq i32 %inc11.us, %N 713 br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us 714 715for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry 716 ret void 717} 718 719; CHECK-LABEL: mac_8x8_2d 720; CHECK: @ %for.body4.us 721 722; TODO: Both input arrays could use pre-indexed loads. 723; TODO: pre-indexed stores. 724; CHECK-DEFAULT: ldrb{{.*}}, #4]! 725; CHECK-DEFAULT-NOT: ldr{{.*}}]! 726; CHECK-DEFAULT-NOT: str{{.*}}]! 727 728; TODO: Increased complexity shouldn't prevent indexed accesses. 729; CHECK-COMPLEX-NOT: ldr{{.*}}]! 730; CHECK-COMPLEX-NOT: str{{.*}}]! 731 732; DISABLED-NOT: ldr{{.*}}]! 733; DISABLED-NOT: str{{.*}}]! 734 735; CHECK-T2: @ %for.body4.us.epil 736; CHECK-T2: ldrb{{.*}}, #1]! 737 738define void @mac_8x8_2d(i8* nocapture readonly %A, i8** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) { 739entry: 740 %cmp22 = icmp eq i32 %N, 0 741 %cmp220 = icmp eq i32 %M, 0 742 %or.cond = or i1 %cmp22, %cmp220 743 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader 744 745for.cond1.preheader.us.preheader: ; preds = %entry 746 %tmp = add i32 %M, -1 747 %xtraiter = and i32 %M, 3 748 %tmp1 = icmp ult i32 %tmp, 3 749 %unroll_iter = sub i32 %M, %xtraiter 750 %lcmp.mod = icmp eq i32 %xtraiter, 0 751 br label %for.cond1.preheader.us 752 753for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader 754 %i.023.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] 755 %arrayidx.us = getelementptr inbounds i8, i8* %A, i32 %i.023.us 756 %arrayidx5.us = getelementptr inbounds i8*, i8** %B, i32 %i.023.us 757 %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.023.us 758 %.pre = load i8*, i8** %arrayidx5.us, align 4 759 %.pre28 = load i32, i32* %arrayidx8.us, align 4 760 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 761 762for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us 763 %tmp2 = phi i32 [ %add.us.3, %for.body4.us ], [ %.pre28, %for.cond1.preheader.us ] 764 %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] 765 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] 766 %tmp3 = load i8, i8* %arrayidx.us, align 1 767 %conv.us = zext i8 %tmp3 to i32 768 %arrayidx6.us = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us 769 %tmp4 = load i8, i8* %arrayidx6.us, align 1 770 %conv7.us = zext i8 %tmp4 to i32 771 %mul.us = mul nuw nsw i32 %conv7.us, %conv.us 772 %add.us = add nsw i32 %mul.us, %tmp2 773 store i32 %add.us, i32* %arrayidx8.us, align 4 774 %inc.us = or i32 %j.021.us, 1 775 %tmp5 = load i8, i8* %arrayidx.us, align 1 776 %conv.us.1 = zext i8 %tmp5 to i32 777 %arrayidx6.us.1 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us 778 %tmp6 = load i8, i8* %arrayidx6.us.1, align 1 779 %conv7.us.1 = zext i8 %tmp6 to i32 780 %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1 781 %add.us.1 = add nsw i32 %mul.us.1, %add.us 782 store i32 %add.us.1, i32* %arrayidx8.us, align 4 783 %inc.us.1 = or i32 %j.021.us, 2 784 %tmp7 = load i8, i8* %arrayidx.us, align 1 785 %conv.us.2 = zext i8 %tmp7 to i32 786 %arrayidx6.us.2 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.1 787 %tmp8 = load i8, i8* %arrayidx6.us.2, align 1 788 %conv7.us.2 = zext i8 %tmp8 to i32 789 %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2 790 %add.us.2 = add nsw i32 %mul.us.2, %add.us.1 791 store i32 %add.us.2, i32* %arrayidx8.us, align 4 792 %inc.us.2 = or i32 %j.021.us, 3 793 %tmp9 = load i8, i8* %arrayidx.us, align 1 794 %conv.us.3 = zext i8 %tmp9 to i32 795 %arrayidx6.us.3 = getelementptr inbounds i8, i8* %.pre, i32 %inc.us.2 796 %tmp10 = load i8, i8* %arrayidx6.us.3, align 1 797 %conv7.us.3 = zext i8 %tmp10 to i32 798 %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3 799 %add.us.3 = add nsw i32 %mul.us.3, %add.us.2 800 store i32 %add.us.3, i32* %arrayidx8.us, align 4 801 %inc.us.3 = add i32 %j.021.us, 4 802 %niter.nsub.3 = add i32 %niter, -4 803 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 804 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 805 806for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us 807 %.unr = phi i32 [ %.pre28, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ] 808 %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] 809 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 810 811for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 812 %tmp11 = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 813 %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 814 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 815 %tmp12 = load i8, i8* %arrayidx.us, align 1 816 %conv.us.epil = zext i8 %tmp12 to i32 817 %arrayidx6.us.epil = getelementptr inbounds i8, i8* %.pre, i32 %j.021.us.epil 818 %tmp13 = load i8, i8* %arrayidx6.us.epil, align 1 819 %conv7.us.epil = zext i8 %tmp13 to i32 820 %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil 821 %add.us.epil = add nsw i32 %mul.us.epil, %tmp11 822 store i32 %add.us.epil, i32* %arrayidx8.us, align 4 823 %inc.us.epil = add nuw i32 %j.021.us.epil, 1 824 %epil.iter.sub = add i32 %epil.iter, -1 825 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 826 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 827 828for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 829 %inc10.us = add nuw i32 %i.023.us, 1 830 %exitcond26 = icmp eq i32 %inc10.us, %N 831 br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us 832 833for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry 834 ret void 835} 836 837; CHECK-LABEL: mac_16x16_2d 838; CHECK: @ %for.body4.us 839 840; TODO: pre-indexed loads for both input arrays. 841; CHECK-DEFAULT: ldrsh{{.*}}, #8]! 842; CHECK-DEFAULT-NOT: ldr{{.*}}]! 843 844; TODO: increased complexity should lead to better codegen. 845; CHECK-COMPLEX-NOT: ldr{{.*}}]! 846 847; DISABLED-NOT: ldr{{.*}}]! 848 849; CHECK-T2: @ %for.body4.us.epil 850; CHECK-T2: ldrsh{{.*}}, #2]! 851 852define void @mac_16x16_2d(i16* nocapture readonly %A, i16** nocapture readonly %B, i32* nocapture %C, i32 %N, i32 %M) { 853entry: 854 %cmp23 = icmp eq i32 %N, 0 855 %cmp220 = icmp eq i32 %M, 0 856 %or.cond = or i1 %cmp23, %cmp220 857 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader 858 859for.cond1.preheader.us.preheader: ; preds = %entry 860 %tmp = add i32 %M, -1 861 %xtraiter = and i32 %M, 3 862 %tmp1 = icmp ult i32 %tmp, 3 863 %unroll_iter = sub i32 %M, %xtraiter 864 %lcmp.mod = icmp eq i32 %xtraiter, 0 865 br label %for.cond1.preheader.us 866 867for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader 868 %i.024.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] 869 %arrayidx.us = getelementptr inbounds i16, i16* %A, i32 %i.024.us 870 %tmp2 = load i16, i16* %arrayidx.us, align 2 871 %conv.us = sext i16 %tmp2 to i32 872 %arrayidx5.us = getelementptr inbounds i16*, i16** %B, i32 %i.024.us 873 %tmp3 = load i16*, i16** %arrayidx5.us, align 4 874 %arrayidx8.us = getelementptr inbounds i32, i32* %C, i32 %i.024.us 875 %arrayidx8.promoted.us = load i32, i32* %arrayidx8.us, align 4 876 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 877 878for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us 879 %add22.us = phi i32 [ %add.us.3, %for.body4.us ], [ %arrayidx8.promoted.us, %for.cond1.preheader.us ] 880 %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] 881 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] 882 %arrayidx6.us = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us 883 %tmp4 = load i16, i16* %arrayidx6.us, align 2 884 %conv7.us = sext i16 %tmp4 to i32 885 %mul.us = mul nsw i32 %conv7.us, %conv.us 886 %add.us = add nsw i32 %mul.us, %add22.us 887 %inc.us = or i32 %j.021.us, 1 888 %arrayidx6.us.1 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us 889 %tmp5 = load i16, i16* %arrayidx6.us.1, align 2 890 %conv7.us.1 = sext i16 %tmp5 to i32 891 %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us 892 %add.us.1 = add nsw i32 %mul.us.1, %add.us 893 %inc.us.1 = or i32 %j.021.us, 2 894 %arrayidx6.us.2 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.1 895 %tmp6 = load i16, i16* %arrayidx6.us.2, align 2 896 %conv7.us.2 = sext i16 %tmp6 to i32 897 %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us 898 %add.us.2 = add nsw i32 %mul.us.2, %add.us.1 899 %inc.us.2 = or i32 %j.021.us, 3 900 %arrayidx6.us.3 = getelementptr inbounds i16, i16* %tmp3, i32 %inc.us.2 901 %tmp7 = load i16, i16* %arrayidx6.us.3, align 2 902 %conv7.us.3 = sext i16 %tmp7 to i32 903 %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us 904 %add.us.3 = add nsw i32 %mul.us.3, %add.us.2 905 %inc.us.3 = add i32 %j.021.us, 4 906 %niter.nsub.3 = add i32 %niter, -4 907 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 908 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 909 910for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us 911 %add.us.lcssa.ph = phi i32 [ undef, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ] 912 %add22.us.unr = phi i32 [ %arrayidx8.promoted.us, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ] 913 %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] 914 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 915 916for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 917 %add22.us.epil = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %add22.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 918 %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 919 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 920 %arrayidx6.us.epil = getelementptr inbounds i16, i16* %tmp3, i32 %j.021.us.epil 921 %tmp8 = load i16, i16* %arrayidx6.us.epil, align 2 922 %conv7.us.epil = sext i16 %tmp8 to i32 923 %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us 924 %add.us.epil = add nsw i32 %mul.us.epil, %add22.us.epil 925 %inc.us.epil = add nuw i32 %j.021.us.epil, 1 926 %epil.iter.sub = add i32 %epil.iter, -1 927 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 928 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 929 930for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 931 %add.us.lcssa = phi i32 [ %add.us.lcssa.ph, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ], [ %add.us.epil, %for.body4.us.epil ] 932 store i32 %add.us.lcssa, i32* %arrayidx8.us, align 4 933 %inc10.us = add nuw i32 %i.024.us, 1 934 %exitcond27 = icmp eq i32 %inc10.us, %N 935 br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us 936 937for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry 938 ret void 939} 940 941; CHECK-LABEL: mul32x32_backwards 942; CHECK: @ %for.body 943 944; TODO: post increments for decreasing addresses 945; CHECK-DEFAULT-NOT: ldr{{.*}}]! 946; CHECK-DEFAULT-NOT: str{{.*}}]! 947 948; CHECK-COMPLEX-NOT: ldr{{.*}}]! 949; CHECK-COMPLEX-NOT: str{{.*}}]! 950 951define void @mul32x32_backwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { 952entry: 953 %i.08 = add i32 %N, -1 954 %cmp9 = icmp sgt i32 %i.08, -1 955 br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup 956 957for.body.preheader: ; preds = %entry 958 %xtraiter = and i32 %N, 3 959 %lcmp.mod = icmp eq i32 %xtraiter, 0 960 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol 961 962for.body.prol: ; preds = %for.body.prol, %for.body.preheader 963 %i.010.prol = phi i32 [ %i.0.prol, %for.body.prol ], [ %i.08, %for.body.preheader ] 964 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader ] 965 %arrayidx.prol = getelementptr inbounds i32, i32* %b, i32 %i.010.prol 966 %tmp = load i32, i32* %arrayidx.prol, align 4 967 %arrayidx1.prol = getelementptr inbounds i32, i32* %c, i32 %i.010.prol 968 %tmp1 = load i32, i32* %arrayidx1.prol, align 4 969 %mul.prol = mul nsw i32 %tmp1, %tmp 970 %arrayidx2.prol = getelementptr inbounds i32, i32* %a, i32 %i.010.prol 971 store i32 %mul.prol, i32* %arrayidx2.prol, align 4 972 %i.0.prol = add i32 %i.010.prol, -1 973 %prol.iter.sub = add i32 %prol.iter, -1 974 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0 975 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol 976 977for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader 978 %i.010.unr = phi i32 [ %i.08, %for.body.preheader ], [ %i.0.prol, %for.body.prol ] 979 %tmp2 = icmp ult i32 %i.08, 3 980 br i1 %tmp2, label %for.cond.cleanup, label %for.body 981 982for.cond.cleanup: ; preds = %for.body, %for.body.prol.loopexit, %entry 983 ret void 984 985for.body: ; preds = %for.body, %for.body.prol.loopexit 986 %i.010 = phi i32 [ %i.0.3, %for.body ], [ %i.010.unr, %for.body.prol.loopexit ] 987 %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.010 988 %tmp3 = load i32, i32* %arrayidx, align 4 989 %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.010 990 %tmp4 = load i32, i32* %arrayidx1, align 4 991 %mul = mul nsw i32 %tmp4, %tmp3 992 %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.010 993 store i32 %mul, i32* %arrayidx2, align 4 994 %i.0 = add i32 %i.010, -1 995 %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %i.0 996 %tmp5 = load i32, i32* %arrayidx.1, align 4 997 %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %i.0 998 %tmp6 = load i32, i32* %arrayidx1.1, align 4 999 %mul.1 = mul nsw i32 %tmp6, %tmp5 1000 %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %i.0 1001 store i32 %mul.1, i32* %arrayidx2.1, align 4 1002 %i.0.1 = add i32 %i.010, -2 1003 %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %i.0.1 1004 %tmp7 = load i32, i32* %arrayidx.2, align 4 1005 %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %i.0.1 1006 %tmp8 = load i32, i32* %arrayidx1.2, align 4 1007 %mul.2 = mul nsw i32 %tmp8, %tmp7 1008 %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %i.0.1 1009 store i32 %mul.2, i32* %arrayidx2.2, align 4 1010 %i.0.2 = add i32 %i.010, -3 1011 %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %i.0.2 1012 %tmp9 = load i32, i32* %arrayidx.3, align 4 1013 %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %i.0.2 1014 %tmp10 = load i32, i32* %arrayidx1.3, align 4 1015 %mul.3 = mul nsw i32 %tmp10, %tmp9 1016 %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %i.0.2 1017 store i32 %mul.3, i32* %arrayidx2.3, align 4 1018 %i.0.3 = add i32 %i.010, -4 1019 %cmp.3 = icmp sgt i32 %i.0.3, -1 1020 br i1 %cmp.3, label %for.body, label %for.cond.cleanup 1021} 1022 1023; CHECK-LABEL: mul32x32_forwards 1024; CHECK: @ %for.body 1025 1026; TODO: Would be good for the complexity limit didn't have to be increased to 1027; enable the pre-indexed accesses. 1028 1029; CHECK-DEFAULT-NOT: ldr{{.*}}]! 1030; CHECK-DEFAULT-NOT: str{{.*}}]! 1031 1032; CHECK-COMPLEX: ldr{{.*}}, #16]! 1033; CHECK-COMPLEX: ldr{{.*}}, #16]! 1034; CHECK-COMPLEX: str{{.*}}, #16]! 1035 1036; CHECK-T2: @ %for.body.epil 1037; CHECK-T2: ldr{{.*}}, #4]! 1038; CHECK-T2: ldr{{.*}}, #4]! 1039; CHECK-T2: str{{.*}}, #4]! 1040 1041define void @mul32x32_forwards(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { 1042entry: 1043 %cmp8 = icmp eq i32 %N, 0 1044 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 1045 1046for.body.preheader: ; preds = %entry 1047 %tmp = add i32 %N, -1 1048 %xtraiter = and i32 %N, 3 1049 %tmp1 = icmp ult i32 %tmp, 3 1050 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 1051 1052for.body.preheader.new: ; preds = %for.body.preheader 1053 %unroll_iter = sub i32 %N, %xtraiter 1054 br label %for.body 1055 1056for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 1057 %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] 1058 %lcmp.mod = icmp eq i32 %xtraiter, 0 1059 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil 1060 1061for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa 1062 %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 1063 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 1064 %arrayidx.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil 1065 %tmp2 = load i32, i32* %arrayidx.epil, align 4 1066 %arrayidx1.epil = getelementptr inbounds i32, i32* %c, i32 %i.09.epil 1067 %tmp3 = load i32, i32* %arrayidx1.epil, align 4 1068 %mul.epil = mul nsw i32 %tmp3, %tmp2 1069 %arrayidx2.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil 1070 store i32 %mul.epil, i32* %arrayidx2.epil, align 4 1071 %inc.epil = add nuw nsw i32 %i.09.epil, 1 1072 %epil.iter.sub = add i32 %epil.iter, -1 1073 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 1074 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil 1075 1076for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry 1077 ret void 1078 1079for.body: ; preds = %for.body, %for.body.preheader.new 1080 %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] 1081 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 1082 %arrayidx = getelementptr inbounds i32, i32* %b, i32 %i.09 1083 %tmp4 = load i32, i32* %arrayidx, align 4 1084 %arrayidx1 = getelementptr inbounds i32, i32* %c, i32 %i.09 1085 %tmp5 = load i32, i32* %arrayidx1, align 4 1086 %mul = mul nsw i32 %tmp5, %tmp4 1087 %arrayidx2 = getelementptr inbounds i32, i32* %a, i32 %i.09 1088 store i32 %mul, i32* %arrayidx2, align 4 1089 %inc = or i32 %i.09, 1 1090 %arrayidx.1 = getelementptr inbounds i32, i32* %b, i32 %inc 1091 %tmp6 = load i32, i32* %arrayidx.1, align 4 1092 %arrayidx1.1 = getelementptr inbounds i32, i32* %c, i32 %inc 1093 %tmp7 = load i32, i32* %arrayidx1.1, align 4 1094 %mul.1 = mul nsw i32 %tmp7, %tmp6 1095 %arrayidx2.1 = getelementptr inbounds i32, i32* %a, i32 %inc 1096 store i32 %mul.1, i32* %arrayidx2.1, align 4 1097 %inc.1 = or i32 %i.09, 2 1098 %arrayidx.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1 1099 %tmp8 = load i32, i32* %arrayidx.2, align 4 1100 %arrayidx1.2 = getelementptr inbounds i32, i32* %c, i32 %inc.1 1101 %tmp9 = load i32, i32* %arrayidx1.2, align 4 1102 %mul.2 = mul nsw i32 %tmp9, %tmp8 1103 %arrayidx2.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1 1104 store i32 %mul.2, i32* %arrayidx2.2, align 4 1105 %inc.2 = or i32 %i.09, 3 1106 %arrayidx.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2 1107 %tmp10 = load i32, i32* %arrayidx.3, align 4 1108 %arrayidx1.3 = getelementptr inbounds i32, i32* %c, i32 %inc.2 1109 %tmp11 = load i32, i32* %arrayidx1.3, align 4 1110 %mul.3 = mul nsw i32 %tmp11, %tmp10 1111 %arrayidx2.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2 1112 store i32 %mul.3, i32* %arrayidx2.3, align 4 1113 %inc.3 = add nuw nsw i32 %i.09, 4 1114 %niter.nsub.3 = add i32 %niter, -4 1115 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 1116 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 1117} 1118