1; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 %s -o - | \ 2; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 3 4; RUN: llc --mtriple=thumbv7em -mattr=+fp-armv8 -O3 -lsr-preferred-addressing-mode=none %s -o - | \ 5; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED 6 7; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 %s -o - | \ 8; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT --check-prefix=CHECK-T2 9 10; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-preferred-addressing-mode=postindexed %s -o - | \ 11; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED 12 13; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -lsr-preferred-addressing-mode=preindexed %s -o - | \ 14; RUN: FileCheck %s --check-prefixes=CHECK,CHECK-T2 15 16; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED 17; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLED 18 19; RUN: llc -mtriple=thumbv8m.main -mattr=+fp-armv8,+dsp -O3 -lsr-complexity-limit=2147483647 %s -o - | \ 20; RUN: FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-COMPLEX --check-prefix=CHECK-T2 21 22; Tests to check that post increment addressing modes are used instead of 23; updating base pointers with add instructions. 24 25; TODO: I think we should be able to use post inc addressing with VLDM 26; instructions. 27; CHECK-LABEL: test_fma 28; CHECK: @ %loop 29 30; CHECK-DEFAULT: vldr s{{.*}}, #8] 31; CHECK-DEFAULT: vldr s{{.*}}, #8] 32; CHECK-DEFAULT: vldr s{{.*}}, #12] 33; CHECK-DEFAULT: vldr s{{.*}}, #12] 34 35; CHECK-COMPLEX: vldr s{{.*}}, #8] 36; CHECK-COMPLEX: vldr s{{.*}}, #8] 37; CHECK-COMPLEX: vldr s{{.*}}, #12] 38; CHECK-COMPLEX: vldr s{{.*}}, #12] 39 40define float @test_fma(ptr %a, ptr %b, i32 %N) { 41entry: 42 br label %loop 43 44loop: 45 %i = phi i32 [ 0, %entry ], [ %i.next, %loop ] 46 %idx.1 = phi i32 [ 0, %entry ], [ %idx.next, %loop ] 47 %res = phi float [ 0.0, %entry ], [ %fma.2, %loop ] 48 %gep.a.1 = getelementptr inbounds float, ptr %a, i32 %idx.1 49 %a.1 = load float, ptr %gep.a.1 50 %gep.b.1 = getelementptr inbounds float, ptr %b, i32 %idx.1 51 %b.1 = load float, ptr %gep.b.1 52 %fmul.1 = fmul float %a.1, %b.1 53 %fma.1 = fadd float %fmul.1, %res 54 %idx.2 = or disjoint i32 %idx.1, 1 55 %gep.a.2 = getelementptr inbounds float, ptr %a, i32 %idx.2 56 %a.2 = load float, ptr %gep.a.2 57 %gep.b.2 = getelementptr inbounds float, ptr %b, i32 %idx.2 58 %b.2 = load float, ptr %gep.b.2 59 %fmul.2 = fmul float %a.2, %b.2 60 %fma.2 = fadd float %fmul.2, %fma.1 61 %i.next = add nsw nuw i32 %i, -2 62 %idx.next = add nsw nuw i32 %idx.1, 2 63 %cmp = icmp ult i32 %i.next, %N 64 br i1 %cmp, label %loop, label %exit 65 66exit: 67 ret float %fma.2 68} 69 70; CHECK-LABEL: convolve_16bit 71; TODO: Both arrays should use indexing 72; CHECK-DEFAULT: ldr{{.*}}, #8]! 73; CHECK-DEFAULT-NOT: ldr{{.*}}]! 74 75; CHECK-COMPLEX: ldr{{.*}}, #8]! 76; CHECK-COMPLEX-NOT: ldr{{.*}}]! 77 78; DISABLED-NOT: ldr{{.*}}]! 79; DISABLED-NOT: str{{.*}}]! 80 81define void @convolve_16bit(ptr nocapture readonly %input_image, ptr nocapture readonly %filter, 82 i32 %filter_dim, i32 %out_width, i32 %out_height, 83 ptr nocapture readonly %convolved) { 84entry: 85 %cmp92 = icmp eq i32 %out_height, 0 86 br i1 %cmp92, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph 87 88for.cond1.preheader.lr.ph: ; preds = %entry 89 %xtraiter = and i32 %filter_dim, 3 90 %unroll_iter = sub i32 %filter_dim, %xtraiter 91 br label %for.cond1.preheader 92 93for.cond1.preheader: ; preds = %for.cond.cleanup3, %for.cond1.preheader.lr.ph 94 %res_y.093 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add28, %for.cond.cleanup3 ] 95 %arrayidx22 = getelementptr inbounds ptr, ptr %convolved, i32 %res_y.093 96 %tmp3 = load ptr, ptr %arrayidx22, align 4 97 br label %for.cond9.preheader.us.us.preheader 98 99for.cond9.preheader.us.us.preheader: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.lr.ph 100 %res_x.060.us = phi i32 [ %add25.us, %for.cond5.for.cond.cleanup7_crit_edge.us ], [ 0, %for.cond1.preheader ] 101 br label %for.cond9.preheader.us.us 102 103for.cond9.preheader.us.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us, %for.cond9.preheader.us.us.preheader 104 %filter_y.056.us.us = phi i32 [ %inc20.us.us, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ] 105 %result_element.055.us.us = phi i32 [ %add18.us.us.3, %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa ], [ 0, %for.cond9.preheader.us.us.preheader ] 106 %add.us.us = add i32 %filter_y.056.us.us, %res_y.093 107 %arrayidx.us.us = getelementptr inbounds ptr, ptr %filter, i32 %filter_y.056.us.us 108 %tmp5 = load ptr, ptr %arrayidx.us.us, align 4 109 %arrayidx15.us.us = getelementptr inbounds ptr, ptr %input_image, i32 %add.us.us 110 %tmp6 = load ptr, ptr %arrayidx15.us.us, align 4 111 br label %for.body12.us.us 112 113for.body12.us.us: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us 114 %filter_x.053.us.us = phi i32 [ %inc.us.us.3, %for.body12.us.us ], [ 0, %for.cond9.preheader.us.us ] 115 %result_element.152.us.us = phi i32 [ %add18.us.us.3, %for.body12.us.us ], [ %result_element.055.us.us, %for.cond9.preheader.us.us ] 116 %niter = phi i32 [ %niter.nsub.3, %for.body12.us.us ], [ %unroll_iter, %for.cond9.preheader.us.us ] 117 %add13.us.us = add i32 %filter_x.053.us.us, %res_x.060.us 118 %arrayidx14.us.us = getelementptr inbounds i16, ptr %tmp5, i32 %filter_x.053.us.us 119 %tmp9 = load i16, ptr %arrayidx14.us.us, align 2 120 %conv.us.us = sext i16 %tmp9 to i32 121 %arrayidx16.us.us = getelementptr inbounds i16, ptr %tmp6, i32 %add13.us.us 122 %tmp10 = load i16, ptr %arrayidx16.us.us, align 2 123 %conv17.us.us = sext i16 %tmp10 to i32 124 %mul.us.us = mul nsw i32 %conv17.us.us, %conv.us.us 125 %add18.us.us = add nsw i32 %mul.us.us, %result_element.152.us.us 126 %inc.us.us = or disjoint i32 %filter_x.053.us.us, 1 127 %add13.us.us.1 = add i32 %inc.us.us, %res_x.060.us 128 %arrayidx14.us.us.1 = getelementptr inbounds i16, ptr %tmp5, i32 %inc.us.us 129 %tmp11 = load i16, ptr %arrayidx14.us.us.1, align 2 130 %conv.us.us.1 = sext i16 %tmp11 to i32 131 %arrayidx16.us.us.1 = getelementptr inbounds i16, ptr %tmp6, i32 %add13.us.us.1 132 %tmp12 = load i16, ptr %arrayidx16.us.us.1, align 2 133 %conv17.us.us.1 = sext i16 %tmp12 to i32 134 %mul.us.us.1 = mul nsw i32 %conv17.us.us.1, %conv.us.us.1 135 %add18.us.us.1 = add nsw i32 %mul.us.us.1, %add18.us.us 136 %inc.us.us.1 = or disjoint i32 %filter_x.053.us.us, 2 137 %add13.us.us.2 = add i32 %inc.us.us.1, %res_x.060.us 138 %arrayidx14.us.us.2 = getelementptr inbounds i16, ptr %tmp5, i32 %inc.us.us.1 139 %tmp13 = load i16, ptr %arrayidx14.us.us.2, align 2 140 %conv.us.us.2 = sext i16 %tmp13 to i32 141 %arrayidx16.us.us.2 = getelementptr inbounds i16, ptr %tmp6, i32 %add13.us.us.2 142 %tmp14 = load i16, ptr %arrayidx16.us.us.2, align 2 143 %conv17.us.us.2 = sext i16 %tmp14 to i32 144 %mul.us.us.2 = mul nsw i32 %conv17.us.us.2, %conv.us.us.2 145 %add18.us.us.2 = add nsw i32 %mul.us.us.2, %add18.us.us.1 146 %inc.us.us.2 = or disjoint i32 %filter_x.053.us.us, 3 147 %add13.us.us.3 = add i32 %inc.us.us.2, %res_x.060.us 148 %arrayidx14.us.us.3 = getelementptr inbounds i16, ptr %tmp5, i32 %inc.us.us.2 149 %tmp15 = load i16, ptr %arrayidx14.us.us.3, align 2 150 %conv.us.us.3 = sext i16 %tmp15 to i32 151 %arrayidx16.us.us.3 = getelementptr inbounds i16, ptr %tmp6, i32 %add13.us.us.3 152 %tmp16 = load i16, ptr %arrayidx16.us.us.3, align 2 153 %conv17.us.us.3 = sext i16 %tmp16 to i32 154 %mul.us.us.3 = mul nsw i32 %conv17.us.us.3, %conv.us.us.3 155 %add18.us.us.3 = add nsw i32 %mul.us.us.3, %add18.us.us.2 156 %inc.us.us.3 = add i32 %filter_x.053.us.us, 4 157 %niter.nsub.3 = add i32 %niter, -4 158 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 159 br i1 %niter.ncmp.3, label %for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa, label %for.body12.us.us 160 161for.cond9.for.cond.cleanup11_crit_edge.us.us.unr-lcssa: ; preds = %for.body12.us.us, %for.cond9.preheader.us.us 162 %inc20.us.us = add nuw i32 %filter_y.056.us.us, 1 163 %exitcond98 = icmp eq i32 %inc20.us.us, %filter_dim 164 br i1 %exitcond98, label %for.cond5.for.cond.cleanup7_crit_edge.us, label %for.cond9.preheader.us.us 165 166for.cond5.for.cond.cleanup7_crit_edge.us: ; preds = %for.cond9.for.cond.cleanup11_crit_edge.us.us 167 %arrayidx23.us = getelementptr inbounds i32, ptr %tmp3, i32 %res_x.060.us 168 store i32 %add18.us.us.3, ptr %arrayidx23.us, align 4 169 %add25.us = add nuw i32 %res_x.060.us, 1 170 %exitcond99 = icmp eq i32 %add25.us, %out_width 171 br i1 %exitcond99, label %for.cond.cleanup3, label %for.cond9.preheader.us.us.preheader 172 173for.cond.cleanup3: ; preds = %for.cond5.for.cond.cleanup7_crit_edge.us, %for.cond5.preheader.preheader, %for.cond1.preheader 174 %add28 = add nuw i32 %res_y.093, 1 175 %exitcond100 = icmp eq i32 %add28, %out_height 176 br i1 %exitcond100, label %for.cond.cleanup, label %for.cond1.preheader 177 178for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry 179 ret void 180} 181 182; CHECK-LABEL: mul_8x8 183; CHECK: @ %for.body 184 185; CHECK-DEFAULT: str{{.*}}, #16]! 186; CHECK-DEFAULT: ldrb{{.*}}, #4]! 187; CHECK-DEFAULT: ldrb{{.*}}, #4]! 188 189; CHECK-COMPLEX: str{{.*}}, #16]! 190; CHECK-COMPLEX: ldrb{{.*}}, #4]! 191; CHECK-COMPLEX: ldrb{{.*}}, #4]! 192 193; DISABLED-NOT: ldr{{.*}}]! 194; DISABLED-NOT: str{{.*}}]! 195 196; CHECK-T2: @ %for.body.epil 197; CHECK-T2: ldrb{{.*}}, #1]! 198; CHECK-T2: ldrb{{.*}}, #1]! 199; CHECK-T2: str{{.*}}, #4]! 200 201define void @mul_8x8(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture %C, i32 %N) { 202entry: 203 %cmp9 = icmp eq i32 %N, 0 204 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader 205 206for.body.preheader: ; preds = %entry 207 %tmp = add i32 %N, -1 208 %xtraiter = and i32 %N, 3 209 %tmp1 = icmp ult i32 %tmp, 3 210 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 211 212for.body.preheader.new: ; preds = %for.body.preheader 213 %unroll_iter = sub i32 %N, %xtraiter 214 br label %for.body 215 216for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 217 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] 218 %lcmp.mod = icmp eq i32 %xtraiter, 0 219 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil 220 221for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa 222 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 223 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 224 %arrayidx.epil = getelementptr inbounds i8, ptr %A, i32 %i.010.epil 225 %tmp2 = load i8, ptr %arrayidx.epil, align 1 226 %conv.epil = zext i8 %tmp2 to i32 227 %arrayidx1.epil = getelementptr inbounds i8, ptr %B, i32 %i.010.epil 228 %tmp3 = load i8, ptr %arrayidx1.epil, align 1 229 %conv2.epil = zext i8 %tmp3 to i32 230 %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil 231 %arrayidx3.epil = getelementptr inbounds i32, ptr %C, i32 %i.010.epil 232 store i32 %mul.epil, ptr %arrayidx3.epil, align 4 233 %inc.epil = add nuw i32 %i.010.epil, 1 234 %epil.iter.sub = add i32 %epil.iter, -1 235 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 236 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil 237 238for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry 239 ret void 240 241for.body: ; preds = %for.body, %for.body.preheader.new 242 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] 243 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 244 %arrayidx = getelementptr inbounds i8, ptr %A, i32 %i.010 245 %tmp4 = load i8, ptr %arrayidx, align 1 246 %conv = zext i8 %tmp4 to i32 247 %arrayidx1 = getelementptr inbounds i8, ptr %B, i32 %i.010 248 %tmp5 = load i8, ptr %arrayidx1, align 1 249 %conv2 = zext i8 %tmp5 to i32 250 %mul = mul nuw nsw i32 %conv2, %conv 251 %arrayidx3 = getelementptr inbounds i32, ptr %C, i32 %i.010 252 store i32 %mul, ptr %arrayidx3, align 4 253 %inc = or disjoint i32 %i.010, 1 254 %arrayidx.1 = getelementptr inbounds i8, ptr %A, i32 %inc 255 %tmp6 = load i8, ptr %arrayidx.1, align 1 256 %conv.1 = zext i8 %tmp6 to i32 257 %arrayidx1.1 = getelementptr inbounds i8, ptr %B, i32 %inc 258 %tmp7 = load i8, ptr %arrayidx1.1, align 1 259 %conv2.1 = zext i8 %tmp7 to i32 260 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1 261 %arrayidx3.1 = getelementptr inbounds i32, ptr %C, i32 %inc 262 store i32 %mul.1, ptr %arrayidx3.1, align 4 263 %inc.1 = or disjoint i32 %i.010, 2 264 %arrayidx.2 = getelementptr inbounds i8, ptr %A, i32 %inc.1 265 %tmp8 = load i8, ptr %arrayidx.2, align 1 266 %conv.2 = zext i8 %tmp8 to i32 267 %arrayidx1.2 = getelementptr inbounds i8, ptr %B, i32 %inc.1 268 %tmp9 = load i8, ptr %arrayidx1.2, align 1 269 %conv2.2 = zext i8 %tmp9 to i32 270 %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2 271 %arrayidx3.2 = getelementptr inbounds i32, ptr %C, i32 %inc.1 272 store i32 %mul.2, ptr %arrayidx3.2, align 4 273 %inc.2 = or disjoint i32 %i.010, 3 274 %arrayidx.3 = getelementptr inbounds i8, ptr %A, i32 %inc.2 275 %tmp10 = load i8, ptr %arrayidx.3, align 1 276 %conv.3 = zext i8 %tmp10 to i32 277 %arrayidx1.3 = getelementptr inbounds i8, ptr %B, i32 %inc.2 278 %tmp11 = load i8, ptr %arrayidx1.3, align 1 279 %conv2.3 = zext i8 %tmp11 to i32 280 %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3 281 %arrayidx3.3 = getelementptr inbounds i32, ptr %C, i32 %inc.2 282 store i32 %mul.3, ptr %arrayidx3.3, align 4 283 %inc.3 = add i32 %i.010, 4 284 %niter.nsub.3 = add i32 %niter, -4 285 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 286 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 287} 288 289; CHECK-LABEL: mul_16x8 290; CHECK: @ %for.body 291 292; CHECK-DEFAULT: str{{.*}}, #16]! 293; CHECK-DEFAULT: ldrsh{{.*}}, #8]! 294 295; CHECK-COMPLEX: ldrsh{{.*}}, #8]! 296; CHECK-COMPLEX: str{{.*}}, #16]! 297; CHECK-COMPLEX: ldrb{{.*}}, #4]! 298 299; DISABLED-NOT: ldr{{.*}}]! 300; DISABLED-NOT: str{{.*}}]! 301 302; CHECK-T2: @ %for.body.epil 303; CHECK-T2: ldrsh{{.*}}, #2]! 304; CHECK-T2: ldrb{{.*}}, #1]! 305; CHECK-T2: str{{.*}}, #4]! 306 307define void @mul_16x8(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture %C, i32 %N) { 308entry: 309 %cmp9 = icmp eq i32 %N, 0 310 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader 311 312for.body.preheader: ; preds = %entry 313 %tmp = add i32 %N, -1 314 %xtraiter = and i32 %N, 3 315 %tmp1 = icmp ult i32 %tmp, 3 316 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 317 318for.body.preheader.new: ; preds = %for.body.preheader 319 %unroll_iter = sub i32 %N, %xtraiter 320 br label %for.body 321 322for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 323 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] 324 %lcmp.mod = icmp eq i32 %xtraiter, 0 325 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil 326 327for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa 328 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 329 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 330 %arrayidx.epil = getelementptr inbounds i16, ptr %A, i32 %i.010.epil 331 %tmp2 = load i16, ptr %arrayidx.epil, align 2 332 %conv.epil = sext i16 %tmp2 to i32 333 %arrayidx1.epil = getelementptr inbounds i8, ptr %B, i32 %i.010.epil 334 %tmp3 = load i8, ptr %arrayidx1.epil, align 1 335 %conv2.epil = zext i8 %tmp3 to i32 336 %mul.epil = mul nsw i32 %conv2.epil, %conv.epil 337 %arrayidx3.epil = getelementptr inbounds i32, ptr %C, i32 %i.010.epil 338 store i32 %mul.epil, ptr %arrayidx3.epil, align 4 339 %inc.epil = add nuw i32 %i.010.epil, 1 340 %epil.iter.sub = add i32 %epil.iter, -1 341 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 342 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil 343 344for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry 345 ret void 346 347for.body: ; preds = %for.body, %for.body.preheader.new 348 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] 349 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 350 %arrayidx = getelementptr inbounds i16, ptr %A, i32 %i.010 351 %tmp4 = load i16, ptr %arrayidx, align 2 352 %conv = sext i16 %tmp4 to i32 353 %arrayidx1 = getelementptr inbounds i8, ptr %B, i32 %i.010 354 %tmp5 = load i8, ptr %arrayidx1, align 1 355 %conv2 = zext i8 %tmp5 to i32 356 %mul = mul nsw i32 %conv2, %conv 357 %arrayidx3 = getelementptr inbounds i32, ptr %C, i32 %i.010 358 store i32 %mul, ptr %arrayidx3, align 4 359 %inc = or disjoint i32 %i.010, 1 360 %arrayidx.1 = getelementptr inbounds i16, ptr %A, i32 %inc 361 %tmp6 = load i16, ptr %arrayidx.1, align 2 362 %conv.1 = sext i16 %tmp6 to i32 363 %arrayidx1.1 = getelementptr inbounds i8, ptr %B, i32 %inc 364 %tmp7 = load i8, ptr %arrayidx1.1, align 1 365 %conv2.1 = zext i8 %tmp7 to i32 366 %mul.1 = mul nsw i32 %conv2.1, %conv.1 367 %arrayidx3.1 = getelementptr inbounds i32, ptr %C, i32 %inc 368 store i32 %mul.1, ptr %arrayidx3.1, align 4 369 %inc.1 = or disjoint i32 %i.010, 2 370 %arrayidx.2 = getelementptr inbounds i16, ptr %A, i32 %inc.1 371 %tmp8 = load i16, ptr %arrayidx.2, align 2 372 %conv.2 = sext i16 %tmp8 to i32 373 %arrayidx1.2 = getelementptr inbounds i8, ptr %B, i32 %inc.1 374 %tmp9 = load i8, ptr %arrayidx1.2, align 1 375 %conv2.2 = zext i8 %tmp9 to i32 376 %mul.2 = mul nsw i32 %conv2.2, %conv.2 377 %arrayidx3.2 = getelementptr inbounds i32, ptr %C, i32 %inc.1 378 store i32 %mul.2, ptr %arrayidx3.2, align 4 379 %inc.2 = or disjoint i32 %i.010, 3 380 %arrayidx.3 = getelementptr inbounds i16, ptr %A, i32 %inc.2 381 %tmp10 = load i16, ptr %arrayidx.3, align 2 382 %conv.3 = sext i16 %tmp10 to i32 383 %arrayidx1.3 = getelementptr inbounds i8, ptr %B, i32 %inc.2 384 %tmp11 = load i8, ptr %arrayidx1.3, align 1 385 %conv2.3 = zext i8 %tmp11 to i32 386 %mul.3 = mul nsw i32 %conv2.3, %conv.3 387 %arrayidx3.3 = getelementptr inbounds i32, ptr %C, i32 %inc.2 388 store i32 %mul.3, ptr %arrayidx3.3, align 4 389 %inc.3 = add i32 %i.010, 4 390 %niter.nsub.3 = add i32 %niter, -4 391 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 392 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 393} 394 395; CHECK-LABEL: mul_16x16 396; CHECK: @ %for.body 397 398; TODO: pre-indexed loads 399; CHECK-DEFAULT-NOT: ldrsh{{.*}}]! 400; CHECK-DEFAULT: str{{.*}}, #16]! 401; CHECK-DEFAULT-NOT: ldrsh{{.*}}]! 402 403; CHECK-COMPLEX: ldrsh{{.*}}]! 404; CHECK-COMPLEX: ldrsh{{.*}}]! 405; CHECK-COMPLEX: str{{.*}}]! 406 407; DISABLED-NOT: ldr{{.*}}]! 408; DISABLED-NOT: str{{.*}}]! 409 410; CHECK-T2: @ %for.body.epil 411; CHECK-T2: ldrsh{{.*}}, #2]! 412; CHECK-T2: ldrsh{{.*}}, #2]! 413; CHECK-T2: str{{.*}}, #4]! 414 415define void @mul_16x16(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture %C, i32 %N) { 416entry: 417 %cmp9 = icmp eq i32 %N, 0 418 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader 419 420for.body.preheader: ; preds = %entry 421 %tmp = add i32 %N, -1 422 %xtraiter = and i32 %N, 3 423 %tmp1 = icmp ult i32 %tmp, 3 424 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 425 426for.body.preheader.new: ; preds = %for.body.preheader 427 %unroll_iter = sub i32 %N, %xtraiter 428 br label %for.body 429 430for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 431 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] 432 %lcmp.mod = icmp eq i32 %xtraiter, 0 433 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil 434 435for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa 436 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 437 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 438 %arrayidx.epil = getelementptr inbounds i16, ptr %A, i32 %i.010.epil 439 %tmp2 = load i16, ptr %arrayidx.epil, align 2 440 %conv.epil = sext i16 %tmp2 to i32 441 %arrayidx1.epil = getelementptr inbounds i16, ptr %B, i32 %i.010.epil 442 %tmp3 = load i16, ptr %arrayidx1.epil, align 2 443 %conv2.epil = sext i16 %tmp3 to i32 444 %mul.epil = mul nsw i32 %conv2.epil, %conv.epil 445 %arrayidx3.epil = getelementptr inbounds i32, ptr %C, i32 %i.010.epil 446 store i32 %mul.epil, ptr %arrayidx3.epil, align 4 447 %inc.epil = add nuw i32 %i.010.epil, 1 448 %epil.iter.sub = add i32 %epil.iter, -1 449 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 450 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil 451 452for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry 453 ret void 454 455for.body: ; preds = %for.body, %for.body.preheader.new 456 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] 457 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 458 %arrayidx = getelementptr inbounds i16, ptr %A, i32 %i.010 459 %tmp4 = load i16, ptr %arrayidx, align 2 460 %conv = sext i16 %tmp4 to i32 461 %arrayidx1 = getelementptr inbounds i16, ptr %B, i32 %i.010 462 %tmp5 = load i16, ptr %arrayidx1, align 2 463 %conv2 = sext i16 %tmp5 to i32 464 %mul = mul nsw i32 %conv2, %conv 465 %arrayidx3 = getelementptr inbounds i32, ptr %C, i32 %i.010 466 store i32 %mul, ptr %arrayidx3, align 4 467 %inc = or disjoint i32 %i.010, 1 468 %arrayidx.1 = getelementptr inbounds i16, ptr %A, i32 %inc 469 %tmp6 = load i16, ptr %arrayidx.1, align 2 470 %conv.1 = sext i16 %tmp6 to i32 471 %arrayidx1.1 = getelementptr inbounds i16, ptr %B, i32 %inc 472 %tmp7 = load i16, ptr %arrayidx1.1, align 2 473 %conv2.1 = sext i16 %tmp7 to i32 474 %mul.1 = mul nsw i32 %conv2.1, %conv.1 475 %arrayidx3.1 = getelementptr inbounds i32, ptr %C, i32 %inc 476 store i32 %mul.1, ptr %arrayidx3.1, align 4 477 %inc.1 = or disjoint i32 %i.010, 2 478 %arrayidx.2 = getelementptr inbounds i16, ptr %A, i32 %inc.1 479 %tmp8 = load i16, ptr %arrayidx.2, align 2 480 %conv.2 = sext i16 %tmp8 to i32 481 %arrayidx1.2 = getelementptr inbounds i16, ptr %B, i32 %inc.1 482 %tmp9 = load i16, ptr %arrayidx1.2, align 2 483 %conv2.2 = sext i16 %tmp9 to i32 484 %mul.2 = mul nsw i32 %conv2.2, %conv.2 485 %arrayidx3.2 = getelementptr inbounds i32, ptr %C, i32 %inc.1 486 store i32 %mul.2, ptr %arrayidx3.2, align 4 487 %inc.2 = or disjoint i32 %i.010, 3 488 %arrayidx.3 = getelementptr inbounds i16, ptr %A, i32 %inc.2 489 %tmp10 = load i16, ptr %arrayidx.3, align 2 490 %conv.3 = sext i16 %tmp10 to i32 491 %arrayidx1.3 = getelementptr inbounds i16, ptr %B, i32 %inc.2 492 %tmp11 = load i16, ptr %arrayidx1.3, align 2 493 %conv2.3 = sext i16 %tmp11 to i32 494 %mul.3 = mul nsw i32 %conv2.3, %conv.3 495 %arrayidx3.3 = getelementptr inbounds i32, ptr %C, i32 %inc.2 496 store i32 %mul.3, ptr %arrayidx3.3, align 4 497 %inc.3 = add i32 %i.010, 4 498 %niter.nsub.3 = add i32 %niter, -4 499 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 500 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 501} 502 503; CHECK-LABEL: mul_8x8_2d 504; CHECK: @ %for.body4.us 505 506; CHECK-DEFAULT: ldr{{.*}}, #16]! 507; CHECK-DEFAULT: ldrb{{.*}}, #4]! 508 509; DISABLED-NOT: ldr{{.*}}]! 510; DISABLED-NOT: str{{.*}}]! 511 512; CHECK-T2: @ %for.body4.us.epil 513; CHECK-T2: ldrb{{.*}}, #1]! 514; CHECK-T2: ldr{{.*}}, #4]! 515 516define void @mul_8x8_2d(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture readonly %C, i32 %N, i32 %M) { 517entry: 518 %cmp24 = icmp eq i32 %N, 0 519 %cmp222 = icmp eq i32 %M, 0 520 %or.cond = or i1 %cmp24, %cmp222 521 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader 522 523for.cond1.preheader.us.preheader: ; preds = %entry 524 %tmp = add i32 %M, -1 525 %xtraiter = and i32 %M, 3 526 %tmp1 = icmp ult i32 %tmp, 3 527 %unroll_iter = sub i32 %M, %xtraiter 528 %lcmp.mod = icmp eq i32 %xtraiter, 0 529 br label %for.cond1.preheader.us 530 531for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader 532 %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] 533 %arrayidx.us = getelementptr inbounds i8, ptr %A, i32 %i.025.us 534 %arrayidx5.us = getelementptr inbounds ptr, ptr %B, i32 %i.025.us 535 %arrayidx8.us = getelementptr inbounds ptr, ptr %C, i32 %i.025.us 536 %.pre = load ptr, ptr %arrayidx5.us, align 4 537 %.pre30 = load ptr, ptr %arrayidx8.us, align 4 538 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 539 540for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us 541 %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] 542 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] 543 %tmp2 = load i8, ptr %arrayidx.us, align 1 544 %conv.us = zext i8 %tmp2 to i32 545 %arrayidx6.us = getelementptr inbounds i8, ptr %.pre, i32 %j.023.us 546 %tmp3 = load i8, ptr %arrayidx6.us, align 1 547 %conv7.us = zext i8 %tmp3 to i32 548 %mul.us = mul nuw nsw i32 %conv7.us, %conv.us 549 %arrayidx9.us = getelementptr inbounds i32, ptr %.pre30, i32 %j.023.us 550 %tmp4 = load i32, ptr %arrayidx9.us, align 4 551 %add.us = add nsw i32 %tmp4, %mul.us 552 store i32 %add.us, ptr %arrayidx9.us, align 4 553 %inc.us = or disjoint i32 %j.023.us, 1 554 %tmp5 = load i8, ptr %arrayidx.us, align 1 555 %conv.us.1 = zext i8 %tmp5 to i32 556 %arrayidx6.us.1 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us 557 %tmp6 = load i8, ptr %arrayidx6.us.1, align 1 558 %conv7.us.1 = zext i8 %tmp6 to i32 559 %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1 560 %arrayidx9.us.1 = getelementptr inbounds i32, ptr %.pre30, i32 %inc.us 561 %tmp7 = load i32, ptr %arrayidx9.us.1, align 4 562 %add.us.1 = add nsw i32 %tmp7, %mul.us.1 563 store i32 %add.us.1, ptr %arrayidx9.us.1, align 4 564 %inc.us.1 = or disjoint i32 %j.023.us, 2 565 %tmp8 = load i8, ptr %arrayidx.us, align 1 566 %conv.us.2 = zext i8 %tmp8 to i32 567 %arrayidx6.us.2 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us.1 568 %tmp9 = load i8, ptr %arrayidx6.us.2, align 1 569 %conv7.us.2 = zext i8 %tmp9 to i32 570 %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2 571 %arrayidx9.us.2 = getelementptr inbounds i32, ptr %.pre30, i32 %inc.us.1 572 %tmp10 = load i32, ptr %arrayidx9.us.2, align 4 573 %add.us.2 = add nsw i32 %tmp10, %mul.us.2 574 store i32 %add.us.2, ptr %arrayidx9.us.2, align 4 575 %inc.us.2 = or disjoint i32 %j.023.us, 3 576 %tmp11 = load i8, ptr %arrayidx.us, align 1 577 %conv.us.3 = zext i8 %tmp11 to i32 578 %arrayidx6.us.3 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us.2 579 %tmp12 = load i8, ptr %arrayidx6.us.3, align 1 580 %conv7.us.3 = zext i8 %tmp12 to i32 581 %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3 582 %arrayidx9.us.3 = getelementptr inbounds i32, ptr %.pre30, i32 %inc.us.2 583 %tmp13 = load i32, ptr %arrayidx9.us.3, align 4 584 %add.us.3 = add nsw i32 %tmp13, %mul.us.3 585 store i32 %add.us.3, ptr %arrayidx9.us.3, align 4 586 %inc.us.3 = add i32 %j.023.us, 4 587 %niter.nsub.3 = add i32 %niter, -4 588 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 589 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 590 591for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us 592 %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] 593 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 594 595for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 596 %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 597 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 598 %tmp14 = load i8, ptr %arrayidx.us, align 1 599 %conv.us.epil = zext i8 %tmp14 to i32 600 %arrayidx6.us.epil = getelementptr inbounds i8, ptr %.pre, i32 %j.023.us.epil 601 %tmp15 = load i8, ptr %arrayidx6.us.epil, align 1 602 %conv7.us.epil = zext i8 %tmp15 to i32 603 %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil 604 %arrayidx9.us.epil = getelementptr inbounds i32, ptr %.pre30, i32 %j.023.us.epil 605 %tmp16 = load i32, ptr %arrayidx9.us.epil, align 4 606 %add.us.epil = add nsw i32 %tmp16, %mul.us.epil 607 store i32 %add.us.epil, ptr %arrayidx9.us.epil, align 4 608 %inc.us.epil = add nuw i32 %j.023.us.epil, 1 609 %epil.iter.sub = add i32 %epil.iter, -1 610 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 611 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 612 613for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 614 %inc11.us = add nuw i32 %i.025.us, 1 615 %exitcond28 = icmp eq i32 %inc11.us, %N 616 br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us 617 618for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry 619 ret void 620} 621 622; CHECK-LABEL: mul_16x16_2d 623; CHECK: @ %for.body4.us 624 625; CHECK-DEFAULT: ldr{{.*}}, #16]! 626; CHECK-DEFAULT: ldrsh{{.*}}, #8]! 627 628; DISABLED-NOT: ldr{{.*}}]! 629; DISABLED-NOT: str{{.*}}]! 630 631; CHECK-T2: @ %for.body4.us.epil 632; CHECK-T2: ldrsh{{.*}}, #2]! 633; CHECK-T2: ldr{{.*}}, #4]! 634 635define void @mul_16x16_2d(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture readonly %C, i32 %N, i32 %M) { 636entry: 637 %cmp24 = icmp eq i32 %N, 0 638 %cmp222 = icmp eq i32 %M, 0 639 %or.cond = or i1 %cmp24, %cmp222 640 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader 641 642for.cond1.preheader.us.preheader: ; preds = %entry 643 %tmp = add i32 %M, -1 644 %xtraiter = and i32 %M, 3 645 %tmp1 = icmp ult i32 %tmp, 3 646 %unroll_iter = sub i32 %M, %xtraiter 647 %lcmp.mod = icmp eq i32 %xtraiter, 0 648 br label %for.cond1.preheader.us 649 650for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader 651 %i.025.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] 652 %arrayidx.us = getelementptr inbounds i16, ptr %A, i32 %i.025.us 653 %tmp2 = load i16, ptr %arrayidx.us, align 2 654 %conv.us = sext i16 %tmp2 to i32 655 %arrayidx5.us = getelementptr inbounds ptr, ptr %B, i32 %i.025.us 656 %tmp3 = load ptr, ptr %arrayidx5.us, align 4 657 %arrayidx8.us = getelementptr inbounds ptr, ptr %C, i32 %i.025.us 658 %tmp4 = load ptr, ptr %arrayidx8.us, align 4 659 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 660 661for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us 662 %j.023.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] 663 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] 664 %arrayidx6.us = getelementptr inbounds i16, ptr %tmp3, i32 %j.023.us 665 %tmp5 = load i16, ptr %arrayidx6.us, align 2 666 %conv7.us = sext i16 %tmp5 to i32 667 %mul.us = mul nsw i32 %conv7.us, %conv.us 668 %arrayidx9.us = getelementptr inbounds i32, ptr %tmp4, i32 %j.023.us 669 %tmp6 = load i32, ptr %arrayidx9.us, align 4 670 %add.us = add nsw i32 %tmp6, %mul.us 671 store i32 %add.us, ptr %arrayidx9.us, align 4 672 %inc.us = or disjoint i32 %j.023.us, 1 673 %arrayidx6.us.1 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us 674 %tmp7 = load i16, ptr %arrayidx6.us.1, align 2 675 %conv7.us.1 = sext i16 %tmp7 to i32 676 %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us 677 %arrayidx9.us.1 = getelementptr inbounds i32, ptr %tmp4, i32 %inc.us 678 %tmp8 = load i32, ptr %arrayidx9.us.1, align 4 679 %add.us.1 = add nsw i32 %tmp8, %mul.us.1 680 store i32 %add.us.1, ptr %arrayidx9.us.1, align 4 681 %inc.us.1 = or disjoint i32 %j.023.us, 2 682 %arrayidx6.us.2 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us.1 683 %tmp9 = load i16, ptr %arrayidx6.us.2, align 2 684 %conv7.us.2 = sext i16 %tmp9 to i32 685 %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us 686 %arrayidx9.us.2 = getelementptr inbounds i32, ptr %tmp4, i32 %inc.us.1 687 %tmp10 = load i32, ptr %arrayidx9.us.2, align 4 688 %add.us.2 = add nsw i32 %tmp10, %mul.us.2 689 store i32 %add.us.2, ptr %arrayidx9.us.2, align 4 690 %inc.us.2 = or disjoint i32 %j.023.us, 3 691 %arrayidx6.us.3 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us.2 692 %tmp11 = load i16, ptr %arrayidx6.us.3, align 2 693 %conv7.us.3 = sext i16 %tmp11 to i32 694 %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us 695 %arrayidx9.us.3 = getelementptr inbounds i32, ptr %tmp4, i32 %inc.us.2 696 %tmp12 = load i32, ptr %arrayidx9.us.3, align 4 697 %add.us.3 = add nsw i32 %tmp12, %mul.us.3 698 store i32 %add.us.3, ptr %arrayidx9.us.3, align 4 699 %inc.us.3 = add i32 %j.023.us, 4 700 %niter.nsub.3 = add i32 %niter, -4 701 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 702 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 703 704for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us 705 %j.023.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] 706 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 707 708for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 709 %j.023.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.023.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 710 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 711 %arrayidx6.us.epil = getelementptr inbounds i16, ptr %tmp3, i32 %j.023.us.epil 712 %tmp13 = load i16, ptr %arrayidx6.us.epil, align 2 713 %conv7.us.epil = sext i16 %tmp13 to i32 714 %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us 715 %arrayidx9.us.epil = getelementptr inbounds i32, ptr %tmp4, i32 %j.023.us.epil 716 %tmp14 = load i32, ptr %arrayidx9.us.epil, align 4 717 %add.us.epil = add nsw i32 %tmp14, %mul.us.epil 718 store i32 %add.us.epil, ptr %arrayidx9.us.epil, align 4 719 %inc.us.epil = add nuw i32 %j.023.us.epil, 1 720 %epil.iter.sub = add i32 %epil.iter, -1 721 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 722 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 723 724for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 725 %inc11.us = add nuw i32 %i.025.us, 1 726 %exitcond28 = icmp eq i32 %inc11.us, %N 727 br i1 %exitcond28, label %for.cond.cleanup, label %for.cond1.preheader.us 728 729for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry 730 ret void 731} 732 733; CHECK-LABEL: mac_8x8_2d 734; CHECK: @ %for.body4.us 735 736; TODO: Both input arrays could use pre-indexed loads. 737; TODO: pre-indexed stores. 738; CHECK-DEFAULT: ldrb{{.*}}, #4]! 739; CHECK-DEFAULT-NOT: ldr{{.*}}]! 740; CHECK-DEFAULT-NOT: str{{.*}}]! 741 742; TODO: Increased complexity shouldn't prevent indexed accesses. 743; CHECK-COMPLEX-NOT: ldr{{.*}}]! 744; CHECK-COMPLEX-NOT: str{{.*}}]! 745 746; DISABLED-NOT: ldr{{.*}}]! 747; DISABLED-NOT: str{{.*}}]! 748 749; CHECK-T2: @ %for.body4.us.epil 750; CHECK-T2: ldrb{{.*}}, #1]! 751 752define void @mac_8x8_2d(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture %C, i32 %N, i32 %M) { 753entry: 754 %cmp22 = icmp eq i32 %N, 0 755 %cmp220 = icmp eq i32 %M, 0 756 %or.cond = or i1 %cmp22, %cmp220 757 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader 758 759for.cond1.preheader.us.preheader: ; preds = %entry 760 %tmp = add i32 %M, -1 761 %xtraiter = and i32 %M, 3 762 %tmp1 = icmp ult i32 %tmp, 3 763 %unroll_iter = sub i32 %M, %xtraiter 764 %lcmp.mod = icmp eq i32 %xtraiter, 0 765 br label %for.cond1.preheader.us 766 767for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader 768 %i.023.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] 769 %arrayidx.us = getelementptr inbounds i8, ptr %A, i32 %i.023.us 770 %arrayidx5.us = getelementptr inbounds ptr, ptr %B, i32 %i.023.us 771 %arrayidx8.us = getelementptr inbounds i32, ptr %C, i32 %i.023.us 772 %.pre = load ptr, ptr %arrayidx5.us, align 4 773 %.pre28 = load i32, ptr %arrayidx8.us, align 4 774 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 775 776for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us 777 %tmp2 = phi i32 [ %add.us.3, %for.body4.us ], [ %.pre28, %for.cond1.preheader.us ] 778 %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] 779 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] 780 %tmp3 = load i8, ptr %arrayidx.us, align 1 781 %conv.us = zext i8 %tmp3 to i32 782 %arrayidx6.us = getelementptr inbounds i8, ptr %.pre, i32 %j.021.us 783 %tmp4 = load i8, ptr %arrayidx6.us, align 1 784 %conv7.us = zext i8 %tmp4 to i32 785 %mul.us = mul nuw nsw i32 %conv7.us, %conv.us 786 %add.us = add nsw i32 %mul.us, %tmp2 787 store i32 %add.us, ptr %arrayidx8.us, align 4 788 %inc.us = or disjoint i32 %j.021.us, 1 789 %tmp5 = load i8, ptr %arrayidx.us, align 1 790 %conv.us.1 = zext i8 %tmp5 to i32 791 %arrayidx6.us.1 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us 792 %tmp6 = load i8, ptr %arrayidx6.us.1, align 1 793 %conv7.us.1 = zext i8 %tmp6 to i32 794 %mul.us.1 = mul nuw nsw i32 %conv7.us.1, %conv.us.1 795 %add.us.1 = add nsw i32 %mul.us.1, %add.us 796 store i32 %add.us.1, ptr %arrayidx8.us, align 4 797 %inc.us.1 = or disjoint i32 %j.021.us, 2 798 %tmp7 = load i8, ptr %arrayidx.us, align 1 799 %conv.us.2 = zext i8 %tmp7 to i32 800 %arrayidx6.us.2 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us.1 801 %tmp8 = load i8, ptr %arrayidx6.us.2, align 1 802 %conv7.us.2 = zext i8 %tmp8 to i32 803 %mul.us.2 = mul nuw nsw i32 %conv7.us.2, %conv.us.2 804 %add.us.2 = add nsw i32 %mul.us.2, %add.us.1 805 store i32 %add.us.2, ptr %arrayidx8.us, align 4 806 %inc.us.2 = or disjoint i32 %j.021.us, 3 807 %tmp9 = load i8, ptr %arrayidx.us, align 1 808 %conv.us.3 = zext i8 %tmp9 to i32 809 %arrayidx6.us.3 = getelementptr inbounds i8, ptr %.pre, i32 %inc.us.2 810 %tmp10 = load i8, ptr %arrayidx6.us.3, align 1 811 %conv7.us.3 = zext i8 %tmp10 to i32 812 %mul.us.3 = mul nuw nsw i32 %conv7.us.3, %conv.us.3 813 %add.us.3 = add nsw i32 %mul.us.3, %add.us.2 814 store i32 %add.us.3, ptr %arrayidx8.us, align 4 815 %inc.us.3 = add i32 %j.021.us, 4 816 %niter.nsub.3 = add i32 %niter, -4 817 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 818 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 819 820for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us 821 %.unr = phi i32 [ %.pre28, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ] 822 %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] 823 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 824 825for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 826 %tmp11 = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 827 %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 828 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 829 %tmp12 = load i8, ptr %arrayidx.us, align 1 830 %conv.us.epil = zext i8 %tmp12 to i32 831 %arrayidx6.us.epil = getelementptr inbounds i8, ptr %.pre, i32 %j.021.us.epil 832 %tmp13 = load i8, ptr %arrayidx6.us.epil, align 1 833 %conv7.us.epil = zext i8 %tmp13 to i32 834 %mul.us.epil = mul nuw nsw i32 %conv7.us.epil, %conv.us.epil 835 %add.us.epil = add nsw i32 %mul.us.epil, %tmp11 836 store i32 %add.us.epil, ptr %arrayidx8.us, align 4 837 %inc.us.epil = add nuw i32 %j.021.us.epil, 1 838 %epil.iter.sub = add i32 %epil.iter, -1 839 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 840 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 841 842for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 843 %inc10.us = add nuw i32 %i.023.us, 1 844 %exitcond26 = icmp eq i32 %inc10.us, %N 845 br i1 %exitcond26, label %for.cond.cleanup, label %for.cond1.preheader.us 846 847for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry 848 ret void 849} 850 851; CHECK-LABEL: mac_16x16_2d 852; CHECK: @ %for.body4.us 853 854; TODO: pre-indexed loads for both input arrays. 855; CHECK-DEFAULT: ldrsh{{.*}}, #8]! 856; CHECK-DEFAULT-NOT: ldr{{.*}}]! 857 858; TODO: increased complexity should lead to better codegen. 859; CHECK-COMPLEX-NOT: ldr{{.*}}]! 860 861; DISABLED-NOT: ldr{{.*}}]! 862 863; CHECK-T2: @ %for.body4.us.epil 864; CHECK-T2: ldrsh{{.*}}, #2]! 865 866define void @mac_16x16_2d(ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture %C, i32 %N, i32 %M) { 867entry: 868 %cmp23 = icmp eq i32 %N, 0 869 %cmp220 = icmp eq i32 %M, 0 870 %or.cond = or i1 %cmp23, %cmp220 871 br i1 %or.cond, label %for.cond.cleanup, label %for.cond1.preheader.us.preheader 872 873for.cond1.preheader.us.preheader: ; preds = %entry 874 %tmp = add i32 %M, -1 875 %xtraiter = and i32 %M, 3 876 %tmp1 = icmp ult i32 %tmp, 3 877 %unroll_iter = sub i32 %M, %xtraiter 878 %lcmp.mod = icmp eq i32 %xtraiter, 0 879 br label %for.cond1.preheader.us 880 881for.cond1.preheader.us: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.us.preheader 882 %i.024.us = phi i32 [ %inc10.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] 883 %arrayidx.us = getelementptr inbounds i16, ptr %A, i32 %i.024.us 884 %tmp2 = load i16, ptr %arrayidx.us, align 2 885 %conv.us = sext i16 %tmp2 to i32 886 %arrayidx5.us = getelementptr inbounds ptr, ptr %B, i32 %i.024.us 887 %tmp3 = load ptr, ptr %arrayidx5.us, align 4 888 %arrayidx8.us = getelementptr inbounds i32, ptr %C, i32 %i.024.us 889 %arrayidx8.promoted.us = load i32, ptr %arrayidx8.us, align 4 890 br i1 %tmp1, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 891 892for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us 893 %add22.us = phi i32 [ %add.us.3, %for.body4.us ], [ %arrayidx8.promoted.us, %for.cond1.preheader.us ] 894 %j.021.us = phi i32 [ %inc.us.3, %for.body4.us ], [ 0, %for.cond1.preheader.us ] 895 %niter = phi i32 [ %niter.nsub.3, %for.body4.us ], [ %unroll_iter, %for.cond1.preheader.us ] 896 %arrayidx6.us = getelementptr inbounds i16, ptr %tmp3, i32 %j.021.us 897 %tmp4 = load i16, ptr %arrayidx6.us, align 2 898 %conv7.us = sext i16 %tmp4 to i32 899 %mul.us = mul nsw i32 %conv7.us, %conv.us 900 %add.us = add nsw i32 %mul.us, %add22.us 901 %inc.us = or disjoint i32 %j.021.us, 1 902 %arrayidx6.us.1 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us 903 %tmp5 = load i16, ptr %arrayidx6.us.1, align 2 904 %conv7.us.1 = sext i16 %tmp5 to i32 905 %mul.us.1 = mul nsw i32 %conv7.us.1, %conv.us 906 %add.us.1 = add nsw i32 %mul.us.1, %add.us 907 %inc.us.1 = or disjoint i32 %j.021.us, 2 908 %arrayidx6.us.2 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us.1 909 %tmp6 = load i16, ptr %arrayidx6.us.2, align 2 910 %conv7.us.2 = sext i16 %tmp6 to i32 911 %mul.us.2 = mul nsw i32 %conv7.us.2, %conv.us 912 %add.us.2 = add nsw i32 %mul.us.2, %add.us.1 913 %inc.us.2 = or disjoint i32 %j.021.us, 3 914 %arrayidx6.us.3 = getelementptr inbounds i16, ptr %tmp3, i32 %inc.us.2 915 %tmp7 = load i16, ptr %arrayidx6.us.3, align 2 916 %conv7.us.3 = sext i16 %tmp7 to i32 917 %mul.us.3 = mul nsw i32 %conv7.us.3, %conv.us 918 %add.us.3 = add nsw i32 %mul.us.3, %add.us.2 919 %inc.us.3 = add i32 %j.021.us, 4 920 %niter.nsub.3 = add i32 %niter, -4 921 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 922 br i1 %niter.ncmp.3, label %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa, label %for.body4.us 923 924for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa: ; preds = %for.body4.us, %for.cond1.preheader.us 925 %add.us.lcssa.ph = phi i32 [ undef, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ] 926 %add22.us.unr = phi i32 [ %arrayidx8.promoted.us, %for.cond1.preheader.us ], [ %add.us.3, %for.body4.us ] 927 %j.021.us.unr = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us.3, %for.body4.us ] 928 br i1 %lcmp.mod, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 929 930for.body4.us.epil: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 931 %add22.us.epil = phi i32 [ %add.us.epil, %for.body4.us.epil ], [ %add22.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 932 %j.021.us.epil = phi i32 [ %inc.us.epil, %for.body4.us.epil ], [ %j.021.us.unr, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 933 %epil.iter = phi i32 [ %epil.iter.sub, %for.body4.us.epil ], [ %xtraiter, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ] 934 %arrayidx6.us.epil = getelementptr inbounds i16, ptr %tmp3, i32 %j.021.us.epil 935 %tmp8 = load i16, ptr %arrayidx6.us.epil, align 2 936 %conv7.us.epil = sext i16 %tmp8 to i32 937 %mul.us.epil = mul nsw i32 %conv7.us.epil, %conv.us 938 %add.us.epil = add nsw i32 %mul.us.epil, %add22.us.epil 939 %inc.us.epil = add nuw i32 %j.021.us.epil, 1 940 %epil.iter.sub = add i32 %epil.iter, -1 941 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 942 br i1 %epil.iter.cmp, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.epil 943 944for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us.epil, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa 945 %add.us.lcssa = phi i32 [ %add.us.lcssa.ph, %for.cond1.for.cond.cleanup3_crit_edge.us.unr-lcssa ], [ %add.us.epil, %for.body4.us.epil ] 946 store i32 %add.us.lcssa, ptr %arrayidx8.us, align 4 947 %inc10.us = add nuw i32 %i.024.us, 1 948 %exitcond27 = icmp eq i32 %inc10.us, %N 949 br i1 %exitcond27, label %for.cond.cleanup, label %for.cond1.preheader.us 950 951for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry 952 ret void 953} 954 955; CHECK-LABEL: mul32x32_backwards 956; CHECK: @ %for.body 957 958; TODO: post increments for decreasing addresses 959; CHECK-DEFAULT-NOT: ldr{{.*}}]! 960; CHECK-DEFAULT-NOT: str{{.*}}]! 961 962; CHECK-COMPLEX-NOT: ldr{{.*}}]! 963; CHECK-COMPLEX-NOT: str{{.*}}]! 964 965define void @mul32x32_backwards(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %N) { 966entry: 967 %i.08 = add i32 %N, -1 968 %cmp9 = icmp sgt i32 %i.08, -1 969 br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup 970 971for.body.preheader: ; preds = %entry 972 %xtraiter = and i32 %N, 3 973 %lcmp.mod = icmp eq i32 %xtraiter, 0 974 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol 975 976for.body.prol: ; preds = %for.body.prol, %for.body.preheader 977 %i.010.prol = phi i32 [ %i.0.prol, %for.body.prol ], [ %i.08, %for.body.preheader ] 978 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader ] 979 %arrayidx.prol = getelementptr inbounds i32, ptr %b, i32 %i.010.prol 980 %tmp = load i32, ptr %arrayidx.prol, align 4 981 %arrayidx1.prol = getelementptr inbounds i32, ptr %c, i32 %i.010.prol 982 %tmp1 = load i32, ptr %arrayidx1.prol, align 4 983 %mul.prol = mul nsw i32 %tmp1, %tmp 984 %arrayidx2.prol = getelementptr inbounds i32, ptr %a, i32 %i.010.prol 985 store i32 %mul.prol, ptr %arrayidx2.prol, align 4 986 %i.0.prol = add i32 %i.010.prol, -1 987 %prol.iter.sub = add i32 %prol.iter, -1 988 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0 989 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol 990 991for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader 992 %i.010.unr = phi i32 [ %i.08, %for.body.preheader ], [ %i.0.prol, %for.body.prol ] 993 %tmp2 = icmp ult i32 %i.08, 3 994 br i1 %tmp2, label %for.cond.cleanup, label %for.body 995 996for.cond.cleanup: ; preds = %for.body, %for.body.prol.loopexit, %entry 997 ret void 998 999for.body: ; preds = %for.body, %for.body.prol.loopexit 1000 %i.010 = phi i32 [ %i.0.3, %for.body ], [ %i.010.unr, %for.body.prol.loopexit ] 1001 %arrayidx = getelementptr inbounds i32, ptr %b, i32 %i.010 1002 %tmp3 = load i32, ptr %arrayidx, align 4 1003 %arrayidx1 = getelementptr inbounds i32, ptr %c, i32 %i.010 1004 %tmp4 = load i32, ptr %arrayidx1, align 4 1005 %mul = mul nsw i32 %tmp4, %tmp3 1006 %arrayidx2 = getelementptr inbounds i32, ptr %a, i32 %i.010 1007 store i32 %mul, ptr %arrayidx2, align 4 1008 %i.0 = add i32 %i.010, -1 1009 %arrayidx.1 = getelementptr inbounds i32, ptr %b, i32 %i.0 1010 %tmp5 = load i32, ptr %arrayidx.1, align 4 1011 %arrayidx1.1 = getelementptr inbounds i32, ptr %c, i32 %i.0 1012 %tmp6 = load i32, ptr %arrayidx1.1, align 4 1013 %mul.1 = mul nsw i32 %tmp6, %tmp5 1014 %arrayidx2.1 = getelementptr inbounds i32, ptr %a, i32 %i.0 1015 store i32 %mul.1, ptr %arrayidx2.1, align 4 1016 %i.0.1 = add i32 %i.010, -2 1017 %arrayidx.2 = getelementptr inbounds i32, ptr %b, i32 %i.0.1 1018 %tmp7 = load i32, ptr %arrayidx.2, align 4 1019 %arrayidx1.2 = getelementptr inbounds i32, ptr %c, i32 %i.0.1 1020 %tmp8 = load i32, ptr %arrayidx1.2, align 4 1021 %mul.2 = mul nsw i32 %tmp8, %tmp7 1022 %arrayidx2.2 = getelementptr inbounds i32, ptr %a, i32 %i.0.1 1023 store i32 %mul.2, ptr %arrayidx2.2, align 4 1024 %i.0.2 = add i32 %i.010, -3 1025 %arrayidx.3 = getelementptr inbounds i32, ptr %b, i32 %i.0.2 1026 %tmp9 = load i32, ptr %arrayidx.3, align 4 1027 %arrayidx1.3 = getelementptr inbounds i32, ptr %c, i32 %i.0.2 1028 %tmp10 = load i32, ptr %arrayidx1.3, align 4 1029 %mul.3 = mul nsw i32 %tmp10, %tmp9 1030 %arrayidx2.3 = getelementptr inbounds i32, ptr %a, i32 %i.0.2 1031 store i32 %mul.3, ptr %arrayidx2.3, align 4 1032 %i.0.3 = add i32 %i.010, -4 1033 %cmp.3 = icmp sgt i32 %i.0.3, -1 1034 br i1 %cmp.3, label %for.body, label %for.cond.cleanup 1035} 1036 1037; CHECK-LABEL: mul32x32_forwards 1038; CHECK: @ %for.body 1039 1040; TODO: Would be good for the complexity limit didn't have to be increased to 1041; enable the pre-indexed accesses. 1042 1043; CHECK-DEFAULT-NOT: ldr{{.*}}]! 1044; CHECK-DEFAULT-NOT: str{{.*}}]! 1045 1046; CHECK-COMPLEX: ldr{{.*}}, #16]! 1047; CHECK-COMPLEX: ldr{{.*}}, #16]! 1048; CHECK-COMPLEX: str{{.*}}, #16]! 1049 1050; CHECK-T2: @ %for.body.epil 1051; CHECK-T2: ldr{{.*}}, #4]! 1052; CHECK-T2: ldr{{.*}}, #4]! 1053; CHECK-T2: str{{.*}}, #4]! 1054 1055define void @mul32x32_forwards(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c, i32 %N) { 1056entry: 1057 %cmp8 = icmp eq i32 %N, 0 1058 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 1059 1060for.body.preheader: ; preds = %entry 1061 %tmp = add i32 %N, -1 1062 %xtraiter = and i32 %N, 3 1063 %tmp1 = icmp ult i32 %tmp, 3 1064 br i1 %tmp1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 1065 1066for.body.preheader.new: ; preds = %for.body.preheader 1067 %unroll_iter = sub i32 %N, %xtraiter 1068 br label %for.body 1069 1070for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 1071 %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] 1072 %lcmp.mod = icmp eq i32 %xtraiter, 0 1073 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil 1074 1075for.body.epil: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa 1076 %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 1077 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 1078 %arrayidx.epil = getelementptr inbounds i32, ptr %b, i32 %i.09.epil 1079 %tmp2 = load i32, ptr %arrayidx.epil, align 4 1080 %arrayidx1.epil = getelementptr inbounds i32, ptr %c, i32 %i.09.epil 1081 %tmp3 = load i32, ptr %arrayidx1.epil, align 4 1082 %mul.epil = mul nsw i32 %tmp3, %tmp2 1083 %arrayidx2.epil = getelementptr inbounds i32, ptr %a, i32 %i.09.epil 1084 store i32 %mul.epil, ptr %arrayidx2.epil, align 4 1085 %inc.epil = add nuw nsw i32 %i.09.epil, 1 1086 %epil.iter.sub = add i32 %epil.iter, -1 1087 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 1088 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil 1089 1090for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry 1091 ret void 1092 1093for.body: ; preds = %for.body, %for.body.preheader.new 1094 %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] 1095 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 1096 %arrayidx = getelementptr inbounds i32, ptr %b, i32 %i.09 1097 %tmp4 = load i32, ptr %arrayidx, align 4 1098 %arrayidx1 = getelementptr inbounds i32, ptr %c, i32 %i.09 1099 %tmp5 = load i32, ptr %arrayidx1, align 4 1100 %mul = mul nsw i32 %tmp5, %tmp4 1101 %arrayidx2 = getelementptr inbounds i32, ptr %a, i32 %i.09 1102 store i32 %mul, ptr %arrayidx2, align 4 1103 %inc = or disjoint i32 %i.09, 1 1104 %arrayidx.1 = getelementptr inbounds i32, ptr %b, i32 %inc 1105 %tmp6 = load i32, ptr %arrayidx.1, align 4 1106 %arrayidx1.1 = getelementptr inbounds i32, ptr %c, i32 %inc 1107 %tmp7 = load i32, ptr %arrayidx1.1, align 4 1108 %mul.1 = mul nsw i32 %tmp7, %tmp6 1109 %arrayidx2.1 = getelementptr inbounds i32, ptr %a, i32 %inc 1110 store i32 %mul.1, ptr %arrayidx2.1, align 4 1111 %inc.1 = or disjoint i32 %i.09, 2 1112 %arrayidx.2 = getelementptr inbounds i32, ptr %b, i32 %inc.1 1113 %tmp8 = load i32, ptr %arrayidx.2, align 4 1114 %arrayidx1.2 = getelementptr inbounds i32, ptr %c, i32 %inc.1 1115 %tmp9 = load i32, ptr %arrayidx1.2, align 4 1116 %mul.2 = mul nsw i32 %tmp9, %tmp8 1117 %arrayidx2.2 = getelementptr inbounds i32, ptr %a, i32 %inc.1 1118 store i32 %mul.2, ptr %arrayidx2.2, align 4 1119 %inc.2 = or disjoint i32 %i.09, 3 1120 %arrayidx.3 = getelementptr inbounds i32, ptr %b, i32 %inc.2 1121 %tmp10 = load i32, ptr %arrayidx.3, align 4 1122 %arrayidx1.3 = getelementptr inbounds i32, ptr %c, i32 %inc.2 1123 %tmp11 = load i32, ptr %arrayidx1.3, align 4 1124 %mul.3 = mul nsw i32 %tmp11, %tmp10 1125 %arrayidx2.3 = getelementptr inbounds i32, ptr %a, i32 %inc.2 1126 store i32 %mul.3, ptr %arrayidx2.3, align 4 1127 %inc.3 = add nuw nsw i32 %i.09, 4 1128 %niter.nsub.3 = add i32 %niter, -4 1129 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 1130 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 1131} 1132