1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc void @float_float_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { 5; CHECK-LABEL: float_float_mul: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 8; CHECK-NEXT: cmp r3, #0 9; CHECK-NEXT: beq .LBB0_10 10; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 11; CHECK-NEXT: cmp r3, #3 12; CHECK-NEXT: bhi .LBB0_3 13; CHECK-NEXT: @ %bb.2: 14; CHECK-NEXT: mov.w r12, #0 15; CHECK-NEXT: b .LBB0_4 16; CHECK-NEXT: .LBB0_3: @ %vector.memcheck 17; CHECK-NEXT: add.w r7, r1, r3, lsl #2 18; CHECK-NEXT: add.w r6, r2, r3, lsl #2 19; CHECK-NEXT: cmp r7, r2 20; CHECK-NEXT: add.w r5, r0, r3, lsl #2 21; CHECK-NEXT: cset r7, hi 22; CHECK-NEXT: cmp r6, r1 23; CHECK-NEXT: csel r7, zr, r7, ls 24; CHECK-NEXT: cmp r6, r0 25; CHECK-NEXT: cset r6, hi 26; CHECK-NEXT: cmp r5, r2 27; CHECK-NEXT: cset r5, hi 28; CHECK-NEXT: mov.w r12, #0 29; CHECK-NEXT: tst r5, r6 30; CHECK-NEXT: it eq 31; CHECK-NEXT: cmpeq r7, #0 32; CHECK-NEXT: beq .LBB0_11 33; CHECK-NEXT: .LBB0_4: @ %for.body.preheader22 34; CHECK-NEXT: mvn.w r7, r12 35; CHECK-NEXT: adds r4, r7, r3 36; CHECK-NEXT: and r7, r3, #3 37; CHECK-NEXT: add.w r8, r12, r7 38; CHECK-NEXT: wls lr, r7, .LBB0_7 39; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader 40; CHECK-NEXT: add.w r6, r0, r12, lsl #2 41; CHECK-NEXT: add.w r7, r1, r12, lsl #2 42; CHECK-NEXT: add.w r5, r2, r12, lsl #2 43; CHECK-NEXT: mov r12, r8 44; CHECK-NEXT: .LBB0_6: @ %for.body.prol 45; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 46; CHECK-NEXT: vldmia r7!, {s0} 47; CHECK-NEXT: vldmia r6!, {s2} 48; CHECK-NEXT: vmul.f32 s0, s2, s0 49; CHECK-NEXT: vstmia r5!, {s0} 50; CHECK-NEXT: le lr, .LBB0_6 51; CHECK-NEXT: .LBB0_7: @ %for.body.prol.loopexit 52; CHECK-NEXT: cmp r4, #3 53; CHECK-NEXT: blo .LBB0_10 54; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 55; CHECK-NEXT: sub.w r3, r8, r3 56; CHECK-NEXT: movs r7, #1 57; CHECK-NEXT: rsb r3, r3, r3, lsl #30 58; CHECK-NEXT: subs r3, #4 59; CHECK-NEXT: add.w lr, r7, r3, lsr #2 60; CHECK-NEXT: lsl.w r3, r12, #2 61; CHECK-NEXT: .LBB0_9: @ %for.body 62; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 63; CHECK-NEXT: adds r7, r1, r3 64; CHECK-NEXT: adds r6, r0, r3 65; CHECK-NEXT: adds r5, r2, r3 66; CHECK-NEXT: adds r0, #16 67; CHECK-NEXT: vldr s0, [r7] 68; CHECK-NEXT: adds r1, #16 69; CHECK-NEXT: vldr s2, [r6] 70; CHECK-NEXT: adds r2, #16 71; CHECK-NEXT: vmul.f32 s0, s2, s0 72; CHECK-NEXT: vstr s0, [r5] 73; CHECK-NEXT: vldr s0, [r7, #4] 74; CHECK-NEXT: vldr s2, [r6, #4] 75; CHECK-NEXT: vmul.f32 s0, s2, s0 76; CHECK-NEXT: vstr s0, [r5, #4] 77; CHECK-NEXT: vldr s0, [r7, #8] 78; CHECK-NEXT: vldr s2, [r6, #8] 79; CHECK-NEXT: vmul.f32 s0, s2, s0 80; CHECK-NEXT: vstr s0, [r5, #8] 81; CHECK-NEXT: vldr s0, [r7, #12] 82; CHECK-NEXT: vldr s2, [r6, #12] 83; CHECK-NEXT: vmul.f32 s0, s2, s0 84; CHECK-NEXT: vstr s0, [r5, #12] 85; CHECK-NEXT: le lr, .LBB0_9 86; CHECK-NEXT: .LBB0_10: @ %for.cond.cleanup 87; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 88; CHECK-NEXT: .LBB0_11: @ %vector.ph 89; CHECK-NEXT: bic r12, r3, #3 90; CHECK-NEXT: movs r6, #1 91; CHECK-NEXT: sub.w r7, r12, #4 92; CHECK-NEXT: mov r4, r0 93; CHECK-NEXT: mov r5, r1 94; CHECK-NEXT: add.w lr, r6, r7, lsr #2 95; CHECK-NEXT: mov r6, r2 96; CHECK-NEXT: .LBB0_12: @ %vector.body 97; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 98; CHECK-NEXT: vldrw.u32 q0, [r5], #16 99; CHECK-NEXT: vldrw.u32 q1, [r4], #16 100; CHECK-NEXT: vmul.f32 q0, q1, q0 101; CHECK-NEXT: vstrb.8 q0, [r6], #16 102; CHECK-NEXT: le lr, .LBB0_12 103; CHECK-NEXT: @ %bb.13: @ %middle.block 104; CHECK-NEXT: cmp r12, r3 105; CHECK-NEXT: bne .LBB0_4 106; CHECK-NEXT: b .LBB0_10 107entry: 108 %cmp8 = icmp eq i32 %N, 0 109 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 110 111for.body.preheader: ; preds = %entry 112 %min.iters.check = icmp ult i32 %N, 4 113 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck 114 115for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader 116 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 117 %0 = xor i32 %i.09.ph, -1 118 %1 = add i32 %0, %N 119 %xtraiter = and i32 %N, 3 120 %lcmp.mod = icmp eq i32 %xtraiter, 0 121 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol 122 123for.body.prol: ; preds = %for.body.preheader22, %for.body.prol 124 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ] 125 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ] 126 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol 127 %2 = load float, ptr %arrayidx.prol, align 4 128 %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol 129 %3 = load float, ptr %arrayidx1.prol, align 4 130 %mul.prol = fmul float %2, %3 131 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol 132 store float %mul.prol, ptr %arrayidx2.prol, align 4 133 %inc.prol = add nuw i32 %i.09.prol, 1 134 %prol.iter.sub = add i32 %prol.iter, -1 135 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0 136 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol 137 138for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22 139 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ] 140 %4 = icmp ult i32 %1, 3 141 br i1 %4, label %for.cond.cleanup, label %for.body 142 143vector.memcheck: ; preds = %for.body.preheader 144 %scevgep = getelementptr float, ptr %c, i32 %N 145 %scevgep13 = getelementptr float, ptr %a, i32 %N 146 %scevgep16 = getelementptr float, ptr %b, i32 %N 147 %bound0 = icmp ugt ptr %scevgep13, %c 148 %bound1 = icmp ugt ptr %scevgep, %a 149 %found.conflict = and i1 %bound0, %bound1 150 %bound018 = icmp ugt ptr %scevgep16, %c 151 %bound119 = icmp ugt ptr %scevgep, %b 152 %found.conflict20 = and i1 %bound018, %bound119 153 %conflict.rdx = or i1 %found.conflict, %found.conflict20 154 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph 155 156vector.ph: ; preds = %vector.memcheck 157 %n.vec = and i32 %N, -4 158 br label %vector.body 159 160vector.body: ; preds = %vector.body, %vector.ph 161 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 162 %5 = getelementptr inbounds float, ptr %a, i32 %index 163 %wide.load = load <4 x float>, ptr %5, align 4 164 %6 = getelementptr inbounds float, ptr %b, i32 %index 165 %wide.load21 = load <4 x float>, ptr %6, align 4 166 %7 = fmul <4 x float> %wide.load, %wide.load21 167 %8 = getelementptr inbounds float, ptr %c, i32 %index 168 store <4 x float> %7, ptr %8, align 4 169 %index.next = add i32 %index, 4 170 %9 = icmp eq i32 %index.next, %n.vec 171 br i1 %9, label %middle.block, label %vector.body 172 173middle.block: ; preds = %vector.body 174 %cmp.n = icmp eq i32 %n.vec, %N 175 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22 176 177for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry 178 ret void 179 180for.body: ; preds = %for.body.prol.loopexit, %for.body 181 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ] 182 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09 183 %10 = load float, ptr %arrayidx, align 4 184 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09 185 %11 = load float, ptr %arrayidx1, align 4 186 %mul = fmul float %10, %11 187 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09 188 store float %mul, ptr %arrayidx2, align 4 189 %inc = add nuw i32 %i.09, 1 190 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc 191 %12 = load float, ptr %arrayidx.1, align 4 192 %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc 193 %13 = load float, ptr %arrayidx1.1, align 4 194 %mul.1 = fmul float %12, %13 195 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc 196 store float %mul.1, ptr %arrayidx2.1, align 4 197 %inc.1 = add nuw i32 %i.09, 2 198 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1 199 %14 = load float, ptr %arrayidx.2, align 4 200 %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1 201 %15 = load float, ptr %arrayidx1.2, align 4 202 %mul.2 = fmul float %14, %15 203 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1 204 store float %mul.2, ptr %arrayidx2.2, align 4 205 %inc.2 = add nuw i32 %i.09, 3 206 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2 207 %16 = load float, ptr %arrayidx.3, align 4 208 %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2 209 %17 = load float, ptr %arrayidx1.3, align 4 210 %mul.3 = fmul float %16, %17 211 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2 212 store float %mul.3, ptr %arrayidx2.3, align 4 213 %inc.3 = add nuw i32 %i.09, 4 214 %exitcond.3 = icmp eq i32 %inc.3, %N 215 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body 216} 217 218define arm_aapcs_vfpcc void @float_float_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { 219; CHECK-LABEL: float_float_add: 220; CHECK: @ %bb.0: @ %entry 221; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 222; CHECK-NEXT: cmp r3, #0 223; CHECK-NEXT: beq .LBB1_10 224; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 225; CHECK-NEXT: cmp r3, #3 226; CHECK-NEXT: bhi .LBB1_3 227; CHECK-NEXT: @ %bb.2: 228; CHECK-NEXT: mov.w r12, #0 229; CHECK-NEXT: b .LBB1_4 230; CHECK-NEXT: .LBB1_3: @ %vector.memcheck 231; CHECK-NEXT: add.w r7, r1, r3, lsl #2 232; CHECK-NEXT: add.w r6, r2, r3, lsl #2 233; CHECK-NEXT: cmp r7, r2 234; CHECK-NEXT: add.w r5, r0, r3, lsl #2 235; CHECK-NEXT: cset r7, hi 236; CHECK-NEXT: cmp r6, r1 237; CHECK-NEXT: csel r7, zr, r7, ls 238; CHECK-NEXT: cmp r6, r0 239; CHECK-NEXT: cset r6, hi 240; CHECK-NEXT: cmp r5, r2 241; CHECK-NEXT: cset r5, hi 242; CHECK-NEXT: mov.w r12, #0 243; CHECK-NEXT: tst r5, r6 244; CHECK-NEXT: it eq 245; CHECK-NEXT: cmpeq r7, #0 246; CHECK-NEXT: beq .LBB1_11 247; CHECK-NEXT: .LBB1_4: @ %for.body.preheader22 248; CHECK-NEXT: mvn.w r7, r12 249; CHECK-NEXT: adds r4, r7, r3 250; CHECK-NEXT: and r7, r3, #3 251; CHECK-NEXT: add.w r8, r12, r7 252; CHECK-NEXT: wls lr, r7, .LBB1_7 253; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader 254; CHECK-NEXT: add.w r6, r0, r12, lsl #2 255; CHECK-NEXT: add.w r7, r1, r12, lsl #2 256; CHECK-NEXT: add.w r5, r2, r12, lsl #2 257; CHECK-NEXT: mov r12, r8 258; CHECK-NEXT: .LBB1_6: @ %for.body.prol 259; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 260; CHECK-NEXT: vldmia r7!, {s0} 261; CHECK-NEXT: vldmia r6!, {s2} 262; CHECK-NEXT: vadd.f32 s0, s2, s0 263; CHECK-NEXT: vstmia r5!, {s0} 264; CHECK-NEXT: le lr, .LBB1_6 265; CHECK-NEXT: .LBB1_7: @ %for.body.prol.loopexit 266; CHECK-NEXT: cmp r4, #3 267; CHECK-NEXT: blo .LBB1_10 268; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 269; CHECK-NEXT: sub.w r3, r8, r3 270; CHECK-NEXT: movs r7, #1 271; CHECK-NEXT: rsb r3, r3, r3, lsl #30 272; CHECK-NEXT: subs r3, #4 273; CHECK-NEXT: add.w lr, r7, r3, lsr #2 274; CHECK-NEXT: lsl.w r3, r12, #2 275; CHECK-NEXT: .LBB1_9: @ %for.body 276; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 277; CHECK-NEXT: adds r7, r1, r3 278; CHECK-NEXT: adds r6, r0, r3 279; CHECK-NEXT: adds r5, r2, r3 280; CHECK-NEXT: adds r0, #16 281; CHECK-NEXT: vldr s0, [r7] 282; CHECK-NEXT: adds r1, #16 283; CHECK-NEXT: vldr s2, [r6] 284; CHECK-NEXT: adds r2, #16 285; CHECK-NEXT: vadd.f32 s0, s2, s0 286; CHECK-NEXT: vstr s0, [r5] 287; CHECK-NEXT: vldr s0, [r7, #4] 288; CHECK-NEXT: vldr s2, [r6, #4] 289; CHECK-NEXT: vadd.f32 s0, s2, s0 290; CHECK-NEXT: vstr s0, [r5, #4] 291; CHECK-NEXT: vldr s0, [r7, #8] 292; CHECK-NEXT: vldr s2, [r6, #8] 293; CHECK-NEXT: vadd.f32 s0, s2, s0 294; CHECK-NEXT: vstr s0, [r5, #8] 295; CHECK-NEXT: vldr s0, [r7, #12] 296; CHECK-NEXT: vldr s2, [r6, #12] 297; CHECK-NEXT: vadd.f32 s0, s2, s0 298; CHECK-NEXT: vstr s0, [r5, #12] 299; CHECK-NEXT: le lr, .LBB1_9 300; CHECK-NEXT: .LBB1_10: @ %for.cond.cleanup 301; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 302; CHECK-NEXT: .LBB1_11: @ %vector.ph 303; CHECK-NEXT: bic r12, r3, #3 304; CHECK-NEXT: movs r6, #1 305; CHECK-NEXT: sub.w r7, r12, #4 306; CHECK-NEXT: mov r4, r0 307; CHECK-NEXT: mov r5, r1 308; CHECK-NEXT: add.w lr, r6, r7, lsr #2 309; CHECK-NEXT: mov r6, r2 310; CHECK-NEXT: .LBB1_12: @ %vector.body 311; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 312; CHECK-NEXT: vldrw.u32 q0, [r5], #16 313; CHECK-NEXT: vldrw.u32 q1, [r4], #16 314; CHECK-NEXT: vadd.f32 q0, q1, q0 315; CHECK-NEXT: vstrb.8 q0, [r6], #16 316; CHECK-NEXT: le lr, .LBB1_12 317; CHECK-NEXT: @ %bb.13: @ %middle.block 318; CHECK-NEXT: cmp r12, r3 319; CHECK-NEXT: bne .LBB1_4 320; CHECK-NEXT: b .LBB1_10 321entry: 322 %cmp8 = icmp eq i32 %N, 0 323 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 324 325for.body.preheader: ; preds = %entry 326 %min.iters.check = icmp ult i32 %N, 4 327 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck 328 329for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader 330 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 331 %0 = xor i32 %i.09.ph, -1 332 %1 = add i32 %0, %N 333 %xtraiter = and i32 %N, 3 334 %lcmp.mod = icmp eq i32 %xtraiter, 0 335 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol 336 337for.body.prol: ; preds = %for.body.preheader22, %for.body.prol 338 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ] 339 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ] 340 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol 341 %2 = load float, ptr %arrayidx.prol, align 4 342 %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol 343 %3 = load float, ptr %arrayidx1.prol, align 4 344 %add.prol = fadd float %2, %3 345 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol 346 store float %add.prol, ptr %arrayidx2.prol, align 4 347 %inc.prol = add nuw i32 %i.09.prol, 1 348 %prol.iter.sub = add i32 %prol.iter, -1 349 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0 350 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol 351 352for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22 353 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ] 354 %4 = icmp ult i32 %1, 3 355 br i1 %4, label %for.cond.cleanup, label %for.body 356 357vector.memcheck: ; preds = %for.body.preheader 358 %scevgep = getelementptr float, ptr %c, i32 %N 359 %scevgep13 = getelementptr float, ptr %a, i32 %N 360 %scevgep16 = getelementptr float, ptr %b, i32 %N 361 %bound0 = icmp ugt ptr %scevgep13, %c 362 %bound1 = icmp ugt ptr %scevgep, %a 363 %found.conflict = and i1 %bound0, %bound1 364 %bound018 = icmp ugt ptr %scevgep16, %c 365 %bound119 = icmp ugt ptr %scevgep, %b 366 %found.conflict20 = and i1 %bound018, %bound119 367 %conflict.rdx = or i1 %found.conflict, %found.conflict20 368 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph 369 370vector.ph: ; preds = %vector.memcheck 371 %n.vec = and i32 %N, -4 372 br label %vector.body 373 374vector.body: ; preds = %vector.body, %vector.ph 375 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 376 %5 = getelementptr inbounds float, ptr %a, i32 %index 377 %wide.load = load <4 x float>, ptr %5, align 4 378 %6 = getelementptr inbounds float, ptr %b, i32 %index 379 %wide.load21 = load <4 x float>, ptr %6, align 4 380 %7 = fadd <4 x float> %wide.load, %wide.load21 381 %8 = getelementptr inbounds float, ptr %c, i32 %index 382 store <4 x float> %7, ptr %8, align 4 383 %index.next = add i32 %index, 4 384 %9 = icmp eq i32 %index.next, %n.vec 385 br i1 %9, label %middle.block, label %vector.body 386 387middle.block: ; preds = %vector.body 388 %cmp.n = icmp eq i32 %n.vec, %N 389 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22 390 391for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry 392 ret void 393 394for.body: ; preds = %for.body.prol.loopexit, %for.body 395 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ] 396 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09 397 %10 = load float, ptr %arrayidx, align 4 398 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09 399 %11 = load float, ptr %arrayidx1, align 4 400 %add = fadd float %10, %11 401 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09 402 store float %add, ptr %arrayidx2, align 4 403 %inc = add nuw i32 %i.09, 1 404 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc 405 %12 = load float, ptr %arrayidx.1, align 4 406 %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc 407 %13 = load float, ptr %arrayidx1.1, align 4 408 %add.1 = fadd float %12, %13 409 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc 410 store float %add.1, ptr %arrayidx2.1, align 4 411 %inc.1 = add nuw i32 %i.09, 2 412 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1 413 %14 = load float, ptr %arrayidx.2, align 4 414 %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1 415 %15 = load float, ptr %arrayidx1.2, align 4 416 %add.2 = fadd float %14, %15 417 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1 418 store float %add.2, ptr %arrayidx2.2, align 4 419 %inc.2 = add nuw i32 %i.09, 3 420 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2 421 %16 = load float, ptr %arrayidx.3, align 4 422 %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2 423 %17 = load float, ptr %arrayidx1.3, align 4 424 %add.3 = fadd float %16, %17 425 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2 426 store float %add.3, ptr %arrayidx2.3, align 4 427 %inc.3 = add nuw i32 %i.09, 4 428 %exitcond.3 = icmp eq i32 %inc.3, %N 429 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body 430} 431 432define arm_aapcs_vfpcc void @float_float_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { 433; CHECK-LABEL: float_float_sub: 434; CHECK: @ %bb.0: @ %entry 435; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} 436; CHECK-NEXT: cmp r3, #0 437; CHECK-NEXT: beq .LBB2_10 438; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 439; CHECK-NEXT: cmp r3, #3 440; CHECK-NEXT: bhi .LBB2_3 441; CHECK-NEXT: @ %bb.2: 442; CHECK-NEXT: mov.w r12, #0 443; CHECK-NEXT: b .LBB2_4 444; CHECK-NEXT: .LBB2_3: @ %vector.memcheck 445; CHECK-NEXT: add.w r7, r1, r3, lsl #2 446; CHECK-NEXT: add.w r6, r2, r3, lsl #2 447; CHECK-NEXT: cmp r7, r2 448; CHECK-NEXT: add.w r5, r0, r3, lsl #2 449; CHECK-NEXT: cset r7, hi 450; CHECK-NEXT: cmp r6, r1 451; CHECK-NEXT: csel r7, zr, r7, ls 452; CHECK-NEXT: cmp r6, r0 453; CHECK-NEXT: cset r6, hi 454; CHECK-NEXT: cmp r5, r2 455; CHECK-NEXT: cset r5, hi 456; CHECK-NEXT: mov.w r12, #0 457; CHECK-NEXT: tst r5, r6 458; CHECK-NEXT: it eq 459; CHECK-NEXT: cmpeq r7, #0 460; CHECK-NEXT: beq .LBB2_11 461; CHECK-NEXT: .LBB2_4: @ %for.body.preheader22 462; CHECK-NEXT: mvn.w r7, r12 463; CHECK-NEXT: adds r4, r7, r3 464; CHECK-NEXT: and r7, r3, #3 465; CHECK-NEXT: add.w r8, r12, r7 466; CHECK-NEXT: wls lr, r7, .LBB2_7 467; CHECK-NEXT: @ %bb.5: @ %for.body.prol.preheader 468; CHECK-NEXT: add.w r6, r0, r12, lsl #2 469; CHECK-NEXT: add.w r7, r1, r12, lsl #2 470; CHECK-NEXT: add.w r5, r2, r12, lsl #2 471; CHECK-NEXT: mov r12, r8 472; CHECK-NEXT: .LBB2_6: @ %for.body.prol 473; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 474; CHECK-NEXT: vldmia r7!, {s0} 475; CHECK-NEXT: vldmia r6!, {s2} 476; CHECK-NEXT: vsub.f32 s0, s2, s0 477; CHECK-NEXT: vstmia r5!, {s0} 478; CHECK-NEXT: le lr, .LBB2_6 479; CHECK-NEXT: .LBB2_7: @ %for.body.prol.loopexit 480; CHECK-NEXT: cmp r4, #3 481; CHECK-NEXT: blo .LBB2_10 482; CHECK-NEXT: @ %bb.8: @ %for.body.preheader1 483; CHECK-NEXT: sub.w r3, r8, r3 484; CHECK-NEXT: movs r7, #1 485; CHECK-NEXT: rsb r3, r3, r3, lsl #30 486; CHECK-NEXT: subs r3, #4 487; CHECK-NEXT: add.w lr, r7, r3, lsr #2 488; CHECK-NEXT: lsl.w r3, r12, #2 489; CHECK-NEXT: .LBB2_9: @ %for.body 490; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 491; CHECK-NEXT: adds r7, r1, r3 492; CHECK-NEXT: adds r6, r0, r3 493; CHECK-NEXT: adds r5, r2, r3 494; CHECK-NEXT: adds r0, #16 495; CHECK-NEXT: vldr s0, [r7] 496; CHECK-NEXT: adds r1, #16 497; CHECK-NEXT: vldr s2, [r6] 498; CHECK-NEXT: adds r2, #16 499; CHECK-NEXT: vsub.f32 s0, s2, s0 500; CHECK-NEXT: vstr s0, [r5] 501; CHECK-NEXT: vldr s0, [r7, #4] 502; CHECK-NEXT: vldr s2, [r6, #4] 503; CHECK-NEXT: vsub.f32 s0, s2, s0 504; CHECK-NEXT: vstr s0, [r5, #4] 505; CHECK-NEXT: vldr s0, [r7, #8] 506; CHECK-NEXT: vldr s2, [r6, #8] 507; CHECK-NEXT: vsub.f32 s0, s2, s0 508; CHECK-NEXT: vstr s0, [r5, #8] 509; CHECK-NEXT: vldr s0, [r7, #12] 510; CHECK-NEXT: vldr s2, [r6, #12] 511; CHECK-NEXT: vsub.f32 s0, s2, s0 512; CHECK-NEXT: vstr s0, [r5, #12] 513; CHECK-NEXT: le lr, .LBB2_9 514; CHECK-NEXT: .LBB2_10: @ %for.cond.cleanup 515; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} 516; CHECK-NEXT: .LBB2_11: @ %vector.ph 517; CHECK-NEXT: bic r12, r3, #3 518; CHECK-NEXT: movs r6, #1 519; CHECK-NEXT: sub.w r7, r12, #4 520; CHECK-NEXT: mov r4, r0 521; CHECK-NEXT: mov r5, r1 522; CHECK-NEXT: add.w lr, r6, r7, lsr #2 523; CHECK-NEXT: mov r6, r2 524; CHECK-NEXT: .LBB2_12: @ %vector.body 525; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 526; CHECK-NEXT: vldrw.u32 q0, [r5], #16 527; CHECK-NEXT: vldrw.u32 q1, [r4], #16 528; CHECK-NEXT: vsub.f32 q0, q1, q0 529; CHECK-NEXT: vstrb.8 q0, [r6], #16 530; CHECK-NEXT: le lr, .LBB2_12 531; CHECK-NEXT: @ %bb.13: @ %middle.block 532; CHECK-NEXT: cmp r12, r3 533; CHECK-NEXT: bne .LBB2_4 534; CHECK-NEXT: b .LBB2_10 535entry: 536 %cmp8 = icmp eq i32 %N, 0 537 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 538 539for.body.preheader: ; preds = %entry 540 %min.iters.check = icmp ult i32 %N, 4 541 br i1 %min.iters.check, label %for.body.preheader22, label %vector.memcheck 542 543for.body.preheader22: ; preds = %middle.block, %vector.memcheck, %for.body.preheader 544 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 545 %0 = xor i32 %i.09.ph, -1 546 %1 = add i32 %0, %N 547 %xtraiter = and i32 %N, 3 548 %lcmp.mod = icmp eq i32 %xtraiter, 0 549 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol 550 551for.body.prol: ; preds = %for.body.preheader22, %for.body.prol 552 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader22 ] 553 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader22 ] 554 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol 555 %2 = load float, ptr %arrayidx.prol, align 4 556 %arrayidx1.prol = getelementptr inbounds float, ptr %b, i32 %i.09.prol 557 %3 = load float, ptr %arrayidx1.prol, align 4 558 %sub.prol = fsub float %2, %3 559 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol 560 store float %sub.prol, ptr %arrayidx2.prol, align 4 561 %inc.prol = add nuw i32 %i.09.prol, 1 562 %prol.iter.sub = add i32 %prol.iter, -1 563 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0 564 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol 565 566for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader22 567 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader22 ], [ %inc.prol, %for.body.prol ] 568 %4 = icmp ult i32 %1, 3 569 br i1 %4, label %for.cond.cleanup, label %for.body 570 571vector.memcheck: ; preds = %for.body.preheader 572 %scevgep = getelementptr float, ptr %c, i32 %N 573 %scevgep13 = getelementptr float, ptr %a, i32 %N 574 %scevgep16 = getelementptr float, ptr %b, i32 %N 575 %bound0 = icmp ugt ptr %scevgep13, %c 576 %bound1 = icmp ugt ptr %scevgep, %a 577 %found.conflict = and i1 %bound0, %bound1 578 %bound018 = icmp ugt ptr %scevgep16, %c 579 %bound119 = icmp ugt ptr %scevgep, %b 580 %found.conflict20 = and i1 %bound018, %bound119 581 %conflict.rdx = or i1 %found.conflict, %found.conflict20 582 br i1 %conflict.rdx, label %for.body.preheader22, label %vector.ph 583 584vector.ph: ; preds = %vector.memcheck 585 %n.vec = and i32 %N, -4 586 br label %vector.body 587 588vector.body: ; preds = %vector.body, %vector.ph 589 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 590 %5 = getelementptr inbounds float, ptr %a, i32 %index 591 %wide.load = load <4 x float>, ptr %5, align 4 592 %6 = getelementptr inbounds float, ptr %b, i32 %index 593 %wide.load21 = load <4 x float>, ptr %6, align 4 594 %7 = fsub <4 x float> %wide.load, %wide.load21 595 %8 = getelementptr inbounds float, ptr %c, i32 %index 596 store <4 x float> %7, ptr %8, align 4 597 %index.next = add i32 %index, 4 598 %9 = icmp eq i32 %index.next, %n.vec 599 br i1 %9, label %middle.block, label %vector.body 600 601middle.block: ; preds = %vector.body 602 %cmp.n = icmp eq i32 %n.vec, %N 603 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader22 604 605for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry 606 ret void 607 608for.body: ; preds = %for.body.prol.loopexit, %for.body 609 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ] 610 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09 611 %10 = load float, ptr %arrayidx, align 4 612 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.09 613 %11 = load float, ptr %arrayidx1, align 4 614 %sub = fsub float %10, %11 615 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09 616 store float %sub, ptr %arrayidx2, align 4 617 %inc = add nuw i32 %i.09, 1 618 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc 619 %12 = load float, ptr %arrayidx.1, align 4 620 %arrayidx1.1 = getelementptr inbounds float, ptr %b, i32 %inc 621 %13 = load float, ptr %arrayidx1.1, align 4 622 %sub.1 = fsub float %12, %13 623 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc 624 store float %sub.1, ptr %arrayidx2.1, align 4 625 %inc.1 = add nuw i32 %i.09, 2 626 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1 627 %14 = load float, ptr %arrayidx.2, align 4 628 %arrayidx1.2 = getelementptr inbounds float, ptr %b, i32 %inc.1 629 %15 = load float, ptr %arrayidx1.2, align 4 630 %sub.2 = fsub float %14, %15 631 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1 632 store float %sub.2, ptr %arrayidx2.2, align 4 633 %inc.2 = add nuw i32 %i.09, 3 634 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2 635 %16 = load float, ptr %arrayidx.3, align 4 636 %arrayidx1.3 = getelementptr inbounds float, ptr %b, i32 %inc.2 637 %17 = load float, ptr %arrayidx1.3, align 4 638 %sub.3 = fsub float %16, %17 639 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2 640 store float %sub.3, ptr %arrayidx2.3, align 4 641 %inc.3 = add nuw i32 %i.09, 4 642 %exitcond.3 = icmp eq i32 %inc.3, %N 643 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body 644} 645 646define arm_aapcs_vfpcc void @float_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { 647; CHECK-LABEL: float_int_mul: 648; CHECK: @ %bb.0: @ %entry 649; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} 650; CHECK-NEXT: cmp r3, #0 651; CHECK-NEXT: beq.w .LBB3_13 652; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 653; CHECK-NEXT: cmp r3, #3 654; CHECK-NEXT: bls .LBB3_6 655; CHECK-NEXT: @ %bb.2: @ %vector.memcheck 656; CHECK-NEXT: add.w r7, r0, r3, lsl #2 657; CHECK-NEXT: cmp r7, r2 658; CHECK-NEXT: itt hi 659; CHECK-NEXT: addhi.w r7, r2, r3, lsl #2 660; CHECK-NEXT: cmphi r7, r0 661; CHECK-NEXT: bhi .LBB3_6 662; CHECK-NEXT: @ %bb.3: @ %vector.ph 663; CHECK-NEXT: bic r12, r3, #3 664; CHECK-NEXT: movs r6, #1 665; CHECK-NEXT: sub.w r7, r12, #4 666; CHECK-NEXT: mov r4, r0 667; CHECK-NEXT: mov r5, r1 668; CHECK-NEXT: add.w lr, r6, r7, lsr #2 669; CHECK-NEXT: mov r6, r2 670; CHECK-NEXT: .LBB3_4: @ %vector.body 671; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 672; CHECK-NEXT: vldrw.u32 q0, [r5], #16 673; CHECK-NEXT: vldrw.u32 q1, [r4], #16 674; CHECK-NEXT: vcvt.f32.s32 q0, q0 675; CHECK-NEXT: vmul.f32 q0, q1, q0 676; CHECK-NEXT: vstrb.8 q0, [r6], #16 677; CHECK-NEXT: le lr, .LBB3_4 678; CHECK-NEXT: @ %bb.5: @ %middle.block 679; CHECK-NEXT: cmp r12, r3 680; CHECK-NEXT: bne .LBB3_7 681; CHECK-NEXT: b .LBB3_13 682; CHECK-NEXT: .LBB3_6: 683; CHECK-NEXT: mov.w r12, #0 684; CHECK-NEXT: .LBB3_7: @ %for.body.preheader16 685; CHECK-NEXT: mvn.w r7, r12 686; CHECK-NEXT: add.w r9, r7, r3 687; CHECK-NEXT: and r7, r3, #3 688; CHECK-NEXT: add.w r8, r12, r7 689; CHECK-NEXT: wls lr, r7, .LBB3_10 690; CHECK-NEXT: @ %bb.8: @ %for.body.prol.preheader 691; CHECK-NEXT: add.w r6, r0, r12, lsl #2 692; CHECK-NEXT: add.w r7, r1, r12, lsl #2 693; CHECK-NEXT: add.w r5, r2, r12, lsl #2 694; CHECK-NEXT: mov r12, r8 695; CHECK-NEXT: .LBB3_9: @ %for.body.prol 696; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 697; CHECK-NEXT: ldr r4, [r7], #4 698; CHECK-NEXT: vldmia r6!, {s2} 699; CHECK-NEXT: vmov s0, r4 700; CHECK-NEXT: vcvt.f32.s32 s0, s0 701; CHECK-NEXT: vmul.f32 s0, s2, s0 702; CHECK-NEXT: vstmia r5!, {s0} 703; CHECK-NEXT: le lr, .LBB3_9 704; CHECK-NEXT: .LBB3_10: @ %for.body.prol.loopexit 705; CHECK-NEXT: cmp.w r9, #3 706; CHECK-NEXT: blo .LBB3_13 707; CHECK-NEXT: @ %bb.11: @ %for.body.preheader1 708; CHECK-NEXT: sub.w r3, r8, r3 709; CHECK-NEXT: add.w r1, r1, r12, lsl #2 710; CHECK-NEXT: movs r7, #1 711; CHECK-NEXT: adds r1, #8 712; CHECK-NEXT: rsb r3, r3, r3, lsl #30 713; CHECK-NEXT: subs r3, #4 714; CHECK-NEXT: add.w lr, r7, r3, lsr #2 715; CHECK-NEXT: lsl.w r3, r12, #2 716; CHECK-NEXT: .LBB3_12: @ %for.body 717; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 718; CHECK-NEXT: vldr s0, [r1, #-8] 719; CHECK-NEXT: adds r7, r0, r3 720; CHECK-NEXT: adds r6, r2, r3 721; CHECK-NEXT: adds r0, #16 722; CHECK-NEXT: vcvt.f32.s32 s0, s0 723; CHECK-NEXT: vldr s2, [r7] 724; CHECK-NEXT: adds r2, #16 725; CHECK-NEXT: vmul.f32 s0, s2, s0 726; CHECK-NEXT: vstr s0, [r6] 727; CHECK-NEXT: vldr s0, [r1, #-4] 728; CHECK-NEXT: vldr s2, [r7, #4] 729; CHECK-NEXT: vcvt.f32.s32 s0, s0 730; CHECK-NEXT: vmul.f32 s0, s2, s0 731; CHECK-NEXT: vstr s0, [r6, #4] 732; CHECK-NEXT: vldr s0, [r1] 733; CHECK-NEXT: vldr s2, [r7, #8] 734; CHECK-NEXT: vcvt.f32.s32 s0, s0 735; CHECK-NEXT: vmul.f32 s0, s2, s0 736; CHECK-NEXT: vstr s0, [r6, #8] 737; CHECK-NEXT: vldr s0, [r1, #4] 738; CHECK-NEXT: adds r1, #16 739; CHECK-NEXT: vldr s2, [r7, #12] 740; CHECK-NEXT: vcvt.f32.s32 s0, s0 741; CHECK-NEXT: vmul.f32 s0, s2, s0 742; CHECK-NEXT: vstr s0, [r6, #12] 743; CHECK-NEXT: le lr, .LBB3_12 744; CHECK-NEXT: .LBB3_13: @ %for.cond.cleanup 745; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} 746entry: 747 %cmp8 = icmp eq i32 %N, 0 748 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 749 750for.body.preheader: ; preds = %entry 751 %min.iters.check = icmp ult i32 %N, 4 752 br i1 %min.iters.check, label %for.body.preheader16, label %vector.memcheck 753 754for.body.preheader16: ; preds = %middle.block, %vector.memcheck, %for.body.preheader 755 %i.09.ph = phi i32 [ 0, %vector.memcheck ], [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 756 %0 = xor i32 %i.09.ph, -1 757 %1 = add i32 %0, %N 758 %xtraiter = and i32 %N, 3 759 %lcmp.mod = icmp eq i32 %xtraiter, 0 760 br i1 %lcmp.mod, label %for.body.prol.loopexit, label %for.body.prol 761 762for.body.prol: ; preds = %for.body.preheader16, %for.body.prol 763 %i.09.prol = phi i32 [ %inc.prol, %for.body.prol ], [ %i.09.ph, %for.body.preheader16 ] 764 %prol.iter = phi i32 [ %prol.iter.sub, %for.body.prol ], [ %xtraiter, %for.body.preheader16 ] 765 %arrayidx.prol = getelementptr inbounds float, ptr %a, i32 %i.09.prol 766 %2 = load float, ptr %arrayidx.prol, align 4 767 %arrayidx1.prol = getelementptr inbounds i32, ptr %b, i32 %i.09.prol 768 %3 = load i32, ptr %arrayidx1.prol, align 4 769 %conv.prol = sitofp i32 %3 to float 770 %mul.prol = fmul float %2, %conv.prol 771 %arrayidx2.prol = getelementptr inbounds float, ptr %c, i32 %i.09.prol 772 store float %mul.prol, ptr %arrayidx2.prol, align 4 773 %inc.prol = add nuw i32 %i.09.prol, 1 774 %prol.iter.sub = add i32 %prol.iter, -1 775 %prol.iter.cmp = icmp eq i32 %prol.iter.sub, 0 776 br i1 %prol.iter.cmp, label %for.body.prol.loopexit, label %for.body.prol 777 778for.body.prol.loopexit: ; preds = %for.body.prol, %for.body.preheader16 779 %i.09.unr = phi i32 [ %i.09.ph, %for.body.preheader16 ], [ %inc.prol, %for.body.prol ] 780 %4 = icmp ult i32 %1, 3 781 br i1 %4, label %for.cond.cleanup, label %for.body 782 783vector.memcheck: ; preds = %for.body.preheader 784 %scevgep = getelementptr float, ptr %c, i32 %N 785 %scevgep13 = getelementptr float, ptr %a, i32 %N 786 %bound0 = icmp ugt ptr %scevgep13, %c 787 %bound1 = icmp ugt ptr %scevgep, %a 788 %found.conflict = and i1 %bound0, %bound1 789 br i1 %found.conflict, label %for.body.preheader16, label %vector.ph 790 791vector.ph: ; preds = %vector.memcheck 792 %n.vec = and i32 %N, -4 793 br label %vector.body 794 795vector.body: ; preds = %vector.body, %vector.ph 796 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 797 %5 = getelementptr inbounds float, ptr %a, i32 %index 798 %wide.load = load <4 x float>, ptr %5, align 4 799 %6 = getelementptr inbounds i32, ptr %b, i32 %index 800 %wide.load15 = load <4 x i32>, ptr %6, align 4 801 %7 = sitofp <4 x i32> %wide.load15 to <4 x float> 802 %8 = fmul <4 x float> %wide.load, %7 803 %9 = getelementptr inbounds float, ptr %c, i32 %index 804 store <4 x float> %8, ptr %9, align 4 805 %index.next = add i32 %index, 4 806 %10 = icmp eq i32 %index.next, %n.vec 807 br i1 %10, label %middle.block, label %vector.body 808 809middle.block: ; preds = %vector.body 810 %cmp.n = icmp eq i32 %n.vec, %N 811 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader16 812 813for.cond.cleanup: ; preds = %for.body.prol.loopexit, %for.body, %middle.block, %entry 814 ret void 815 816for.body: ; preds = %for.body.prol.loopexit, %for.body 817 %i.09 = phi i32 [ %inc.3, %for.body ], [ %i.09.unr, %for.body.prol.loopexit ] 818 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.09 819 %11 = load float, ptr %arrayidx, align 4 820 %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.09 821 %12 = load i32, ptr %arrayidx1, align 4 822 %conv = sitofp i32 %12 to float 823 %mul = fmul float %11, %conv 824 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09 825 store float %mul, ptr %arrayidx2, align 4 826 %inc = add nuw i32 %i.09, 1 827 %arrayidx.1 = getelementptr inbounds float, ptr %a, i32 %inc 828 %13 = load float, ptr %arrayidx.1, align 4 829 %arrayidx1.1 = getelementptr inbounds i32, ptr %b, i32 %inc 830 %14 = load i32, ptr %arrayidx1.1, align 4 831 %conv.1 = sitofp i32 %14 to float 832 %mul.1 = fmul float %13, %conv.1 833 %arrayidx2.1 = getelementptr inbounds float, ptr %c, i32 %inc 834 store float %mul.1, ptr %arrayidx2.1, align 4 835 %inc.1 = add nuw i32 %i.09, 2 836 %arrayidx.2 = getelementptr inbounds float, ptr %a, i32 %inc.1 837 %15 = load float, ptr %arrayidx.2, align 4 838 %arrayidx1.2 = getelementptr inbounds i32, ptr %b, i32 %inc.1 839 %16 = load i32, ptr %arrayidx1.2, align 4 840 %conv.2 = sitofp i32 %16 to float 841 %mul.2 = fmul float %15, %conv.2 842 %arrayidx2.2 = getelementptr inbounds float, ptr %c, i32 %inc.1 843 store float %mul.2, ptr %arrayidx2.2, align 4 844 %inc.2 = add nuw i32 %i.09, 3 845 %arrayidx.3 = getelementptr inbounds float, ptr %a, i32 %inc.2 846 %17 = load float, ptr %arrayidx.3, align 4 847 %arrayidx1.3 = getelementptr inbounds i32, ptr %b, i32 %inc.2 848 %18 = load i32, ptr %arrayidx1.3, align 4 849 %conv.3 = sitofp i32 %18 to float 850 %mul.3 = fmul float %17, %conv.3 851 %arrayidx2.3 = getelementptr inbounds float, ptr %c, i32 %inc.2 852 store float %mul.3, ptr %arrayidx2.3, align 4 853 %inc.3 = add nuw i32 %i.09, 4 854 %exitcond.3 = icmp eq i32 %inc.3, %N 855 br i1 %exitcond.3, label %for.cond.cleanup, label %for.body 856} 857 858define arm_aapcs_vfpcc void @float_int_int_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { 859; CHECK-LABEL: float_int_int_mul: 860; CHECK: @ %bb.0: @ %entry 861; CHECK-NEXT: push {r4, r5, r6, lr} 862; CHECK-NEXT: cbz r3, .LBB4_8 863; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 864; CHECK-NEXT: cmp r3, #3 865; CHECK-NEXT: bhi .LBB4_3 866; CHECK-NEXT: @ %bb.2: 867; CHECK-NEXT: mov.w r12, #0 868; CHECK-NEXT: b .LBB4_6 869; CHECK-NEXT: .LBB4_3: @ %vector.ph 870; CHECK-NEXT: bic r12, r3, #3 871; CHECK-NEXT: movs r5, #1 872; CHECK-NEXT: sub.w r6, r12, #4 873; CHECK-NEXT: mov r4, r0 874; CHECK-NEXT: add.w lr, r5, r6, lsr #2 875; CHECK-NEXT: mov r5, r1 876; CHECK-NEXT: mov r6, r2 877; CHECK-NEXT: .LBB4_4: @ %vector.body 878; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 879; CHECK-NEXT: vldrw.u32 q0, [r4], #16 880; CHECK-NEXT: vldrw.u32 q1, [r5], #16 881; CHECK-NEXT: vmul.i32 q0, q1, q0 882; CHECK-NEXT: vcvt.f32.s32 q0, q0 883; CHECK-NEXT: vstrb.8 q0, [r6], #16 884; CHECK-NEXT: le lr, .LBB4_4 885; CHECK-NEXT: @ %bb.5: @ %middle.block 886; CHECK-NEXT: cmp r12, r3 887; CHECK-NEXT: it eq 888; CHECK-NEXT: popeq {r4, r5, r6, pc} 889; CHECK-NEXT: .LBB4_6: @ %for.body.preheader11 890; CHECK-NEXT: sub.w lr, r3, r12 891; CHECK-NEXT: add.w r0, r0, r12, lsl #2 892; CHECK-NEXT: add.w r1, r1, r12, lsl #2 893; CHECK-NEXT: add.w r2, r2, r12, lsl #2 894; CHECK-NEXT: .LBB4_7: @ %for.body 895; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 896; CHECK-NEXT: ldr r3, [r0], #4 897; CHECK-NEXT: ldr r6, [r1], #4 898; CHECK-NEXT: muls r3, r6, r3 899; CHECK-NEXT: vmov s0, r3 900; CHECK-NEXT: vcvt.f32.s32 s0, s0 901; CHECK-NEXT: vstmia r2!, {s0} 902; CHECK-NEXT: le lr, .LBB4_7 903; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup 904; CHECK-NEXT: pop {r4, r5, r6, pc} 905entry: 906 %cmp8 = icmp eq i32 %N, 0 907 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 908 909for.body.preheader: ; preds = %entry 910 %min.iters.check = icmp ult i32 %N, 4 911 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph 912 913for.body.preheader11: ; preds = %middle.block, %for.body.preheader 914 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 915 br label %for.body 916 917vector.ph: ; preds = %for.body.preheader 918 %n.vec = and i32 %N, -4 919 br label %vector.body 920 921vector.body: ; preds = %vector.body, %vector.ph 922 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 923 %0 = getelementptr inbounds i32, ptr %a, i32 %index 924 %wide.load = load <4 x i32>, ptr %0, align 4 925 %1 = getelementptr inbounds i32, ptr %b, i32 %index 926 %wide.load10 = load <4 x i32>, ptr %1, align 4 927 %2 = mul nsw <4 x i32> %wide.load10, %wide.load 928 %3 = sitofp <4 x i32> %2 to <4 x float> 929 %4 = getelementptr inbounds float, ptr %c, i32 %index 930 store <4 x float> %3, ptr %4, align 4 931 %index.next = add i32 %index, 4 932 %5 = icmp eq i32 %index.next, %n.vec 933 br i1 %5, label %middle.block, label %vector.body 934 935middle.block: ; preds = %vector.body 936 %cmp.n = icmp eq i32 %n.vec, %N 937 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11 938 939for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 940 ret void 941 942for.body: ; preds = %for.body.preheader11, %for.body 943 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ] 944 %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.09 945 %6 = load i32, ptr %arrayidx, align 4 946 %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.09 947 %7 = load i32, ptr %arrayidx1, align 4 948 %mul = mul nsw i32 %7, %6 949 %conv = sitofp i32 %mul to float 950 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09 951 store float %conv, ptr %arrayidx2, align 4 952 %inc = add nuw i32 %i.09, 1 953 %exitcond = icmp eq i32 %inc, %N 954 br i1 %exitcond, label %for.cond.cleanup, label %for.body 955} 956 957define arm_aapcs_vfpcc void @half_half_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { 958; CHECK-LABEL: half_half_mul: 959; CHECK: @ %bb.0: @ %entry 960; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} 961; CHECK-NEXT: cmp r3, #0 962; CHECK-NEXT: beq .LBB5_8 963; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 964; CHECK-NEXT: cmp r3, #3 965; CHECK-NEXT: bhi .LBB5_3 966; CHECK-NEXT: @ %bb.2: 967; CHECK-NEXT: mov.w r12, #0 968; CHECK-NEXT: b .LBB5_6 969; CHECK-NEXT: .LBB5_3: @ %vector.ph 970; CHECK-NEXT: bic r12, r3, #3 971; CHECK-NEXT: movs r5, #1 972; CHECK-NEXT: sub.w r6, r12, #4 973; CHECK-NEXT: mov r4, r0 974; CHECK-NEXT: add.w lr, r5, r6, lsr #2 975; CHECK-NEXT: mov r5, r1 976; CHECK-NEXT: mov r6, r2 977; CHECK-NEXT: .LBB5_4: @ %vector.body 978; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 979; CHECK-NEXT: ldr.w r9, [r4] 980; CHECK-NEXT: ldr r7, [r5] 981; CHECK-NEXT: ldr.w r8, [r4, #4] 982; CHECK-NEXT: vmov.32 q0[0], r9 983; CHECK-NEXT: ldr.w r10, [r5, #4] 984; CHECK-NEXT: vmov.32 q1[0], r7 985; CHECK-NEXT: vmov.32 q0[1], r8 986; CHECK-NEXT: adds r4, #8 987; CHECK-NEXT: vmov.32 q1[1], r10 988; CHECK-NEXT: adds r5, #8 989; CHECK-NEXT: vmul.f16 q0, q0, q1 990; CHECK-NEXT: vcvtt.f32.f16 s3, s1 991; CHECK-NEXT: vcvtb.f32.f16 s2, s1 992; CHECK-NEXT: vcvtt.f32.f16 s1, s0 993; CHECK-NEXT: vcvtb.f32.f16 s0, s0 994; CHECK-NEXT: vstrb.8 q0, [r6], #16 995; CHECK-NEXT: le lr, .LBB5_4 996; CHECK-NEXT: @ %bb.5: @ %middle.block 997; CHECK-NEXT: cmp r12, r3 998; CHECK-NEXT: beq .LBB5_8 999; CHECK-NEXT: .LBB5_6: @ %for.body.preheader11 1000; CHECK-NEXT: sub.w lr, r3, r12 1001; CHECK-NEXT: add.w r0, r0, r12, lsl #1 1002; CHECK-NEXT: add.w r1, r1, r12, lsl #1 1003; CHECK-NEXT: add.w r2, r2, r12, lsl #2 1004; CHECK-NEXT: .LBB5_7: @ %for.body 1005; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1006; CHECK-NEXT: vldr.16 s0, [r1] 1007; CHECK-NEXT: vldr.16 s2, [r0] 1008; CHECK-NEXT: adds r0, #2 1009; CHECK-NEXT: adds r1, #2 1010; CHECK-NEXT: vmul.f16 s0, s2, s0 1011; CHECK-NEXT: vcvtb.f32.f16 s0, s0 1012; CHECK-NEXT: vstmia r2!, {s0} 1013; CHECK-NEXT: le lr, .LBB5_7 1014; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup 1015; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} 1016entry: 1017 %cmp8 = icmp eq i32 %N, 0 1018 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 1019 1020for.body.preheader: ; preds = %entry 1021 %min.iters.check = icmp ult i32 %N, 4 1022 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph 1023 1024for.body.preheader11: ; preds = %middle.block, %for.body.preheader 1025 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1026 br label %for.body 1027 1028vector.ph: ; preds = %for.body.preheader 1029 %n.vec = and i32 %N, -4 1030 br label %vector.body 1031 1032vector.body: ; preds = %vector.body, %vector.ph 1033 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1034 %0 = getelementptr inbounds half, ptr %a, i32 %index 1035 %wide.load = load <4 x half>, ptr %0, align 2 1036 %1 = getelementptr inbounds half, ptr %b, i32 %index 1037 %wide.load10 = load <4 x half>, ptr %1, align 2 1038 %2 = fmul <4 x half> %wide.load, %wide.load10 1039 %3 = fpext <4 x half> %2 to <4 x float> 1040 %4 = getelementptr inbounds float, ptr %c, i32 %index 1041 store <4 x float> %3, ptr %4, align 4 1042 %index.next = add i32 %index, 4 1043 %5 = icmp eq i32 %index.next, %n.vec 1044 br i1 %5, label %middle.block, label %vector.body 1045 1046middle.block: ; preds = %vector.body 1047 %cmp.n = icmp eq i32 %n.vec, %N 1048 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11 1049 1050for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1051 ret void 1052 1053for.body: ; preds = %for.body.preheader11, %for.body 1054 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ] 1055 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09 1056 %6 = load half, ptr %arrayidx, align 2 1057 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09 1058 %7 = load half, ptr %arrayidx1, align 2 1059 %mul = fmul half %6, %7 1060 %conv = fpext half %mul to float 1061 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09 1062 store float %conv, ptr %arrayidx2, align 4 1063 %inc = add nuw i32 %i.09, 1 1064 %exitcond = icmp eq i32 %inc, %N 1065 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1066} 1067 1068define arm_aapcs_vfpcc void @half_half_add(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { 1069; CHECK-LABEL: half_half_add: 1070; CHECK: @ %bb.0: @ %entry 1071; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} 1072; CHECK-NEXT: cmp r3, #0 1073; CHECK-NEXT: beq .LBB6_8 1074; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1075; CHECK-NEXT: cmp r3, #3 1076; CHECK-NEXT: bhi .LBB6_3 1077; CHECK-NEXT: @ %bb.2: 1078; CHECK-NEXT: mov.w r12, #0 1079; CHECK-NEXT: b .LBB6_6 1080; CHECK-NEXT: .LBB6_3: @ %vector.ph 1081; CHECK-NEXT: bic r12, r3, #3 1082; CHECK-NEXT: movs r5, #1 1083; CHECK-NEXT: sub.w r6, r12, #4 1084; CHECK-NEXT: mov r4, r0 1085; CHECK-NEXT: add.w lr, r5, r6, lsr #2 1086; CHECK-NEXT: mov r5, r1 1087; CHECK-NEXT: mov r6, r2 1088; CHECK-NEXT: .LBB6_4: @ %vector.body 1089; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1090; CHECK-NEXT: ldr.w r9, [r4] 1091; CHECK-NEXT: ldr r7, [r5] 1092; CHECK-NEXT: ldr.w r8, [r4, #4] 1093; CHECK-NEXT: vmov.32 q0[0], r9 1094; CHECK-NEXT: ldr.w r10, [r5, #4] 1095; CHECK-NEXT: vmov.32 q1[0], r7 1096; CHECK-NEXT: vmov.32 q0[1], r8 1097; CHECK-NEXT: adds r4, #8 1098; CHECK-NEXT: vmov.32 q1[1], r10 1099; CHECK-NEXT: adds r5, #8 1100; CHECK-NEXT: vadd.f16 q0, q0, q1 1101; CHECK-NEXT: vcvtt.f32.f16 s3, s1 1102; CHECK-NEXT: vcvtb.f32.f16 s2, s1 1103; CHECK-NEXT: vcvtt.f32.f16 s1, s0 1104; CHECK-NEXT: vcvtb.f32.f16 s0, s0 1105; CHECK-NEXT: vstrb.8 q0, [r6], #16 1106; CHECK-NEXT: le lr, .LBB6_4 1107; CHECK-NEXT: @ %bb.5: @ %middle.block 1108; CHECK-NEXT: cmp r12, r3 1109; CHECK-NEXT: beq .LBB6_8 1110; CHECK-NEXT: .LBB6_6: @ %for.body.preheader11 1111; CHECK-NEXT: sub.w lr, r3, r12 1112; CHECK-NEXT: add.w r0, r0, r12, lsl #1 1113; CHECK-NEXT: add.w r1, r1, r12, lsl #1 1114; CHECK-NEXT: add.w r2, r2, r12, lsl #2 1115; CHECK-NEXT: .LBB6_7: @ %for.body 1116; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1117; CHECK-NEXT: vldr.16 s0, [r1] 1118; CHECK-NEXT: vldr.16 s2, [r0] 1119; CHECK-NEXT: adds r0, #2 1120; CHECK-NEXT: adds r1, #2 1121; CHECK-NEXT: vadd.f16 s0, s2, s0 1122; CHECK-NEXT: vcvtb.f32.f16 s0, s0 1123; CHECK-NEXT: vstmia r2!, {s0} 1124; CHECK-NEXT: le lr, .LBB6_7 1125; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup 1126; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} 1127entry: 1128 %cmp8 = icmp eq i32 %N, 0 1129 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 1130 1131for.body.preheader: ; preds = %entry 1132 %min.iters.check = icmp ult i32 %N, 4 1133 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph 1134 1135for.body.preheader11: ; preds = %middle.block, %for.body.preheader 1136 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1137 br label %for.body 1138 1139vector.ph: ; preds = %for.body.preheader 1140 %n.vec = and i32 %N, -4 1141 br label %vector.body 1142 1143vector.body: ; preds = %vector.body, %vector.ph 1144 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1145 %0 = getelementptr inbounds half, ptr %a, i32 %index 1146 %wide.load = load <4 x half>, ptr %0, align 2 1147 %1 = getelementptr inbounds half, ptr %b, i32 %index 1148 %wide.load10 = load <4 x half>, ptr %1, align 2 1149 %2 = fadd <4 x half> %wide.load, %wide.load10 1150 %3 = fpext <4 x half> %2 to <4 x float> 1151 %4 = getelementptr inbounds float, ptr %c, i32 %index 1152 store <4 x float> %3, ptr %4, align 4 1153 %index.next = add i32 %index, 4 1154 %5 = icmp eq i32 %index.next, %n.vec 1155 br i1 %5, label %middle.block, label %vector.body 1156 1157middle.block: ; preds = %vector.body 1158 %cmp.n = icmp eq i32 %n.vec, %N 1159 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11 1160 1161for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1162 ret void 1163 1164for.body: ; preds = %for.body.preheader11, %for.body 1165 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ] 1166 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09 1167 %6 = load half, ptr %arrayidx, align 2 1168 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09 1169 %7 = load half, ptr %arrayidx1, align 2 1170 %add = fadd half %6, %7 1171 %conv = fpext half %add to float 1172 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09 1173 store float %conv, ptr %arrayidx2, align 4 1174 %inc = add nuw i32 %i.09, 1 1175 %exitcond = icmp eq i32 %inc, %N 1176 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1177} 1178 1179define arm_aapcs_vfpcc void @half_half_sub(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { 1180; CHECK-LABEL: half_half_sub: 1181; CHECK: @ %bb.0: @ %entry 1182; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} 1183; CHECK-NEXT: cmp r3, #0 1184; CHECK-NEXT: beq .LBB7_8 1185; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1186; CHECK-NEXT: cmp r3, #3 1187; CHECK-NEXT: bhi .LBB7_3 1188; CHECK-NEXT: @ %bb.2: 1189; CHECK-NEXT: mov.w r12, #0 1190; CHECK-NEXT: b .LBB7_6 1191; CHECK-NEXT: .LBB7_3: @ %vector.ph 1192; CHECK-NEXT: bic r12, r3, #3 1193; CHECK-NEXT: movs r5, #1 1194; CHECK-NEXT: sub.w r6, r12, #4 1195; CHECK-NEXT: mov r4, r0 1196; CHECK-NEXT: add.w lr, r5, r6, lsr #2 1197; CHECK-NEXT: mov r5, r1 1198; CHECK-NEXT: mov r6, r2 1199; CHECK-NEXT: .LBB7_4: @ %vector.body 1200; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1201; CHECK-NEXT: ldr.w r9, [r4] 1202; CHECK-NEXT: ldr r7, [r5] 1203; CHECK-NEXT: ldr.w r8, [r4, #4] 1204; CHECK-NEXT: vmov.32 q0[0], r9 1205; CHECK-NEXT: ldr.w r10, [r5, #4] 1206; CHECK-NEXT: vmov.32 q1[0], r7 1207; CHECK-NEXT: vmov.32 q0[1], r8 1208; CHECK-NEXT: adds r4, #8 1209; CHECK-NEXT: vmov.32 q1[1], r10 1210; CHECK-NEXT: adds r5, #8 1211; CHECK-NEXT: vsub.f16 q0, q0, q1 1212; CHECK-NEXT: vcvtt.f32.f16 s3, s1 1213; CHECK-NEXT: vcvtb.f32.f16 s2, s1 1214; CHECK-NEXT: vcvtt.f32.f16 s1, s0 1215; CHECK-NEXT: vcvtb.f32.f16 s0, s0 1216; CHECK-NEXT: vstrb.8 q0, [r6], #16 1217; CHECK-NEXT: le lr, .LBB7_4 1218; CHECK-NEXT: @ %bb.5: @ %middle.block 1219; CHECK-NEXT: cmp r12, r3 1220; CHECK-NEXT: beq .LBB7_8 1221; CHECK-NEXT: .LBB7_6: @ %for.body.preheader11 1222; CHECK-NEXT: sub.w lr, r3, r12 1223; CHECK-NEXT: add.w r0, r0, r12, lsl #1 1224; CHECK-NEXT: add.w r1, r1, r12, lsl #1 1225; CHECK-NEXT: add.w r2, r2, r12, lsl #2 1226; CHECK-NEXT: .LBB7_7: @ %for.body 1227; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1228; CHECK-NEXT: vldr.16 s0, [r1] 1229; CHECK-NEXT: vldr.16 s2, [r0] 1230; CHECK-NEXT: adds r0, #2 1231; CHECK-NEXT: adds r1, #2 1232; CHECK-NEXT: vsub.f16 s0, s2, s0 1233; CHECK-NEXT: vcvtb.f32.f16 s0, s0 1234; CHECK-NEXT: vstmia r2!, {s0} 1235; CHECK-NEXT: le lr, .LBB7_7 1236; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup 1237; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} 1238entry: 1239 %cmp8 = icmp eq i32 %N, 0 1240 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 1241 1242for.body.preheader: ; preds = %entry 1243 %min.iters.check = icmp ult i32 %N, 4 1244 br i1 %min.iters.check, label %for.body.preheader11, label %vector.ph 1245 1246for.body.preheader11: ; preds = %middle.block, %for.body.preheader 1247 %i.09.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1248 br label %for.body 1249 1250vector.ph: ; preds = %for.body.preheader 1251 %n.vec = and i32 %N, -4 1252 br label %vector.body 1253 1254vector.body: ; preds = %vector.body, %vector.ph 1255 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1256 %0 = getelementptr inbounds half, ptr %a, i32 %index 1257 %wide.load = load <4 x half>, ptr %0, align 2 1258 %1 = getelementptr inbounds half, ptr %b, i32 %index 1259 %wide.load10 = load <4 x half>, ptr %1, align 2 1260 %2 = fsub <4 x half> %wide.load, %wide.load10 1261 %3 = fpext <4 x half> %2 to <4 x float> 1262 %4 = getelementptr inbounds float, ptr %c, i32 %index 1263 store <4 x float> %3, ptr %4, align 4 1264 %index.next = add i32 %index, 4 1265 %5 = icmp eq i32 %index.next, %n.vec 1266 br i1 %5, label %middle.block, label %vector.body 1267 1268middle.block: ; preds = %vector.body 1269 %cmp.n = icmp eq i32 %n.vec, %N 1270 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader11 1271 1272for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1273 ret void 1274 1275for.body: ; preds = %for.body.preheader11, %for.body 1276 %i.09 = phi i32 [ %inc, %for.body ], [ %i.09.ph, %for.body.preheader11 ] 1277 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.09 1278 %6 = load half, ptr %arrayidx, align 2 1279 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.09 1280 %7 = load half, ptr %arrayidx1, align 2 1281 %sub = fsub half %6, %7 1282 %conv = fpext half %sub to float 1283 %arrayidx2 = getelementptr inbounds float, ptr %c, i32 %i.09 1284 store float %conv, ptr %arrayidx2, align 4 1285 %inc = add nuw i32 %i.09, 1 1286 %exitcond = icmp eq i32 %inc, %N 1287 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1288} 1289 1290define arm_aapcs_vfpcc void @half_short_mul(ptr nocapture readonly %a, ptr nocapture readonly %b, ptr nocapture %c, i32 %N) { 1291; CHECK-LABEL: half_short_mul: 1292; CHECK: @ %bb.0: @ %entry 1293; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} 1294; CHECK-NEXT: sub sp, #16 1295; CHECK-NEXT: cmp r3, #0 1296; CHECK-NEXT: beq .LBB8_8 1297; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1298; CHECK-NEXT: mov r8, r2 1299; CHECK-NEXT: mov r9, r1 1300; CHECK-NEXT: cmp r3, #3 1301; CHECK-NEXT: bhi .LBB8_3 1302; CHECK-NEXT: @ %bb.2: 1303; CHECK-NEXT: mov.w r12, #0 1304; CHECK-NEXT: b .LBB8_6 1305; CHECK-NEXT: .LBB8_3: @ %vector.ph 1306; CHECK-NEXT: bic r12, r3, #3 1307; CHECK-NEXT: movs r6, #1 1308; CHECK-NEXT: sub.w r7, r12, #4 1309; CHECK-NEXT: mov r1, sp 1310; CHECK-NEXT: mov r5, r0 1311; CHECK-NEXT: add.w lr, r6, r7, lsr #2 1312; CHECK-NEXT: mov r6, r9 1313; CHECK-NEXT: mov r7, r8 1314; CHECK-NEXT: .LBB8_4: @ %vector.body 1315; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1316; CHECK-NEXT: vldrh.u32 q0, [r6], #8 1317; CHECK-NEXT: ldr r4, [r5] 1318; CHECK-NEXT: ldr r2, [r5, #4] 1319; CHECK-NEXT: adds r5, #8 1320; CHECK-NEXT: vstrh.32 q0, [r1] 1321; CHECK-NEXT: vmov.32 q1[0], r4 1322; CHECK-NEXT: vldrw.u32 q0, [r1] 1323; CHECK-NEXT: vmov.32 q1[1], r2 1324; CHECK-NEXT: vcvt.f16.s16 q0, q0 1325; CHECK-NEXT: vmul.f16 q0, q1, q0 1326; CHECK-NEXT: vcvtt.f32.f16 s3, s1 1327; CHECK-NEXT: vcvtb.f32.f16 s2, s1 1328; CHECK-NEXT: vcvtt.f32.f16 s1, s0 1329; CHECK-NEXT: vcvtb.f32.f16 s0, s0 1330; CHECK-NEXT: vstrb.8 q0, [r7], #16 1331; CHECK-NEXT: le lr, .LBB8_4 1332; CHECK-NEXT: @ %bb.5: @ %middle.block 1333; CHECK-NEXT: cmp r12, r3 1334; CHECK-NEXT: beq .LBB8_8 1335; CHECK-NEXT: .LBB8_6: @ %for.body.preheader13 1336; CHECK-NEXT: sub.w lr, r3, r12 1337; CHECK-NEXT: add.w r0, r0, r12, lsl #1 1338; CHECK-NEXT: add.w r1, r9, r12, lsl #1 1339; CHECK-NEXT: add.w r2, r8, r12, lsl #2 1340; CHECK-NEXT: .LBB8_7: @ %for.body 1341; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1342; CHECK-NEXT: ldrsh r3, [r1], #2 1343; CHECK-NEXT: vldr.16 s0, [r0] 1344; CHECK-NEXT: adds r0, #2 1345; CHECK-NEXT: vmov s2, r3 1346; CHECK-NEXT: vcvt.f16.s32 s2, s2 1347; CHECK-NEXT: vmul.f16 s0, s0, s2 1348; CHECK-NEXT: vcvtb.f32.f16 s0, s0 1349; CHECK-NEXT: vstmia r2!, {s0} 1350; CHECK-NEXT: le lr, .LBB8_7 1351; CHECK-NEXT: .LBB8_8: @ %for.cond.cleanup 1352; CHECK-NEXT: add sp, #16 1353; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc} 1354entry: 1355 %cmp10 = icmp eq i32 %N, 0 1356 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader 1357 1358for.body.preheader: ; preds = %entry 1359 %min.iters.check = icmp ult i32 %N, 4 1360 br i1 %min.iters.check, label %for.body.preheader13, label %vector.ph 1361 1362for.body.preheader13: ; preds = %middle.block, %for.body.preheader 1363 %i.011.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1364 br label %for.body 1365 1366vector.ph: ; preds = %for.body.preheader 1367 %n.vec = and i32 %N, -4 1368 br label %vector.body 1369 1370vector.body: ; preds = %vector.body, %vector.ph 1371 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1372 %0 = getelementptr inbounds half, ptr %a, i32 %index 1373 %wide.load = load <4 x half>, ptr %0, align 2 1374 %1 = getelementptr inbounds i16, ptr %b, i32 %index 1375 %wide.load12 = load <4 x i16>, ptr %1, align 2 1376 %2 = sitofp <4 x i16> %wide.load12 to <4 x half> 1377 %3 = fmul <4 x half> %wide.load, %2 1378 %4 = fpext <4 x half> %3 to <4 x float> 1379 %5 = getelementptr inbounds float, ptr %c, i32 %index 1380 store <4 x float> %4, ptr %5, align 4 1381 %index.next = add i32 %index, 4 1382 %6 = icmp eq i32 %index.next, %n.vec 1383 br i1 %6, label %middle.block, label %vector.body 1384 1385middle.block: ; preds = %vector.body 1386 %cmp.n = icmp eq i32 %n.vec, %N 1387 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13 1388 1389for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1390 ret void 1391 1392for.body: ; preds = %for.body.preheader13, %for.body 1393 %i.011 = phi i32 [ %inc, %for.body ], [ %i.011.ph, %for.body.preheader13 ] 1394 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.011 1395 %7 = load half, ptr %arrayidx, align 2 1396 %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.011 1397 %8 = load i16, ptr %arrayidx1, align 2 1398 %conv2 = sitofp i16 %8 to half 1399 %mul = fmul half %7, %conv2 1400 %conv3 = fpext half %mul to float 1401 %arrayidx4 = getelementptr inbounds float, ptr %c, i32 %i.011 1402 store float %conv3, ptr %arrayidx4, align 4 1403 %inc = add nuw i32 %i.011, 1 1404 %exitcond = icmp eq i32 %inc, %N 1405 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1406} 1407 1408define arm_aapcs_vfpcc float @half_half_mac(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) { 1409; CHECK-LABEL: half_half_mac: 1410; CHECK: @ %bb.0: @ %entry 1411; CHECK-NEXT: push {r4, r5, r7, lr} 1412; CHECK-NEXT: cbz r2, .LBB9_3 1413; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1414; CHECK-NEXT: subs r3, r2, #1 1415; CHECK-NEXT: and r12, r2, #3 1416; CHECK-NEXT: cmp r3, #3 1417; CHECK-NEXT: bhs .LBB9_4 1418; CHECK-NEXT: @ %bb.2: 1419; CHECK-NEXT: vldr s0, .LCPI9_0 1420; CHECK-NEXT: movs r2, #0 1421; CHECK-NEXT: b .LBB9_6 1422; CHECK-NEXT: .LBB9_3: 1423; CHECK-NEXT: vldr s0, .LCPI9_0 1424; CHECK-NEXT: pop {r4, r5, r7, pc} 1425; CHECK-NEXT: .LBB9_4: @ %for.body.preheader.new 1426; CHECK-NEXT: bic r2, r2, #3 1427; CHECK-NEXT: movs r3, #1 1428; CHECK-NEXT: subs r2, #4 1429; CHECK-NEXT: vldr s0, .LCPI9_0 1430; CHECK-NEXT: add.w lr, r3, r2, lsr #2 1431; CHECK-NEXT: movs r3, #0 1432; CHECK-NEXT: movs r2, #0 1433; CHECK-NEXT: .LBB9_5: @ %for.body 1434; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1435; CHECK-NEXT: adds r5, r0, r3 1436; CHECK-NEXT: adds r4, r1, r3 1437; CHECK-NEXT: vldr.16 s2, [r4, #6] 1438; CHECK-NEXT: vldr.16 s4, [r5, #6] 1439; CHECK-NEXT: vldr.16 s6, [r5, #4] 1440; CHECK-NEXT: vldr.16 s8, [r5, #2] 1441; CHECK-NEXT: vmul.f16 s2, s4, s2 1442; CHECK-NEXT: vldr.16 s4, [r4, #4] 1443; CHECK-NEXT: vldr.16 s10, [r5] 1444; CHECK-NEXT: vcvtb.f32.f16 s2, s2 1445; CHECK-NEXT: vmul.f16 s4, s6, s4 1446; CHECK-NEXT: vldr.16 s6, [r4, #2] 1447; CHECK-NEXT: vcvtb.f32.f16 s4, s4 1448; CHECK-NEXT: adds r3, #8 1449; CHECK-NEXT: vmul.f16 s6, s8, s6 1450; CHECK-NEXT: vldr.16 s8, [r4] 1451; CHECK-NEXT: vcvtb.f32.f16 s6, s6 1452; CHECK-NEXT: adds r2, #4 1453; CHECK-NEXT: vmul.f16 s8, s10, s8 1454; CHECK-NEXT: vcvtb.f32.f16 s8, s8 1455; CHECK-NEXT: vadd.f32 s0, s0, s8 1456; CHECK-NEXT: vadd.f32 s0, s0, s6 1457; CHECK-NEXT: vadd.f32 s0, s0, s4 1458; CHECK-NEXT: vadd.f32 s0, s0, s2 1459; CHECK-NEXT: le lr, .LBB9_5 1460; CHECK-NEXT: .LBB9_6: @ %for.cond.cleanup.loopexit.unr-lcssa 1461; CHECK-NEXT: wls lr, r12, .LBB9_9 1462; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader 1463; CHECK-NEXT: add.w r0, r0, r2, lsl #1 1464; CHECK-NEXT: add.w r1, r1, r2, lsl #1 1465; CHECK-NEXT: .LBB9_8: @ %for.body.epil 1466; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1467; CHECK-NEXT: vldr.16 s2, [r1] 1468; CHECK-NEXT: vldr.16 s4, [r0] 1469; CHECK-NEXT: adds r0, #2 1470; CHECK-NEXT: adds r1, #2 1471; CHECK-NEXT: vmul.f16 s2, s4, s2 1472; CHECK-NEXT: vcvtb.f32.f16 s2, s2 1473; CHECK-NEXT: vadd.f32 s0, s0, s2 1474; CHECK-NEXT: le lr, .LBB9_8 1475; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup 1476; CHECK-NEXT: pop {r4, r5, r7, pc} 1477; CHECK-NEXT: .p2align 2 1478; CHECK-NEXT: @ %bb.10: 1479; CHECK-NEXT: .LCPI9_0: 1480; CHECK-NEXT: .long 0x00000000 @ float 0 1481entry: 1482 %cmp8 = icmp eq i32 %N, 0 1483 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 1484 1485for.body.preheader: ; preds = %entry 1486 %0 = add i32 %N, -1 1487 %xtraiter = and i32 %N, 3 1488 %1 = icmp ult i32 %0, 3 1489 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 1490 1491for.body.preheader.new: ; preds = %for.body.preheader 1492 %unroll_iter = sub i32 %N, %xtraiter 1493 br label %for.body 1494 1495for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 1496 %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ] 1497 %i.010.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] 1498 %res.09.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ] 1499 %lcmp.mod = icmp eq i32 %xtraiter, 0 1500 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil 1501 1502for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil 1503 %i.010.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 1504 %res.09.epil = phi float [ %add.epil, %for.body.epil ], [ %res.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 1505 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 1506 %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.010.epil 1507 %2 = load half, ptr %arrayidx.epil, align 2 1508 %arrayidx1.epil = getelementptr inbounds half, ptr %b, i32 %i.010.epil 1509 %3 = load half, ptr %arrayidx1.epil, align 2 1510 %mul.epil = fmul half %2, %3 1511 %conv.epil = fpext half %mul.epil to float 1512 %add.epil = fadd float %res.09.epil, %conv.epil 1513 %inc.epil = add nuw i32 %i.010.epil, 1 1514 %epil.iter.sub = add i32 %epil.iter, -1 1515 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 1516 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil 1517 1518for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry 1519 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ] 1520 ret float %res.0.lcssa 1521 1522for.body: ; preds = %for.body, %for.body.preheader.new 1523 %i.010 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] 1524 %res.09 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ] 1525 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 1526 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.010 1527 %4 = load half, ptr %arrayidx, align 2 1528 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.010 1529 %5 = load half, ptr %arrayidx1, align 2 1530 %mul = fmul half %4, %5 1531 %conv = fpext half %mul to float 1532 %add = fadd float %res.09, %conv 1533 %inc = or disjoint i32 %i.010, 1 1534 %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc 1535 %6 = load half, ptr %arrayidx.1, align 2 1536 %arrayidx1.1 = getelementptr inbounds half, ptr %b, i32 %inc 1537 %7 = load half, ptr %arrayidx1.1, align 2 1538 %mul.1 = fmul half %6, %7 1539 %conv.1 = fpext half %mul.1 to float 1540 %add.1 = fadd float %add, %conv.1 1541 %inc.1 = or disjoint i32 %i.010, 2 1542 %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1 1543 %8 = load half, ptr %arrayidx.2, align 2 1544 %arrayidx1.2 = getelementptr inbounds half, ptr %b, i32 %inc.1 1545 %9 = load half, ptr %arrayidx1.2, align 2 1546 %mul.2 = fmul half %8, %9 1547 %conv.2 = fpext half %mul.2 to float 1548 %add.2 = fadd float %add.1, %conv.2 1549 %inc.2 = or disjoint i32 %i.010, 3 1550 %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2 1551 %10 = load half, ptr %arrayidx.3, align 2 1552 %arrayidx1.3 = getelementptr inbounds half, ptr %b, i32 %inc.2 1553 %11 = load half, ptr %arrayidx1.3, align 2 1554 %mul.3 = fmul half %10, %11 1555 %conv.3 = fpext half %mul.3 to float 1556 %add.3 = fadd float %add.2, %conv.3 1557 %inc.3 = add nuw i32 %i.010, 4 1558 %niter.nsub.3 = add i32 %niter, -4 1559 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 1560 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 1561} 1562 1563define arm_aapcs_vfpcc float @half_half_acc(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) { 1564; CHECK-LABEL: half_half_acc: 1565; CHECK: @ %bb.0: @ %entry 1566; CHECK-NEXT: push {r4, r5, r7, lr} 1567; CHECK-NEXT: cbz r2, .LBB10_3 1568; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1569; CHECK-NEXT: subs r3, r2, #1 1570; CHECK-NEXT: and r12, r2, #3 1571; CHECK-NEXT: cmp r3, #3 1572; CHECK-NEXT: bhs .LBB10_4 1573; CHECK-NEXT: @ %bb.2: 1574; CHECK-NEXT: vldr s0, .LCPI10_0 1575; CHECK-NEXT: movs r2, #0 1576; CHECK-NEXT: b .LBB10_6 1577; CHECK-NEXT: .LBB10_3: 1578; CHECK-NEXT: vldr s0, .LCPI10_0 1579; CHECK-NEXT: pop {r4, r5, r7, pc} 1580; CHECK-NEXT: .LBB10_4: @ %for.body.preheader.new 1581; CHECK-NEXT: bic r2, r2, #3 1582; CHECK-NEXT: movs r3, #1 1583; CHECK-NEXT: subs r2, #4 1584; CHECK-NEXT: vldr s0, .LCPI10_0 1585; CHECK-NEXT: add.w lr, r3, r2, lsr #2 1586; CHECK-NEXT: movs r3, #0 1587; CHECK-NEXT: movs r2, #0 1588; CHECK-NEXT: .LBB10_5: @ %for.body 1589; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1590; CHECK-NEXT: adds r5, r0, r3 1591; CHECK-NEXT: adds r4, r1, r3 1592; CHECK-NEXT: vldr.16 s2, [r4, #6] 1593; CHECK-NEXT: vldr.16 s4, [r5, #6] 1594; CHECK-NEXT: vldr.16 s6, [r5, #4] 1595; CHECK-NEXT: vldr.16 s8, [r5, #2] 1596; CHECK-NEXT: vadd.f16 s2, s4, s2 1597; CHECK-NEXT: vldr.16 s4, [r4, #4] 1598; CHECK-NEXT: vldr.16 s10, [r5] 1599; CHECK-NEXT: vcvtb.f32.f16 s2, s2 1600; CHECK-NEXT: vadd.f16 s4, s6, s4 1601; CHECK-NEXT: vldr.16 s6, [r4, #2] 1602; CHECK-NEXT: vcvtb.f32.f16 s4, s4 1603; CHECK-NEXT: adds r3, #8 1604; CHECK-NEXT: vadd.f16 s6, s8, s6 1605; CHECK-NEXT: vldr.16 s8, [r4] 1606; CHECK-NEXT: vcvtb.f32.f16 s6, s6 1607; CHECK-NEXT: adds r2, #4 1608; CHECK-NEXT: vadd.f16 s8, s10, s8 1609; CHECK-NEXT: vcvtb.f32.f16 s8, s8 1610; CHECK-NEXT: vadd.f32 s0, s0, s8 1611; CHECK-NEXT: vadd.f32 s0, s0, s6 1612; CHECK-NEXT: vadd.f32 s0, s0, s4 1613; CHECK-NEXT: vadd.f32 s0, s0, s2 1614; CHECK-NEXT: le lr, .LBB10_5 1615; CHECK-NEXT: .LBB10_6: @ %for.cond.cleanup.loopexit.unr-lcssa 1616; CHECK-NEXT: wls lr, r12, .LBB10_9 1617; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader 1618; CHECK-NEXT: add.w r0, r0, r2, lsl #1 1619; CHECK-NEXT: add.w r1, r1, r2, lsl #1 1620; CHECK-NEXT: .LBB10_8: @ %for.body.epil 1621; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1622; CHECK-NEXT: vldr.16 s2, [r1] 1623; CHECK-NEXT: vldr.16 s4, [r0] 1624; CHECK-NEXT: adds r0, #2 1625; CHECK-NEXT: adds r1, #2 1626; CHECK-NEXT: vadd.f16 s2, s4, s2 1627; CHECK-NEXT: vcvtb.f32.f16 s2, s2 1628; CHECK-NEXT: vadd.f32 s0, s0, s2 1629; CHECK-NEXT: le lr, .LBB10_8 1630; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup 1631; CHECK-NEXT: pop {r4, r5, r7, pc} 1632; CHECK-NEXT: .p2align 2 1633; CHECK-NEXT: @ %bb.10: 1634; CHECK-NEXT: .LCPI10_0: 1635; CHECK-NEXT: .long 0x00000000 @ float 0 1636entry: 1637 %cmp9 = icmp eq i32 %N, 0 1638 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader 1639 1640for.body.preheader: ; preds = %entry 1641 %0 = add i32 %N, -1 1642 %xtraiter = and i32 %N, 3 1643 %1 = icmp ult i32 %0, 3 1644 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 1645 1646for.body.preheader.new: ; preds = %for.body.preheader 1647 %unroll_iter = sub i32 %N, %xtraiter 1648 br label %for.body 1649 1650for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 1651 %add2.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add2.3, %for.body ] 1652 %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] 1653 %res.010.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add2.3, %for.body ] 1654 %lcmp.mod = icmp eq i32 %xtraiter, 0 1655 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil 1656 1657for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil 1658 %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 1659 %res.010.epil = phi float [ %add2.epil, %for.body.epil ], [ %res.010.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 1660 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 1661 %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.011.epil 1662 %2 = load half, ptr %arrayidx.epil, align 2 1663 %arrayidx1.epil = getelementptr inbounds half, ptr %b, i32 %i.011.epil 1664 %3 = load half, ptr %arrayidx1.epil, align 2 1665 %add.epil = fadd half %2, %3 1666 %conv.epil = fpext half %add.epil to float 1667 %add2.epil = fadd float %res.010.epil, %conv.epil 1668 %inc.epil = add nuw i32 %i.011.epil, 1 1669 %epil.iter.sub = add i32 %epil.iter, -1 1670 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 1671 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil 1672 1673for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry 1674 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add2.epil, %for.body.epil ] 1675 ret float %res.0.lcssa 1676 1677for.body: ; preds = %for.body, %for.body.preheader.new 1678 %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] 1679 %res.010 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add2.3, %for.body ] 1680 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 1681 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.011 1682 %4 = load half, ptr %arrayidx, align 2 1683 %arrayidx1 = getelementptr inbounds half, ptr %b, i32 %i.011 1684 %5 = load half, ptr %arrayidx1, align 2 1685 %add = fadd half %4, %5 1686 %conv = fpext half %add to float 1687 %add2 = fadd float %res.010, %conv 1688 %inc = or disjoint i32 %i.011, 1 1689 %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc 1690 %6 = load half, ptr %arrayidx.1, align 2 1691 %arrayidx1.1 = getelementptr inbounds half, ptr %b, i32 %inc 1692 %7 = load half, ptr %arrayidx1.1, align 2 1693 %add.1 = fadd half %6, %7 1694 %conv.1 = fpext half %add.1 to float 1695 %add2.1 = fadd float %add2, %conv.1 1696 %inc.1 = or disjoint i32 %i.011, 2 1697 %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1 1698 %8 = load half, ptr %arrayidx.2, align 2 1699 %arrayidx1.2 = getelementptr inbounds half, ptr %b, i32 %inc.1 1700 %9 = load half, ptr %arrayidx1.2, align 2 1701 %add.2 = fadd half %8, %9 1702 %conv.2 = fpext half %add.2 to float 1703 %add2.2 = fadd float %add2.1, %conv.2 1704 %inc.2 = or disjoint i32 %i.011, 3 1705 %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2 1706 %10 = load half, ptr %arrayidx.3, align 2 1707 %arrayidx1.3 = getelementptr inbounds half, ptr %b, i32 %inc.2 1708 %11 = load half, ptr %arrayidx1.3, align 2 1709 %add.3 = fadd half %10, %11 1710 %conv.3 = fpext half %add.3 to float 1711 %add2.3 = fadd float %add2.2, %conv.3 1712 %inc.3 = add nuw i32 %i.011, 4 1713 %niter.nsub.3 = add i32 %niter, -4 1714 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 1715 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 1716} 1717 1718define arm_aapcs_vfpcc float @half_short_mac(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %N) { 1719; CHECK-LABEL: half_short_mac: 1720; CHECK: @ %bb.0: @ %entry 1721; CHECK-NEXT: push {r4, r5, r6, lr} 1722; CHECK-NEXT: cbz r2, .LBB11_3 1723; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1724; CHECK-NEXT: subs r3, r2, #1 1725; CHECK-NEXT: and r12, r2, #3 1726; CHECK-NEXT: cmp r3, #3 1727; CHECK-NEXT: bhs .LBB11_4 1728; CHECK-NEXT: @ %bb.2: 1729; CHECK-NEXT: vldr s0, .LCPI11_0 1730; CHECK-NEXT: movs r2, #0 1731; CHECK-NEXT: b .LBB11_6 1732; CHECK-NEXT: .LBB11_3: 1733; CHECK-NEXT: vldr s0, .LCPI11_0 1734; CHECK-NEXT: pop {r4, r5, r6, pc} 1735; CHECK-NEXT: .LBB11_4: @ %for.body.preheader.new 1736; CHECK-NEXT: bic r2, r2, #3 1737; CHECK-NEXT: movs r3, #1 1738; CHECK-NEXT: subs r2, #4 1739; CHECK-NEXT: vldr s0, .LCPI11_0 1740; CHECK-NEXT: adds r4, r0, #4 1741; CHECK-NEXT: add.w lr, r3, r2, lsr #2 1742; CHECK-NEXT: adds r3, r1, #4 1743; CHECK-NEXT: movs r2, #0 1744; CHECK-NEXT: .LBB11_5: @ %for.body 1745; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1746; CHECK-NEXT: ldrsh.w r5, [r3, #2] 1747; CHECK-NEXT: vldr.16 s2, [r4, #2] 1748; CHECK-NEXT: adds r2, #4 1749; CHECK-NEXT: vmov s4, r5 1750; CHECK-NEXT: ldrsh r5, [r3], #8 1751; CHECK-NEXT: vcvt.f16.s32 s4, s4 1752; CHECK-NEXT: ldrsh r6, [r3, #-10] 1753; CHECK-NEXT: vmul.f16 s2, s2, s4 1754; CHECK-NEXT: vmov s6, r5 1755; CHECK-NEXT: vldr.16 s4, [r4] 1756; CHECK-NEXT: vcvt.f16.s32 s6, s6 1757; CHECK-NEXT: ldrsh r5, [r3, #-12] 1758; CHECK-NEXT: vmul.f16 s4, s4, s6 1759; CHECK-NEXT: vmov s8, r6 1760; CHECK-NEXT: vldr.16 s6, [r4, #-2] 1761; CHECK-NEXT: vcvt.f16.s32 s8, s8 1762; CHECK-NEXT: vmov s10, r5 1763; CHECK-NEXT: vcvtb.f32.f16 s4, s4 1764; CHECK-NEXT: vmul.f16 s6, s6, s8 1765; CHECK-NEXT: vldr.16 s8, [r4, #-4] 1766; CHECK-NEXT: vcvt.f16.s32 s10, s10 1767; CHECK-NEXT: vcvtb.f32.f16 s6, s6 1768; CHECK-NEXT: vmul.f16 s8, s8, s10 1769; CHECK-NEXT: vcvtb.f32.f16 s2, s2 1770; CHECK-NEXT: vcvtb.f32.f16 s8, s8 1771; CHECK-NEXT: adds r4, #8 1772; CHECK-NEXT: vadd.f32 s0, s0, s8 1773; CHECK-NEXT: vadd.f32 s0, s0, s6 1774; CHECK-NEXT: vadd.f32 s0, s0, s4 1775; CHECK-NEXT: vadd.f32 s0, s0, s2 1776; CHECK-NEXT: le lr, .LBB11_5 1777; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa 1778; CHECK-NEXT: wls lr, r12, .LBB11_9 1779; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader 1780; CHECK-NEXT: add.w r0, r0, r2, lsl #1 1781; CHECK-NEXT: add.w r1, r1, r2, lsl #1 1782; CHECK-NEXT: .LBB11_8: @ %for.body.epil 1783; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1784; CHECK-NEXT: ldrsh r2, [r1], #2 1785; CHECK-NEXT: vldr.16 s2, [r0] 1786; CHECK-NEXT: adds r0, #2 1787; CHECK-NEXT: vmov s4, r2 1788; CHECK-NEXT: vcvt.f16.s32 s4, s4 1789; CHECK-NEXT: vmul.f16 s2, s2, s4 1790; CHECK-NEXT: vcvtb.f32.f16 s2, s2 1791; CHECK-NEXT: vadd.f32 s0, s0, s2 1792; CHECK-NEXT: le lr, .LBB11_8 1793; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup 1794; CHECK-NEXT: pop {r4, r5, r6, pc} 1795; CHECK-NEXT: .p2align 2 1796; CHECK-NEXT: @ %bb.10: 1797; CHECK-NEXT: .LCPI11_0: 1798; CHECK-NEXT: .long 0x00000000 @ float 0 1799entry: 1800 %cmp10 = icmp eq i32 %N, 0 1801 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader 1802 1803for.body.preheader: ; preds = %entry 1804 %0 = add i32 %N, -1 1805 %xtraiter = and i32 %N, 3 1806 %1 = icmp ult i32 %0, 3 1807 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 1808 1809for.body.preheader.new: ; preds = %for.body.preheader 1810 %unroll_iter = sub i32 %N, %xtraiter 1811 br label %for.body 1812 1813for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 1814 %add.lcssa.ph = phi float [ undef, %for.body.preheader ], [ %add.3, %for.body ] 1815 %i.012.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ] 1816 %res.011.unr = phi float [ 0.000000e+00, %for.body.preheader ], [ %add.3, %for.body ] 1817 %lcmp.mod = icmp eq i32 %xtraiter, 0 1818 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil 1819 1820for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil 1821 %i.012.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.012.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 1822 %res.011.epil = phi float [ %add.epil, %for.body.epil ], [ %res.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 1823 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 1824 %arrayidx.epil = getelementptr inbounds half, ptr %a, i32 %i.012.epil 1825 %2 = load half, ptr %arrayidx.epil, align 2 1826 %arrayidx1.epil = getelementptr inbounds i16, ptr %b, i32 %i.012.epil 1827 %3 = load i16, ptr %arrayidx1.epil, align 2 1828 %conv2.epil = sitofp i16 %3 to half 1829 %mul.epil = fmul half %2, %conv2.epil 1830 %conv3.epil = fpext half %mul.epil to float 1831 %add.epil = fadd float %res.011.epil, %conv3.epil 1832 %inc.epil = add nuw i32 %i.012.epil, 1 1833 %epil.iter.sub = add i32 %epil.iter, -1 1834 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0 1835 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil 1836 1837for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry 1838 %res.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa.ph, %for.cond.cleanup.loopexit.unr-lcssa ], [ %add.epil, %for.body.epil ] 1839 ret float %res.0.lcssa 1840 1841for.body: ; preds = %for.body, %for.body.preheader.new 1842 %i.012 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ] 1843 %res.011 = phi float [ 0.000000e+00, %for.body.preheader.new ], [ %add.3, %for.body ] 1844 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 1845 %arrayidx = getelementptr inbounds half, ptr %a, i32 %i.012 1846 %4 = load half, ptr %arrayidx, align 2 1847 %arrayidx1 = getelementptr inbounds i16, ptr %b, i32 %i.012 1848 %5 = load i16, ptr %arrayidx1, align 2 1849 %conv2 = sitofp i16 %5 to half 1850 %mul = fmul half %4, %conv2 1851 %conv3 = fpext half %mul to float 1852 %add = fadd float %res.011, %conv3 1853 %inc = or disjoint i32 %i.012, 1 1854 %arrayidx.1 = getelementptr inbounds half, ptr %a, i32 %inc 1855 %6 = load half, ptr %arrayidx.1, align 2 1856 %arrayidx1.1 = getelementptr inbounds i16, ptr %b, i32 %inc 1857 %7 = load i16, ptr %arrayidx1.1, align 2 1858 %conv2.1 = sitofp i16 %7 to half 1859 %mul.1 = fmul half %6, %conv2.1 1860 %conv3.1 = fpext half %mul.1 to float 1861 %add.1 = fadd float %add, %conv3.1 1862 %inc.1 = or disjoint i32 %i.012, 2 1863 %arrayidx.2 = getelementptr inbounds half, ptr %a, i32 %inc.1 1864 %8 = load half, ptr %arrayidx.2, align 2 1865 %arrayidx1.2 = getelementptr inbounds i16, ptr %b, i32 %inc.1 1866 %9 = load i16, ptr %arrayidx1.2, align 2 1867 %conv2.2 = sitofp i16 %9 to half 1868 %mul.2 = fmul half %8, %conv2.2 1869 %conv3.2 = fpext half %mul.2 to float 1870 %add.2 = fadd float %add.1, %conv3.2 1871 %inc.2 = or disjoint i32 %i.012, 3 1872 %arrayidx.3 = getelementptr inbounds half, ptr %a, i32 %inc.2 1873 %10 = load half, ptr %arrayidx.3, align 2 1874 %arrayidx1.3 = getelementptr inbounds i16, ptr %b, i32 %inc.2 1875 %11 = load i16, ptr %arrayidx1.3, align 2 1876 %conv2.3 = sitofp i16 %11 to half 1877 %mul.3 = fmul half %10, %conv2.3 1878 %conv3.3 = fpext half %mul.3 to float 1879 %add.3 = fadd float %add.2, %conv3.3 1880 %inc.3 = add nuw i32 %i.012, 4 1881 %niter.nsub.3 = add i32 %niter, -4 1882 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0 1883 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 1884} 1885 1886