1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 2; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s 3 4define void @foo(i32 noundef %limit, ptr %out, ptr %y) { 5; CHECK-LABEL: foo: 6; CHECK: // %bb.0: // %entry 7; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0 8; CHECK-NEXT: cmp w0, #1 9; CHECK-NEXT: b.lt .LBB0_10 10; CHECK-NEXT: // %bb.1: // %for.cond1.preheader.us.preheader 11; CHECK-NEXT: mov w10, w0 12; CHECK-NEXT: ubfiz x11, x0, #2, #32 13; CHECK-NEXT: mov x8, xzr 14; CHECK-NEXT: mov x9, xzr 15; CHECK-NEXT: and x12, x10, #0xfffffff0 16; CHECK-NEXT: add x13, x1, #32 17; CHECK-NEXT: add x14, x2, #16 18; CHECK-NEXT: b .LBB0_3 19; CHECK-NEXT: .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us 20; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 21; CHECK-NEXT: add x9, x9, #1 22; CHECK-NEXT: add x13, x13, x11 23; CHECK-NEXT: add x8, x8, x10 24; CHECK-NEXT: cmp x9, x10 25; CHECK-NEXT: b.eq .LBB0_10 26; CHECK-NEXT: .LBB0_3: // %for.cond1.preheader.us 27; CHECK-NEXT: // =>This Loop Header: Depth=1 28; CHECK-NEXT: // Child Loop BB0_6 Depth 2 29; CHECK-NEXT: // Child Loop BB0_9 Depth 2 30; CHECK-NEXT: ldrsh w15, [x2, x9, lsl #1] 31; CHECK-NEXT: cmp w0, #16 32; CHECK-NEXT: b.hs .LBB0_5 33; CHECK-NEXT: // %bb.4: // in Loop: Header=BB0_3 Depth=1 34; CHECK-NEXT: mov x18, xzr 35; CHECK-NEXT: b .LBB0_8 36; CHECK-NEXT: .LBB0_5: // %vector.ph 37; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 38; CHECK-NEXT: dup v0.8h, w15 39; CHECK-NEXT: mov x16, x14 40; CHECK-NEXT: mov x17, x13 41; CHECK-NEXT: mov x18, x12 42; CHECK-NEXT: .LBB0_6: // %vector.body 43; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 44; CHECK-NEXT: // => This Inner Loop Header: Depth=2 45; CHECK-NEXT: ldp q1, q4, [x16, #-16] 46; CHECK-NEXT: subs x18, x18, #16 47; CHECK-NEXT: ldp q3, q2, [x17, #-32] 48; CHECK-NEXT: add x16, x16, #32 49; CHECK-NEXT: ldp q6, q5, [x17] 50; CHECK-NEXT: smlal2 v2.4s, v0.8h, v1.8h 51; CHECK-NEXT: smlal v3.4s, v0.4h, v1.4h 52; CHECK-NEXT: smlal2 v5.4s, v0.8h, v4.8h 53; CHECK-NEXT: smlal v6.4s, v0.4h, v4.4h 54; CHECK-NEXT: stp q3, q2, [x17, #-32] 55; CHECK-NEXT: stp q6, q5, [x17], #64 56; CHECK-NEXT: b.ne .LBB0_6 57; CHECK-NEXT: // %bb.7: // %middle.block 58; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 59; CHECK-NEXT: cmp x12, x10 60; CHECK-NEXT: mov x18, x12 61; CHECK-NEXT: b.eq .LBB0_2 62; CHECK-NEXT: .LBB0_8: // %for.body4.us.preheader 63; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 64; CHECK-NEXT: add x16, x18, x8 65; CHECK-NEXT: add x17, x2, x18, lsl #1 66; CHECK-NEXT: sub x18, x10, x18 67; CHECK-NEXT: add x16, x1, x16, lsl #2 68; CHECK-NEXT: .LBB0_9: // %for.body4.us 69; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 70; CHECK-NEXT: // => This Inner Loop Header: Depth=2 71; CHECK-NEXT: ldrsh w3, [x17], #2 72; CHECK-NEXT: ldr w4, [x16] 73; CHECK-NEXT: subs x18, x18, #1 74; CHECK-NEXT: madd w3, w3, w15, w4 75; CHECK-NEXT: str w3, [x16], #4 76; CHECK-NEXT: b.ne .LBB0_9 77; CHECK-NEXT: b .LBB0_2 78; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup 79; CHECK-NEXT: ret 80entry: 81 %cmp26 = icmp sgt i32 %limit, 0 82 br i1 %cmp26, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup 83 84for.cond1.preheader.us.preheader: ; preds = %entry 85 %0 = zext i32 %limit to i64 86 %wide.trip.count34 = zext i32 %limit to i64 87 %min.iters.check = icmp ult i32 %limit, 16 88 %n.vec = and i64 %wide.trip.count34, 4294967280 89 %cmp.n = icmp eq i64 %n.vec, %wide.trip.count34 90 br label %for.cond1.preheader.us 91 92for.cond1.preheader.us: ; preds = %for.cond1.preheader.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us 93 %indvars.iv30 = phi i64 [ 0, %for.cond1.preheader.us.preheader ], [ %indvars.iv.next31, %for.cond1.for.cond.cleanup3_crit_edge.us ] 94 %arrayidx.us = getelementptr inbounds i16, ptr %y, i64 %indvars.iv30 95 %1 = load i16, ptr %arrayidx.us, align 2 96 %conv.us = sext i16 %1 to i32 97 %2 = mul nsw i64 %indvars.iv30, %0 98 br i1 %min.iters.check, label %for.body4.us.preheader, label %vector.ph 99 100vector.ph: ; preds = %for.cond1.preheader.us 101 %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv.us, i64 0 102 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer 103 %broadcast.splatinsert37 = insertelement <8 x i32> poison, i32 %conv.us, i64 0 104 %broadcast.splat38 = shufflevector <8 x i32> %broadcast.splatinsert37, <8 x i32> poison, <8 x i32> zeroinitializer 105 br label %vector.body 106 107vector.body: ; preds = %vector.body, %vector.ph 108 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 109 %3 = getelementptr inbounds i16, ptr %y, i64 %index 110 %wide.load = load <8 x i16>, ptr %3, align 2 111 %4 = getelementptr inbounds i16, ptr %3, i64 8 112 %wide.load36 = load <8 x i16>, ptr %4, align 2 113 %5 = sext <8 x i16> %wide.load to <8 x i32> 114 %6 = sext <8 x i16> %wide.load36 to <8 x i32> 115 %7 = mul nsw <8 x i32> %broadcast.splat, %5 116 %8 = mul nsw <8 x i32> %broadcast.splat38, %6 117 %9 = add nuw nsw i64 %index, %2 118 %10 = getelementptr inbounds i32, ptr %out, i64 %9 119 %wide.load39 = load <8 x i32>, ptr %10, align 4 120 %11 = getelementptr inbounds i32, ptr %10, i64 8 121 %wide.load40 = load <8 x i32>, ptr %11, align 4 122 %12 = add nsw <8 x i32> %7, %wide.load39 123 %13 = add nsw <8 x i32> %8, %wide.load40 124 store <8 x i32> %12, ptr %10, align 4 125 store <8 x i32> %13, ptr %11, align 4 126 %index.next = add nuw i64 %index, 16 127 %14 = icmp eq i64 %index.next, %n.vec 128 br i1 %14, label %middle.block, label %vector.body 129 130middle.block: ; preds = %vector.body 131 br i1 %cmp.n, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.preheader 132 133for.body4.us.preheader: ; preds = %for.cond1.preheader.us, %middle.block 134 %indvars.iv.ph = phi i64 [ 0, %for.cond1.preheader.us ], [ %n.vec, %middle.block ] 135 br label %for.body4.us 136 137for.body4.us: ; preds = %for.body4.us.preheader, %for.body4.us 138 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.us ], [ %indvars.iv.ph, %for.body4.us.preheader ] 139 %arrayidx6.us = getelementptr inbounds i16, ptr %y, i64 %indvars.iv 140 %15 = load i16, ptr %arrayidx6.us, align 2 141 %conv7.us = sext i16 %15 to i32 142 %mul.us = mul nsw i32 %conv7.us, %conv.us 143 %16 = add nuw nsw i64 %indvars.iv, %2 144 %arrayidx10.us = getelementptr inbounds i32, ptr %out, i64 %16 145 %17 = load i32, ptr %arrayidx10.us, align 4 146 %add11.us = add nsw i32 %mul.us, %17 147 store i32 %add11.us, ptr %arrayidx10.us, align 4 148 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 149 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count34 150 br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us 151 152for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us, %middle.block 153 %indvars.iv.next31 = add nuw nsw i64 %indvars.iv30, 1 154 %exitcond35.not = icmp eq i64 %indvars.iv.next31, %wide.trip.count34 155 br i1 %exitcond35.not, label %for.cond.cleanup, label %for.cond1.preheader.us 156 157for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry 158 ret void 159} 160