; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI define void @matrix_mul_unsigned(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) { ; CHECK-SD-LABEL: matrix_mul_unsigned: ; CHECK-SD: // %bb.0: // %vector.header ; CHECK-SD-NEXT: dup v0.4h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 ; CHECK-SD-NEXT: .LBB0_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-SD-NEXT: subs x8, x8, #8 ; CHECK-SD-NEXT: ldp d1, d2, [x9] ; CHECK-SD-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-SD-NEXT: add w0, w0, #8 ; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: stp q1, q2, [x9] ; CHECK-SD-NEXT: b.ne .LBB0_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: matrix_mul_unsigned: ; CHECK-GI: // %bb.0: // %vector.header ; CHECK-GI-NEXT: and w8, w3, #0xffff ; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 ; CHECK-GI-NEXT: .LBB0_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-GI-NEXT: subs x8, x8, #8 ; CHECK-GI-NEXT: ldp d1, d2, [x9] ; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-GI-NEXT: add w0, w0, #8 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: stp q1, q2, [x9] ; CHECK-GI-NEXT: b.ne .LBB0_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret vector.header: %conv4 = zext i16 %val to i32 %wide.trip.count = zext i32 %N to i64 %0 = add nsw i64 %wide.trip.count, -1 %min.iters.check = icmp ult i32 %N, 8 %1 = trunc i64 %0 to i32 %2 = icmp ugt i64 %0, 4294967295 %n.vec = and i64 %wide.trip.count, 4294967288 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer %cmp.n = icmp eq i64 %n.vec, %wide.trip.count br label %vector.body vector.body: ; preds = %vector.header, %vector.body %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] %3 = trunc i64 %index to i32 %4 = add i32 %N, %3 %5 = zext i32 %4 to i64 %6 = getelementptr inbounds i16, ptr %A, i64 %5 %7 = bitcast ptr %6 to ptr %wide.load = load <4 x i16>, ptr %7, align 2 %8 = getelementptr inbounds i16, ptr %6, i64 4 %9 = bitcast ptr %8 to ptr %wide.load30 = load <4 x i16>, ptr %9, align 2 %10 = zext <4 x i16> %wide.load to <4 x i32> %11 = zext <4 x i16> %wide.load30 to <4 x i32> %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10 %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11 %14 = getelementptr inbounds i32, ptr %C, i64 %5 %15 = bitcast ptr %14 to ptr store <4 x i32> %12, ptr %15, align 4 %16 = getelementptr inbounds i32, ptr %14, i64 4 %17 = bitcast ptr %16 to ptr store <4 x i32> %13, ptr %17, align 4 %index.next = add i64 %index, 8 %18 = icmp eq i64 %index.next, %n.vec br i1 %18, label %for.end12, label %vector.body for.end12: ; preds = %vector.body ret void } define void @matrix_mul_signed(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) { ; CHECK-SD-LABEL: matrix_mul_signed: ; CHECK-SD: // %bb.0: // %vector.header ; CHECK-SD-NEXT: dup v0.4h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 ; CHECK-SD-NEXT: .LBB1_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, sxtw #1 ; CHECK-SD-NEXT: subs x8, x8, #8 ; CHECK-SD-NEXT: ldp d1, d2, [x9] ; CHECK-SD-NEXT: add x9, x1, w0, sxtw #2 ; CHECK-SD-NEXT: add w0, w0, #8 ; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: smull v2.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: stp q1, q2, [x9] ; CHECK-SD-NEXT: b.ne .LBB1_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: matrix_mul_signed: ; CHECK-GI: // %bb.0: // %vector.header ; CHECK-GI-NEXT: sxth w9, w3 ; CHECK-GI-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-GI-NEXT: sxtw x8, w0 ; CHECK-GI-NEXT: dup v0.4s, w9 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 ; CHECK-GI-NEXT: .LBB1_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, sxtw #1 ; CHECK-GI-NEXT: subs x8, x8, #8 ; CHECK-GI-NEXT: ldp d1, d2, [x9] ; CHECK-GI-NEXT: add x9, x1, w0, sxtw #2 ; CHECK-GI-NEXT: add w0, w0, #8 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: stp q1, q2, [x9] ; CHECK-GI-NEXT: b.ne .LBB1_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret vector.header: %conv4 = sext i16 %val to i32 %wide.trip.count = sext i32 %N to i64 %0 = add nsw i64 %wide.trip.count, -1 %min.iters.check = icmp ult i32 %N, 8 %1 = trunc i64 %0 to i32 %2 = icmp ugt i64 %0, 4294967295 %n.vec = and i64 %wide.trip.count, 4294967288 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer %cmp.n = icmp eq i64 %n.vec, %wide.trip.count br label %vector.body vector.body: ; preds = %vector.header, %vector.body %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] %3 = trunc i64 %index to i32 %4 = add i32 %N, %3 %5 = sext i32 %4 to i64 %6 = getelementptr inbounds i16, ptr %A, i64 %5 %7 = bitcast ptr %6 to ptr %wide.load = load <4 x i16>, ptr %7, align 2 %8 = getelementptr inbounds i16, ptr %6, i64 4 %9 = bitcast ptr %8 to ptr %wide.load30 = load <4 x i16>, ptr %9, align 2 %10 = sext <4 x i16> %wide.load to <4 x i32> %11 = sext <4 x i16> %wide.load30 to <4 x i32> %12 = mul nsw <4 x i32> %broadcast.splat, %10 %13 = mul nsw <4 x i32> %broadcast.splat32, %11 %14 = getelementptr inbounds i32, ptr %C, i64 %5 %15 = bitcast ptr %14 to ptr store <4 x i32> %12, ptr %15, align 4 %16 = getelementptr inbounds i32, ptr %14, i64 4 %17 = bitcast ptr %16 to ptr store <4 x i32> %13, ptr %17, align 4 %index.next = add i64 %index, 8 %18 = icmp eq i64 %index.next, %n.vec br i1 %18, label %for.end12, label %vector.body for.end12: ; preds = %vector.body ret void } define void @matrix_mul_double_shuffle(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i16 %val) { ; CHECK-SD-LABEL: matrix_mul_double_shuffle: ; CHECK-SD: // %bb.0: // %vector.header ; CHECK-SD-NEXT: dup v0.4h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 killed $x0 def $x0 ; CHECK-SD-NEXT: .LBB2_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldrh w9, [x2], #16 ; CHECK-SD-NEXT: subs x8, x8, #8 ; CHECK-SD-NEXT: dup v1.4h, w9 ; CHECK-SD-NEXT: ubfiz x9, x0, #2, #32 ; CHECK-SD-NEXT: add w0, w0, #8 ; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: str q1, [x1, x9] ; CHECK-SD-NEXT: b.ne .LBB2_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: matrix_mul_double_shuffle: ; CHECK-GI: // %bb.0: // %vector.header ; CHECK-GI-NEXT: and w9, w3, #0xffff ; CHECK-GI-NEXT: adrp x8, .LCPI2_0 ; CHECK-GI-NEXT: dup v0.4s, w9 ; CHECK-GI-NEXT: mov w9, w0 ; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI2_0] ; CHECK-GI-NEXT: and x8, x9, #0xfffffff8 ; CHECK-GI-NEXT: .LBB2_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldrh w9, [x2], #16 ; CHECK-GI-NEXT: subs x8, x8, #8 ; CHECK-GI-NEXT: mov v2.s[0], w9 ; CHECK-GI-NEXT: mov w9, w0 ; CHECK-GI-NEXT: add w0, w0, #8 ; CHECK-GI-NEXT: lsl x9, x9, #2 ; CHECK-GI-NEXT: tbl v2.16b, { v2.16b, v3.16b }, v1.16b ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: str q2, [x1, x9] ; CHECK-GI-NEXT: b.ne .LBB2_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret vector.header: %conv4 = zext i16 %val to i32 %wide.trip.count = zext i32 %N to i64 %0 = add nsw i64 %wide.trip.count, -1 %min.iters.check = icmp ult i32 %N, 8 %1 = trunc i64 %0 to i32 %2 = icmp ugt i64 %0, 4294967295 %n.vec = and i64 %wide.trip.count, 4294967288 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %cmp.n = icmp eq i64 %n.vec, %wide.trip.count br label %vector.body vector.body: ; preds = %vector.header, %vector.body %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] %g = getelementptr inbounds i16, ptr %A, i64 %index %val1 = load i16, ptr %g %splat.input.ext = zext i16 %val1 to i32 %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %splat.input.ext, i32 0 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> %broadcast.splat, <4 x i32> %3 = trunc i64 %index to i32 %4 = add i32 %N, %3 %5 = zext i32 %4 to i64 %6 = mul nuw nsw <4 x i32> %broadcast.splat, %broadcast.splat32 %7 = getelementptr inbounds i32, ptr %C, i64 %5 %8 = bitcast ptr %7 to ptr store <4 x i32> %6, ptr %8, align 4 %index.next = add i64 %index, 8 %9 = icmp eq i64 %index.next, %n.vec br i1 %9, label %for.end12, label %vector.body for.end12: ; preds = %vector.body ret void } define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr noalias nocapture noundef writeonly %s, i32 noundef %n) { ; CHECK-SD-LABEL: larger_smull: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: cmp w3, #1 ; CHECK-SD-NEXT: b.lt .LBB3_8 ; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader ; CHECK-SD-NEXT: cmp w3, #15 ; CHECK-SD-NEXT: mov w8, w3 ; CHECK-SD-NEXT: b.hi .LBB3_3 ; CHECK-SD-NEXT: // %bb.2: ; CHECK-SD-NEXT: mov x9, xzr ; CHECK-SD-NEXT: b .LBB3_6 ; CHECK-SD-NEXT: .LBB3_3: // %vector.ph ; CHECK-SD-NEXT: dup v0.8h, w1 ; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 ; CHECK-SD-NEXT: add x10, x2, #32 ; CHECK-SD-NEXT: add x11, x0, #16 ; CHECK-SD-NEXT: mov x12, x9 ; CHECK-SD-NEXT: .LBB3_4: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] ; CHECK-SD-NEXT: subs x12, x12, #16 ; CHECK-SD-NEXT: add x11, x11, #32 ; CHECK-SD-NEXT: smull2 v3.4s, v0.8h, v1.8h ; CHECK-SD-NEXT: smull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: smull2 v4.4s, v0.8h, v2.8h ; CHECK-SD-NEXT: smull v2.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: stp q1, q3, [x10, #-32] ; CHECK-SD-NEXT: stp q2, q4, [x10], #64 ; CHECK-SD-NEXT: b.ne .LBB3_4 ; CHECK-SD-NEXT: // %bb.5: // %middle.block ; CHECK-SD-NEXT: cmp x9, x8 ; CHECK-SD-NEXT: b.eq .LBB3_8 ; CHECK-SD-NEXT: .LBB3_6: // %for.body.preheader1 ; CHECK-SD-NEXT: sxth w10, w1 ; CHECK-SD-NEXT: add x11, x2, x9, lsl #2 ; CHECK-SD-NEXT: add x12, x0, x9, lsl #1 ; CHECK-SD-NEXT: sub x8, x8, x9 ; CHECK-SD-NEXT: .LBB3_7: // %for.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldrsh w9, [x12], #2 ; CHECK-SD-NEXT: subs x8, x8, #1 ; CHECK-SD-NEXT: mul w9, w9, w10 ; CHECK-SD-NEXT: str w9, [x11], #4 ; CHECK-SD-NEXT: b.ne .LBB3_7 ; CHECK-SD-NEXT: .LBB3_8: // %for.cond.cleanup ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: larger_smull: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: cmp w3, #0 ; CHECK-GI-NEXT: b.le .LBB3_7 ; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader ; CHECK-GI-NEXT: sxth w8, w1 ; CHECK-GI-NEXT: mov x9, xzr ; CHECK-GI-NEXT: cmp w3, #16 ; CHECK-GI-NEXT: mov w10, w3 ; CHECK-GI-NEXT: b.lo .LBB3_5 ; CHECK-GI-NEXT: // %bb.2: // %vector.ph ; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: and x9, x10, #0xfffffff0 ; CHECK-GI-NEXT: add x11, x2, #32 ; CHECK-GI-NEXT: add x12, x0, #16 ; CHECK-GI-NEXT: mov x13, x9 ; CHECK-GI-NEXT: .LBB3_3: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldp q1, q2, [x12, #-16] ; CHECK-GI-NEXT: mov x14, x11 ; CHECK-GI-NEXT: subs x13, x13, #16 ; CHECK-GI-NEXT: add x12, x12, #32 ; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: sshll v4.4s, v2.4h, #0 ; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: stp q3, q1, [x14, #-32]! ; CHECK-GI-NEXT: stp q4, q2, [x11], #64 ; CHECK-GI-NEXT: b.ne .LBB3_3 ; CHECK-GI-NEXT: // %bb.4: // %middle.block ; CHECK-GI-NEXT: cmp x9, x10 ; CHECK-GI-NEXT: b.eq .LBB3_7 ; CHECK-GI-NEXT: .LBB3_5: // %for.body.preheader1 ; CHECK-GI-NEXT: add x11, x2, x9, lsl #2 ; CHECK-GI-NEXT: add x12, x0, x9, lsl #1 ; CHECK-GI-NEXT: sub x9, x10, x9 ; CHECK-GI-NEXT: .LBB3_6: // %for.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldrsh w10, [x12], #2 ; CHECK-GI-NEXT: subs x9, x9, #1 ; CHECK-GI-NEXT: mul w10, w10, w8 ; CHECK-GI-NEXT: str w10, [x11], #4 ; CHECK-GI-NEXT: b.ne .LBB3_6 ; CHECK-GI-NEXT: .LBB3_7: // %for.cond.cleanup ; CHECK-GI-NEXT: ret entry: %conv1 = sext i16 %y to i32 %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: ; preds = %entry %wide.trip.count = zext i32 %n to i64 %min.iters.check = icmp ult i32 %n, 16 br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph vector.ph: ; preds = %for.body.preheader %n.vec = and i64 %wide.trip.count, 4294967280 %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0 %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i16, ptr %x, i64 %index %1 = bitcast ptr %0 to ptr %wide.load = load <8 x i16>, ptr %1, align 2 %2 = getelementptr inbounds i16, ptr %0, i64 8 %3 = bitcast ptr %2 to ptr %wide.load11 = load <8 x i16>, ptr %3, align 2 %4 = sext <8 x i16> %wide.load to <8 x i32> %5 = sext <8 x i16> %wide.load11 to <8 x i32> %6 = mul nsw <8 x i32> %broadcast.splat, %4 %7 = mul nsw <8 x i32> %broadcast.splat13, %5 %8 = getelementptr inbounds i32, ptr %s, i64 %index %9 = bitcast ptr %8 to ptr store <8 x i32> %6, ptr %9, align 4 %10 = getelementptr inbounds i32, ptr %8, i64 8 %11 = bitcast ptr %10 to ptr store <8 x i32> %7, ptr %11, align 4 %index.next = add nuw i64 %index, 16 %12 = icmp eq i64 %index.next, %n.vec br i1 %12, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i64 %n.vec, %wide.trip.count br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14 for.body.preheader14: ; preds = %for.body.preheader, %middle.block %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] br label %for.body for.cond.cleanup: ; preds = %for.body, %middle.block, %entry ret void for.body: ; preds = %for.body.preheader14, %for.body %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ] %arrayidx = getelementptr inbounds i16, ptr %x, i64 %indvars.iv %13 = load i16, ptr %arrayidx, align 2 %conv = sext i16 %13 to i32 %mul = mul nsw i32 %conv, %conv1 %arrayidx3 = getelementptr inbounds i32, ptr %s, i64 %indvars.iv store i32 %mul, ptr %arrayidx3, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr noalias nocapture noundef writeonly %s, i32 noundef %n) { ; CHECK-SD-LABEL: larger_umull: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: cmp w3, #1 ; CHECK-SD-NEXT: b.lt .LBB4_8 ; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader ; CHECK-SD-NEXT: cmp w3, #15 ; CHECK-SD-NEXT: mov w8, w3 ; CHECK-SD-NEXT: b.hi .LBB4_3 ; CHECK-SD-NEXT: // %bb.2: ; CHECK-SD-NEXT: mov x9, xzr ; CHECK-SD-NEXT: b .LBB4_6 ; CHECK-SD-NEXT: .LBB4_3: // %vector.ph ; CHECK-SD-NEXT: dup v0.8h, w1 ; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 ; CHECK-SD-NEXT: add x10, x2, #32 ; CHECK-SD-NEXT: add x11, x0, #16 ; CHECK-SD-NEXT: mov x12, x9 ; CHECK-SD-NEXT: .LBB4_4: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] ; CHECK-SD-NEXT: subs x12, x12, #16 ; CHECK-SD-NEXT: add x11, x11, #32 ; CHECK-SD-NEXT: umull2 v3.4s, v0.8h, v1.8h ; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: umull2 v4.4s, v0.8h, v2.8h ; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: stp q1, q3, [x10, #-32] ; CHECK-SD-NEXT: stp q2, q4, [x10], #64 ; CHECK-SD-NEXT: b.ne .LBB4_4 ; CHECK-SD-NEXT: // %bb.5: // %middle.block ; CHECK-SD-NEXT: cmp x9, x8 ; CHECK-SD-NEXT: b.eq .LBB4_8 ; CHECK-SD-NEXT: .LBB4_6: // %for.body.preheader1 ; CHECK-SD-NEXT: add x10, x2, x9, lsl #2 ; CHECK-SD-NEXT: add x11, x0, x9, lsl #1 ; CHECK-SD-NEXT: and w12, w1, #0xffff ; CHECK-SD-NEXT: sub x8, x8, x9 ; CHECK-SD-NEXT: .LBB4_7: // %for.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldrh w9, [x11], #2 ; CHECK-SD-NEXT: subs x8, x8, #1 ; CHECK-SD-NEXT: mul w9, w9, w12 ; CHECK-SD-NEXT: str w9, [x10], #4 ; CHECK-SD-NEXT: b.ne .LBB4_7 ; CHECK-SD-NEXT: .LBB4_8: // %for.cond.cleanup ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: larger_umull: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: cmp w3, #0 ; CHECK-GI-NEXT: b.le .LBB4_7 ; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader ; CHECK-GI-NEXT: mov x8, xzr ; CHECK-GI-NEXT: cmp w3, #16 ; CHECK-GI-NEXT: mov w9, w3 ; CHECK-GI-NEXT: b.lo .LBB4_5 ; CHECK-GI-NEXT: // %bb.2: // %vector.ph ; CHECK-GI-NEXT: and x8, x9, #0xfffffff0 ; CHECK-GI-NEXT: add x10, x2, #32 ; CHECK-GI-NEXT: add x11, x0, #16 ; CHECK-GI-NEXT: mov x12, x8 ; CHECK-GI-NEXT: .LBB4_3: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldp q0, q1, [x11, #-16] ; CHECK-GI-NEXT: and w13, w1, #0xffff ; CHECK-GI-NEXT: dup v2.4s, w13 ; CHECK-GI-NEXT: mov x13, x10 ; CHECK-GI-NEXT: subs x12, x12, #16 ; CHECK-GI-NEXT: add x11, x11, #32 ; CHECK-GI-NEXT: ushll v3.4s, v0.4h, #0 ; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-GI-NEXT: ushll v4.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: mul v3.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: mul v0.4s, v2.4s, v0.4s ; CHECK-GI-NEXT: mul v4.4s, v2.4s, v4.4s ; CHECK-GI-NEXT: mul v1.4s, v2.4s, v1.4s ; CHECK-GI-NEXT: stp q3, q0, [x13, #-32]! ; CHECK-GI-NEXT: stp q4, q1, [x10], #64 ; CHECK-GI-NEXT: b.ne .LBB4_3 ; CHECK-GI-NEXT: // %bb.4: // %middle.block ; CHECK-GI-NEXT: cmp x8, x9 ; CHECK-GI-NEXT: b.eq .LBB4_7 ; CHECK-GI-NEXT: .LBB4_5: // %for.body.preheader1 ; CHECK-GI-NEXT: add x10, x2, x8, lsl #2 ; CHECK-GI-NEXT: add x11, x0, x8, lsl #1 ; CHECK-GI-NEXT: and w12, w1, #0xffff ; CHECK-GI-NEXT: sub x8, x9, x8 ; CHECK-GI-NEXT: .LBB4_6: // %for.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldrh w9, [x11], #2 ; CHECK-GI-NEXT: subs x8, x8, #1 ; CHECK-GI-NEXT: mul w9, w9, w12 ; CHECK-GI-NEXT: str w9, [x10], #4 ; CHECK-GI-NEXT: b.ne .LBB4_6 ; CHECK-GI-NEXT: .LBB4_7: // %for.cond.cleanup ; CHECK-GI-NEXT: ret entry: %conv1 = zext i16 %y to i32 %cmp8 = icmp sgt i32 %n, 0 br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup for.body.preheader: ; preds = %entry %wide.trip.count = zext i32 %n to i64 %min.iters.check = icmp ult i32 %n, 16 br i1 %min.iters.check, label %for.body.preheader14, label %vector.ph vector.ph: ; preds = %for.body.preheader %n.vec = and i64 %wide.trip.count, 4294967280 %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1, i64 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer %broadcast.splatinsert12 = insertelement <8 x i32> poison, i32 %conv1, i64 0 %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> poison, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %0 = getelementptr inbounds i16, ptr %x, i64 %index %1 = bitcast ptr %0 to ptr %wide.load = load <8 x i16>, ptr %1, align 2 %2 = getelementptr inbounds i16, ptr %0, i64 8 %3 = bitcast ptr %2 to ptr %wide.load11 = load <8 x i16>, ptr %3, align 2 %4 = zext <8 x i16> %wide.load to <8 x i32> %5 = zext <8 x i16> %wide.load11 to <8 x i32> %6 = mul nuw <8 x i32> %broadcast.splat, %4 %7 = mul nuw <8 x i32> %broadcast.splat13, %5 %8 = getelementptr inbounds i32, ptr %s, i64 %index %9 = bitcast ptr %8 to ptr store <8 x i32> %6, ptr %9, align 4 %10 = getelementptr inbounds i32, ptr %8, i64 8 %11 = bitcast ptr %10 to ptr store <8 x i32> %7, ptr %11, align 4 %index.next = add nuw i64 %index, 16 %12 = icmp eq i64 %index.next, %n.vec br i1 %12, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i64 %n.vec, %wide.trip.count br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader14 for.body.preheader14: ; preds = %for.body.preheader, %middle.block %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] br label %for.body for.cond.cleanup: ; preds = %for.body, %middle.block, %entry ret void for.body: ; preds = %for.body.preheader14, %for.body %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader14 ] %arrayidx = getelementptr inbounds i16, ptr %x, i64 %indvars.iv %13 = load i16, ptr %arrayidx, align 2 %conv = zext i16 %13 to i32 %mul = mul nuw i32 %conv, %conv1 %arrayidx3 = getelementptr inbounds i32, ptr %s, i64 %indvars.iv store i32 %mul, ptr %arrayidx3, align 4 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, i8 noundef %B, i32 noundef %n) { ; CHECK-SD-LABEL: red_mla_dup_ext_u8_s8_s16: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: cbz w2, .LBB5_3 ; CHECK-SD-NEXT: // %bb.1: // %for.body.preheader ; CHECK-SD-NEXT: sxtb w9, w1 ; CHECK-SD-NEXT: cmp w2, #15 ; CHECK-SD-NEXT: mov w10, w2 ; CHECK-SD-NEXT: b.hi .LBB5_4 ; CHECK-SD-NEXT: // %bb.2: ; CHECK-SD-NEXT: mov x11, xzr ; CHECK-SD-NEXT: mov w8, wzr ; CHECK-SD-NEXT: b .LBB5_7 ; CHECK-SD-NEXT: .LBB5_3: ; CHECK-SD-NEXT: mov w8, wzr ; CHECK-SD-NEXT: mov w0, w8 ; CHECK-SD-NEXT: ret ; CHECK-SD-NEXT: .LBB5_4: // %vector.ph ; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 ; CHECK-SD-NEXT: movi v1.2d, #0000000000000000 ; CHECK-SD-NEXT: and x11, x10, #0xfffffff0 ; CHECK-SD-NEXT: fmov s2, w9 ; CHECK-SD-NEXT: add x8, x0, #8 ; CHECK-SD-NEXT: mov x12, x11 ; CHECK-SD-NEXT: .LBB5_5: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp d3, d4, [x8, #-8] ; CHECK-SD-NEXT: subs x12, x12, #16 ; CHECK-SD-NEXT: add x8, x8, #16 ; CHECK-SD-NEXT: ushll v3.8h, v3.8b, #0 ; CHECK-SD-NEXT: ushll v4.8h, v4.8b, #0 ; CHECK-SD-NEXT: mla v0.8h, v3.8h, v2.h[0] ; CHECK-SD-NEXT: mla v1.8h, v4.8h, v2.h[0] ; CHECK-SD-NEXT: b.ne .LBB5_5 ; CHECK-SD-NEXT: // %bb.6: // %middle.block ; CHECK-SD-NEXT: add v0.8h, v1.8h, v0.8h ; CHECK-SD-NEXT: cmp x11, x10 ; CHECK-SD-NEXT: addv h0, v0.8h ; CHECK-SD-NEXT: fmov w8, s0 ; CHECK-SD-NEXT: b.eq .LBB5_9 ; CHECK-SD-NEXT: .LBB5_7: // %for.body.preheader1 ; CHECK-SD-NEXT: sub x10, x10, x11 ; CHECK-SD-NEXT: add x11, x0, x11 ; CHECK-SD-NEXT: .LBB5_8: // %for.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldrb w12, [x11], #1 ; CHECK-SD-NEXT: subs x10, x10, #1 ; CHECK-SD-NEXT: madd w8, w12, w9, w8 ; CHECK-SD-NEXT: b.ne .LBB5_8 ; CHECK-SD-NEXT: .LBB5_9: // %for.cond.cleanup ; CHECK-SD-NEXT: mov w0, w8 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: cbz w2, .LBB5_3 ; CHECK-GI-NEXT: // %bb.1: // %for.body.preheader ; CHECK-GI-NEXT: cmp w2, #16 ; CHECK-GI-NEXT: mov w8, w2 ; CHECK-GI-NEXT: b.hs .LBB5_4 ; CHECK-GI-NEXT: // %bb.2: ; CHECK-GI-NEXT: mov w10, #0 // =0x0 ; CHECK-GI-NEXT: mov x9, xzr ; CHECK-GI-NEXT: fmov s0, w10 ; CHECK-GI-NEXT: b .LBB5_8 ; CHECK-GI-NEXT: .LBB5_3: ; CHECK-GI-NEXT: mov w0, wzr ; CHECK-GI-NEXT: ret ; CHECK-GI-NEXT: .LBB5_4: // %vector.ph ; CHECK-GI-NEXT: lsl w9, w1, #8 ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-NEXT: add x10, x0, #8 ; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: dup v2.8h, w9 ; CHECK-GI-NEXT: and x9, x8, #0xfffffff0 ; CHECK-GI-NEXT: mov x11, x9 ; CHECK-GI-NEXT: .LBB5_5: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldp d3, d4, [x10, #-8] ; CHECK-GI-NEXT: subs x11, x11, #16 ; CHECK-GI-NEXT: add x10, x10, #16 ; CHECK-GI-NEXT: ushll v3.8h, v3.8b, #0 ; CHECK-GI-NEXT: ushll v4.8h, v4.8b, #0 ; CHECK-GI-NEXT: mla v0.8h, v2.8h, v3.8h ; CHECK-GI-NEXT: mla v1.8h, v2.8h, v4.8h ; CHECK-GI-NEXT: b.ne .LBB5_5 ; CHECK-GI-NEXT: // %bb.6: // %middle.block ; CHECK-GI-NEXT: add v0.8h, v1.8h, v0.8h ; CHECK-GI-NEXT: cmp x9, x8 ; CHECK-GI-NEXT: addv h0, v0.8h ; CHECK-GI-NEXT: b.ne .LBB5_8 ; CHECK-GI-NEXT: // %bb.7: ; CHECK-GI-NEXT: fmov w0, s0 ; CHECK-GI-NEXT: ret ; CHECK-GI-NEXT: .LBB5_8: // %for.body.preheader1 ; CHECK-GI-NEXT: sxtb w10, w1 ; CHECK-GI-NEXT: sub x8, x8, x9 ; CHECK-GI-NEXT: add x9, x0, x9 ; CHECK-GI-NEXT: .LBB5_9: // %for.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldrb w11, [x9], #1 ; CHECK-GI-NEXT: fmov w12, s0 ; CHECK-GI-NEXT: subs x8, x8, #1 ; CHECK-GI-NEXT: mul w11, w11, w10 ; CHECK-GI-NEXT: add w0, w11, w12, uxth ; CHECK-GI-NEXT: fmov s0, w0 ; CHECK-GI-NEXT: b.ne .LBB5_9 ; CHECK-GI-NEXT: // %bb.10: // %for.cond.cleanup ; CHECK-GI-NEXT: ret entry: %conv2 = sext i8 %B to i16 %cmp10.not = icmp eq i32 %n, 0 br i1 %cmp10.not, label %for.cond.cleanup, label %for.body.preheader for.body.preheader: ; preds = %entry %wide.trip.count = zext i32 %n to i64 %min.iters.check = icmp ult i32 %n, 16 br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph vector.ph: ; preds = %for.body.preheader %n.vec = and i64 %wide.trip.count, 4294967280 %broadcast.splatinsert = insertelement <8 x i16> poison, i16 %conv2, i64 0 %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> poison, <8 x i32> zeroinitializer %broadcast.splatinsert15 = insertelement <8 x i16> poison, i16 %conv2, i64 0 %broadcast.splat16 = shufflevector <8 x i16> %broadcast.splatinsert15, <8 x i16> poison, <8 x i32> zeroinitializer br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %vec.phi = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %8, %vector.body ] %vec.phi13 = phi <8 x i16> [ zeroinitializer, %vector.ph ], [ %9, %vector.body ] %0 = getelementptr inbounds i8, ptr %A, i64 %index %1 = bitcast ptr %0 to ptr %wide.load = load <8 x i8>, ptr %1, align 1 %2 = getelementptr inbounds i8, ptr %0, i64 8 %3 = bitcast ptr %2 to ptr %wide.load14 = load <8 x i8>, ptr %3, align 1 %4 = zext <8 x i8> %wide.load to <8 x i16> %5 = zext <8 x i8> %wide.load14 to <8 x i16> %6 = mul nsw <8 x i16> %broadcast.splat, %4 %7 = mul nsw <8 x i16> %broadcast.splat16, %5 %8 = add <8 x i16> %6, %vec.phi %9 = add <8 x i16> %7, %vec.phi13 %index.next = add nuw i64 %index, 16 %10 = icmp eq i64 %index.next, %n.vec br i1 %10, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %bin.rdx = add <8 x i16> %9, %8 %11 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %bin.rdx) %cmp.n = icmp eq i64 %n.vec, %wide.trip.count br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17 for.body.preheader17: ; preds = %for.body.preheader, %middle.block %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] %s.011.ph = phi i16 [ 0, %for.body.preheader ], [ %11, %middle.block ] br label %for.body for.cond.cleanup: ; preds = %for.body, %middle.block, %entry %s.0.lcssa = phi i16 [ 0, %entry ], [ %11, %middle.block ], [ %add, %for.body ] ret i16 %s.0.lcssa for.body: ; preds = %for.body.preheader17, %for.body %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ] %s.011 = phi i16 [ %add, %for.body ], [ %s.011.ph, %for.body.preheader17 ] %arrayidx = getelementptr inbounds i8, ptr %A, i64 %indvars.iv %12 = load i8, ptr %arrayidx, align 1 %13 = zext i8 %12 to i16 %mul = mul nsw i16 %13, %conv2 %add = add i16 %mul, %s.011 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-SD-LABEL: sink_v2z64_1: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: mov x8, xzr ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: .LBB6_1: // %loop ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldr d1, [x0] ; CHECK-SD-NEXT: subs x2, x2, #8 ; CHECK-SD-NEXT: add x8, x8, #8 ; CHECK-SD-NEXT: umull v1.2d, v1.2s, v0.s[1] ; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #15 ; CHECK-SD-NEXT: str d1, [x0], #32 ; CHECK-SD-NEXT: b.ne .LBB6_1 ; CHECK-SD-NEXT: // %bb.2: // %exit ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sink_v2z64_1: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: mov x8, xzr ; CHECK-GI-NEXT: dup v0.2d, v0.d[1] ; CHECK-GI-NEXT: mov x9, v0.d[1] ; CHECK-GI-NEXT: fmov x10, d0 ; CHECK-GI-NEXT: .LBB6_1: // %loop ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldr d0, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 ; CHECK-GI-NEXT: add x8, x8, #8 ; CHECK-GI-NEXT: ushll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: fmov x11, d0 ; CHECK-GI-NEXT: mov x12, v0.d[1] ; CHECK-GI-NEXT: mul x11, x11, x10 ; CHECK-GI-NEXT: mul x12, x12, x9 ; CHECK-GI-NEXT: mov v0.d[0], x11 ; CHECK-GI-NEXT: mov v0.d[1], x12 ; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #15 ; CHECK-GI-NEXT: str d0, [x0], #32 ; CHECK-GI-NEXT: b.ne .LBB6_1 ; CHECK-GI-NEXT: // %bb.2: // %exit ; CHECK-GI-NEXT: ret entry: %ext = zext <2 x i32> %a to <2 x i64> %broadcast.splat = shufflevector <2 x i64> %ext, <2 x i64> poison, <2 x i32> br label %loop loop: %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] %g = getelementptr inbounds i32, ptr %p, i64 %index %gb = bitcast ptr %g to ptr %l = load <2 x i32>, ptr %gb, align 4 %e = zext <2 x i32> %l to <2 x i64> %m = mul <2 x i64> %e, %broadcast.splat %s = ashr <2 x i64> %m, %t = trunc <2 x i64> %s to <2 x i32> %h = getelementptr inbounds i32, ptr %d, i64 %index %hb = bitcast ptr %g to ptr store <2 x i32> %t, ptr %hb, align 4 %index.next = add nuw i64 %index, 8 %c = icmp eq i64 %index.next, %n br i1 %c, label %exit, label %loop exit: ret void } define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) { ; CHECK-SD-LABEL: sink_v4i64_1: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: mov x8, xzr ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: .LBB7_1: // %loop ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldr q1, [x0] ; CHECK-SD-NEXT: subs x2, x2, #8 ; CHECK-SD-NEXT: add x8, x8, #8 ; CHECK-SD-NEXT: smull v2.2d, v1.2s, v0.s[1] ; CHECK-SD-NEXT: smull2 v1.2d, v1.4s, v0.s[1] ; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #15 ; CHECK-SD-NEXT: shrn2 v2.4s, v1.2d, #15 ; CHECK-SD-NEXT: str q2, [x0], #32 ; CHECK-SD-NEXT: b.ne .LBB7_1 ; CHECK-SD-NEXT: // %bb.2: // %exit ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sink_v4i64_1: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-GI-NEXT: mov x8, xzr ; CHECK-GI-NEXT: dup v0.2d, v0.d[1] ; CHECK-GI-NEXT: mov x9, v0.d[1] ; CHECK-GI-NEXT: fmov x10, d0 ; CHECK-GI-NEXT: .LBB7_1: // %loop ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldr q0, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 ; CHECK-GI-NEXT: add x8, x8, #8 ; CHECK-GI-NEXT: sshll v1.2d, v0.2s, #0 ; CHECK-GI-NEXT: sshll2 v0.2d, v0.4s, #0 ; CHECK-GI-NEXT: fmov x11, d1 ; CHECK-GI-NEXT: mov x12, v1.d[1] ; CHECK-GI-NEXT: fmov x13, d0 ; CHECK-GI-NEXT: mov x14, v0.d[1] ; CHECK-GI-NEXT: mul x11, x11, x10 ; CHECK-GI-NEXT: mul x13, x13, x10 ; CHECK-GI-NEXT: mul x12, x12, x9 ; CHECK-GI-NEXT: mov v0.d[0], x11 ; CHECK-GI-NEXT: mul x11, x14, x9 ; CHECK-GI-NEXT: mov v1.d[0], x13 ; CHECK-GI-NEXT: mov v0.d[1], x12 ; CHECK-GI-NEXT: mov v1.d[1], x11 ; CHECK-GI-NEXT: shrn v0.2s, v0.2d, #15 ; CHECK-GI-NEXT: shrn2 v0.4s, v1.2d, #15 ; CHECK-GI-NEXT: str q0, [x0], #32 ; CHECK-GI-NEXT: b.ne .LBB7_1 ; CHECK-GI-NEXT: // %bb.2: // %exit ; CHECK-GI-NEXT: ret entry: %ext = sext <2 x i32> %a to <2 x i64> %broadcast.splat = shufflevector <2 x i64> %ext, <2 x i64> poison, <4 x i32> br label %loop loop: %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] %g = getelementptr inbounds i32, ptr %p, i64 %index %gb = bitcast ptr %g to ptr %l = load <4 x i32>, ptr %gb, align 4 %e = sext <4 x i32> %l to <4 x i64> %m = mul <4 x i64> %e, %broadcast.splat %s = ashr <4 x i64> %m, %t = trunc <4 x i64> %s to <4 x i32> %h = getelementptr inbounds i32, ptr %d, i64 %index %hb = bitcast ptr %g to ptr store <4 x i32> %t, ptr %hb, align 4 %index.next = add nuw i64 %index, 8 %c = icmp eq i64 %index.next, %n br i1 %c, label %exit, label %loop exit: ret void } define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-SD-LABEL: sink_v8z16_0: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: dup v0.8b, v0.b[0] ; CHECK-SD-NEXT: mov x8, xzr ; CHECK-SD-NEXT: .LBB8_1: // %loop ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldr d1, [x0] ; CHECK-SD-NEXT: subs x2, x2, #8 ; CHECK-SD-NEXT: add x8, x8, #8 ; CHECK-SD-NEXT: umull v1.8h, v1.8b, v0.8b ; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0 ; CHECK-SD-NEXT: xtn v1.8b, v1.8h ; CHECK-SD-NEXT: str d1, [x0], #32 ; CHECK-SD-NEXT: b.ne .LBB8_1 ; CHECK-SD-NEXT: // %bb.2: // %exit ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sink_v8z16_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-GI-NEXT: mov x8, xzr ; CHECK-GI-NEXT: .LBB8_1: // %loop ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldr d1, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 ; CHECK-GI-NEXT: add x8, x8, #8 ; CHECK-GI-NEXT: ushll v1.8h, v1.8b, #0 ; CHECK-GI-NEXT: mul v1.8h, v1.8h, v0.h[0] ; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #15 ; CHECK-GI-NEXT: xtn v1.8b, v1.8h ; CHECK-GI-NEXT: str d1, [x0], #32 ; CHECK-GI-NEXT: b.ne .LBB8_1 ; CHECK-GI-NEXT: // %bb.2: // %exit ; CHECK-GI-NEXT: ret entry: %ext = zext <16 x i8> %a to <16 x i16> %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <8 x i32> br label %loop loop: %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] %g = getelementptr inbounds i32, ptr %p, i64 %index %gb = bitcast ptr %g to ptr %l = load <8 x i8>, ptr %gb, align 4 %e = zext <8 x i8> %l to <8 x i16> %m = mul <8 x i16> %e, %broadcast.splat %s = ashr <8 x i16> %m, %t = trunc <8 x i16> %s to <8 x i8> %h = getelementptr inbounds i32, ptr %d, i64 %index %hb = bitcast ptr %g to ptr store <8 x i8> %t, ptr %hb, align 4 %index.next = add nuw i64 %index, 8 %c = icmp eq i64 %index.next, %n br i1 %c, label %exit, label %loop exit: ret void } define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) { ; CHECK-SD-LABEL: sink_v16s16_8: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: dup v0.16b, v0.b[10] ; CHECK-SD-NEXT: mov x8, xzr ; CHECK-SD-NEXT: .LBB9_1: // %loop ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldr q1, [x0] ; CHECK-SD-NEXT: subs x2, x2, #8 ; CHECK-SD-NEXT: add x8, x8, #8 ; CHECK-SD-NEXT: smull v2.8h, v1.8b, v0.8b ; CHECK-SD-NEXT: smull2 v1.8h, v1.16b, v0.16b ; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0 ; CHECK-SD-NEXT: cmlt v2.8h, v2.8h, #0 ; CHECK-SD-NEXT: uzp1 v1.16b, v2.16b, v1.16b ; CHECK-SD-NEXT: str q1, [x0], #32 ; CHECK-SD-NEXT: b.ne .LBB9_1 ; CHECK-SD-NEXT: // %bb.2: // %exit ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: sink_v16s16_8: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: sshll2 v0.8h, v0.16b, #0 ; CHECK-GI-NEXT: mov x8, xzr ; CHECK-GI-NEXT: .LBB9_1: // %loop ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldr q1, [x0] ; CHECK-GI-NEXT: subs x2, x2, #8 ; CHECK-GI-NEXT: add x8, x8, #8 ; CHECK-GI-NEXT: sshll v2.8h, v1.8b, #0 ; CHECK-GI-NEXT: sshll2 v1.8h, v1.16b, #0 ; CHECK-GI-NEXT: mul v2.8h, v2.8h, v0.h[2] ; CHECK-GI-NEXT: mul v1.8h, v1.8h, v0.h[2] ; CHECK-GI-NEXT: sshr v2.8h, v2.8h, #15 ; CHECK-GI-NEXT: sshr v1.8h, v1.8h, #15 ; CHECK-GI-NEXT: uzp1 v1.16b, v2.16b, v1.16b ; CHECK-GI-NEXT: str q1, [x0], #32 ; CHECK-GI-NEXT: b.ne .LBB9_1 ; CHECK-GI-NEXT: // %bb.2: // %exit ; CHECK-GI-NEXT: ret entry: %ext = sext <16 x i8> %a to <16 x i16> %broadcast.splat = shufflevector <16 x i16> %ext, <16 x i16> poison, <16 x i32> br label %loop loop: %index = phi i64 [ 0, %entry ], [ %index.next, %loop ] %g = getelementptr inbounds i32, ptr %p, i64 %index %gb = bitcast ptr %g to ptr %l = load <16 x i8>, ptr %gb, align 4 %e = sext <16 x i8> %l to <16 x i16> %m = mul <16 x i16> %e, %broadcast.splat %s = ashr <16 x i16> %m, %t = trunc <16 x i16> %s to <16 x i8> %h = getelementptr inbounds i32, ptr %d, i64 %index %hb = bitcast ptr %g to ptr store <16 x i8> %t, ptr %hb, align 4 %index.next = add nuw i64 %index, 8 %c = icmp eq i64 %index.next, %n br i1 %c, label %exit, label %loop exit: ret void } define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { ; CHECK-SD-LABEL: matrix_mul_unsigned_and: ; CHECK-SD: // %bb.0: // %vector.header ; CHECK-SD-NEXT: dup v0.4h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 ; CHECK-SD-NEXT: .LBB10_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-SD-NEXT: subs x8, x8, #8 ; CHECK-SD-NEXT: ldp d1, d2, [x9] ; CHECK-SD-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-SD-NEXT: add w0, w0, #8 ; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: stp q1, q2, [x9] ; CHECK-SD-NEXT: b.ne .LBB10_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: matrix_mul_unsigned_and: ; CHECK-GI: // %bb.0: // %vector.header ; CHECK-GI-NEXT: and w8, w3, #0xffff ; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 ; CHECK-GI-NEXT: .LBB10_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-GI-NEXT: subs x8, x8, #8 ; CHECK-GI-NEXT: ldp d1, d2, [x9] ; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-GI-NEXT: add w0, w0, #8 ; CHECK-GI-NEXT: ushll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: stp q1, q2, [x9] ; CHECK-GI-NEXT: b.ne .LBB10_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret vector.header: %conv4 = and i32 %val, 65535 %wide.trip.count = zext i32 %N to i64 %0 = add nsw i64 %wide.trip.count, -1 %min.iters.check = icmp ult i32 %N, 8 %1 = trunc i64 %0 to i32 %2 = icmp ugt i64 %0, 4294967295 %n.vec = and i64 %wide.trip.count, 4294967288 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer %cmp.n = icmp eq i64 %n.vec, %wide.trip.count br label %vector.body vector.body: ; preds = %vector.header, %vector.body %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] %3 = trunc i64 %index to i32 %4 = add i32 %N, %3 %5 = zext i32 %4 to i64 %6 = getelementptr inbounds i16, ptr %A, i64 %5 %7 = bitcast ptr %6 to ptr %wide.load = load <4 x i16>, ptr %7, align 2 %8 = getelementptr inbounds i16, ptr %6, i64 4 %9 = bitcast ptr %8 to ptr %wide.load30 = load <4 x i16>, ptr %9, align 2 %10 = zext <4 x i16> %wide.load to <4 x i32> %11 = zext <4 x i16> %wide.load30 to <4 x i32> %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10 %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11 %14 = getelementptr inbounds i32, ptr %C, i64 %5 %15 = bitcast ptr %14 to ptr store <4 x i32> %12, ptr %15, align 4 %16 = getelementptr inbounds i32, ptr %14, i64 4 %17 = bitcast ptr %16 to ptr store <4 x i32> %13, ptr %17, align 4 %index.next = add i64 %index, 8 %18 = icmp eq i64 %index.next, %n.vec br i1 %18, label %for.end12, label %vector.body for.end12: ; preds = %vector.body ret void } define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { ; CHECK-SD-LABEL: matrix_mul_unsigned_and_double: ; CHECK-SD: // %bb.0: // %vector.header ; CHECK-SD-NEXT: dup v0.8h, w3 ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff0 ; CHECK-SD-NEXT: .LBB11_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-SD-NEXT: subs x8, x8, #16 ; CHECK-SD-NEXT: ldr q1, [x9] ; CHECK-SD-NEXT: ldur q2, [x9, #8] ; CHECK-SD-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-SD-NEXT: add w0, w0, #16 ; CHECK-SD-NEXT: umull2 v3.4s, v0.8h, v1.8h ; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h ; CHECK-SD-NEXT: umull2 v4.4s, v0.8h, v2.8h ; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h ; CHECK-SD-NEXT: stp q1, q3, [x9] ; CHECK-SD-NEXT: stp q2, q4, [x9, #32] ; CHECK-SD-NEXT: b.ne .LBB11_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: matrix_mul_unsigned_and_double: ; CHECK-GI: // %bb.0: // %vector.header ; CHECK-GI-NEXT: and w8, w3, #0xffff ; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff0 ; CHECK-GI-NEXT: .LBB11_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-GI-NEXT: subs x8, x8, #16 ; CHECK-GI-NEXT: ldr q1, [x9] ; CHECK-GI-NEXT: ldur q2, [x9, #8] ; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-GI-NEXT: add w0, w0, #16 ; CHECK-GI-NEXT: ushll v3.4s, v1.4h, #0 ; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: ushll v4.4s, v2.4h, #0 ; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 ; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: stp q3, q1, [x9] ; CHECK-GI-NEXT: stp q4, q2, [x9, #32]! ; CHECK-GI-NEXT: b.ne .LBB11_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret vector.header: %conv4 = and i32 %val, 65535 %wide.trip.count = zext i32 %N to i64 %0 = add nsw i64 %wide.trip.count, -1 %min.iters.check = icmp ult i32 %N, 16 %1 = trunc i64 %0 to i32 %2 = icmp ugt i64 %0, 4294967295 %n.vec = and i64 %wide.trip.count, 4294967280 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %conv4, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %broadcast.splatinsert31 = insertelement <8 x i32> undef, i32 %conv4, i32 0 %broadcast.splat32 = shufflevector <8 x i32> %broadcast.splatinsert31, <8 x i32> undef, <8 x i32> zeroinitializer %cmp.n = icmp eq i64 %n.vec, %wide.trip.count br label %vector.body vector.body: ; preds = %vector.header, %vector.body %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] %3 = trunc i64 %index to i32 %4 = add i32 %N, %3 %5 = zext i32 %4 to i64 %6 = getelementptr inbounds i16, ptr %A, i64 %5 %7 = bitcast ptr %6 to ptr %wide.load = load <8 x i16>, ptr %7, align 2 %8 = getelementptr inbounds i16, ptr %6, i64 4 %9 = bitcast ptr %8 to ptr %wide.load30 = load <8 x i16>, ptr %9, align 2 %10 = zext <8 x i16> %wide.load to <8 x i32> %11 = zext <8 x i16> %wide.load30 to <8 x i32> %12 = mul nuw nsw <8 x i32> %broadcast.splat, %10 %13 = mul nuw nsw <8 x i32> %broadcast.splat32, %11 %14 = getelementptr inbounds i32, ptr %C, i64 %5 %15 = bitcast ptr %14 to ptr store <8 x i32> %12, ptr %15, align 4 %16 = getelementptr inbounds i32, ptr %14, i64 8 %17 = bitcast ptr %16 to ptr store <8 x i32> %13, ptr %17, align 4 %index.next = add i64 %index, 16 %18 = icmp eq i64 %index.next, %n.vec br i1 %18, label %for.end12, label %vector.body for.end12: ; preds = %vector.body ret void } define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { ; CHECK-SD-LABEL: matrix_mul_signed_and: ; CHECK-SD: // %bb.0: // %vector.header ; CHECK-SD-NEXT: and w9, w3, #0xffff ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff8 ; CHECK-SD-NEXT: fmov s0, w9 ; CHECK-SD-NEXT: .LBB12_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-SD-NEXT: subs x8, x8, #8 ; CHECK-SD-NEXT: ldp d1, d2, [x9] ; CHECK-SD-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-SD-NEXT: add w0, w0, #8 ; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0] ; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0] ; CHECK-SD-NEXT: stp q1, q2, [x9] ; CHECK-SD-NEXT: b.ne .LBB12_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: matrix_mul_signed_and: ; CHECK-GI: // %bb.0: // %vector.header ; CHECK-GI-NEXT: and w8, w3, #0xffff ; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff8 ; CHECK-GI-NEXT: .LBB12_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-GI-NEXT: subs x8, x8, #8 ; CHECK-GI-NEXT: ldp d1, d2, [x9] ; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-GI-NEXT: add w0, w0, #8 ; CHECK-GI-NEXT: sshll v1.4s, v1.4h, #0 ; CHECK-GI-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: stp q1, q2, [x9] ; CHECK-GI-NEXT: b.ne .LBB12_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret vector.header: %conv4 = and i32 %val, 65535 %wide.trip.count = zext i32 %N to i64 %0 = add nsw i64 %wide.trip.count, -1 %min.iters.check = icmp ult i32 %N, 8 %1 = trunc i64 %0 to i32 %2 = icmp ugt i64 %0, 4294967295 %n.vec = and i64 %wide.trip.count, 4294967288 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %conv4, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer %broadcast.splatinsert31 = insertelement <4 x i32> undef, i32 %conv4, i32 0 %broadcast.splat32 = shufflevector <4 x i32> %broadcast.splatinsert31, <4 x i32> undef, <4 x i32> zeroinitializer %cmp.n = icmp eq i64 %n.vec, %wide.trip.count br label %vector.body vector.body: ; preds = %vector.header, %vector.body %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] %3 = trunc i64 %index to i32 %4 = add i32 %N, %3 %5 = zext i32 %4 to i64 %6 = getelementptr inbounds i16, ptr %A, i64 %5 %7 = bitcast ptr %6 to ptr %wide.load = load <4 x i16>, ptr %7, align 2 %8 = getelementptr inbounds i16, ptr %6, i64 4 %9 = bitcast ptr %8 to ptr %wide.load30 = load <4 x i16>, ptr %9, align 2 %10 = sext <4 x i16> %wide.load to <4 x i32> %11 = sext <4 x i16> %wide.load30 to <4 x i32> %12 = mul nuw nsw <4 x i32> %broadcast.splat, %10 %13 = mul nuw nsw <4 x i32> %broadcast.splat32, %11 %14 = getelementptr inbounds i32, ptr %C, i64 %5 %15 = bitcast ptr %14 to ptr store <4 x i32> %12, ptr %15, align 4 %16 = getelementptr inbounds i32, ptr %14, i64 4 %17 = bitcast ptr %16 to ptr store <4 x i32> %13, ptr %17, align 4 %index.next = add i64 %index, 8 %18 = icmp eq i64 %index.next, %n.vec br i1 %18, label %for.end12, label %vector.body for.end12: ; preds = %vector.body ret void } define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocapture readonly %A, i32 %val) { ; CHECK-SD-LABEL: matrix_mul_signed_and_double: ; CHECK-SD: // %bb.0: // %vector.header ; CHECK-SD-NEXT: and w9, w3, #0xffff ; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 ; CHECK-SD-NEXT: and x8, x0, #0xfffffff0 ; CHECK-SD-NEXT: fmov s0, w9 ; CHECK-SD-NEXT: .LBB13_1: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-SD-NEXT: subs x8, x8, #16 ; CHECK-SD-NEXT: ldr q1, [x9] ; CHECK-SD-NEXT: ldur q2, [x9, #8] ; CHECK-SD-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-SD-NEXT: add w0, w0, #16 ; CHECK-SD-NEXT: sshll2 v3.4s, v1.8h, #0 ; CHECK-SD-NEXT: sshll v1.4s, v1.4h, #0 ; CHECK-SD-NEXT: sshll2 v4.4s, v2.8h, #0 ; CHECK-SD-NEXT: sshll v2.4s, v2.4h, #0 ; CHECK-SD-NEXT: mul v3.4s, v3.4s, v0.s[0] ; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0] ; CHECK-SD-NEXT: mul v4.4s, v4.4s, v0.s[0] ; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0] ; CHECK-SD-NEXT: stp q1, q3, [x9] ; CHECK-SD-NEXT: stp q2, q4, [x9, #32] ; CHECK-SD-NEXT: b.ne .LBB13_1 ; CHECK-SD-NEXT: // %bb.2: // %for.end12 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: matrix_mul_signed_and_double: ; CHECK-GI: // %bb.0: // %vector.header ; CHECK-GI-NEXT: and w8, w3, #0xffff ; CHECK-GI-NEXT: dup v0.4s, w8 ; CHECK-GI-NEXT: mov w8, w0 ; CHECK-GI-NEXT: and x8, x8, #0xfffffff0 ; CHECK-GI-NEXT: .LBB13_1: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1 ; CHECK-GI-NEXT: subs x8, x8, #16 ; CHECK-GI-NEXT: ldr q1, [x9] ; CHECK-GI-NEXT: ldur q2, [x9, #8] ; CHECK-GI-NEXT: add x9, x1, w0, uxtw #2 ; CHECK-GI-NEXT: add w0, w0, #16 ; CHECK-GI-NEXT: sshll v3.4s, v1.4h, #0 ; CHECK-GI-NEXT: sshll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: sshll v4.4s, v2.4h, #0 ; CHECK-GI-NEXT: sshll2 v2.4s, v2.8h, #0 ; CHECK-GI-NEXT: mul v3.4s, v0.4s, v3.4s ; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: mul v4.4s, v0.4s, v4.4s ; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s ; CHECK-GI-NEXT: stp q3, q1, [x9] ; CHECK-GI-NEXT: stp q4, q2, [x9, #32]! ; CHECK-GI-NEXT: b.ne .LBB13_1 ; CHECK-GI-NEXT: // %bb.2: // %for.end12 ; CHECK-GI-NEXT: ret vector.header: %conv4 = and i32 %val, 65535 %wide.trip.count = zext i32 %N to i64 %0 = add nsw i64 %wide.trip.count, -1 %min.iters.check = icmp ult i32 %N, 16 %1 = trunc i64 %0 to i32 %2 = icmp ugt i64 %0, 4294967295 %n.vec = and i64 %wide.trip.count, 4294967280 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %conv4, i32 0 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer %broadcast.splatinsert31 = insertelement <8 x i32> undef, i32 %conv4, i32 0 %broadcast.splat32 = shufflevector <8 x i32> %broadcast.splatinsert31, <8 x i32> undef, <8 x i32> zeroinitializer %cmp.n = icmp eq i64 %n.vec, %wide.trip.count br label %vector.body vector.body: ; preds = %vector.header, %vector.body %index = phi i64 [ %index.next, %vector.body ], [ 0, %vector.header ] %3 = trunc i64 %index to i32 %4 = add i32 %N, %3 %5 = zext i32 %4 to i64 %6 = getelementptr inbounds i16, ptr %A, i64 %5 %7 = bitcast ptr %6 to ptr %wide.load = load <8 x i16>, ptr %7, align 2 %8 = getelementptr inbounds i16, ptr %6, i64 4 %9 = bitcast ptr %8 to ptr %wide.load30 = load <8 x i16>, ptr %9, align 2 %10 = sext <8 x i16> %wide.load to <8 x i32> %11 = sext <8 x i16> %wide.load30 to <8 x i32> %12 = mul nuw nsw <8 x i32> %broadcast.splat, %10 %13 = mul nuw nsw <8 x i32> %broadcast.splat32, %11 %14 = getelementptr inbounds i32, ptr %C, i64 %5 %15 = bitcast ptr %14 to ptr store <8 x i32> %12, ptr %15, align 4 %16 = getelementptr inbounds i32, ptr %14, i64 8 %17 = bitcast ptr %16 to ptr store <8 x i32> %13, ptr %17, align 4 %index.next = add i64 %index, 16 %18 = icmp eq i64 %index.next, %n.vec br i1 %18, label %for.end12, label %vector.body for.end12: ; preds = %vector.body ret void } declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}}