1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s 3 4define void @vmovl_s32(ptr noalias nocapture %d, ptr nocapture readonly %s, i32 %n) { 5; CHECK-LABEL: vmovl_s32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: cmp r2, #1 10; CHECK-NEXT: it lt 11; CHECK-NEXT: poplt {r7, pc} 12; CHECK-NEXT: .LBB0_1: @ %vector.ph 13; CHECK-NEXT: dlstp.32 lr, r2 14; CHECK-NEXT: .LBB0_2: @ %vector.body 15; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 16; CHECK-NEXT: vldrw.u32 q0, [r1], #16 17; CHECK-NEXT: vmovlb.s16 q0, q0 18; CHECK-NEXT: vstrw.32 q0, [r0], #16 19; CHECK-NEXT: letp lr, .LBB0_2 20; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 21; CHECK-NEXT: pop {r7, pc} 22entry: 23 %cmp7 = icmp sgt i32 %n, 0 24 br i1 %cmp7, label %vector.ph, label %for.cond.cleanup 25 26vector.ph: ; preds = %entry 27 %n.rnd.up = add i32 %n, 3 28 %n.vec = and i32 %n.rnd.up, -4 29 br label %vector.body 30 31vector.body: ; preds = %vector.body, %vector.ph 32 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 33 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 34 %0 = getelementptr inbounds i32, ptr %s, i32 %index 35 %1 = bitcast ptr %0 to ptr 36 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison) 37 %2 = shl <4 x i32> %wide.masked.load, <i32 16, i32 16, i32 16, i32 16> 38 %3 = ashr exact <4 x i32> %2, <i32 16, i32 16, i32 16, i32 16> 39 %4 = getelementptr inbounds i32, ptr %d, i32 %index 40 %5 = bitcast ptr %4 to ptr 41 call void @llvm.masked.store.v4i32.p0(<4 x i32> %3, ptr %5, i32 4, <4 x i1> %active.lane.mask) 42 %index.next = add i32 %index, 4 43 %6 = icmp eq i32 %index.next, %n.vec 44 br i1 %6, label %for.cond.cleanup, label %vector.body 45 46for.cond.cleanup: ; preds = %vector.body, %entry 47 ret void 48} 49 50 51define void @vmovl_u16(ptr noalias nocapture %d, ptr nocapture readonly %s, i32 %n) { 52; CHECK-LABEL: vmovl_u16: 53; CHECK: @ %bb.0: @ %entry 54; CHECK-NEXT: .save {r7, lr} 55; CHECK-NEXT: push {r7, lr} 56; CHECK-NEXT: cmp r2, #1 57; CHECK-NEXT: it lt 58; CHECK-NEXT: poplt {r7, pc} 59; CHECK-NEXT: .LBB1_1: @ %vector.ph 60; CHECK-NEXT: dlstp.16 lr, r2 61; CHECK-NEXT: .LBB1_2: @ %vector.body 62; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 63; CHECK-NEXT: vldrh.u16 q0, [r1], #16 64; CHECK-NEXT: vmovlb.u8 q0, q0 65; CHECK-NEXT: vstrh.16 q0, [r0], #16 66; CHECK-NEXT: letp lr, .LBB1_2 67; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 68; CHECK-NEXT: pop {r7, pc} 69entry: 70 %cmp7 = icmp sgt i32 %n, 0 71 br i1 %cmp7, label %vector.ph, label %for.cond.cleanup 72 73vector.ph: ; preds = %entry 74 %n.rnd.up = add i32 %n, 7 75 %n.vec = and i32 %n.rnd.up, -8 76 br label %vector.body 77 78vector.body: ; preds = %vector.body, %vector.ph 79 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 80 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 81 %0 = getelementptr inbounds i16, ptr %s, i32 %index 82 %1 = bitcast ptr %0 to ptr 83 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison) 84 %2 = and <8 x i16> %wide.masked.load, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 85 %3 = getelementptr inbounds i16, ptr %d, i32 %index 86 %4 = bitcast ptr %3 to ptr 87 call void @llvm.masked.store.v8i16.p0(<8 x i16> %2, ptr %4, i32 2, <8 x i1> %active.lane.mask) 88 %index.next = add i32 %index, 8 89 %5 = icmp eq i32 %index.next, %n.vec 90 br i1 %5, label %for.cond.cleanup, label %vector.body 91 92for.cond.cleanup: ; preds = %vector.body, %entry 93 ret void 94} 95 96define void @vmovl_16to32(ptr %d, ptr %s, i32 %n) { 97; CHECK-LABEL: vmovl_16to32: 98; CHECK: @ %bb.0: @ %entry 99; CHECK-NEXT: .save {r7, lr} 100; CHECK-NEXT: push {r7, lr} 101; CHECK-NEXT: cmp r2, #1 102; CHECK-NEXT: it lt 103; CHECK-NEXT: poplt {r7, pc} 104; CHECK-NEXT: .LBB2_1: @ %for.body.preheader 105; CHECK-NEXT: mov r3, r2 106; CHECK-NEXT: cmp r2, #8 107; CHECK-NEXT: it ge 108; CHECK-NEXT: movge r3, #8 109; CHECK-NEXT: subs r3, r2, r3 110; CHECK-NEXT: add.w r12, r3, #7 111; CHECK-NEXT: movs r3, #1 112; CHECK-NEXT: add.w r3, r3, r12, lsr #3 113; CHECK-NEXT: dls lr, r3 114; CHECK-NEXT: .LBB2_2: @ %for.body 115; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 116; CHECK-NEXT: vctp.16 r2 117; CHECK-NEXT: subs r2, #8 118; CHECK-NEXT: vpst 119; CHECK-NEXT: vldrht.u16 q0, [r1], #16 120; CHECK-NEXT: vmovlb.s16 q0, q0 121; CHECK-NEXT: vpst 122; CHECK-NEXT: vstrht.16 q0, [r0], #16 123; CHECK-NEXT: le lr, .LBB2_2 124; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 125; CHECK-NEXT: pop {r7, pc} 126entry: 127 %cmp13 = icmp sgt i32 %n, 0 128 br i1 %cmp13, label %for.body, label %for.cond.cleanup 129 130for.cond.cleanup: ; preds = %for.body, %entry 131 ret void 132 133for.body: ; preds = %entry, %for.body 134 %d.addr.016 = phi ptr [ %add.ptr3, %for.body ], [ %d, %entry ] 135 %s.addr.015 = phi ptr [ %add.ptr, %for.body ], [ %s, %entry ] 136 %i.014 = phi i32 [ %sub, %for.body ], [ %n, %entry ] 137 %0 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %i.014) 138 %1 = bitcast ptr %s.addr.015 to ptr 139 %2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %0, <8 x i16> <i16 0, i16 poison, i16 0, i16 poison, i16 0, i16 poison, i16 0, i16 poison>) 140 %add.ptr = getelementptr inbounds i16, ptr %s.addr.015, i32 8 141 %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 142 %4 = sext <4 x i16> %3 to <4 x i32> 143 %5 = bitcast <4 x i32> %4 to <8 x i16> 144 %6 = bitcast ptr %d.addr.016 to ptr 145 tail call void @llvm.masked.store.v8i16.p0(<8 x i16> %5, ptr %6, i32 2, <8 x i1> %0) 146 %add.ptr3 = getelementptr inbounds i16, ptr %d.addr.016, i32 8 147 %sub = add nsw i32 %i.014, -8 148 %cmp = icmp sgt i32 %i.014, 8 149 br i1 %cmp, label %for.body, label %for.cond.cleanup 150} 151 152define void @sunken_vmovl(ptr noalias %pTarget, i16 signext %iTargetStride, ptr noalias %pchAlpha, i16 signext %iAlphaStride, i16 %0, i8 zeroext %Colour) { 153; CHECK-LABEL: sunken_vmovl: 154; CHECK: @ %bb.0: @ %entry 155; CHECK-NEXT: .save {r7, lr} 156; CHECK-NEXT: push {r7, lr} 157; CHECK-NEXT: ldrsh.w r1, [sp, #8] 158; CHECK-NEXT: vmov.i16 q0, #0x100 159; CHECK-NEXT: vldrb.u16 q1, [r2], #8 160; CHECK-NEXT: vldrb.u16 q2, [r0], #8 161; CHECK-NEXT: ldr r3, [sp, #12] 162; CHECK-NEXT: dlstp.16 lr, r1 163; CHECK-NEXT: .LBB3_1: @ %do.body 164; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 165; CHECK-NEXT: vmovlb.u8 q1, q1 166; CHECK-NEXT: vsub.i16 q3, q0, q1 167; CHECK-NEXT: vmovlb.u8 q2, q2 168; CHECK-NEXT: vmul.i16 q3, q2, q3 169; CHECK-NEXT: vldrb.u16 q2, [r0], #8 170; CHECK-NEXT: vmla.i16 q3, q1, r3 171; CHECK-NEXT: vldrb.u16 q1, [r2], #8 172; CHECK-NEXT: vshr.u16 q3, q3, #8 173; CHECK-NEXT: vstrb.16 q3, [r0, #-16] 174; CHECK-NEXT: letp lr, .LBB3_1 175; CHECK-NEXT: @ %bb.2: @ %do.end 176; CHECK-NEXT: pop {r7, pc} 177entry: 178 %conv3 = sext i16 %0 to i32 179 %1 = zext i8 %Colour to i32 180 %2 = bitcast ptr %pTarget to ptr 181 %3 = load <8 x i8>, ptr %2, align 1 182 %4 = bitcast ptr %pchAlpha to ptr 183 %5 = load <8 x i8>, ptr %4, align 1 184 br label %do.body 185 186do.body: ; preds = %do.body, %entry 187 %pchAlpha.addr.0.pn = phi ptr [ %pchAlpha, %entry ], [ %pAlpha.0, %do.body ] 188 %pTarget8.0 = phi ptr [ %pTarget, %entry ], [ %add.ptr5, %do.body ] 189 %blkCnt.0 = phi i32 [ %conv3, %entry ], [ %sub, %do.body ] 190 %vecTarget.0.in = phi <8 x i8> [ %3, %entry ], [ %10, %do.body ] 191 %vecTransp.0.in = phi <8 x i8> [ %5, %entry ], [ %13, %do.body ] 192 %vecTransp.0 = zext <8 x i8> %vecTransp.0.in to <8 x i16> 193 %vecTarget.0 = zext <8 x i8> %vecTarget.0.in to <8 x i16> 194 %pAlpha.0 = getelementptr inbounds i8, ptr %pchAlpha.addr.0.pn, i32 8 195 %6 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %blkCnt.0) 196 %7 = tail call <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16> <i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256, i16 256>, <8 x i16> %vecTransp.0, <8 x i1> %6, <8 x i16> undef) 197 %8 = tail call <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16> %vecTarget.0, <8 x i16> %7, <8 x i1> %6, <8 x i16> undef) 198 %add.ptr5 = getelementptr inbounds i8, ptr %pTarget8.0, i32 8 199 %9 = bitcast ptr %add.ptr5 to ptr 200 %10 = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr nonnull %9, i32 1, <8 x i1> %6, <8 x i8> zeroinitializer) 201 %11 = tail call <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16> %8, <8 x i16> %vecTransp.0, i32 %1, <8 x i1> %6) 202 %12 = bitcast ptr %pAlpha.0 to ptr 203 %13 = tail call <8 x i8> @llvm.masked.load.v8i8.p0(ptr nonnull %12, i32 1, <8 x i1> %6, <8 x i8> zeroinitializer) 204 %14 = tail call <8 x i16> @llvm.arm.mve.shr.imm.predicated.v8i16.v8i1(<8 x i16> %11, i32 8, i32 1, <8 x i1> %6, <8 x i16> %11) 205 %15 = trunc <8 x i16> %14 to <8 x i8> 206 %16 = bitcast ptr %pTarget8.0 to ptr 207 tail call void @llvm.masked.store.v8i8.p0(<8 x i8> %15, ptr %16, i32 1, <8 x i1> %6) 208 %sub = add nsw i32 %blkCnt.0, -8 209 %cmp9 = icmp sgt i32 %blkCnt.0, 8 210 br i1 %cmp9, label %do.body, label %do.end 211 212do.end: ; preds = %do.body 213 ret void 214} 215 216declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1 217declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>) #2 218declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>) #3 219declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1 220declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #2 221declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #3 222declare <8 x i1> @llvm.arm.mve.vctp16(i32) 223declare <8 x i16> @llvm.arm.mve.sub.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) 224declare <8 x i16> @llvm.arm.mve.mul.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>) 225declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32 immarg, <8 x i1>, <8 x i8>) 226declare <8 x i16> @llvm.arm.mve.vmla.n.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) 227declare <8 x i16> @llvm.arm.mve.shr.imm.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>, <8 x i16>) 228declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32 immarg, <8 x i1>) 229