1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_mul(<16 x i8> %s0, <16 x i8> %s1) { 5; CHECK-LABEL: reduce_v16i16_shift_mul: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: vmullt.u8 q2, q0, q1 8; CHECK-NEXT: vmullb.u8 q0, q0, q1 9; CHECK-NEXT: vshr.s16 q2, q2, #14 10; CHECK-NEXT: vshr.s16 q0, q0, #14 11; CHECK-NEXT: vaddv.u16 r0, q2 12; CHECK-NEXT: vaddva.u16 r0, q0 13; CHECK-NEXT: bx lr 14entry: 15 %s0s = zext <16 x i8> %s0 to <16 x i16> 16 %s1s = zext <16 x i8> %s1 to <16 x i16> 17 %m = mul <16 x i16> %s0s, %s1s 18 %sh = ashr <16 x i16> %m, <i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14> 19 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %sh) 20 ret i16 %result 21} 22 23define arm_aapcs_vfpcc i16 @reduce_v8i16_shift_mul(<8 x i8> %s0, <8 x i8> %s1) { 24; CHECK-LABEL: reduce_v8i16_shift_mul: 25; CHECK: @ %bb.0: @ %entry 26; CHECK-NEXT: vmullb.u8 q0, q0, q1 27; CHECK-NEXT: vshr.s16 q0, q0, #14 28; CHECK-NEXT: vaddv.u16 r0, q0 29; CHECK-NEXT: bx lr 30entry: 31 %s0s = zext <8 x i8> %s0 to <8 x i16> 32 %s1s = zext <8 x i8> %s1 to <8 x i16> 33 %m = mul <8 x i16> %s0s, %s1s 34 %sh = ashr <8 x i16> %m, <i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14> 35 %result = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sh) 36 ret i16 %result 37} 38 39define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_sub(<16 x i8> %s0, <16 x i8> %s1) { 40; CHECK-LABEL: reduce_v16i16_shift_sub: 41; CHECK: @ %bb.0: @ %entry 42; CHECK-NEXT: vmovlt.u8 q2, q1 43; CHECK-NEXT: vmovlt.u8 q3, q0 44; CHECK-NEXT: vsub.i16 q2, q3, q2 45; CHECK-NEXT: vmovlb.u8 q1, q1 46; CHECK-NEXT: vmovlb.u8 q0, q0 47; CHECK-NEXT: vshr.s16 q2, q2, #14 48; CHECK-NEXT: vsub.i16 q0, q0, q1 49; CHECK-NEXT: vaddv.u16 r0, q2 50; CHECK-NEXT: vshr.s16 q0, q0, #14 51; CHECK-NEXT: vaddva.u16 r0, q0 52; CHECK-NEXT: bx lr 53entry: 54 %s0s = zext <16 x i8> %s0 to <16 x i16> 55 %s1s = zext <16 x i8> %s1 to <16 x i16> 56 %m = sub <16 x i16> %s0s, %s1s 57 %sh = ashr <16 x i16> %m, <i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14, i16 14> 58 %result = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %sh) 59 ret i16 %result 60} 61 62define arm_aapcs_vfpcc i32 @mlapred_v4i32_v4i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %a, <8 x i16> %b) { 63; CHECK-LABEL: mlapred_v4i32_v4i64_zext: 64; CHECK: @ %bb.0: @ %entry 65; CHECK-NEXT: .vsave {d8, d9, d10, d11} 66; CHECK-NEXT: vpush {d8, d9, d10, d11} 67; CHECK-NEXT: .pad #32 68; CHECK-NEXT: sub sp, #32 69; CHECK-NEXT: vorr q2, q2, q3 70; CHECK-NEXT: mov r0, sp 71; CHECK-NEXT: vstrw.32 q2, [r0] 72; CHECK-NEXT: vmov.i8 q3, #0xff 73; CHECK-NEXT: vldrh.u32 q2, [r0, #8] 74; CHECK-NEXT: vldrh.u32 q5, [r0] 75; CHECK-NEXT: add r0, sp, #16 76; CHECK-NEXT: vcmp.i32 eq, q2, zr 77; CHECK-NEXT: vmov.i8 q2, #0x0 78; CHECK-NEXT: vpsel q4, q3, q2 79; CHECK-NEXT: vcmp.i32 eq, q5, zr 80; CHECK-NEXT: vpsel q2, q3, q2 81; CHECK-NEXT: vstrh.32 q4, [r0, #8] 82; CHECK-NEXT: vstrh.32 q2, [r0] 83; CHECK-NEXT: vldrw.u32 q2, [r0] 84; CHECK-NEXT: vpt.i16 ne, q2, zr 85; CHECK-NEXT: vmlavt.u16 r0, q0, q1 86; CHECK-NEXT: add sp, #32 87; CHECK-NEXT: vpop {d8, d9, d10, d11} 88; CHECK-NEXT: bx lr 89entry: 90 %aa = zext <8 x i16> %a to <8 x i32> 91 %bb = zext <8 x i16> %b to <8 x i32> 92 %c1 = icmp eq <8 x i32> %aa, zeroinitializer 93 %c2 = icmp eq <8 x i32> %bb, zeroinitializer 94 %c = and <8 x i1> %c1, %c2 95 %xx = zext <8 x i16> %x to <8 x i32> 96 %yy = zext <8 x i16> %y to <8 x i32> 97 %m = mul <8 x i32> %xx, %yy 98 %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer 99 %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s) 100 ret i32 %z 101} 102 103define void @correlate(ptr nocapture noundef readonly %ID, ptr nocapture noundef writeonly %ACD, i16 noundef signext %DS, i16 noundef signext %Ls, i16 noundef signext %S) { 104; CHECK-LABEL: correlate: 105; CHECK: @ %bb.0: @ %entry 106; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 107; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 108; CHECK-NEXT: .pad #12 109; CHECK-NEXT: sub sp, #12 110; CHECK-NEXT: cmp r3, #1 111; CHECK-NEXT: stm.w sp, {r0, r1, r3} @ 12-byte Folded Spill 112; CHECK-NEXT: blt .LBB4_12 113; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph 114; CHECK-NEXT: ldr r1, [sp, #48] 115; CHECK-NEXT: add.w r12, r2, #3 116; CHECK-NEXT: ldr.w r11, [sp] @ 4-byte Reload 117; CHECK-NEXT: mov.w r10, #0 118; CHECK-NEXT: mov r8, r2 119; CHECK-NEXT: mov r0, r2 120; CHECK-NEXT: uxth r3, r1 121; CHECK-NEXT: b .LBB4_4 122; CHECK-NEXT: .LBB4_2: @ in Loop: Header=BB4_4 Depth=1 123; CHECK-NEXT: movs r6, #0 124; CHECK-NEXT: .LBB4_3: @ %for.end 125; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 126; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload 127; CHECK-NEXT: lsrs r2, r6, #16 128; CHECK-NEXT: sub.w r12, r12, #1 129; CHECK-NEXT: add.w r11, r11, #2 130; CHECK-NEXT: sub.w r8, r8, #1 131; CHECK-NEXT: strh.w r2, [r7, r10, lsl #1] 132; CHECK-NEXT: add.w r10, r10, #1 133; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload 134; CHECK-NEXT: cmp r10, r2 135; CHECK-NEXT: mov r2, r0 136; CHECK-NEXT: beq .LBB4_12 137; CHECK-NEXT: .LBB4_4: @ %for.body 138; CHECK-NEXT: @ =>This Loop Header: Depth=1 139; CHECK-NEXT: @ Child Loop BB4_8 Depth 2 140; CHECK-NEXT: @ Child Loop BB4_11 Depth 2 141; CHECK-NEXT: cmp r2, r10 142; CHECK-NEXT: ble .LBB4_2 143; CHECK-NEXT: @ %bb.5: @ %vector.main.loop.iter.check 144; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 145; CHECK-NEXT: sub.w r4, r2, r10 146; CHECK-NEXT: cmp r4, #8 147; CHECK-NEXT: bhs .LBB4_7 148; CHECK-NEXT: @ %bb.6: @ in Loop: Header=BB4_4 Depth=1 149; CHECK-NEXT: movs r6, #0 150; CHECK-NEXT: mov.w r9, #0 151; CHECK-NEXT: b .LBB4_10 152; CHECK-NEXT: .LBB4_7: @ %vector.ph 153; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 154; CHECK-NEXT: bic r2, r8, #7 155; CHECK-NEXT: movs r7, #1 156; CHECK-NEXT: subs r2, #8 157; CHECK-NEXT: bic r9, r4, #7 158; CHECK-NEXT: movs r6, #0 159; CHECK-NEXT: mov r5, r11 160; CHECK-NEXT: add.w lr, r7, r2, lsr #3 161; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload 162; CHECK-NEXT: .LBB4_8: @ %vector.body 163; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 164; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 165; CHECK-NEXT: vldrh.u16 q0, [r2], #16 166; CHECK-NEXT: vldrh.u16 q1, [r5], #16 167; CHECK-NEXT: rsbs r7, r3, #0 168; CHECK-NEXT: vmullb.s16 q2, q1, q0 169; CHECK-NEXT: vmullt.s16 q0, q1, q0 170; CHECK-NEXT: vshl.s32 q2, r7 171; CHECK-NEXT: vshl.s32 q0, r7 172; CHECK-NEXT: vaddva.u32 r6, q2 173; CHECK-NEXT: vaddva.u32 r6, q0 174; CHECK-NEXT: le lr, .LBB4_8 175; CHECK-NEXT: @ %bb.9: @ %middle.block 176; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 177; CHECK-NEXT: cmp r4, r9 178; CHECK-NEXT: beq .LBB4_3 179; CHECK-NEXT: .LBB4_10: @ %vec.epilog.ph 180; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1 181; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload 182; CHECK-NEXT: add.w r2, r9, r10 183; CHECK-NEXT: sub.w r5, r8, r9 184; CHECK-NEXT: add.w r7, r1, r9, lsl #1 185; CHECK-NEXT: add.w r2, r1, r2, lsl #1 186; CHECK-NEXT: dlstp.32 lr, r5 187; CHECK-NEXT: .LBB4_11: @ %vec.epilog.vector.body 188; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 189; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 190; CHECK-NEXT: rsbs r4, r3, #0 191; CHECK-NEXT: vldrh.s32 q0, [r7], #8 192; CHECK-NEXT: vldrh.s32 q1, [r2], #8 193; CHECK-NEXT: vmul.i32 q0, q1, q0 194; CHECK-NEXT: vshl.s32 q0, r4 195; CHECK-NEXT: vaddva.u32 r6, q0 196; CHECK-NEXT: letp lr, .LBB4_11 197; CHECK-NEXT: b .LBB4_3 198; CHECK-NEXT: .LBB4_12: @ %for.end17 199; CHECK-NEXT: add sp, #12 200; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 201entry: 202 %conv = sext i16 %Ls to i32 203 %cmp31 = icmp sgt i16 %Ls, 0 204 br i1 %cmp31, label %for.body.lr.ph, label %for.end17 205 206for.body.lr.ph: ; preds = %entry 207 %conv2 = sext i16 %DS to i32 208 %conv1027 = zext i16 %S to i32 209 %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %conv1027, i64 0 210 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer 211 %broadcast.splatinsert40 = insertelement <4 x i32> poison, i32 %conv1027, i64 0 212 %broadcast.splat41 = shufflevector <4 x i32> %broadcast.splatinsert40, <4 x i32> poison, <4 x i32> zeroinitializer 213 br label %for.body 214 215for.body: ; preds = %for.body.lr.ph, %for.end 216 %lag.032 = phi i32 [ 0, %for.body.lr.ph ], [ %inc16, %for.end ] 217 %0 = sub i32 %conv2, %lag.032 218 %cmp428 = icmp slt i32 %lag.032, %conv2 219 br i1 %cmp428, label %vector.main.loop.iter.check, label %for.end 220 221vector.main.loop.iter.check: ; preds = %for.body 222 %min.iters.check = icmp ult i32 %0, 8 223 br i1 %min.iters.check, label %vec.epilog.ph, label %vector.ph 224 225vector.ph: ; preds = %vector.main.loop.iter.check 226 %n.vec = and i32 %0, -8 227 br label %vector.body 228 229vector.body: ; preds = %vector.body, %vector.ph 230 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 231 %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ] 232 %1 = getelementptr inbounds i16, ptr %ID, i32 %index 233 %wide.load = load <8 x i16>, ptr %1, align 2 234 %2 = sext <8 x i16> %wide.load to <8 x i32> 235 %3 = add nuw nsw i32 %index, %lag.032 236 %4 = getelementptr inbounds i16, ptr %ID, i32 %3 237 %wide.load34 = load <8 x i16>, ptr %4, align 2 238 %5 = sext <8 x i16> %wide.load34 to <8 x i32> 239 %6 = mul nsw <8 x i32> %5, %2 240 %7 = ashr <8 x i32> %6, %broadcast.splat 241 %8 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7) 242 %9 = add i32 %8, %vec.phi 243 %index.next = add nuw i32 %index, 8 244 %10 = icmp eq i32 %index.next, %n.vec 245 br i1 %10, label %middle.block, label %vector.body 246 247middle.block: ; preds = %vector.body 248 %cmp.n = icmp eq i32 %0, %n.vec 249 br i1 %cmp.n, label %for.end, label %vec.epilog.ph 250 251vec.epilog.ph: ; preds = %middle.block, %vector.main.loop.iter.check 252 %bc.merge.rdx = phi i32 [ 0, %vector.main.loop.iter.check ], [ %9, %middle.block ] 253 %vec.epilog.resume.val = phi i32 [ 0, %vector.main.loop.iter.check ], [ %n.vec, %middle.block ] 254 %n.rnd.up = add i32 %0, 3 255 %n.vec36 = and i32 %n.rnd.up, -4 256 br label %vec.epilog.vector.body 257 258vec.epilog.vector.body: ; preds = %vec.epilog.vector.body, %vec.epilog.ph 259 %index37 = phi i32 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next42, %vec.epilog.vector.body ] 260 %vec.phi38 = phi i32 [ %bc.merge.rdx, %vec.epilog.ph ], [ %20, %vec.epilog.vector.body ] 261 %active.lane.mask = tail call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index37, i32 %0) 262 %11 = getelementptr inbounds i16, ptr %ID, i32 %index37 263 %wide.masked.load = tail call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %11, i32 2, <4 x i1> %active.lane.mask, <4 x i16> poison) 264 %12 = sext <4 x i16> %wide.masked.load to <4 x i32> 265 %13 = add nuw nsw i32 %index37, %lag.032 266 %14 = getelementptr inbounds i16, ptr %ID, i32 %13 267 %wide.masked.load39 = tail call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %14, i32 2, <4 x i1> %active.lane.mask, <4 x i16> poison) 268 %15 = sext <4 x i16> %wide.masked.load39 to <4 x i32> 269 %16 = mul nsw <4 x i32> %15, %12 270 %17 = ashr <4 x i32> %16, %broadcast.splat41 271 %18 = select <4 x i1> %active.lane.mask, <4 x i32> %17, <4 x i32> zeroinitializer 272 %19 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %18) 273 %20 = add i32 %19, %vec.phi38 274 %index.next42 = add i32 %index37, 4 275 %21 = icmp eq i32 %index.next42, %n.vec36 276 br i1 %21, label %for.end, label %vec.epilog.vector.body 277 278for.end: ; preds = %vec.epilog.vector.body, %middle.block, %for.body 279 %Accumulator.0.lcssa = phi i32 [ 0, %for.body ], [ %9, %middle.block ], [ %20, %vec.epilog.vector.body ] 280 %22 = lshr i32 %Accumulator.0.lcssa, 16 281 %conv13 = trunc i32 %22 to i16 282 %arrayidx14 = getelementptr inbounds i16, ptr %ACD, i32 %lag.032 283 store i16 %conv13, ptr %arrayidx14, align 2 284 %inc16 = add nuw nsw i32 %lag.032, 1 285 %exitcond33.not = icmp eq i32 %inc16, %conv 286 br i1 %exitcond33.not, label %for.end17, label %for.body 287 288for.end17: ; preds = %for.end, %entry 289 ret void 290} 291 292declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %sh) 293declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %sh) 294declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 295declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 296declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr nocapture, i32 immarg, <4 x i1>, <4 x i16>) 297declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 298