1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc void @ssatmul_s_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 5; CHECK-LABEL: ssatmul_s_q31: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 8; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 9; CHECK-NEXT: .pad #8 10; CHECK-NEXT: sub sp, #8 11; CHECK-NEXT: cmp r3, #0 12; CHECK-NEXT: beq.w .LBB0_8 13; CHECK-NEXT: @ %bb.1: @ %entry 14; CHECK-NEXT: mov r11, r2 15; CHECK-NEXT: cmp r3, #1 16; CHECK-NEXT: bne .LBB0_3 17; CHECK-NEXT: @ %bb.2: 18; CHECK-NEXT: movs r2, #0 19; CHECK-NEXT: mov r12, r0 20; CHECK-NEXT: mov r8, r1 21; CHECK-NEXT: mov r10, r11 22; CHECK-NEXT: b .LBB0_6 23; CHECK-NEXT: .LBB0_3: @ %vector.ph 24; CHECK-NEXT: bic r2, r3, #1 25; CHECK-NEXT: adr r4, .LCPI0_0 26; CHECK-NEXT: subs r7, r2, #2 27; CHECK-NEXT: movs r6, #1 28; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill 29; CHECK-NEXT: add.w r10, r11, r2, lsl #2 30; CHECK-NEXT: add.w lr, r6, r7, lsr #1 31; CHECK-NEXT: str r2, [sp] @ 4-byte Spill 32; CHECK-NEXT: add.w r8, r1, r2, lsl #2 33; CHECK-NEXT: add.w r12, r0, r2, lsl #2 34; CHECK-NEXT: vldrw.u32 q0, [r4] 35; CHECK-NEXT: vmvn.i32 q1, #0x80000000 36; CHECK-NEXT: .LBB0_4: @ %vector.body 37; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 38; CHECK-NEXT: ldrd r4, r2, [r0], #8 39; CHECK-NEXT: movs r5, #0 40; CHECK-NEXT: ldrd r7, r6, [r1], #8 41; CHECK-NEXT: smull r4, r7, r7, r4 42; CHECK-NEXT: asrl r4, r7, #31 43; CHECK-NEXT: rsbs.w r9, r4, #-2147483648 44; CHECK-NEXT: mov.w r9, #-1 45; CHECK-NEXT: sbcs.w r3, r9, r7 46; CHECK-NEXT: csetm r3, lt 47; CHECK-NEXT: bfi r5, r3, #0, #8 48; CHECK-NEXT: smull r2, r3, r6, r2 49; CHECK-NEXT: asrl r2, r3, #31 50; CHECK-NEXT: rsbs.w r6, r2, #-2147483648 51; CHECK-NEXT: vmov q2[2], q2[0], r4, r2 52; CHECK-NEXT: sbcs.w r6, r9, r3 53; CHECK-NEXT: vmov q2[3], q2[1], r7, r3 54; CHECK-NEXT: csetm r6, lt 55; CHECK-NEXT: bfi r5, r6, #8, #8 56; CHECK-NEXT: vmsr p0, r5 57; CHECK-NEXT: mvn r5, #-2147483648 58; CHECK-NEXT: vpsel q2, q2, q0 59; CHECK-NEXT: vmov r2, r3, d4 60; CHECK-NEXT: subs r2, r2, r5 61; CHECK-NEXT: sbcs r2, r3, #0 62; CHECK-NEXT: mov.w r3, #0 63; CHECK-NEXT: csetm r2, lt 64; CHECK-NEXT: bfi r3, r2, #0, #8 65; CHECK-NEXT: vmov r2, r4, d5 66; CHECK-NEXT: subs r2, r2, r5 67; CHECK-NEXT: sbcs r2, r4, #0 68; CHECK-NEXT: csetm r2, lt 69; CHECK-NEXT: bfi r3, r2, #8, #8 70; CHECK-NEXT: vmsr p0, r3 71; CHECK-NEXT: vpsel q2, q2, q1 72; CHECK-NEXT: vmov r2, s10 73; CHECK-NEXT: vmov r3, s8 74; CHECK-NEXT: strd r3, r2, [r11], #8 75; CHECK-NEXT: le lr, .LBB0_4 76; CHECK-NEXT: @ %bb.5: @ %middle.block 77; CHECK-NEXT: ldrd r2, r3, [sp] @ 8-byte Folded Reload 78; CHECK-NEXT: cmp r2, r3 79; CHECK-NEXT: beq .LBB0_8 80; CHECK-NEXT: .LBB0_6: @ %for.body.preheader 81; CHECK-NEXT: sub.w lr, r3, r2 82; CHECK-NEXT: mov.w r0, #-1 83; CHECK-NEXT: mov.w r1, #-2147483648 84; CHECK-NEXT: mvn r3, #-2147483648 85; CHECK-NEXT: .LBB0_7: @ %for.body 86; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 87; CHECK-NEXT: ldr r2, [r12], #4 88; CHECK-NEXT: ldr r4, [r8], #4 89; CHECK-NEXT: smull r2, r5, r4, r2 90; CHECK-NEXT: asrl r2, r5, #31 91; CHECK-NEXT: subs r4, r1, r2 92; CHECK-NEXT: sbcs.w r4, r0, r5 93; CHECK-NEXT: csel r2, r2, r1, lt 94; CHECK-NEXT: csel r4, r5, r0, lt 95; CHECK-NEXT: subs r5, r2, r3 96; CHECK-NEXT: sbcs r4, r4, #0 97; CHECK-NEXT: csel r2, r2, r3, lt 98; CHECK-NEXT: str r2, [r10], #4 99; CHECK-NEXT: le lr, .LBB0_7 100; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup 101; CHECK-NEXT: add sp, #8 102; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 103; CHECK-NEXT: .p2align 4 104; CHECK-NEXT: @ %bb.9: 105; CHECK-NEXT: .LCPI0_0: 106; CHECK-NEXT: .long 2147483648 @ 0x80000000 107; CHECK-NEXT: .long 4294967295 @ 0xffffffff 108; CHECK-NEXT: .long 2147483648 @ 0x80000000 109; CHECK-NEXT: .long 4294967295 @ 0xffffffff 110entry: 111 switch i32 %N, label %vector.ph [ 112 i32 0, label %for.cond.cleanup 113 i32 1, label %for.body.preheader 114 ] 115 116vector.ph: ; preds = %entry 117 %n.vec = and i32 %N, -2 118 %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec 119 %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec 120 %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec 121 br label %vector.body 122 123vector.body: ; preds = %vector.body, %vector.ph 124 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 125 %next.gep = getelementptr i32, ptr %pSrcA, i32 %index 126 %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index 127 %next.gep19 = getelementptr i32, ptr %pDst, i32 %index 128 %wide.load = load <2 x i32>, ptr %next.gep, align 4 129 %0 = sext <2 x i32> %wide.load to <2 x i64> 130 %wide.load20 = load <2 x i32>, ptr %next.gep18, align 4 131 %1 = sext <2 x i32> %wide.load20 to <2 x i64> 132 %2 = mul nsw <2 x i64> %1, %0 133 %3 = ashr <2 x i64> %2, <i64 31, i64 31> 134 %4 = icmp sgt <2 x i64> %3, <i64 -2147483648, i64 -2147483648> 135 %5 = select <2 x i1> %4, <2 x i64> %3, <2 x i64> <i64 -2147483648, i64 -2147483648> 136 %6 = icmp slt <2 x i64> %5, <i64 2147483647, i64 2147483647> 137 %7 = select <2 x i1> %6, <2 x i64> %5, <2 x i64> <i64 2147483647, i64 2147483647> 138 %8 = trunc <2 x i64> %7 to <2 x i32> 139 store <2 x i32> %8, ptr %next.gep19, align 4 140 %index.next = add i32 %index, 2 141 %9 = icmp eq i32 %index.next, %n.vec 142 br i1 %9, label %middle.block, label %vector.body 143 144middle.block: ; preds = %vector.body 145 %cmp.n = icmp eq i32 %n.vec, %N 146 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader 147 148for.body.preheader: ; preds = %entry, %middle.block 149 %i.012.ph = phi i32 [ 0, %entry ], [ %n.vec, %middle.block ] 150 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %entry ], [ %ind.end, %middle.block ] 151 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %entry ], [ %ind.end15, %middle.block ] 152 %pDst.addr.09.ph = phi ptr [ %pDst, %entry ], [ %ind.end17, %middle.block ] 153 br label %for.body 154 155for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 156 ret void 157 158for.body: ; preds = %for.body.preheader, %for.body 159 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader ] 160 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader ] 161 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader ] 162 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader ] 163 %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1 164 %10 = load i32, ptr %pSrcA.addr.011, align 4 165 %conv = sext i32 %10 to i64 166 %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1 167 %11 = load i32, ptr %pSrcB.addr.010, align 4 168 %conv2 = sext i32 %11 to i64 169 %mul = mul nsw i64 %conv2, %conv 170 %shr = ashr i64 %mul, 31 171 %12 = icmp sgt i64 %shr, -2147483648 172 %.val.i = select i1 %12, i64 %shr, i64 -2147483648 173 %13 = icmp slt i64 %.val.i, 2147483647 174 %retval.0.i = select i1 %13, i64 %.val.i, i64 2147483647 175 %conv3 = trunc i64 %retval.0.i to i32 176 %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1 177 store i32 %conv3, ptr %pDst.addr.09, align 4 178 %inc = add nuw i32 %i.012, 1 179 %exitcond = icmp eq i32 %inc, %N 180 br i1 %exitcond, label %for.cond.cleanup, label %for.body 181} 182 183define arm_aapcs_vfpcc void @ssatmul_4_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 184; CHECK-LABEL: ssatmul_4_q31: 185; CHECK: @ %bb.0: @ %entry 186; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 187; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 188; CHECK-NEXT: .pad #4 189; CHECK-NEXT: sub sp, #4 190; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 191; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 192; CHECK-NEXT: .pad #16 193; CHECK-NEXT: sub sp, #16 194; CHECK-NEXT: cmp r3, #0 195; CHECK-NEXT: beq.w .LBB1_8 196; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 197; CHECK-NEXT: mov r5, r1 198; CHECK-NEXT: movs r1, #0 199; CHECK-NEXT: cmp r3, #3 200; CHECK-NEXT: bhi .LBB1_3 201; CHECK-NEXT: @ %bb.2: 202; CHECK-NEXT: mov r12, r0 203; CHECK-NEXT: mov r9, r5 204; CHECK-NEXT: mov r11, r2 205; CHECK-NEXT: b .LBB1_6 206; CHECK-NEXT: .LBB1_3: @ %vector.ph 207; CHECK-NEXT: bic r1, r3, #3 208; CHECK-NEXT: adr r4, .LCPI1_0 209; CHECK-NEXT: subs r7, r1, #4 210; CHECK-NEXT: movs r6, #1 211; CHECK-NEXT: vldrw.u32 q0, [r4] 212; CHECK-NEXT: adr r4, .LCPI1_1 213; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill 214; CHECK-NEXT: add.w lr, r6, r7, lsr #2 215; CHECK-NEXT: str r1, [sp] @ 4-byte Spill 216; CHECK-NEXT: add.w r11, r2, r1, lsl #2 217; CHECK-NEXT: add.w r9, r5, r1, lsl #2 218; CHECK-NEXT: add.w r12, r0, r1, lsl #2 219; CHECK-NEXT: vldrw.u32 q1, [r4] 220; CHECK-NEXT: .LBB1_4: @ %vector.body 221; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 222; CHECK-NEXT: vldrw.u32 q3, [r5], #16 223; CHECK-NEXT: vldrw.u32 q2, [r0], #16 224; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill 225; CHECK-NEXT: mov.w r2, #-1 226; CHECK-NEXT: vmov.f32 s16, s10 227; CHECK-NEXT: str r5, [sp, #8] @ 4-byte Spill 228; CHECK-NEXT: vmov.f32 s20, s14 229; CHECK-NEXT: mov.w r8, #0 230; CHECK-NEXT: vmov.f32 s18, s11 231; CHECK-NEXT: vmov.f32 s22, s15 232; CHECK-NEXT: vmullb.s32 q6, q5, q4 233; CHECK-NEXT: vmov.f32 s14, s13 234; CHECK-NEXT: vmov r4, r7, d12 235; CHECK-NEXT: asrl r4, r7, #31 236; CHECK-NEXT: vmov.f32 s10, s9 237; CHECK-NEXT: rsbs.w r5, r4, #-2147483648 238; CHECK-NEXT: sbcs.w r5, r2, r7 239; CHECK-NEXT: csetm r5, lt 240; CHECK-NEXT: bfi r8, r5, #0, #8 241; CHECK-NEXT: vmov r10, r5, d13 242; CHECK-NEXT: asrl r10, r5, #31 243; CHECK-NEXT: vmov r6, s14 244; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 245; CHECK-NEXT: vmov q4[2], q4[0], r4, r10 246; CHECK-NEXT: sbcs.w r3, r2, r5 247; CHECK-NEXT: vmov q4[3], q4[1], r7, r5 248; CHECK-NEXT: csetm r3, lt 249; CHECK-NEXT: bfi r8, r3, #8, #8 250; CHECK-NEXT: vmsr p0, r8 251; CHECK-NEXT: mvn r8, #-2147483648 252; CHECK-NEXT: vpsel q4, q4, q0 253; CHECK-NEXT: vmov r3, r4, d8 254; CHECK-NEXT: subs.w r3, r3, r8 255; CHECK-NEXT: sbcs r3, r4, #0 256; CHECK-NEXT: mov.w r4, #0 257; CHECK-NEXT: csetm r3, lt 258; CHECK-NEXT: bfi r4, r3, #0, #8 259; CHECK-NEXT: vmov r3, r5, d9 260; CHECK-NEXT: subs.w r3, r3, r8 261; CHECK-NEXT: sbcs r3, r5, #0 262; CHECK-NEXT: mov.w r5, #0 263; CHECK-NEXT: csetm r3, lt 264; CHECK-NEXT: bfi r4, r3, #8, #8 265; CHECK-NEXT: vmov r3, s8 266; CHECK-NEXT: vmsr p0, r4 267; CHECK-NEXT: vmov r4, s12 268; CHECK-NEXT: vpsel q4, q4, q1 269; CHECK-NEXT: smull r4, r7, r4, r3 270; CHECK-NEXT: asrl r4, r7, #31 271; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 272; CHECK-NEXT: sbcs.w r3, r2, r7 273; CHECK-NEXT: csetm r3, lt 274; CHECK-NEXT: bfi r5, r3, #0, #8 275; CHECK-NEXT: vmov r3, s10 276; CHECK-NEXT: smull r6, r3, r6, r3 277; CHECK-NEXT: asrl r6, r3, #31 278; CHECK-NEXT: rsbs.w r1, r6, #-2147483648 279; CHECK-NEXT: vmov q2[2], q2[0], r4, r6 280; CHECK-NEXT: sbcs.w r1, r2, r3 281; CHECK-NEXT: vmov q2[3], q2[1], r7, r3 282; CHECK-NEXT: csetm r1, lt 283; CHECK-NEXT: bfi r5, r1, #8, #8 284; CHECK-NEXT: vmsr p0, r5 285; CHECK-NEXT: ldrd r5, r2, [sp, #8] @ 8-byte Folded Reload 286; CHECK-NEXT: vpsel q2, q2, q0 287; CHECK-NEXT: vmov r1, r3, d4 288; CHECK-NEXT: subs.w r1, r1, r8 289; CHECK-NEXT: sbcs r1, r3, #0 290; CHECK-NEXT: mov.w r3, #0 291; CHECK-NEXT: csetm r1, lt 292; CHECK-NEXT: bfi r3, r1, #0, #8 293; CHECK-NEXT: vmov r1, r4, d5 294; CHECK-NEXT: subs.w r1, r1, r8 295; CHECK-NEXT: sbcs r1, r4, #0 296; CHECK-NEXT: csetm r1, lt 297; CHECK-NEXT: bfi r3, r1, #8, #8 298; CHECK-NEXT: vmsr p0, r3 299; CHECK-NEXT: vpsel q2, q2, q1 300; CHECK-NEXT: vmov.f32 s9, s10 301; CHECK-NEXT: vmov.f32 s10, s16 302; CHECK-NEXT: vmov.f32 s11, s18 303; CHECK-NEXT: vstrb.8 q2, [r2], #16 304; CHECK-NEXT: le lr, .LBB1_4 305; CHECK-NEXT: @ %bb.5: @ %middle.block 306; CHECK-NEXT: ldrd r1, r3, [sp] @ 8-byte Folded Reload 307; CHECK-NEXT: cmp r1, r3 308; CHECK-NEXT: beq .LBB1_8 309; CHECK-NEXT: .LBB1_6: @ %for.body.preheader21 310; CHECK-NEXT: sub.w lr, r3, r1 311; CHECK-NEXT: mov.w r0, #-1 312; CHECK-NEXT: mov.w r3, #-2147483648 313; CHECK-NEXT: mvn r2, #-2147483648 314; CHECK-NEXT: .LBB1_7: @ %for.body 315; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 316; CHECK-NEXT: ldr r1, [r12], #4 317; CHECK-NEXT: ldr r4, [r9], #4 318; CHECK-NEXT: smull r4, r1, r4, r1 319; CHECK-NEXT: asrl r4, r1, #31 320; CHECK-NEXT: subs r5, r3, r4 321; CHECK-NEXT: sbcs.w r5, r0, r1 322; CHECK-NEXT: csel r4, r4, r3, lt 323; CHECK-NEXT: csel r1, r1, r0, lt 324; CHECK-NEXT: subs r5, r4, r2 325; CHECK-NEXT: sbcs r1, r1, #0 326; CHECK-NEXT: csel r1, r4, r2, lt 327; CHECK-NEXT: str r1, [r11], #4 328; CHECK-NEXT: le lr, .LBB1_7 329; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup 330; CHECK-NEXT: add sp, #16 331; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 332; CHECK-NEXT: add sp, #4 333; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 334; CHECK-NEXT: .p2align 4 335; CHECK-NEXT: @ %bb.9: 336; CHECK-NEXT: .LCPI1_0: 337; CHECK-NEXT: .long 2147483648 @ 0x80000000 338; CHECK-NEXT: .long 4294967295 @ 0xffffffff 339; CHECK-NEXT: .long 2147483648 @ 0x80000000 340; CHECK-NEXT: .long 4294967295 @ 0xffffffff 341; CHECK-NEXT: .LCPI1_1: 342; CHECK-NEXT: .long 2147483647 @ 0x7fffffff 343; CHECK-NEXT: .long 0 @ 0x0 344; CHECK-NEXT: .long 2147483647 @ 0x7fffffff 345; CHECK-NEXT: .long 0 @ 0x0 346entry: 347 %cmp8 = icmp eq i32 %N, 0 348 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 349 350for.body.preheader: ; preds = %entry 351 %min.iters.check = icmp ult i32 %N, 4 352 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph 353 354for.body.preheader21: ; preds = %middle.block, %for.body.preheader 355 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 356 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 357 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ] 358 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ] 359 br label %for.body 360 361vector.ph: ; preds = %for.body.preheader 362 %n.vec = and i32 %N, -4 363 %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec 364 %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec 365 %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec 366 br label %vector.body 367 368vector.body: ; preds = %vector.body, %vector.ph 369 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 370 %next.gep = getelementptr i32, ptr %pSrcA, i32 %index 371 %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index 372 %next.gep19 = getelementptr i32, ptr %pDst, i32 %index 373 %wide.load = load <4 x i32>, ptr %next.gep, align 4 374 %0 = sext <4 x i32> %wide.load to <4 x i64> 375 %wide.load20 = load <4 x i32>, ptr %next.gep18, align 4 376 %1 = sext <4 x i32> %wide.load20 to <4 x i64> 377 %2 = mul nsw <4 x i64> %1, %0 378 %3 = ashr <4 x i64> %2, <i64 31, i64 31, i64 31, i64 31> 379 %4 = icmp sgt <4 x i64> %3, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648> 380 %5 = select <4 x i1> %4, <4 x i64> %3, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648> 381 %6 = icmp slt <4 x i64> %5, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 382 %7 = select <4 x i1> %6, <4 x i64> %5, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 383 %8 = trunc <4 x i64> %7 to <4 x i32> 384 store <4 x i32> %8, ptr %next.gep19, align 4 385 %index.next = add i32 %index, 4 386 %9 = icmp eq i32 %index.next, %n.vec 387 br i1 %9, label %middle.block, label %vector.body 388 389middle.block: ; preds = %vector.body 390 %cmp.n = icmp eq i32 %n.vec, %N 391 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21 392 393for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 394 ret void 395 396for.body: ; preds = %for.body.preheader21, %for.body 397 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ] 398 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ] 399 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ] 400 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ] 401 %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1 402 %10 = load i32, ptr %pSrcA.addr.011, align 4 403 %conv = sext i32 %10 to i64 404 %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1 405 %11 = load i32, ptr %pSrcB.addr.010, align 4 406 %conv2 = sext i32 %11 to i64 407 %mul = mul nsw i64 %conv2, %conv 408 %shr = ashr i64 %mul, 31 409 %12 = icmp sgt i64 %shr, -2147483648 410 %.val.i = select i1 %12, i64 %shr, i64 -2147483648 411 %13 = icmp slt i64 %.val.i, 2147483647 412 %retval.0.i = select i1 %13, i64 %.val.i, i64 2147483647 413 %conv3 = trunc i64 %retval.0.i to i32 414 %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1 415 store i32 %conv3, ptr %pDst.addr.09, align 4 416 %inc = add nuw i32 %i.012, 1 417 %exitcond = icmp eq i32 %inc, %N 418 br i1 %exitcond, label %for.cond.cleanup, label %for.body 419} 420 421define arm_aapcs_vfpcc void @ssatmul_4t_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 422; CHECK-LABEL: ssatmul_4t_q31: 423; CHECK: @ %bb.0: @ %entry 424; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} 425; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} 426; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 427; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 428; CHECK-NEXT: .pad #24 429; CHECK-NEXT: sub sp, #24 430; CHECK-NEXT: cmp r3, #0 431; CHECK-NEXT: beq.w .LBB2_3 432; CHECK-NEXT: @ %bb.1: @ %vector.ph 433; CHECK-NEXT: adds r6, r3, #3 434; CHECK-NEXT: movs r5, #1 435; CHECK-NEXT: bic r6, r6, #3 436; CHECK-NEXT: adr r4, .LCPI2_1 437; CHECK-NEXT: subs r6, #4 438; CHECK-NEXT: vldrw.u32 q2, [r4] 439; CHECK-NEXT: mov.w r9, #0 440; CHECK-NEXT: mov.w r12, #-1 441; CHECK-NEXT: add.w lr, r5, r6, lsr #2 442; CHECK-NEXT: adr r5, .LCPI2_0 443; CHECK-NEXT: vldrw.u32 q0, [r5] 444; CHECK-NEXT: adr r5, .LCPI2_2 445; CHECK-NEXT: subs r6, r3, #1 446; CHECK-NEXT: vldrw.u32 q3, [r5] 447; CHECK-NEXT: vdup.32 q1, r6 448; CHECK-NEXT: mvn r8, #-2147483648 449; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 450; CHECK-NEXT: .LBB2_2: @ %vector.body 451; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 452; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 453; CHECK-NEXT: vdup.32 q4, r9 454; CHECK-NEXT: movs r4, #0 455; CHECK-NEXT: add.w r9, r9, #4 456; CHECK-NEXT: vorr q4, q4, q0 457; CHECK-NEXT: vcmp.u32 cs, q1, q4 458; CHECK-NEXT: vstr p0, [sp, #20] @ 4-byte Spill 459; CHECK-NEXT: vpstt 460; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 461; CHECK-NEXT: vldrwt.u32 q5, [r1], #16 462; CHECK-NEXT: vmov.f32 s24, s18 463; CHECK-NEXT: vmov.f32 s26, s19 464; CHECK-NEXT: vmov.f32 s28, s22 465; CHECK-NEXT: vmov.f32 s30, s23 466; CHECK-NEXT: vmullb.s32 q0, q7, q6 467; CHECK-NEXT: vmov.f32 s18, s21 468; CHECK-NEXT: vmov r10, r5, d0 469; CHECK-NEXT: asrl r10, r5, #31 470; CHECK-NEXT: rsbs.w r7, r10, #-2147483648 471; CHECK-NEXT: sbcs.w r7, r12, r5 472; CHECK-NEXT: csetm r7, lt 473; CHECK-NEXT: bfi r4, r7, #0, #8 474; CHECK-NEXT: vmov r6, r7, d1 475; CHECK-NEXT: asrl r6, r7, #31 476; CHECK-NEXT: rsbs.w r3, r6, #-2147483648 477; CHECK-NEXT: vmov q0[2], q0[0], r10, r6 478; CHECK-NEXT: sbcs.w r3, r12, r7 479; CHECK-NEXT: vmov q0[3], q0[1], r5, r7 480; CHECK-NEXT: csetm r3, lt 481; CHECK-NEXT: vmov r7, s18 482; CHECK-NEXT: bfi r4, r3, #8, #8 483; CHECK-NEXT: vmsr p0, r4 484; CHECK-NEXT: vpsel q0, q0, q2 485; CHECK-NEXT: vmov r3, r4, d0 486; CHECK-NEXT: subs.w r3, r3, r8 487; CHECK-NEXT: sbcs r3, r4, #0 488; CHECK-NEXT: mov.w r4, #0 489; CHECK-NEXT: csetm r3, lt 490; CHECK-NEXT: bfi r4, r3, #0, #8 491; CHECK-NEXT: vmov r3, r5, d1 492; CHECK-NEXT: subs.w r3, r3, r8 493; CHECK-NEXT: sbcs r3, r5, #0 494; CHECK-NEXT: csetm r3, lt 495; CHECK-NEXT: bfi r4, r3, #8, #8 496; CHECK-NEXT: vmov r3, s16 497; CHECK-NEXT: vmsr p0, r4 498; CHECK-NEXT: vmov r4, s20 499; CHECK-NEXT: vpsel q6, q0, q3 500; CHECK-NEXT: vmov.f32 s2, s17 501; CHECK-NEXT: smull r10, r5, r4, r3 502; CHECK-NEXT: movs r4, #0 503; CHECK-NEXT: asrl r10, r5, #31 504; CHECK-NEXT: rsbs.w r3, r10, #-2147483648 505; CHECK-NEXT: sbcs.w r3, r12, r5 506; CHECK-NEXT: csetm r3, lt 507; CHECK-NEXT: bfi r4, r3, #0, #8 508; CHECK-NEXT: vmov r3, s2 509; CHECK-NEXT: smull r6, r3, r7, r3 510; CHECK-NEXT: asrl r6, r3, #31 511; CHECK-NEXT: rsbs.w r7, r6, #-2147483648 512; CHECK-NEXT: vmov q0[2], q0[0], r10, r6 513; CHECK-NEXT: sbcs.w r7, r12, r3 514; CHECK-NEXT: vmov q0[3], q0[1], r5, r3 515; CHECK-NEXT: csetm r7, lt 516; CHECK-NEXT: bfi r4, r7, #8, #8 517; CHECK-NEXT: vmsr p0, r4 518; CHECK-NEXT: vpsel q0, q0, q2 519; CHECK-NEXT: vmov r3, r4, d0 520; CHECK-NEXT: subs.w r3, r3, r8 521; CHECK-NEXT: sbcs r3, r4, #0 522; CHECK-NEXT: mov.w r4, #0 523; CHECK-NEXT: csetm r3, lt 524; CHECK-NEXT: bfi r4, r3, #0, #8 525; CHECK-NEXT: vmov r3, r5, d1 526; CHECK-NEXT: subs.w r3, r3, r8 527; CHECK-NEXT: sbcs r3, r5, #0 528; CHECK-NEXT: csetm r3, lt 529; CHECK-NEXT: bfi r4, r3, #8, #8 530; CHECK-NEXT: vmsr p0, r4 531; CHECK-NEXT: vpsel q0, q0, q3 532; CHECK-NEXT: vldr p0, [sp, #20] @ 4-byte Reload 533; CHECK-NEXT: vmov.f32 s1, s2 534; CHECK-NEXT: vmov.f32 s2, s24 535; CHECK-NEXT: vmov.f32 s3, s26 536; CHECK-NEXT: vpst 537; CHECK-NEXT: vstrwt.32 q0, [r2], #16 538; CHECK-NEXT: le lr, .LBB2_2 539; CHECK-NEXT: .LBB2_3: @ %for.cond.cleanup 540; CHECK-NEXT: add sp, #24 541; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 542; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} 543; CHECK-NEXT: .p2align 4 544; CHECK-NEXT: @ %bb.4: 545; CHECK-NEXT: .LCPI2_0: 546; CHECK-NEXT: .long 0 @ 0x0 547; CHECK-NEXT: .long 1 @ 0x1 548; CHECK-NEXT: .long 2 @ 0x2 549; CHECK-NEXT: .long 3 @ 0x3 550; CHECK-NEXT: .LCPI2_1: 551; CHECK-NEXT: .long 2147483648 @ 0x80000000 552; CHECK-NEXT: .long 4294967295 @ 0xffffffff 553; CHECK-NEXT: .long 2147483648 @ 0x80000000 554; CHECK-NEXT: .long 4294967295 @ 0xffffffff 555; CHECK-NEXT: .LCPI2_2: 556; CHECK-NEXT: .long 2147483647 @ 0x7fffffff 557; CHECK-NEXT: .long 0 @ 0x0 558; CHECK-NEXT: .long 2147483647 @ 0x7fffffff 559; CHECK-NEXT: .long 0 @ 0x0 560entry: 561 %cmp8 = icmp eq i32 %N, 0 562 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 563 564vector.ph: ; preds = %entry 565 %n.rnd.up = add i32 %N, 3 566 %n.vec = and i32 %n.rnd.up, -4 567 %trip.count.minus.1 = add i32 %N, -1 568 %broadcast.splatinsert20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 569 %broadcast.splat21 = shufflevector <4 x i32> %broadcast.splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer 570 br label %vector.body 571 572vector.body: ; preds = %vector.body, %vector.ph 573 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 574 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 575 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 576 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 577 %next.gep = getelementptr i32, ptr %pSrcA, i32 %index 578 %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index 579 %next.gep19 = getelementptr i32, ptr %pDst, i32 %index 580 %0 = icmp ule <4 x i32> %induction, %broadcast.splat21 581 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %next.gep, i32 4, <4 x i1> %0, <4 x i32> undef) 582 %1 = sext <4 x i32> %wide.masked.load to <4 x i64> 583 %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %next.gep18, i32 4, <4 x i1> %0, <4 x i32> undef) 584 %2 = sext <4 x i32> %wide.masked.load22 to <4 x i64> 585 %3 = mul nsw <4 x i64> %2, %1 586 %4 = ashr <4 x i64> %3, <i64 31, i64 31, i64 31, i64 31> 587 %5 = icmp sgt <4 x i64> %4, <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648> 588 %6 = select <4 x i1> %5, <4 x i64> %4, <4 x i64> <i64 -2147483648, i64 -2147483648, i64 -2147483648, i64 -2147483648> 589 %7 = icmp slt <4 x i64> %6, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 590 %8 = select <4 x i1> %7, <4 x i64> %6, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647> 591 %9 = trunc <4 x i64> %8 to <4 x i32> 592 call void @llvm.masked.store.v4i32.p0(<4 x i32> %9, ptr %next.gep19, i32 4, <4 x i1> %0) 593 %index.next = add i32 %index, 4 594 %10 = icmp eq i32 %index.next, %n.vec 595 br i1 %10, label %for.cond.cleanup, label %vector.body 596 597for.cond.cleanup: ; preds = %vector.body, %entry 598 ret void 599} 600 601define arm_aapcs_vfpcc void @usatmul_2_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 602; CHECK-LABEL: usatmul_2_q31: 603; CHECK: @ %bb.0: @ %entry 604; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 605; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 606; CHECK-NEXT: .pad #4 607; CHECK-NEXT: sub sp, #4 608; CHECK-NEXT: cmp r3, #0 609; CHECK-NEXT: beq .LBB3_8 610; CHECK-NEXT: @ %bb.1: @ %entry 611; CHECK-NEXT: mov r8, r2 612; CHECK-NEXT: cmp r3, #1 613; CHECK-NEXT: bne .LBB3_3 614; CHECK-NEXT: @ %bb.2: 615; CHECK-NEXT: movs r7, #0 616; CHECK-NEXT: mov r12, r0 617; CHECK-NEXT: mov r11, r1 618; CHECK-NEXT: mov r2, r8 619; CHECK-NEXT: b .LBB3_6 620; CHECK-NEXT: .LBB3_3: @ %vector.ph 621; CHECK-NEXT: bic r5, r3, #1 622; CHECK-NEXT: movs r6, #1 623; CHECK-NEXT: subs r7, r5, #2 624; CHECK-NEXT: str r5, [sp] @ 4-byte Spill 625; CHECK-NEXT: add.w r2, r8, r5, lsl #2 626; CHECK-NEXT: add.w r11, r1, r5, lsl #2 627; CHECK-NEXT: add.w lr, r6, r7, lsr #1 628; CHECK-NEXT: add.w r12, r0, r5, lsl #2 629; CHECK-NEXT: vmov.i8 q0, #0xff 630; CHECK-NEXT: .LBB3_4: @ %vector.body 631; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 632; CHECK-NEXT: ldrd r4, r9, [r0], #8 633; CHECK-NEXT: ldrd r5, r10, [r1], #8 634; CHECK-NEXT: umull r4, r5, r5, r4 635; CHECK-NEXT: lsrl r4, r5, #31 636; CHECK-NEXT: subs.w r6, r4, #-1 637; CHECK-NEXT: sbcs r5, r5, #0 638; CHECK-NEXT: mov.w r6, #0 639; CHECK-NEXT: csetm r5, lo 640; CHECK-NEXT: bfi r6, r5, #0, #8 641; CHECK-NEXT: umull r10, r5, r10, r9 642; CHECK-NEXT: lsrl r10, r5, #31 643; CHECK-NEXT: subs.w r7, r10, #-1 644; CHECK-NEXT: vmov q1[2], q1[0], r4, r10 645; CHECK-NEXT: sbcs r5, r5, #0 646; CHECK-NEXT: csetm r5, lo 647; CHECK-NEXT: bfi r6, r5, #8, #8 648; CHECK-NEXT: vmsr p0, r6 649; CHECK-NEXT: vpsel q1, q1, q0 650; CHECK-NEXT: vmov r4, s6 651; CHECK-NEXT: vmov r5, s4 652; CHECK-NEXT: strd r5, r4, [r8], #8 653; CHECK-NEXT: le lr, .LBB3_4 654; CHECK-NEXT: @ %bb.5: @ %middle.block 655; CHECK-NEXT: ldr r7, [sp] @ 4-byte Reload 656; CHECK-NEXT: cmp r7, r3 657; CHECK-NEXT: beq .LBB3_8 658; CHECK-NEXT: .LBB3_6: @ %for.body.preheader 659; CHECK-NEXT: sub.w lr, r3, r7 660; CHECK-NEXT: .LBB3_7: @ %for.body 661; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 662; CHECK-NEXT: ldr r0, [r12], #4 663; CHECK-NEXT: ldr r1, [r11], #4 664; CHECK-NEXT: umull r0, r1, r1, r0 665; CHECK-NEXT: lsrl r0, r1, #31 666; CHECK-NEXT: subs.w r3, r0, #-1 667; CHECK-NEXT: sbcs r1, r1, #0 668; CHECK-NEXT: it hs 669; CHECK-NEXT: movhs.w r0, #-1 670; CHECK-NEXT: str r0, [r2], #4 671; CHECK-NEXT: le lr, .LBB3_7 672; CHECK-NEXT: .LBB3_8: @ %for.cond.cleanup 673; CHECK-NEXT: add sp, #4 674; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 675entry: 676 switch i32 %N, label %vector.ph [ 677 i32 0, label %for.cond.cleanup 678 i32 1, label %for.body.preheader 679 ] 680 681vector.ph: ; preds = %entry 682 %n.vec = and i32 %N, -2 683 %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec 684 %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec 685 %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec 686 br label %vector.body 687 688vector.body: ; preds = %vector.body, %vector.ph 689 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 690 %next.gep = getelementptr i32, ptr %pSrcA, i32 %index 691 %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index 692 %next.gep19 = getelementptr i32, ptr %pDst, i32 %index 693 %wide.load = load <2 x i32>, ptr %next.gep, align 4 694 %0 = zext <2 x i32> %wide.load to <2 x i64> 695 %wide.load20 = load <2 x i32>, ptr %next.gep18, align 4 696 %1 = zext <2 x i32> %wide.load20 to <2 x i64> 697 %2 = mul nuw <2 x i64> %1, %0 698 %3 = lshr <2 x i64> %2, <i64 31, i64 31> 699 %4 = icmp ult <2 x i64> %3, <i64 4294967295, i64 4294967295> 700 %5 = select <2 x i1> %4, <2 x i64> %3, <2 x i64> <i64 4294967295, i64 4294967295> 701 %6 = trunc <2 x i64> %5 to <2 x i32> 702 store <2 x i32> %6, ptr %next.gep19, align 4 703 %index.next = add i32 %index, 2 704 %7 = icmp eq i32 %index.next, %n.vec 705 br i1 %7, label %middle.block, label %vector.body 706 707middle.block: ; preds = %vector.body 708 %cmp.n = icmp eq i32 %n.vec, %N 709 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader 710 711for.body.preheader: ; preds = %entry, %middle.block 712 %i.012.ph = phi i32 [ 0, %entry ], [ %n.vec, %middle.block ] 713 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %entry ], [ %ind.end, %middle.block ] 714 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %entry ], [ %ind.end15, %middle.block ] 715 %pDst.addr.09.ph = phi ptr [ %pDst, %entry ], [ %ind.end17, %middle.block ] 716 br label %for.body 717 718for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 719 ret void 720 721for.body: ; preds = %for.body.preheader, %for.body 722 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader ] 723 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader ] 724 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader ] 725 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader ] 726 %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1 727 %8 = load i32, ptr %pSrcA.addr.011, align 4 728 %conv = zext i32 %8 to i64 729 %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1 730 %9 = load i32, ptr %pSrcB.addr.010, align 4 731 %conv2 = zext i32 %9 to i64 732 %mul = mul nuw i64 %conv2, %conv 733 %shr = lshr i64 %mul, 31 734 %10 = icmp ult i64 %shr, 4294967295 735 %retval.0.i = select i1 %10, i64 %shr, i64 4294967295 736 %conv3 = trunc i64 %retval.0.i to i32 737 %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1 738 store i32 %conv3, ptr %pDst.addr.09, align 4 739 %inc = add nuw i32 %i.012, 1 740 %exitcond = icmp eq i32 %inc, %N 741 br i1 %exitcond, label %for.cond.cleanup, label %for.body 742} 743 744define arm_aapcs_vfpcc void @usatmul_4_q31(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 745; CHECK-LABEL: usatmul_4_q31: 746; CHECK: @ %bb.0: @ %entry 747; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 748; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 749; CHECK-NEXT: .pad #4 750; CHECK-NEXT: sub sp, #4 751; CHECK-NEXT: .vsave {d8, d9, d10, d11} 752; CHECK-NEXT: vpush {d8, d9, d10, d11} 753; CHECK-NEXT: cmp r3, #0 754; CHECK-NEXT: beq.w .LBB4_8 755; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 756; CHECK-NEXT: mov.w r8, #0 757; CHECK-NEXT: cmp r3, #3 758; CHECK-NEXT: bhi .LBB4_3 759; CHECK-NEXT: @ %bb.2: 760; CHECK-NEXT: mov r12, r0 761; CHECK-NEXT: mov r9, r1 762; CHECK-NEXT: mov r11, r2 763; CHECK-NEXT: b .LBB4_6 764; CHECK-NEXT: .LBB4_3: @ %vector.ph 765; CHECK-NEXT: bic r8, r3, #3 766; CHECK-NEXT: movs r6, #1 767; CHECK-NEXT: sub.w r7, r8, #4 768; CHECK-NEXT: vmov.i64 q0, #0xffffffff 769; CHECK-NEXT: add.w r11, r2, r8, lsl #2 770; CHECK-NEXT: add.w r9, r1, r8, lsl #2 771; CHECK-NEXT: add.w lr, r6, r7, lsr #2 772; CHECK-NEXT: add.w r12, r0, r8, lsl #2 773; CHECK-NEXT: .LBB4_4: @ %vector.body 774; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 775; CHECK-NEXT: vldrw.u32 q1, [r0], #16 776; CHECK-NEXT: vldrw.u32 q2, [r1], #16 777; CHECK-NEXT: vmov.f32 s12, s6 778; CHECK-NEXT: vmov.f32 s14, s7 779; CHECK-NEXT: vmov.f32 s16, s10 780; CHECK-NEXT: vmov.f32 s18, s11 781; CHECK-NEXT: vmullb.u32 q5, q4, q3 782; CHECK-NEXT: vmov.f32 s6, s5 783; CHECK-NEXT: vmov r10, r5, d10 784; CHECK-NEXT: lsrl r10, r5, #31 785; CHECK-NEXT: vmov.f32 s10, s9 786; CHECK-NEXT: subs.w r6, r10, #-1 787; CHECK-NEXT: sbcs r5, r5, #0 788; CHECK-NEXT: mov.w r6, #0 789; CHECK-NEXT: csetm r5, lo 790; CHECK-NEXT: vmullb.u32 q4, q2, q1 791; CHECK-NEXT: bfi r6, r5, #0, #8 792; CHECK-NEXT: vmov r4, r5, d11 793; CHECK-NEXT: lsrl r4, r5, #31 794; CHECK-NEXT: subs.w r7, r4, #-1 795; CHECK-NEXT: vmov q3[2], q3[0], r10, r4 796; CHECK-NEXT: sbcs r5, r5, #0 797; CHECK-NEXT: csetm r5, lo 798; CHECK-NEXT: bfi r6, r5, #8, #8 799; CHECK-NEXT: vmov r10, r5, d8 800; CHECK-NEXT: lsrl r10, r5, #31 801; CHECK-NEXT: vmsr p0, r6 802; CHECK-NEXT: subs.w r6, r10, #-1 803; CHECK-NEXT: vpsel q3, q3, q0 804; CHECK-NEXT: sbcs r5, r5, #0 805; CHECK-NEXT: mov.w r6, #0 806; CHECK-NEXT: csetm r5, lo 807; CHECK-NEXT: bfi r6, r5, #0, #8 808; CHECK-NEXT: vmov r4, r5, d9 809; CHECK-NEXT: lsrl r4, r5, #31 810; CHECK-NEXT: subs.w r7, r4, #-1 811; CHECK-NEXT: vmov q1[2], q1[0], r10, r4 812; CHECK-NEXT: sbcs r5, r5, #0 813; CHECK-NEXT: csetm r5, lo 814; CHECK-NEXT: bfi r6, r5, #8, #8 815; CHECK-NEXT: vmsr p0, r6 816; CHECK-NEXT: vpsel q1, q1, q0 817; CHECK-NEXT: vmov.f32 s5, s6 818; CHECK-NEXT: vmov.f32 s6, s12 819; CHECK-NEXT: vmov.f32 s7, s14 820; CHECK-NEXT: vstrb.8 q1, [r2], #16 821; CHECK-NEXT: le lr, .LBB4_4 822; CHECK-NEXT: @ %bb.5: @ %middle.block 823; CHECK-NEXT: cmp r8, r3 824; CHECK-NEXT: beq .LBB4_8 825; CHECK-NEXT: .LBB4_6: @ %for.body.preheader21 826; CHECK-NEXT: sub.w lr, r3, r8 827; CHECK-NEXT: .LBB4_7: @ %for.body 828; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 829; CHECK-NEXT: ldr r0, [r12], #4 830; CHECK-NEXT: ldr r1, [r9], #4 831; CHECK-NEXT: umull r0, r1, r1, r0 832; CHECK-NEXT: lsrl r0, r1, #31 833; CHECK-NEXT: subs.w r2, r0, #-1 834; CHECK-NEXT: sbcs r1, r1, #0 835; CHECK-NEXT: it hs 836; CHECK-NEXT: movhs.w r0, #-1 837; CHECK-NEXT: str r0, [r11], #4 838; CHECK-NEXT: le lr, .LBB4_7 839; CHECK-NEXT: .LBB4_8: @ %for.cond.cleanup 840; CHECK-NEXT: vpop {d8, d9, d10, d11} 841; CHECK-NEXT: add sp, #4 842; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 843entry: 844 %cmp8 = icmp eq i32 %N, 0 845 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 846 847for.body.preheader: ; preds = %entry 848 %min.iters.check = icmp ult i32 %N, 4 849 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph 850 851for.body.preheader21: ; preds = %middle.block, %for.body.preheader 852 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 853 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 854 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ] 855 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ] 856 br label %for.body 857 858vector.ph: ; preds = %for.body.preheader 859 %n.vec = and i32 %N, -4 860 %ind.end = getelementptr i32, ptr %pSrcA, i32 %n.vec 861 %ind.end15 = getelementptr i32, ptr %pSrcB, i32 %n.vec 862 %ind.end17 = getelementptr i32, ptr %pDst, i32 %n.vec 863 br label %vector.body 864 865vector.body: ; preds = %vector.body, %vector.ph 866 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 867 %next.gep = getelementptr i32, ptr %pSrcA, i32 %index 868 %next.gep18 = getelementptr i32, ptr %pSrcB, i32 %index 869 %next.gep19 = getelementptr i32, ptr %pDst, i32 %index 870 %wide.load = load <4 x i32>, ptr %next.gep, align 4 871 %0 = zext <4 x i32> %wide.load to <4 x i64> 872 %wide.load20 = load <4 x i32>, ptr %next.gep18, align 4 873 %1 = zext <4 x i32> %wide.load20 to <4 x i64> 874 %2 = mul nuw <4 x i64> %1, %0 875 %3 = lshr <4 x i64> %2, <i64 31, i64 31, i64 31, i64 31> 876 %4 = icmp ult <4 x i64> %3, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 877 %5 = select <4 x i1> %4, <4 x i64> %3, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295> 878 %6 = trunc <4 x i64> %5 to <4 x i32> 879 store <4 x i32> %6, ptr %next.gep19, align 4 880 %index.next = add i32 %index, 4 881 %7 = icmp eq i32 %index.next, %n.vec 882 br i1 %7, label %middle.block, label %vector.body 883 884middle.block: ; preds = %vector.body 885 %cmp.n = icmp eq i32 %n.vec, %N 886 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21 887 888for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 889 ret void 890 891for.body: ; preds = %for.body.preheader21, %for.body 892 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ] 893 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ] 894 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ] 895 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ] 896 %incdec.ptr = getelementptr inbounds i32, ptr %pSrcA.addr.011, i32 1 897 %8 = load i32, ptr %pSrcA.addr.011, align 4 898 %conv = zext i32 %8 to i64 899 %incdec.ptr1 = getelementptr inbounds i32, ptr %pSrcB.addr.010, i32 1 900 %9 = load i32, ptr %pSrcB.addr.010, align 4 901 %conv2 = zext i32 %9 to i64 902 %mul = mul nuw i64 %conv2, %conv 903 %shr = lshr i64 %mul, 31 904 %10 = icmp ult i64 %shr, 4294967295 905 %retval.0.i = select i1 %10, i64 %shr, i64 4294967295 906 %conv3 = trunc i64 %retval.0.i to i32 907 %incdec.ptr4 = getelementptr inbounds i32, ptr %pDst.addr.09, i32 1 908 store i32 %conv3, ptr %pDst.addr.09, align 4 909 %inc = add nuw i32 %i.012, 1 910 %exitcond = icmp eq i32 %inc, %N 911 br i1 %exitcond, label %for.cond.cleanup, label %for.body 912} 913 914 915; i16 916 917define arm_aapcs_vfpcc void @ssatmul_4_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 918; CHECK-LABEL: ssatmul_4_q15: 919; CHECK: @ %bb.0: @ %entry 920; CHECK-NEXT: .save {r4, r5, r6, lr} 921; CHECK-NEXT: push {r4, r5, r6, lr} 922; CHECK-NEXT: cbz r3, .LBB5_8 923; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 924; CHECK-NEXT: cmp r3, #3 925; CHECK-NEXT: bhi .LBB5_3 926; CHECK-NEXT: @ %bb.2: 927; CHECK-NEXT: movs r5, #0 928; CHECK-NEXT: mov r12, r0 929; CHECK-NEXT: mov r6, r1 930; CHECK-NEXT: mov r4, r2 931; CHECK-NEXT: b .LBB5_6 932; CHECK-NEXT: .LBB5_3: @ %vector.ph 933; CHECK-NEXT: bic r5, r3, #3 934; CHECK-NEXT: movs r4, #1 935; CHECK-NEXT: subs r6, r5, #4 936; CHECK-NEXT: add.w r12, r0, r5, lsl #1 937; CHECK-NEXT: add.w lr, r4, r6, lsr #2 938; CHECK-NEXT: add.w r4, r2, r5, lsl #1 939; CHECK-NEXT: add.w r6, r1, r5, lsl #1 940; CHECK-NEXT: .LBB5_4: @ %vector.body 941; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 942; CHECK-NEXT: vldrh.s32 q0, [r0], #8 943; CHECK-NEXT: vldrh.s32 q1, [r1], #8 944; CHECK-NEXT: vmul.i32 q0, q1, q0 945; CHECK-NEXT: vqshrnb.s32 q0, q0, #15 946; CHECK-NEXT: vstrh.32 q0, [r2], #8 947; CHECK-NEXT: le lr, .LBB5_4 948; CHECK-NEXT: @ %bb.5: @ %middle.block 949; CHECK-NEXT: cmp r5, r3 950; CHECK-NEXT: it eq 951; CHECK-NEXT: popeq {r4, r5, r6, pc} 952; CHECK-NEXT: .LBB5_6: @ %for.body.preheader21 953; CHECK-NEXT: sub.w lr, r3, r5 954; CHECK-NEXT: .LBB5_7: @ %for.body 955; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 956; CHECK-NEXT: ldrsh r0, [r12], #2 957; CHECK-NEXT: ldrsh r1, [r6], #2 958; CHECK-NEXT: muls r0, r1, r0 959; CHECK-NEXT: ssat r0, #16, r0, asr #15 960; CHECK-NEXT: strh r0, [r4], #2 961; CHECK-NEXT: le lr, .LBB5_7 962; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup 963; CHECK-NEXT: pop {r4, r5, r6, pc} 964entry: 965 %cmp8 = icmp eq i32 %N, 0 966 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 967 968for.body.preheader: ; preds = %entry 969 %min.iters.check = icmp ult i32 %N, 4 970 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph 971 972for.body.preheader21: ; preds = %middle.block, %for.body.preheader 973 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 974 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 975 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ] 976 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ] 977 br label %for.body 978 979vector.ph: ; preds = %for.body.preheader 980 %n.vec = and i32 %N, -4 981 %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec 982 %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec 983 %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec 984 br label %vector.body 985 986vector.body: ; preds = %vector.body, %vector.ph 987 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 988 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index 989 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index 990 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index 991 %wide.load = load <4 x i16>, ptr %next.gep, align 2 992 %0 = sext <4 x i16> %wide.load to <4 x i32> 993 %wide.load20 = load <4 x i16>, ptr %next.gep18, align 2 994 %1 = sext <4 x i16> %wide.load20 to <4 x i32> 995 %2 = mul nsw <4 x i32> %1, %0 996 %3 = ashr <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15> 997 %4 = icmp sgt <4 x i32> %3, <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 998 %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 999 %6 = icmp slt <4 x i32> %5, <i32 32767, i32 32767, i32 32767, i32 32767> 1000 %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767> 1001 %8 = trunc <4 x i32> %7 to <4 x i16> 1002 store <4 x i16> %8, ptr %next.gep19, align 2 1003 %index.next = add i32 %index, 4 1004 %9 = icmp eq i32 %index.next, %n.vec 1005 br i1 %9, label %middle.block, label %vector.body 1006 1007middle.block: ; preds = %vector.body 1008 %cmp.n = icmp eq i32 %n.vec, %N 1009 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21 1010 1011for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1012 ret void 1013 1014for.body: ; preds = %for.body.preheader21, %for.body 1015 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ] 1016 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ] 1017 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ] 1018 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ] 1019 %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1 1020 %10 = load i16, ptr %pSrcA.addr.011, align 2 1021 %conv = sext i16 %10 to i32 1022 %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1 1023 %11 = load i16, ptr %pSrcB.addr.010, align 2 1024 %conv2 = sext i16 %11 to i32 1025 %mul = mul nsw i32 %conv2, %conv 1026 %shr = ashr i32 %mul, 15 1027 %12 = icmp sgt i32 %shr, -32768 1028 %.val.i = select i1 %12, i32 %shr, i32 -32768 1029 %13 = icmp slt i32 %.val.i, 32767 1030 %retval.0.i = select i1 %13, i32 %.val.i, i32 32767 1031 %conv3 = trunc i32 %retval.0.i to i16 1032 %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1 1033 store i16 %conv3, ptr %pDst.addr.09, align 2 1034 %inc = add nuw i32 %i.012, 1 1035 %exitcond = icmp eq i32 %inc, %N 1036 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1037} 1038 1039define arm_aapcs_vfpcc void @ssatmul_8_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 1040; CHECK-LABEL: ssatmul_8_q15: 1041; CHECK: @ %bb.0: @ %entry 1042; CHECK-NEXT: .save {r4, r5, r6, lr} 1043; CHECK-NEXT: push {r4, r5, r6, lr} 1044; CHECK-NEXT: cbz r3, .LBB6_8 1045; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1046; CHECK-NEXT: cmp r3, #7 1047; CHECK-NEXT: bhi .LBB6_3 1048; CHECK-NEXT: @ %bb.2: 1049; CHECK-NEXT: movs r5, #0 1050; CHECK-NEXT: mov r12, r0 1051; CHECK-NEXT: mov r6, r1 1052; CHECK-NEXT: mov r4, r2 1053; CHECK-NEXT: b .LBB6_6 1054; CHECK-NEXT: .LBB6_3: @ %vector.ph 1055; CHECK-NEXT: bic r5, r3, #7 1056; CHECK-NEXT: movs r4, #1 1057; CHECK-NEXT: sub.w r6, r5, #8 1058; CHECK-NEXT: add.w r12, r0, r5, lsl #1 1059; CHECK-NEXT: add.w lr, r4, r6, lsr #3 1060; CHECK-NEXT: add.w r4, r2, r5, lsl #1 1061; CHECK-NEXT: add.w r6, r1, r5, lsl #1 1062; CHECK-NEXT: .LBB6_4: @ %vector.body 1063; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1064; CHECK-NEXT: vldrh.u16 q0, [r0], #16 1065; CHECK-NEXT: vldrh.u16 q1, [r1], #16 1066; CHECK-NEXT: vmullt.s16 q2, q1, q0 1067; CHECK-NEXT: vmullb.s16 q0, q1, q0 1068; CHECK-NEXT: vqshrnb.s32 q0, q0, #15 1069; CHECK-NEXT: vqshrnt.s32 q0, q2, #15 1070; CHECK-NEXT: vstrb.8 q0, [r2], #16 1071; CHECK-NEXT: le lr, .LBB6_4 1072; CHECK-NEXT: @ %bb.5: @ %middle.block 1073; CHECK-NEXT: cmp r5, r3 1074; CHECK-NEXT: it eq 1075; CHECK-NEXT: popeq {r4, r5, r6, pc} 1076; CHECK-NEXT: .LBB6_6: @ %for.body.preheader21 1077; CHECK-NEXT: sub.w lr, r3, r5 1078; CHECK-NEXT: .LBB6_7: @ %for.body 1079; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1080; CHECK-NEXT: ldrsh r0, [r12], #2 1081; CHECK-NEXT: ldrsh r1, [r6], #2 1082; CHECK-NEXT: muls r0, r1, r0 1083; CHECK-NEXT: ssat r0, #16, r0, asr #15 1084; CHECK-NEXT: strh r0, [r4], #2 1085; CHECK-NEXT: le lr, .LBB6_7 1086; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup 1087; CHECK-NEXT: pop {r4, r5, r6, pc} 1088entry: 1089 %cmp8 = icmp eq i32 %N, 0 1090 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 1091 1092for.body.preheader: ; preds = %entry 1093 %min.iters.check = icmp ult i32 %N, 8 1094 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph 1095 1096for.body.preheader21: ; preds = %middle.block, %for.body.preheader 1097 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1098 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 1099 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ] 1100 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ] 1101 br label %for.body 1102 1103vector.ph: ; preds = %for.body.preheader 1104 %n.vec = and i32 %N, -8 1105 %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec 1106 %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec 1107 %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec 1108 br label %vector.body 1109 1110vector.body: ; preds = %vector.body, %vector.ph 1111 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1112 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index 1113 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index 1114 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index 1115 %wide.load = load <8 x i16>, ptr %next.gep, align 2 1116 %0 = sext <8 x i16> %wide.load to <8 x i32> 1117 %wide.load20 = load <8 x i16>, ptr %next.gep18, align 2 1118 %1 = sext <8 x i16> %wide.load20 to <8 x i32> 1119 %2 = mul nsw <8 x i32> %1, %0 1120 %3 = ashr <8 x i32> %2, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 1121 %4 = icmp sgt <8 x i32> %3, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1122 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1123 %6 = icmp slt <8 x i32> %5, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 1124 %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 1125 %8 = trunc <8 x i32> %7 to <8 x i16> 1126 store <8 x i16> %8, ptr %next.gep19, align 2 1127 %index.next = add i32 %index, 8 1128 %9 = icmp eq i32 %index.next, %n.vec 1129 br i1 %9, label %middle.block, label %vector.body 1130 1131middle.block: ; preds = %vector.body 1132 %cmp.n = icmp eq i32 %n.vec, %N 1133 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21 1134 1135for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1136 ret void 1137 1138for.body: ; preds = %for.body.preheader21, %for.body 1139 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ] 1140 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ] 1141 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ] 1142 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ] 1143 %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1 1144 %10 = load i16, ptr %pSrcA.addr.011, align 2 1145 %conv = sext i16 %10 to i32 1146 %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1 1147 %11 = load i16, ptr %pSrcB.addr.010, align 2 1148 %conv2 = sext i16 %11 to i32 1149 %mul = mul nsw i32 %conv2, %conv 1150 %shr = ashr i32 %mul, 15 1151 %12 = icmp sgt i32 %shr, -32768 1152 %.val.i = select i1 %12, i32 %shr, i32 -32768 1153 %13 = icmp slt i32 %.val.i, 32767 1154 %retval.0.i = select i1 %13, i32 %.val.i, i32 32767 1155 %conv3 = trunc i32 %retval.0.i to i16 1156 %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1 1157 store i16 %conv3, ptr %pDst.addr.09, align 2 1158 %inc = add nuw i32 %i.012, 1 1159 %exitcond = icmp eq i32 %inc, %N 1160 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1161} 1162 1163define arm_aapcs_vfpcc void @ssatmul_8i_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 1164; CHECK-LABEL: ssatmul_8i_q15: 1165; CHECK: @ %bb.0: @ %entry 1166; CHECK-NEXT: .save {r4, r5, r6, lr} 1167; CHECK-NEXT: push {r4, r5, r6, lr} 1168; CHECK-NEXT: cbz r3, .LBB7_8 1169; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1170; CHECK-NEXT: cmp r3, #7 1171; CHECK-NEXT: bhi .LBB7_3 1172; CHECK-NEXT: @ %bb.2: 1173; CHECK-NEXT: movs r5, #0 1174; CHECK-NEXT: mov r12, r0 1175; CHECK-NEXT: mov r6, r1 1176; CHECK-NEXT: mov r4, r2 1177; CHECK-NEXT: b .LBB7_6 1178; CHECK-NEXT: .LBB7_3: @ %vector.ph 1179; CHECK-NEXT: bic r5, r3, #7 1180; CHECK-NEXT: movs r4, #1 1181; CHECK-NEXT: sub.w r6, r5, #8 1182; CHECK-NEXT: add.w r12, r0, r5, lsl #1 1183; CHECK-NEXT: add.w lr, r4, r6, lsr #3 1184; CHECK-NEXT: add.w r4, r2, r5, lsl #1 1185; CHECK-NEXT: add.w r6, r1, r5, lsl #1 1186; CHECK-NEXT: .LBB7_4: @ %vector.body 1187; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1188; CHECK-NEXT: vldrh.u16 q0, [r0], #16 1189; CHECK-NEXT: vldrh.u16 q1, [r1], #16 1190; CHECK-NEXT: vmullt.s16 q2, q1, q0 1191; CHECK-NEXT: vmullb.s16 q0, q1, q0 1192; CHECK-NEXT: vqshrnb.s32 q0, q0, #15 1193; CHECK-NEXT: vqshrnt.s32 q0, q2, #15 1194; CHECK-NEXT: vstrb.8 q0, [r2], #16 1195; CHECK-NEXT: le lr, .LBB7_4 1196; CHECK-NEXT: @ %bb.5: @ %middle.block 1197; CHECK-NEXT: cmp r5, r3 1198; CHECK-NEXT: it eq 1199; CHECK-NEXT: popeq {r4, r5, r6, pc} 1200; CHECK-NEXT: .LBB7_6: @ %for.body.preheader21 1201; CHECK-NEXT: sub.w lr, r3, r5 1202; CHECK-NEXT: .LBB7_7: @ %for.body 1203; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1204; CHECK-NEXT: ldrsh r0, [r12], #2 1205; CHECK-NEXT: ldrsh r1, [r6], #2 1206; CHECK-NEXT: muls r0, r1, r0 1207; CHECK-NEXT: ssat r0, #16, r0, asr #15 1208; CHECK-NEXT: strh r0, [r4], #2 1209; CHECK-NEXT: le lr, .LBB7_7 1210; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup 1211; CHECK-NEXT: pop {r4, r5, r6, pc} 1212entry: 1213 %cmp8 = icmp eq i32 %N, 0 1214 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 1215 1216for.body.preheader: ; preds = %entry 1217 %min.iters.check = icmp ult i32 %N, 8 1218 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph 1219 1220for.body.preheader21: ; preds = %middle.block, %for.body.preheader 1221 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1222 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 1223 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ] 1224 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ] 1225 br label %for.body 1226 1227vector.ph: ; preds = %for.body.preheader 1228 %n.vec = and i32 %N, -8 1229 %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec 1230 %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec 1231 %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec 1232 br label %vector.body 1233 1234vector.body: ; preds = %vector.body, %vector.ph 1235 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1236 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index 1237 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index 1238 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index 1239 %wide.load = load <8 x i16>, ptr %next.gep, align 2 1240 %0 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1241 %1 = shufflevector <8 x i16> %wide.load, <8 x i16> %wide.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1242 %2 = sext <4 x i16> %0 to <4 x i32> 1243 %3 = sext <4 x i16> %1 to <4 x i32> 1244 %wide.load20 = load <8 x i16>, ptr %next.gep18, align 2 1245 %4 = shufflevector <8 x i16> %wide.load20, <8 x i16> %wide.load20, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1246 %5 = shufflevector <8 x i16> %wide.load20, <8 x i16> %wide.load20, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1247 %6 = sext <4 x i16> %4 to <4 x i32> 1248 %7 = sext <4 x i16> %5 to <4 x i32> 1249 %8 = mul <4 x i32> %6, %2 1250 %9 = mul <4 x i32> %7, %3 1251 %10 = ashr <4 x i32> %8, <i32 15, i32 15, i32 15, i32 15> 1252 %11 = ashr <4 x i32> %9, <i32 15, i32 15, i32 15, i32 15> 1253 %12 = icmp sgt <4 x i32> %10, <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1254 %13 = icmp sgt <4 x i32> %11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1255 %14 = select <4 x i1> %12, <4 x i32> %10, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1256 %15 = select <4 x i1> %13, <4 x i32> %11, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1257 %16 = icmp slt <4 x i32> %14, <i32 32767, i32 32767, i32 32767, i32 32767> 1258 %17 = icmp slt <4 x i32> %15, <i32 32767, i32 32767, i32 32767, i32 32767> 1259 %18 = select <4 x i1> %16, <4 x i32> %14, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767> 1260 %19 = select <4 x i1> %17, <4 x i32> %15, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767> 1261 %20 = shufflevector <4 x i32> %18, <4 x i32> %19, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 1262 %21 = trunc <8 x i32> %20 to <8 x i16> 1263 store <8 x i16> %21, ptr %next.gep19, align 2 1264 %index.next = add i32 %index, 8 1265 %22 = icmp eq i32 %index.next, %n.vec 1266 br i1 %22, label %middle.block, label %vector.body 1267 1268middle.block: ; preds = %vector.body 1269 %cmp.n = icmp eq i32 %n.vec, %N 1270 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21 1271 1272for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1273 ret void 1274 1275for.body: ; preds = %for.body, %for.body.preheader21 1276 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ] 1277 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ] 1278 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ] 1279 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ] 1280 %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1 1281 %23 = load i16, ptr %pSrcA.addr.011, align 2 1282 %conv = sext i16 %23 to i32 1283 %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1 1284 %24 = load i16, ptr %pSrcB.addr.010, align 2 1285 %conv2 = sext i16 %24 to i32 1286 %mul = mul nsw i32 %conv2, %conv 1287 %shr = ashr i32 %mul, 15 1288 %25 = icmp sgt i32 %shr, -32768 1289 %.val.i = select i1 %25, i32 %shr, i32 -32768 1290 %26 = icmp slt i32 %.val.i, 32767 1291 %retval.0.i = select i1 %26, i32 %.val.i, i32 32767 1292 %conv3 = trunc i32 %retval.0.i to i16 1293 %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1 1294 store i16 %conv3, ptr %pDst.addr.09, align 2 1295 %inc = add nuw i32 %i.012, 1 1296 %exitcond = icmp eq i32 %inc, %N 1297 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1298} 1299 1300define arm_aapcs_vfpcc void @ssatmul_s4t_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 1301; CHECK-LABEL: ssatmul_s4t_q15: 1302; CHECK: @ %bb.0: @ %entry 1303; CHECK-NEXT: .save {r4, lr} 1304; CHECK-NEXT: push {r4, lr} 1305; CHECK-NEXT: cmp r3, #0 1306; CHECK-NEXT: it eq 1307; CHECK-NEXT: popeq {r4, pc} 1308; CHECK-NEXT: .LBB8_1: @ %vector.ph 1309; CHECK-NEXT: add.w r12, r3, #3 1310; CHECK-NEXT: mov.w lr, #1 1311; CHECK-NEXT: bic r12, r12, #3 1312; CHECK-NEXT: adr r4, .LCPI8_0 1313; CHECK-NEXT: sub.w r12, r12, #4 1314; CHECK-NEXT: vldrw.u32 q0, [r4] 1315; CHECK-NEXT: add.w lr, lr, r12, lsr #2 1316; CHECK-NEXT: sub.w r12, r3, #1 1317; CHECK-NEXT: movs r3, #0 1318; CHECK-NEXT: vdup.32 q1, r12 1319; CHECK-NEXT: .LBB8_2: @ %vector.body 1320; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1321; CHECK-NEXT: vdup.32 q2, r3 1322; CHECK-NEXT: adds r3, #4 1323; CHECK-NEXT: vorr q2, q2, q0 1324; CHECK-NEXT: vptt.u32 cs, q1, q2 1325; CHECK-NEXT: vldrht.s32 q2, [r0], #8 1326; CHECK-NEXT: vldrht.s32 q3, [r1], #8 1327; CHECK-NEXT: vmul.i32 q2, q3, q2 1328; CHECK-NEXT: vqshrnb.s32 q2, q2, #15 1329; CHECK-NEXT: vpst 1330; CHECK-NEXT: vstrht.32 q2, [r2], #8 1331; CHECK-NEXT: le lr, .LBB8_2 1332; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1333; CHECK-NEXT: pop {r4, pc} 1334; CHECK-NEXT: .p2align 4 1335; CHECK-NEXT: @ %bb.4: 1336; CHECK-NEXT: .LCPI8_0: 1337; CHECK-NEXT: .long 0 @ 0x0 1338; CHECK-NEXT: .long 1 @ 0x1 1339; CHECK-NEXT: .long 2 @ 0x2 1340; CHECK-NEXT: .long 3 @ 0x3 1341entry: 1342 %cmp8 = icmp eq i32 %N, 0 1343 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 1344 1345vector.ph: ; preds = %entry 1346 %n.rnd.up = add i32 %N, 3 1347 %n.vec = and i32 %n.rnd.up, -4 1348 %trip.count.minus.1 = add i32 %N, -1 1349 %broadcast.splatinsert20 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 1350 %broadcast.splat21 = shufflevector <4 x i32> %broadcast.splatinsert20, <4 x i32> undef, <4 x i32> zeroinitializer 1351 br label %vector.body 1352 1353vector.body: ; preds = %vector.body, %vector.ph 1354 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1355 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 1356 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 1357 %induction = or <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 1358 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index 1359 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index 1360 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index 1361 %0 = icmp ule <4 x i32> %induction, %broadcast.splat21 1362 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %next.gep, i32 2, <4 x i1> %0, <4 x i16> undef) 1363 %1 = sext <4 x i16> %wide.masked.load to <4 x i32> 1364 %wide.masked.load22 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr %next.gep18, i32 2, <4 x i1> %0, <4 x i16> undef) 1365 %2 = sext <4 x i16> %wide.masked.load22 to <4 x i32> 1366 %3 = mul nsw <4 x i32> %2, %1 1367 %4 = ashr <4 x i32> %3, <i32 15, i32 15, i32 15, i32 15> 1368 %5 = icmp sgt <4 x i32> %4, <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1369 %6 = select <4 x i1> %5, <4 x i32> %4, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1370 %7 = icmp slt <4 x i32> %6, <i32 32767, i32 32767, i32 32767, i32 32767> 1371 %8 = select <4 x i1> %7, <4 x i32> %6, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767> 1372 %9 = trunc <4 x i32> %8 to <4 x i16> 1373 call void @llvm.masked.store.v4i16.p0(<4 x i16> %9, ptr %next.gep19, i32 2, <4 x i1> %0) 1374 %index.next = add i32 %index, 4 1375 %10 = icmp eq i32 %index.next, %n.vec 1376 br i1 %10, label %for.cond.cleanup, label %vector.body 1377 1378for.cond.cleanup: ; preds = %vector.body, %entry 1379 ret void 1380} 1381 1382define arm_aapcs_vfpcc void @ssatmul_8t_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 1383; CHECK-LABEL: ssatmul_8t_q15: 1384; CHECK: @ %bb.0: @ %entry 1385; CHECK-NEXT: .save {r4, r5, r7, lr} 1386; CHECK-NEXT: push {r4, r5, r7, lr} 1387; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1388; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1389; CHECK-NEXT: .pad #16 1390; CHECK-NEXT: sub sp, #16 1391; CHECK-NEXT: cmp r3, #0 1392; CHECK-NEXT: beq .LBB9_3 1393; CHECK-NEXT: @ %bb.1: @ %vector.ph 1394; CHECK-NEXT: adds r4, r3, #7 1395; CHECK-NEXT: vmov.i8 q2, #0x0 1396; CHECK-NEXT: bic r4, r4, #7 1397; CHECK-NEXT: vmov.i8 q3, #0xff 1398; CHECK-NEXT: sub.w r12, r4, #8 1399; CHECK-NEXT: movs r4, #1 1400; CHECK-NEXT: mov r5, sp 1401; CHECK-NEXT: add.w lr, r4, r12, lsr #3 1402; CHECK-NEXT: adr r4, .LCPI9_0 1403; CHECK-NEXT: vldrw.u32 q0, [r4] 1404; CHECK-NEXT: adr r4, .LCPI9_1 1405; CHECK-NEXT: sub.w r12, r3, #1 1406; CHECK-NEXT: vldrw.u32 q4, [r4] 1407; CHECK-NEXT: movs r3, #0 1408; CHECK-NEXT: vdup.32 q1, r12 1409; CHECK-NEXT: .LBB9_2: @ %vector.body 1410; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1411; CHECK-NEXT: vdup.32 q5, r3 1412; CHECK-NEXT: adds r3, #8 1413; CHECK-NEXT: vorr q6, q5, q0 1414; CHECK-NEXT: vorr q5, q5, q4 1415; CHECK-NEXT: vcmp.u32 cs, q1, q6 1416; CHECK-NEXT: vpsel q6, q3, q2 1417; CHECK-NEXT: vcmp.u32 cs, q1, q5 1418; CHECK-NEXT: vpsel q5, q3, q2 1419; CHECK-NEXT: vstrh.32 q6, [r5, #8] 1420; CHECK-NEXT: vstrh.32 q5, [r5] 1421; CHECK-NEXT: vldrw.u32 q5, [r5] 1422; CHECK-NEXT: vptt.i16 ne, q5, zr 1423; CHECK-NEXT: vldrht.u16 q5, [r0], #16 1424; CHECK-NEXT: vldrht.u16 q6, [r1], #16 1425; CHECK-NEXT: vmullt.s16 q7, q6, q5 1426; CHECK-NEXT: vmullb.s16 q5, q6, q5 1427; CHECK-NEXT: vqshrnb.s32 q5, q5, #15 1428; CHECK-NEXT: vqshrnt.s32 q5, q7, #15 1429; CHECK-NEXT: vpst 1430; CHECK-NEXT: vstrht.16 q5, [r2], #16 1431; CHECK-NEXT: le lr, .LBB9_2 1432; CHECK-NEXT: .LBB9_3: @ %for.cond.cleanup 1433; CHECK-NEXT: add sp, #16 1434; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1435; CHECK-NEXT: pop {r4, r5, r7, pc} 1436; CHECK-NEXT: .p2align 4 1437; CHECK-NEXT: @ %bb.4: 1438; CHECK-NEXT: .LCPI9_0: 1439; CHECK-NEXT: .long 4 @ 0x4 1440; CHECK-NEXT: .long 5 @ 0x5 1441; CHECK-NEXT: .long 6 @ 0x6 1442; CHECK-NEXT: .long 7 @ 0x7 1443; CHECK-NEXT: .LCPI9_1: 1444; CHECK-NEXT: .long 0 @ 0x0 1445; CHECK-NEXT: .long 1 @ 0x1 1446; CHECK-NEXT: .long 2 @ 0x2 1447; CHECK-NEXT: .long 3 @ 0x3 1448entry: 1449 %cmp8 = icmp eq i32 %N, 0 1450 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 1451 1452vector.ph: ; preds = %entry 1453 %n.rnd.up = add i32 %N, 7 1454 %n.vec = and i32 %n.rnd.up, -8 1455 %trip.count.minus.1 = add i32 %N, -1 1456 %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 1457 %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer 1458 br label %vector.body 1459 1460vector.body: ; preds = %vector.body, %vector.ph 1461 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1462 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 1463 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer 1464 %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1465 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index 1466 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index 1467 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index 1468 %0 = icmp ule <8 x i32> %induction, %broadcast.splat21 1469 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %0, <8 x i16> undef) 1470 %1 = sext <8 x i16> %wide.masked.load to <8 x i32> 1471 %wide.masked.load22 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep18, i32 2, <8 x i1> %0, <8 x i16> undef) 1472 %2 = sext <8 x i16> %wide.masked.load22 to <8 x i32> 1473 %3 = mul nsw <8 x i32> %2, %1 1474 %4 = ashr <8 x i32> %3, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 1475 %5 = icmp sgt <8 x i32> %4, <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1476 %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1477 %7 = icmp slt <8 x i32> %6, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 1478 %8 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767> 1479 %9 = trunc <8 x i32> %8 to <8 x i16> 1480 call void @llvm.masked.store.v8i16.p0(<8 x i16> %9, ptr %next.gep19, i32 2, <8 x i1> %0) 1481 %index.next = add i32 %index, 8 1482 %10 = icmp eq i32 %index.next, %n.vec 1483 br i1 %10, label %for.cond.cleanup, label %vector.body 1484 1485for.cond.cleanup: ; preds = %vector.body, %entry 1486 ret void 1487} 1488 1489define arm_aapcs_vfpcc void @ssatmul_8ti_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 1490; CHECK-LABEL: ssatmul_8ti_q15: 1491; CHECK: @ %bb.0: @ %entry 1492; CHECK-NEXT: .save {r4, r5, r7, lr} 1493; CHECK-NEXT: push {r4, r5, r7, lr} 1494; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1495; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1496; CHECK-NEXT: .pad #16 1497; CHECK-NEXT: sub sp, #16 1498; CHECK-NEXT: cmp r3, #0 1499; CHECK-NEXT: beq .LBB10_3 1500; CHECK-NEXT: @ %bb.1: @ %vector.ph 1501; CHECK-NEXT: adds r4, r3, #7 1502; CHECK-NEXT: vmov.i8 q2, #0x0 1503; CHECK-NEXT: bic r4, r4, #7 1504; CHECK-NEXT: vmov.i8 q3, #0xff 1505; CHECK-NEXT: sub.w r12, r4, #8 1506; CHECK-NEXT: movs r4, #1 1507; CHECK-NEXT: mov r5, sp 1508; CHECK-NEXT: add.w lr, r4, r12, lsr #3 1509; CHECK-NEXT: adr r4, .LCPI10_0 1510; CHECK-NEXT: vldrw.u32 q0, [r4] 1511; CHECK-NEXT: adr r4, .LCPI10_1 1512; CHECK-NEXT: sub.w r12, r3, #1 1513; CHECK-NEXT: vldrw.u32 q4, [r4] 1514; CHECK-NEXT: movs r3, #0 1515; CHECK-NEXT: vdup.32 q1, r12 1516; CHECK-NEXT: .LBB10_2: @ %vector.body 1517; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1518; CHECK-NEXT: vdup.32 q5, r3 1519; CHECK-NEXT: adds r3, #8 1520; CHECK-NEXT: vorr q6, q5, q0 1521; CHECK-NEXT: vorr q5, q5, q4 1522; CHECK-NEXT: vcmp.u32 cs, q1, q6 1523; CHECK-NEXT: vpsel q6, q3, q2 1524; CHECK-NEXT: vcmp.u32 cs, q1, q5 1525; CHECK-NEXT: vpsel q5, q3, q2 1526; CHECK-NEXT: vstrh.32 q6, [r5, #8] 1527; CHECK-NEXT: vstrh.32 q5, [r5] 1528; CHECK-NEXT: vldrw.u32 q5, [r5] 1529; CHECK-NEXT: vptt.i16 ne, q5, zr 1530; CHECK-NEXT: vldrht.u16 q5, [r0], #16 1531; CHECK-NEXT: vldrht.u16 q6, [r1], #16 1532; CHECK-NEXT: vmullt.s16 q7, q6, q5 1533; CHECK-NEXT: vmullb.s16 q5, q6, q5 1534; CHECK-NEXT: vqshrnb.s32 q5, q5, #15 1535; CHECK-NEXT: vqshrnt.s32 q5, q7, #15 1536; CHECK-NEXT: vpst 1537; CHECK-NEXT: vstrht.16 q5, [r2], #16 1538; CHECK-NEXT: le lr, .LBB10_2 1539; CHECK-NEXT: .LBB10_3: @ %for.cond.cleanup 1540; CHECK-NEXT: add sp, #16 1541; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1542; CHECK-NEXT: pop {r4, r5, r7, pc} 1543; CHECK-NEXT: .p2align 4 1544; CHECK-NEXT: @ %bb.4: 1545; CHECK-NEXT: .LCPI10_0: 1546; CHECK-NEXT: .long 4 @ 0x4 1547; CHECK-NEXT: .long 5 @ 0x5 1548; CHECK-NEXT: .long 6 @ 0x6 1549; CHECK-NEXT: .long 7 @ 0x7 1550; CHECK-NEXT: .LCPI10_1: 1551; CHECK-NEXT: .long 0 @ 0x0 1552; CHECK-NEXT: .long 1 @ 0x1 1553; CHECK-NEXT: .long 2 @ 0x2 1554; CHECK-NEXT: .long 3 @ 0x3 1555entry: 1556 %cmp8 = icmp eq i32 %N, 0 1557 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph 1558 1559vector.ph: ; preds = %entry 1560 %n.rnd.up = add i32 %N, 7 1561 %n.vec = and i32 %n.rnd.up, -8 1562 %trip.count.minus.1 = add i32 %N, -1 1563 %broadcast.splatinsert20 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 1564 %broadcast.splat21 = shufflevector <8 x i32> %broadcast.splatinsert20, <8 x i32> undef, <8 x i32> zeroinitializer 1565 br label %vector.body 1566 1567vector.body: ; preds = %vector.body, %vector.ph 1568 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1569 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 1570 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer 1571 %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 1572 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index 1573 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index 1574 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index 1575 %0 = icmp ule <8 x i32> %induction, %broadcast.splat21 1576 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep, i32 2, <8 x i1> %0, <8 x i16> undef) 1577 %1 = shufflevector <8 x i16> %wide.masked.load, <8 x i16> %wide.masked.load, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1578 %2 = shufflevector <8 x i16> %wide.masked.load, <8 x i16> %wide.masked.load, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1579 %3 = sext <4 x i16> %1 to <4 x i32> 1580 %4 = sext <4 x i16> %2 to <4 x i32> 1581 %wide.masked.load22 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep18, i32 2, <8 x i1> %0, <8 x i16> undef) 1582 %5 = shufflevector <8 x i16> %wide.masked.load22, <8 x i16> %wide.masked.load22, <4 x i32> <i32 0, i32 2, i32 4, i32 6> 1583 %6 = shufflevector <8 x i16> %wide.masked.load22, <8 x i16> %wide.masked.load22, <4 x i32> <i32 1, i32 3, i32 5, i32 7> 1584 %7 = sext <4 x i16> %5 to <4 x i32> 1585 %8 = sext <4 x i16> %6 to <4 x i32> 1586 %9 = mul <4 x i32> %7, %3 1587 %10 = mul <4 x i32> %8, %4 1588 %11 = ashr <4 x i32> %9, <i32 15, i32 15, i32 15, i32 15> 1589 %12 = ashr <4 x i32> %10, <i32 15, i32 15, i32 15, i32 15> 1590 %13 = icmp sgt <4 x i32> %11, <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1591 %14 = icmp sgt <4 x i32> %12, <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1592 %15 = select <4 x i1> %13, <4 x i32> %11, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1593 %16 = select <4 x i1> %14, <4 x i32> %12, <4 x i32> <i32 -32768, i32 -32768, i32 -32768, i32 -32768> 1594 %17 = icmp slt <4 x i32> %15, <i32 32767, i32 32767, i32 32767, i32 32767> 1595 %18 = icmp slt <4 x i32> %16, <i32 32767, i32 32767, i32 32767, i32 32767> 1596 %19 = select <4 x i1> %17, <4 x i32> %15, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767> 1597 %20 = select <4 x i1> %18, <4 x i32> %16, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767> 1598 %21 = shufflevector <4 x i32> %19, <4 x i32> %20, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 1599 %22 = trunc <8 x i32> %21 to <8 x i16> 1600 call void @llvm.masked.store.v8i16.p0(<8 x i16> %22, ptr %next.gep19, i32 2, <8 x i1> %0) 1601 %index.next = add i32 %index, 8 1602 %23 = icmp eq i32 %index.next, %n.vec 1603 br i1 %23, label %for.cond.cleanup, label %vector.body 1604 1605for.cond.cleanup: ; preds = %vector.body, %entry 1606 ret void 1607} 1608 1609define arm_aapcs_vfpcc void @usatmul_4_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 1610; CHECK-LABEL: usatmul_4_q15: 1611; CHECK: @ %bb.0: @ %entry 1612; CHECK-NEXT: .save {r4, r5, r6, lr} 1613; CHECK-NEXT: push {r4, r5, r6, lr} 1614; CHECK-NEXT: cmp r3, #0 1615; CHECK-NEXT: beq .LBB11_8 1616; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1617; CHECK-NEXT: cmp r3, #3 1618; CHECK-NEXT: bhi .LBB11_3 1619; CHECK-NEXT: @ %bb.2: 1620; CHECK-NEXT: movs r5, #0 1621; CHECK-NEXT: mov r12, r0 1622; CHECK-NEXT: mov r6, r1 1623; CHECK-NEXT: mov r4, r2 1624; CHECK-NEXT: b .LBB11_6 1625; CHECK-NEXT: .LBB11_3: @ %vector.ph 1626; CHECK-NEXT: bic r5, r3, #3 1627; CHECK-NEXT: movs r4, #1 1628; CHECK-NEXT: subs r6, r5, #4 1629; CHECK-NEXT: add.w r12, r0, r5, lsl #1 1630; CHECK-NEXT: add.w lr, r4, r6, lsr #2 1631; CHECK-NEXT: add.w r4, r2, r5, lsl #1 1632; CHECK-NEXT: add.w r6, r1, r5, lsl #1 1633; CHECK-NEXT: .LBB11_4: @ %vector.body 1634; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1635; CHECK-NEXT: vldrh.u32 q0, [r0], #8 1636; CHECK-NEXT: vldrh.u32 q1, [r1], #8 1637; CHECK-NEXT: vmul.i32 q0, q1, q0 1638; CHECK-NEXT: vqshrnb.u32 q0, q0, #15 1639; CHECK-NEXT: vstrh.32 q0, [r2], #8 1640; CHECK-NEXT: le lr, .LBB11_4 1641; CHECK-NEXT: @ %bb.5: @ %middle.block 1642; CHECK-NEXT: cmp r5, r3 1643; CHECK-NEXT: it eq 1644; CHECK-NEXT: popeq {r4, r5, r6, pc} 1645; CHECK-NEXT: .LBB11_6: @ %for.body.preheader21 1646; CHECK-NEXT: sub.w lr, r3, r5 1647; CHECK-NEXT: movw r0, #65535 1648; CHECK-NEXT: .LBB11_7: @ %for.body 1649; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1650; CHECK-NEXT: ldrh r1, [r12], #2 1651; CHECK-NEXT: ldrh r2, [r6], #2 1652; CHECK-NEXT: muls r1, r2, r1 1653; CHECK-NEXT: lsrs r2, r1, #15 1654; CHECK-NEXT: cmp r2, r0 1655; CHECK-NEXT: movw r2, #65535 1656; CHECK-NEXT: it lo 1657; CHECK-NEXT: lsrlo r2, r1, #15 1658; CHECK-NEXT: strh r2, [r4], #2 1659; CHECK-NEXT: le lr, .LBB11_7 1660; CHECK-NEXT: .LBB11_8: @ %for.cond.cleanup 1661; CHECK-NEXT: pop {r4, r5, r6, pc} 1662entry: 1663 %cmp8 = icmp eq i32 %N, 0 1664 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 1665 1666for.body.preheader: ; preds = %entry 1667 %min.iters.check = icmp ult i32 %N, 4 1668 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph 1669 1670for.body.preheader21: ; preds = %middle.block, %for.body.preheader 1671 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1672 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 1673 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ] 1674 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ] 1675 br label %for.body 1676 1677vector.ph: ; preds = %for.body.preheader 1678 %n.vec = and i32 %N, -4 1679 %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec 1680 %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec 1681 %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec 1682 br label %vector.body 1683 1684vector.body: ; preds = %vector.body, %vector.ph 1685 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1686 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index 1687 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index 1688 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index 1689 %wide.load = load <4 x i16>, ptr %next.gep, align 2 1690 %0 = zext <4 x i16> %wide.load to <4 x i32> 1691 %wide.load20 = load <4 x i16>, ptr %next.gep18, align 2 1692 %1 = zext <4 x i16> %wide.load20 to <4 x i32> 1693 %2 = mul nuw <4 x i32> %1, %0 1694 %3 = lshr <4 x i32> %2, <i32 15, i32 15, i32 15, i32 15> 1695 %4 = icmp ult <4 x i32> %3, <i32 65535, i32 65535, i32 65535, i32 65535> 1696 %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535> 1697 %6 = trunc <4 x i32> %5 to <4 x i16> 1698 store <4 x i16> %6, ptr %next.gep19, align 2 1699 %index.next = add i32 %index, 4 1700 %7 = icmp eq i32 %index.next, %n.vec 1701 br i1 %7, label %middle.block, label %vector.body 1702 1703middle.block: ; preds = %vector.body 1704 %cmp.n = icmp eq i32 %n.vec, %N 1705 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21 1706 1707for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1708 ret void 1709 1710for.body: ; preds = %for.body.preheader21, %for.body 1711 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ] 1712 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ] 1713 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ] 1714 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ] 1715 %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1 1716 %8 = load i16, ptr %pSrcA.addr.011, align 2 1717 %conv = zext i16 %8 to i32 1718 %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1 1719 %9 = load i16, ptr %pSrcB.addr.010, align 2 1720 %conv2 = zext i16 %9 to i32 1721 %mul = mul nuw i32 %conv2, %conv 1722 %shr = lshr i32 %mul, 15 1723 %10 = icmp ult i32 %shr, 65535 1724 %retval.0.i = select i1 %10, i32 %shr, i32 65535 1725 %conv3 = trunc i32 %retval.0.i to i16 1726 %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1 1727 store i16 %conv3, ptr %pDst.addr.09, align 2 1728 %inc = add nuw i32 %i.012, 1 1729 %exitcond = icmp eq i32 %inc, %N 1730 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1731} 1732 1733define arm_aapcs_vfpcc void @usatmul_8_q15(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 1734; CHECK-LABEL: usatmul_8_q15: 1735; CHECK: @ %bb.0: @ %entry 1736; CHECK-NEXT: .save {r4, r5, r6, lr} 1737; CHECK-NEXT: push {r4, r5, r6, lr} 1738; CHECK-NEXT: cmp r3, #0 1739; CHECK-NEXT: beq .LBB12_8 1740; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1741; CHECK-NEXT: cmp r3, #7 1742; CHECK-NEXT: bhi .LBB12_3 1743; CHECK-NEXT: @ %bb.2: 1744; CHECK-NEXT: movs r5, #0 1745; CHECK-NEXT: mov r12, r0 1746; CHECK-NEXT: mov r6, r1 1747; CHECK-NEXT: mov r4, r2 1748; CHECK-NEXT: b .LBB12_6 1749; CHECK-NEXT: .LBB12_3: @ %vector.ph 1750; CHECK-NEXT: bic r5, r3, #7 1751; CHECK-NEXT: movs r4, #1 1752; CHECK-NEXT: sub.w r6, r5, #8 1753; CHECK-NEXT: add.w r12, r0, r5, lsl #1 1754; CHECK-NEXT: add.w lr, r4, r6, lsr #3 1755; CHECK-NEXT: add.w r4, r2, r5, lsl #1 1756; CHECK-NEXT: add.w r6, r1, r5, lsl #1 1757; CHECK-NEXT: .LBB12_4: @ %vector.body 1758; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1759; CHECK-NEXT: vldrh.u16 q0, [r0], #16 1760; CHECK-NEXT: vldrh.u16 q1, [r1], #16 1761; CHECK-NEXT: vmullt.u16 q2, q1, q0 1762; CHECK-NEXT: vmullb.u16 q0, q1, q0 1763; CHECK-NEXT: vqshrnb.u32 q0, q0, #15 1764; CHECK-NEXT: vqshrnt.u32 q0, q2, #15 1765; CHECK-NEXT: vstrb.8 q0, [r2], #16 1766; CHECK-NEXT: le lr, .LBB12_4 1767; CHECK-NEXT: @ %bb.5: @ %middle.block 1768; CHECK-NEXT: cmp r5, r3 1769; CHECK-NEXT: it eq 1770; CHECK-NEXT: popeq {r4, r5, r6, pc} 1771; CHECK-NEXT: .LBB12_6: @ %for.body.preheader21 1772; CHECK-NEXT: sub.w lr, r3, r5 1773; CHECK-NEXT: movw r0, #65535 1774; CHECK-NEXT: .LBB12_7: @ %for.body 1775; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1776; CHECK-NEXT: ldrh r1, [r12], #2 1777; CHECK-NEXT: ldrh r2, [r6], #2 1778; CHECK-NEXT: muls r1, r2, r1 1779; CHECK-NEXT: lsrs r2, r1, #15 1780; CHECK-NEXT: cmp r2, r0 1781; CHECK-NEXT: movw r2, #65535 1782; CHECK-NEXT: it lo 1783; CHECK-NEXT: lsrlo r2, r1, #15 1784; CHECK-NEXT: strh r2, [r4], #2 1785; CHECK-NEXT: le lr, .LBB12_7 1786; CHECK-NEXT: .LBB12_8: @ %for.cond.cleanup 1787; CHECK-NEXT: pop {r4, r5, r6, pc} 1788entry: 1789 %cmp8 = icmp eq i32 %N, 0 1790 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 1791 1792for.body.preheader: ; preds = %entry 1793 %min.iters.check = icmp ult i32 %N, 8 1794 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph 1795 1796for.body.preheader21: ; preds = %middle.block, %for.body.preheader 1797 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1798 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 1799 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ] 1800 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ] 1801 br label %for.body 1802 1803vector.ph: ; preds = %for.body.preheader 1804 %n.vec = and i32 %N, -8 1805 %ind.end = getelementptr i16, ptr %pSrcA, i32 %n.vec 1806 %ind.end15 = getelementptr i16, ptr %pSrcB, i32 %n.vec 1807 %ind.end17 = getelementptr i16, ptr %pDst, i32 %n.vec 1808 br label %vector.body 1809 1810vector.body: ; preds = %vector.body, %vector.ph 1811 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1812 %next.gep = getelementptr i16, ptr %pSrcA, i32 %index 1813 %next.gep18 = getelementptr i16, ptr %pSrcB, i32 %index 1814 %next.gep19 = getelementptr i16, ptr %pDst, i32 %index 1815 %wide.load = load <8 x i16>, ptr %next.gep, align 2 1816 %0 = zext <8 x i16> %wide.load to <8 x i32> 1817 %wide.load20 = load <8 x i16>, ptr %next.gep18, align 2 1818 %1 = zext <8 x i16> %wide.load20 to <8 x i32> 1819 %2 = mul nuw <8 x i32> %1, %0 1820 %3 = lshr <8 x i32> %2, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 1821 %4 = icmp ult <8 x i32> %3, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 1822 %5 = select <8 x i1> %4, <8 x i32> %3, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535> 1823 %6 = trunc <8 x i32> %5 to <8 x i16> 1824 store <8 x i16> %6, ptr %next.gep19, align 2 1825 %index.next = add i32 %index, 8 1826 %7 = icmp eq i32 %index.next, %n.vec 1827 br i1 %7, label %middle.block, label %vector.body 1828 1829middle.block: ; preds = %vector.body 1830 %cmp.n = icmp eq i32 %n.vec, %N 1831 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21 1832 1833for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1834 ret void 1835 1836for.body: ; preds = %for.body.preheader21, %for.body 1837 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ] 1838 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ] 1839 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ] 1840 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ] 1841 %incdec.ptr = getelementptr inbounds i16, ptr %pSrcA.addr.011, i32 1 1842 %8 = load i16, ptr %pSrcA.addr.011, align 2 1843 %conv = zext i16 %8 to i32 1844 %incdec.ptr1 = getelementptr inbounds i16, ptr %pSrcB.addr.010, i32 1 1845 %9 = load i16, ptr %pSrcB.addr.010, align 2 1846 %conv2 = zext i16 %9 to i32 1847 %mul = mul nuw i32 %conv2, %conv 1848 %shr = lshr i32 %mul, 15 1849 %10 = icmp ult i32 %shr, 65535 1850 %retval.0.i = select i1 %10, i32 %shr, i32 65535 1851 %conv3 = trunc i32 %retval.0.i to i16 1852 %incdec.ptr4 = getelementptr inbounds i16, ptr %pDst.addr.09, i32 1 1853 store i16 %conv3, ptr %pDst.addr.09, align 2 1854 %inc = add nuw i32 %i.012, 1 1855 %exitcond = icmp eq i32 %inc, %N 1856 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1857} 1858 1859 1860; i8 1861 1862define arm_aapcs_vfpcc void @ssatmul_4_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 1863; CHECK-LABEL: ssatmul_4_q7: 1864; CHECK: @ %bb.0: @ %entry 1865; CHECK-NEXT: .save {r4, r5, r6, lr} 1866; CHECK-NEXT: push {r4, r5, r6, lr} 1867; CHECK-NEXT: cmp r3, #0 1868; CHECK-NEXT: beq .LBB13_8 1869; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1870; CHECK-NEXT: cmp r3, #3 1871; CHECK-NEXT: bhi .LBB13_3 1872; CHECK-NEXT: @ %bb.2: 1873; CHECK-NEXT: movs r5, #0 1874; CHECK-NEXT: mov r12, r0 1875; CHECK-NEXT: mov r6, r1 1876; CHECK-NEXT: mov r4, r2 1877; CHECK-NEXT: b .LBB13_6 1878; CHECK-NEXT: .LBB13_3: @ %vector.ph 1879; CHECK-NEXT: bic r5, r3, #3 1880; CHECK-NEXT: movs r4, #1 1881; CHECK-NEXT: subs r6, r5, #4 1882; CHECK-NEXT: add.w r12, r0, r5 1883; CHECK-NEXT: vmvn.i32 q0, #0x7f 1884; CHECK-NEXT: vmov.i32 q1, #0x7f 1885; CHECK-NEXT: add.w lr, r4, r6, lsr #2 1886; CHECK-NEXT: adds r4, r2, r5 1887; CHECK-NEXT: adds r6, r1, r5 1888; CHECK-NEXT: .LBB13_4: @ %vector.body 1889; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1890; CHECK-NEXT: vldrb.s32 q2, [r0], #4 1891; CHECK-NEXT: vldrb.s32 q3, [r1], #4 1892; CHECK-NEXT: vmul.i32 q2, q3, q2 1893; CHECK-NEXT: vshr.s32 q2, q2, #7 1894; CHECK-NEXT: vmax.s32 q2, q2, q0 1895; CHECK-NEXT: vmin.s32 q2, q2, q1 1896; CHECK-NEXT: vstrb.32 q2, [r2], #4 1897; CHECK-NEXT: le lr, .LBB13_4 1898; CHECK-NEXT: @ %bb.5: @ %middle.block 1899; CHECK-NEXT: cmp r5, r3 1900; CHECK-NEXT: it eq 1901; CHECK-NEXT: popeq {r4, r5, r6, pc} 1902; CHECK-NEXT: .LBB13_6: @ %for.body.preheader21 1903; CHECK-NEXT: sub.w lr, r3, r5 1904; CHECK-NEXT: .LBB13_7: @ %for.body 1905; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1906; CHECK-NEXT: ldrsb r0, [r12], #1 1907; CHECK-NEXT: ldrsb r1, [r6], #1 1908; CHECK-NEXT: muls r0, r1, r0 1909; CHECK-NEXT: ssat r0, #8, r0, asr #7 1910; CHECK-NEXT: strb r0, [r4], #1 1911; CHECK-NEXT: le lr, .LBB13_7 1912; CHECK-NEXT: .LBB13_8: @ %for.cond.cleanup 1913; CHECK-NEXT: pop {r4, r5, r6, pc} 1914entry: 1915 %cmp8 = icmp eq i32 %N, 0 1916 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader 1917 1918for.body.preheader: ; preds = %entry 1919 %min.iters.check = icmp ult i32 %N, 4 1920 br i1 %min.iters.check, label %for.body.preheader21, label %vector.ph 1921 1922for.body.preheader21: ; preds = %middle.block, %for.body.preheader 1923 %i.012.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1924 %pSrcA.addr.011.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 1925 %pSrcB.addr.010.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end15, %middle.block ] 1926 %pDst.addr.09.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end17, %middle.block ] 1927 br label %for.body 1928 1929vector.ph: ; preds = %for.body.preheader 1930 %n.vec = and i32 %N, -4 1931 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec 1932 %ind.end15 = getelementptr i8, ptr %pSrcB, i32 %n.vec 1933 %ind.end17 = getelementptr i8, ptr %pDst, i32 %n.vec 1934 br label %vector.body 1935 1936vector.body: ; preds = %vector.body, %vector.ph 1937 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1938 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index 1939 %next.gep18 = getelementptr i8, ptr %pSrcB, i32 %index 1940 %next.gep19 = getelementptr i8, ptr %pDst, i32 %index 1941 %wide.load = load <4 x i8>, ptr %next.gep, align 1 1942 %0 = sext <4 x i8> %wide.load to <4 x i32> 1943 %wide.load20 = load <4 x i8>, ptr %next.gep18, align 1 1944 %1 = sext <4 x i8> %wide.load20 to <4 x i32> 1945 %2 = mul nsw <4 x i32> %1, %0 1946 %3 = ashr <4 x i32> %2, <i32 7, i32 7, i32 7, i32 7> 1947 %4 = icmp sgt <4 x i32> %3, <i32 -128, i32 -128, i32 -128, i32 -128> 1948 %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> <i32 -128, i32 -128, i32 -128, i32 -128> 1949 %6 = icmp slt <4 x i32> %5, <i32 127, i32 127, i32 127, i32 127> 1950 %7 = select <4 x i1> %6, <4 x i32> %5, <4 x i32> <i32 127, i32 127, i32 127, i32 127> 1951 %8 = trunc <4 x i32> %7 to <4 x i8> 1952 store <4 x i8> %8, ptr %next.gep19, align 1 1953 %index.next = add i32 %index, 4 1954 %9 = icmp eq i32 %index.next, %n.vec 1955 br i1 %9, label %middle.block, label %vector.body 1956 1957middle.block: ; preds = %vector.body 1958 %cmp.n = icmp eq i32 %n.vec, %N 1959 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader21 1960 1961for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1962 ret void 1963 1964for.body: ; preds = %for.body.preheader21, %for.body 1965 %i.012 = phi i32 [ %inc, %for.body ], [ %i.012.ph, %for.body.preheader21 ] 1966 %pSrcA.addr.011 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.011.ph, %for.body.preheader21 ] 1967 %pSrcB.addr.010 = phi ptr [ %incdec.ptr1, %for.body ], [ %pSrcB.addr.010.ph, %for.body.preheader21 ] 1968 %pDst.addr.09 = phi ptr [ %incdec.ptr4, %for.body ], [ %pDst.addr.09.ph, %for.body.preheader21 ] 1969 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.011, i32 1 1970 %10 = load i8, ptr %pSrcA.addr.011, align 1 1971 %conv = sext i8 %10 to i32 1972 %incdec.ptr1 = getelementptr inbounds i8, ptr %pSrcB.addr.010, i32 1 1973 %11 = load i8, ptr %pSrcB.addr.010, align 1 1974 %conv2 = sext i8 %11 to i32 1975 %mul = mul nsw i32 %conv2, %conv 1976 %shr = ashr i32 %mul, 7 1977 %12 = icmp sgt i32 %shr, -128 1978 %.val.i = select i1 %12, i32 %shr, i32 -128 1979 %13 = icmp slt i32 %.val.i, 127 1980 %retval.0.i = select i1 %13, i32 %.val.i, i32 127 1981 %conv3 = trunc i32 %retval.0.i to i8 1982 %incdec.ptr4 = getelementptr inbounds i8, ptr %pDst.addr.09, i32 1 1983 store i8 %conv3, ptr %pDst.addr.09, align 1 1984 %inc = add nuw i32 %i.012, 1 1985 %exitcond = icmp eq i32 %inc, %N 1986 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1987} 1988 1989define arm_aapcs_vfpcc void @ssatmul_8_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 1990; CHECK-LABEL: ssatmul_8_q7: 1991; CHECK: @ %bb.0: @ %entry 1992; CHECK-NEXT: .save {r4, r5, r6, lr} 1993; CHECK-NEXT: push {r4, r5, r6, lr} 1994; CHECK-NEXT: cbz r3, .LBB14_8 1995; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1996; CHECK-NEXT: cmp r3, #7 1997; CHECK-NEXT: bhi .LBB14_3 1998; CHECK-NEXT: @ %bb.2: 1999; CHECK-NEXT: movs r5, #0 2000; CHECK-NEXT: mov r12, r0 2001; CHECK-NEXT: mov r6, r1 2002; CHECK-NEXT: mov r4, r2 2003; CHECK-NEXT: b .LBB14_6 2004; CHECK-NEXT: .LBB14_3: @ %vector.ph 2005; CHECK-NEXT: bic r5, r3, #7 2006; CHECK-NEXT: movs r4, #1 2007; CHECK-NEXT: sub.w r6, r5, #8 2008; CHECK-NEXT: add.w r12, r0, r5 2009; CHECK-NEXT: add.w lr, r4, r6, lsr #3 2010; CHECK-NEXT: adds r4, r2, r5 2011; CHECK-NEXT: adds r6, r1, r5 2012; CHECK-NEXT: .LBB14_4: @ %vector.body 2013; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2014; CHECK-NEXT: vldrb.s16 q0, [r0], #8 2015; CHECK-NEXT: vldrb.s16 q1, [r1], #8 2016; CHECK-NEXT: vmul.i16 q0, q1, q0 2017; CHECK-NEXT: vqshrnb.s16 q0, q0, #7 2018; CHECK-NEXT: vstrb.16 q0, [r2], #8 2019; CHECK-NEXT: le lr, .LBB14_4 2020; CHECK-NEXT: @ %bb.5: @ %middle.block 2021; CHECK-NEXT: cmp r5, r3 2022; CHECK-NEXT: it eq 2023; CHECK-NEXT: popeq {r4, r5, r6, pc} 2024; CHECK-NEXT: .LBB14_6: @ %for.body.preheader23 2025; CHECK-NEXT: sub.w lr, r3, r5 2026; CHECK-NEXT: .LBB14_7: @ %for.body 2027; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2028; CHECK-NEXT: ldrsb r0, [r12], #1 2029; CHECK-NEXT: ldrsb r1, [r6], #1 2030; CHECK-NEXT: muls r0, r1, r0 2031; CHECK-NEXT: ssat r0, #8, r0, asr #7 2032; CHECK-NEXT: strb r0, [r4], #1 2033; CHECK-NEXT: le lr, .LBB14_7 2034; CHECK-NEXT: .LBB14_8: @ %for.cond.cleanup 2035; CHECK-NEXT: pop {r4, r5, r6, pc} 2036entry: 2037 %cmp10 = icmp eq i32 %N, 0 2038 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader 2039 2040for.body.preheader: ; preds = %entry 2041 %min.iters.check = icmp ult i32 %N, 8 2042 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph 2043 2044for.body.preheader23: ; preds = %middle.block, %for.body.preheader 2045 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 2046 %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 2047 %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ] 2048 %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ] 2049 br label %for.body 2050 2051vector.ph: ; preds = %for.body.preheader 2052 %n.vec = and i32 %N, -8 2053 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec 2054 %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec 2055 %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec 2056 br label %vector.body 2057 2058vector.body: ; preds = %vector.body, %vector.ph 2059 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2060 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index 2061 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index 2062 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index 2063 %wide.load = load <8 x i8>, ptr %next.gep, align 1 2064 %0 = sext <8 x i8> %wide.load to <8 x i16> 2065 %wide.load22 = load <8 x i8>, ptr %next.gep20, align 1 2066 %1 = sext <8 x i8> %wide.load22 to <8 x i16> 2067 %2 = mul nsw <8 x i16> %1, %0 2068 %3 = ashr <8 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 2069 %4 = icmp sgt <8 x i16> %3, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2070 %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2071 %6 = icmp slt <8 x i16> %5, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2072 %7 = select <8 x i1> %6, <8 x i16> %5, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2073 %8 = trunc <8 x i16> %7 to <8 x i8> 2074 store <8 x i8> %8, ptr %next.gep21, align 1 2075 %index.next = add i32 %index, 8 2076 %9 = icmp eq i32 %index.next, %n.vec 2077 br i1 %9, label %middle.block, label %vector.body 2078 2079middle.block: ; preds = %vector.body 2080 %cmp.n = icmp eq i32 %n.vec, %N 2081 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23 2082 2083for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 2084 ret void 2085 2086for.body: ; preds = %for.body.preheader23, %for.body 2087 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ] 2088 %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ] 2089 %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ] 2090 %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ] 2091 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1 2092 %10 = load i8, ptr %pSrcA.addr.013, align 1 2093 %conv1 = sext i8 %10 to i16 2094 %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1 2095 %11 = load i8, ptr %pSrcB.addr.012, align 1 2096 %conv3 = sext i8 %11 to i16 2097 %mul = mul nsw i16 %conv3, %conv1 2098 %shr = ashr i16 %mul, 7 2099 %12 = icmp sgt i16 %shr, -128 2100 %.val.i = select i1 %12, i16 %shr, i16 -128 2101 %13 = icmp slt i16 %.val.i, 127 2102 %retval.0.i = select i1 %13, i16 %.val.i, i16 127 2103 %conv5 = trunc i16 %retval.0.i to i8 2104 %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1 2105 store i8 %conv5, ptr %pDst.addr.011, align 1 2106 %inc = add nuw i32 %i.014, 1 2107 %exitcond = icmp eq i32 %inc, %N 2108 br i1 %exitcond, label %for.cond.cleanup, label %for.body 2109} 2110 2111define arm_aapcs_vfpcc void @ssatmul_16_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 2112; CHECK-LABEL: ssatmul_16_q7: 2113; CHECK: @ %bb.0: @ %entry 2114; CHECK-NEXT: .save {r4, r5, r6, lr} 2115; CHECK-NEXT: push {r4, r5, r6, lr} 2116; CHECK-NEXT: cbz r3, .LBB15_8 2117; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 2118; CHECK-NEXT: cmp r3, #15 2119; CHECK-NEXT: bhi .LBB15_3 2120; CHECK-NEXT: @ %bb.2: 2121; CHECK-NEXT: movs r5, #0 2122; CHECK-NEXT: mov r12, r0 2123; CHECK-NEXT: mov r6, r1 2124; CHECK-NEXT: mov r4, r2 2125; CHECK-NEXT: b .LBB15_6 2126; CHECK-NEXT: .LBB15_3: @ %vector.ph 2127; CHECK-NEXT: bic r5, r3, #15 2128; CHECK-NEXT: movs r4, #1 2129; CHECK-NEXT: sub.w r6, r5, #16 2130; CHECK-NEXT: add.w r12, r0, r5 2131; CHECK-NEXT: add.w lr, r4, r6, lsr #4 2132; CHECK-NEXT: adds r4, r2, r5 2133; CHECK-NEXT: adds r6, r1, r5 2134; CHECK-NEXT: .LBB15_4: @ %vector.body 2135; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2136; CHECK-NEXT: vldrb.u8 q0, [r0], #16 2137; CHECK-NEXT: vldrb.u8 q1, [r1], #16 2138; CHECK-NEXT: vmullt.s8 q2, q1, q0 2139; CHECK-NEXT: vmullb.s8 q0, q1, q0 2140; CHECK-NEXT: vqshrnb.s16 q0, q0, #7 2141; CHECK-NEXT: vqshrnt.s16 q0, q2, #7 2142; CHECK-NEXT: vstrb.8 q0, [r2], #16 2143; CHECK-NEXT: le lr, .LBB15_4 2144; CHECK-NEXT: @ %bb.5: @ %middle.block 2145; CHECK-NEXT: cmp r5, r3 2146; CHECK-NEXT: it eq 2147; CHECK-NEXT: popeq {r4, r5, r6, pc} 2148; CHECK-NEXT: .LBB15_6: @ %for.body.preheader23 2149; CHECK-NEXT: sub.w lr, r3, r5 2150; CHECK-NEXT: .LBB15_7: @ %for.body 2151; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2152; CHECK-NEXT: ldrsb r0, [r12], #1 2153; CHECK-NEXT: ldrsb r1, [r6], #1 2154; CHECK-NEXT: muls r0, r1, r0 2155; CHECK-NEXT: ssat r0, #8, r0, asr #7 2156; CHECK-NEXT: strb r0, [r4], #1 2157; CHECK-NEXT: le lr, .LBB15_7 2158; CHECK-NEXT: .LBB15_8: @ %for.cond.cleanup 2159; CHECK-NEXT: pop {r4, r5, r6, pc} 2160entry: 2161 %cmp10 = icmp eq i32 %N, 0 2162 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader 2163 2164for.body.preheader: ; preds = %entry 2165 %min.iters.check = icmp ult i32 %N, 16 2166 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph 2167 2168for.body.preheader23: ; preds = %middle.block, %for.body.preheader 2169 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 2170 %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 2171 %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ] 2172 %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ] 2173 br label %for.body 2174 2175vector.ph: ; preds = %for.body.preheader 2176 %n.vec = and i32 %N, -16 2177 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec 2178 %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec 2179 %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec 2180 br label %vector.body 2181 2182vector.body: ; preds = %vector.body, %vector.ph 2183 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2184 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index 2185 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index 2186 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index 2187 %wide.load = load <16 x i8>, ptr %next.gep, align 1 2188 %0 = sext <16 x i8> %wide.load to <16 x i16> 2189 %wide.load22 = load <16 x i8>, ptr %next.gep20, align 1 2190 %1 = sext <16 x i8> %wide.load22 to <16 x i16> 2191 %2 = mul nsw <16 x i16> %1, %0 2192 %3 = ashr <16 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 2193 %4 = icmp sgt <16 x i16> %3, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2194 %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2195 %6 = icmp slt <16 x i16> %5, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2196 %7 = select <16 x i1> %6, <16 x i16> %5, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2197 %8 = trunc <16 x i16> %7 to <16 x i8> 2198 store <16 x i8> %8, ptr %next.gep21, align 1 2199 %index.next = add i32 %index, 16 2200 %9 = icmp eq i32 %index.next, %n.vec 2201 br i1 %9, label %middle.block, label %vector.body 2202 2203middle.block: ; preds = %vector.body 2204 %cmp.n = icmp eq i32 %n.vec, %N 2205 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23 2206 2207for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 2208 ret void 2209 2210for.body: ; preds = %for.body.preheader23, %for.body 2211 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ] 2212 %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ] 2213 %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ] 2214 %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ] 2215 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1 2216 %10 = load i8, ptr %pSrcA.addr.013, align 1 2217 %conv1 = sext i8 %10 to i16 2218 %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1 2219 %11 = load i8, ptr %pSrcB.addr.012, align 1 2220 %conv3 = sext i8 %11 to i16 2221 %mul = mul nsw i16 %conv3, %conv1 2222 %shr = ashr i16 %mul, 7 2223 %12 = icmp sgt i16 %shr, -128 2224 %.val.i = select i1 %12, i16 %shr, i16 -128 2225 %13 = icmp slt i16 %.val.i, 127 2226 %retval.0.i = select i1 %13, i16 %.val.i, i16 127 2227 %conv5 = trunc i16 %retval.0.i to i8 2228 %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1 2229 store i8 %conv5, ptr %pDst.addr.011, align 1 2230 %inc = add nuw i32 %i.014, 1 2231 %exitcond = icmp eq i32 %inc, %N 2232 br i1 %exitcond, label %for.cond.cleanup, label %for.body 2233} 2234 2235define arm_aapcs_vfpcc void @ssatmul_16i_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 2236; CHECK-LABEL: ssatmul_16i_q7: 2237; CHECK: @ %bb.0: @ %entry 2238; CHECK-NEXT: .save {r4, r5, r6, lr} 2239; CHECK-NEXT: push {r4, r5, r6, lr} 2240; CHECK-NEXT: cbz r3, .LBB16_8 2241; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 2242; CHECK-NEXT: cmp r3, #15 2243; CHECK-NEXT: bhi .LBB16_3 2244; CHECK-NEXT: @ %bb.2: 2245; CHECK-NEXT: movs r5, #0 2246; CHECK-NEXT: mov r12, r0 2247; CHECK-NEXT: mov r6, r1 2248; CHECK-NEXT: mov r4, r2 2249; CHECK-NEXT: b .LBB16_6 2250; CHECK-NEXT: .LBB16_3: @ %vector.ph 2251; CHECK-NEXT: bic r5, r3, #15 2252; CHECK-NEXT: movs r4, #1 2253; CHECK-NEXT: sub.w r6, r5, #16 2254; CHECK-NEXT: add.w r12, r0, r5 2255; CHECK-NEXT: add.w lr, r4, r6, lsr #4 2256; CHECK-NEXT: adds r4, r2, r5 2257; CHECK-NEXT: adds r6, r1, r5 2258; CHECK-NEXT: .LBB16_4: @ %vector.body 2259; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2260; CHECK-NEXT: vldrb.u8 q0, [r0], #16 2261; CHECK-NEXT: vldrb.u8 q1, [r1], #16 2262; CHECK-NEXT: vmullt.s8 q2, q1, q0 2263; CHECK-NEXT: vmullb.s8 q0, q1, q0 2264; CHECK-NEXT: vqshrnb.s16 q0, q0, #7 2265; CHECK-NEXT: vqshrnt.s16 q0, q2, #7 2266; CHECK-NEXT: vstrb.8 q0, [r2], #16 2267; CHECK-NEXT: le lr, .LBB16_4 2268; CHECK-NEXT: @ %bb.5: @ %middle.block 2269; CHECK-NEXT: cmp r5, r3 2270; CHECK-NEXT: it eq 2271; CHECK-NEXT: popeq {r4, r5, r6, pc} 2272; CHECK-NEXT: .LBB16_6: @ %for.body.preheader23 2273; CHECK-NEXT: sub.w lr, r3, r5 2274; CHECK-NEXT: .LBB16_7: @ %for.body 2275; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2276; CHECK-NEXT: ldrsb r0, [r12], #1 2277; CHECK-NEXT: ldrsb r1, [r6], #1 2278; CHECK-NEXT: muls r0, r1, r0 2279; CHECK-NEXT: ssat r0, #8, r0, asr #7 2280; CHECK-NEXT: strb r0, [r4], #1 2281; CHECK-NEXT: le lr, .LBB16_7 2282; CHECK-NEXT: .LBB16_8: @ %for.cond.cleanup 2283; CHECK-NEXT: pop {r4, r5, r6, pc} 2284entry: 2285 %cmp10 = icmp eq i32 %N, 0 2286 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader 2287 2288for.body.preheader: ; preds = %entry 2289 %min.iters.check = icmp ult i32 %N, 16 2290 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph 2291 2292for.body.preheader23: ; preds = %middle.block, %for.body.preheader 2293 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 2294 %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 2295 %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ] 2296 %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ] 2297 br label %for.body 2298 2299vector.ph: ; preds = %for.body.preheader 2300 %n.vec = and i32 %N, -16 2301 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec 2302 %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec 2303 %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec 2304 br label %vector.body 2305 2306vector.body: ; preds = %vector.body, %vector.ph 2307 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2308 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index 2309 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index 2310 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index 2311 %wide.load = load <16 x i8>, ptr %next.gep, align 1 2312 %0 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 2313 %1 = shufflevector <16 x i8> %wide.load, <16 x i8> %wide.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 2314 %2 = sext <8 x i8> %0 to <8 x i16> 2315 %3 = sext <8 x i8> %1 to <8 x i16> 2316 %wide.load22 = load <16 x i8>, ptr %next.gep20, align 1 2317 %4 = shufflevector <16 x i8> %wide.load22, <16 x i8> %wide.load22, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 2318 %5 = shufflevector <16 x i8> %wide.load22, <16 x i8> %wide.load22, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 2319 %6 = sext <8 x i8> %4 to <8 x i16> 2320 %7 = sext <8 x i8> %5 to <8 x i16> 2321 %8 = mul <8 x i16> %6, %2 2322 %9 = mul <8 x i16> %7, %3 2323 %10 = ashr <8 x i16> %8, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 2324 %11 = ashr <8 x i16> %9, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 2325 %12 = icmp sgt <8 x i16> %10, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2326 %13 = icmp sgt <8 x i16> %11, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2327 %14 = select <8 x i1> %12, <8 x i16> %10, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2328 %15 = select <8 x i1> %13, <8 x i16> %11, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2329 %16 = icmp slt <8 x i16> %14, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2330 %17 = icmp slt <8 x i16> %15, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2331 %18 = select <8 x i1> %16, <8 x i16> %14, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2332 %19 = select <8 x i1> %17, <8 x i16> %15, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2333 %20 = shufflevector <8 x i16> %18, <8 x i16> %19, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 2334 %21 = trunc <16 x i16> %20 to <16 x i8> 2335 store <16 x i8> %21, ptr %next.gep21, align 1 2336 %index.next = add i32 %index, 16 2337 %22 = icmp eq i32 %index.next, %n.vec 2338 br i1 %22, label %middle.block, label %vector.body 2339 2340middle.block: ; preds = %vector.body 2341 %cmp.n = icmp eq i32 %n.vec, %N 2342 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23 2343 2344for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 2345 ret void 2346 2347for.body: ; preds = %for.body, %for.body.preheader23 2348 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ] 2349 %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ] 2350 %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ] 2351 %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ] 2352 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1 2353 %23 = load i8, ptr %pSrcA.addr.013, align 1 2354 %conv1 = sext i8 %23 to i16 2355 %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1 2356 %24 = load i8, ptr %pSrcB.addr.012, align 1 2357 %conv3 = sext i8 %24 to i16 2358 %mul = mul nsw i16 %conv3, %conv1 2359 %shr = ashr i16 %mul, 7 2360 %25 = icmp sgt i16 %shr, -128 2361 %.val.i = select i1 %25, i16 %shr, i16 -128 2362 %26 = icmp slt i16 %.val.i, 127 2363 %retval.0.i = select i1 %26, i16 %.val.i, i16 127 2364 %conv5 = trunc i16 %retval.0.i to i8 2365 %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1 2366 store i8 %conv5, ptr %pDst.addr.011, align 1 2367 %inc = add nuw i32 %i.014, 1 2368 %exitcond = icmp eq i32 %inc, %N 2369 br i1 %exitcond, label %for.cond.cleanup, label %for.body 2370} 2371 2372define arm_aapcs_vfpcc void @ssatmul_8t_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 2373; CHECK-LABEL: ssatmul_8t_q7: 2374; CHECK: @ %bb.0: @ %entry 2375; CHECK-NEXT: .save {r4, r5, r7, lr} 2376; CHECK-NEXT: push {r4, r5, r7, lr} 2377; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 2378; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 2379; CHECK-NEXT: .pad #16 2380; CHECK-NEXT: sub sp, #16 2381; CHECK-NEXT: cmp r3, #0 2382; CHECK-NEXT: beq .LBB17_3 2383; CHECK-NEXT: @ %bb.1: @ %vector.ph 2384; CHECK-NEXT: adds r4, r3, #7 2385; CHECK-NEXT: vmov.i8 q2, #0x0 2386; CHECK-NEXT: bic r4, r4, #7 2387; CHECK-NEXT: vmov.i8 q3, #0xff 2388; CHECK-NEXT: sub.w r12, r4, #8 2389; CHECK-NEXT: movs r4, #1 2390; CHECK-NEXT: mov r5, sp 2391; CHECK-NEXT: add.w lr, r4, r12, lsr #3 2392; CHECK-NEXT: adr r4, .LCPI17_0 2393; CHECK-NEXT: vldrw.u32 q0, [r4] 2394; CHECK-NEXT: adr r4, .LCPI17_1 2395; CHECK-NEXT: sub.w r12, r3, #1 2396; CHECK-NEXT: vldrw.u32 q4, [r4] 2397; CHECK-NEXT: movs r3, #0 2398; CHECK-NEXT: vdup.32 q1, r12 2399; CHECK-NEXT: .LBB17_2: @ %vector.body 2400; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2401; CHECK-NEXT: vdup.32 q5, r3 2402; CHECK-NEXT: adds r3, #8 2403; CHECK-NEXT: vorr q6, q5, q0 2404; CHECK-NEXT: vorr q5, q5, q4 2405; CHECK-NEXT: vcmp.u32 cs, q1, q6 2406; CHECK-NEXT: vpsel q6, q3, q2 2407; CHECK-NEXT: vcmp.u32 cs, q1, q5 2408; CHECK-NEXT: vpsel q5, q3, q2 2409; CHECK-NEXT: vstrh.32 q6, [r5, #8] 2410; CHECK-NEXT: vstrh.32 q5, [r5] 2411; CHECK-NEXT: vldrw.u32 q5, [r5] 2412; CHECK-NEXT: vptt.i16 ne, q5, zr 2413; CHECK-NEXT: vldrbt.s16 q5, [r0], #8 2414; CHECK-NEXT: vldrbt.s16 q6, [r1], #8 2415; CHECK-NEXT: vmul.i16 q5, q6, q5 2416; CHECK-NEXT: vqshrnb.s16 q5, q5, #7 2417; CHECK-NEXT: vpst 2418; CHECK-NEXT: vstrbt.16 q5, [r2], #8 2419; CHECK-NEXT: le lr, .LBB17_2 2420; CHECK-NEXT: .LBB17_3: @ %for.cond.cleanup 2421; CHECK-NEXT: add sp, #16 2422; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 2423; CHECK-NEXT: pop {r4, r5, r7, pc} 2424; CHECK-NEXT: .p2align 4 2425; CHECK-NEXT: @ %bb.4: 2426; CHECK-NEXT: .LCPI17_0: 2427; CHECK-NEXT: .long 4 @ 0x4 2428; CHECK-NEXT: .long 5 @ 0x5 2429; CHECK-NEXT: .long 6 @ 0x6 2430; CHECK-NEXT: .long 7 @ 0x7 2431; CHECK-NEXT: .LCPI17_1: 2432; CHECK-NEXT: .long 0 @ 0x0 2433; CHECK-NEXT: .long 1 @ 0x1 2434; CHECK-NEXT: .long 2 @ 0x2 2435; CHECK-NEXT: .long 3 @ 0x3 2436entry: 2437 %cmp10 = icmp eq i32 %N, 0 2438 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph 2439 2440vector.ph: ; preds = %entry 2441 %n.rnd.up = add i32 %N, 7 2442 %n.vec = and i32 %n.rnd.up, -8 2443 %trip.count.minus.1 = add i32 %N, -1 2444 %broadcast.splatinsert22 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 2445 %broadcast.splat23 = shufflevector <8 x i32> %broadcast.splatinsert22, <8 x i32> undef, <8 x i32> zeroinitializer 2446 br label %vector.body 2447 2448vector.body: ; preds = %vector.body, %vector.ph 2449 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2450 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 2451 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer 2452 %induction = or <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 2453 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index 2454 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index 2455 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index 2456 %0 = icmp ule <8 x i32> %induction, %broadcast.splat23 2457 %wide.masked.load = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %next.gep, i32 1, <8 x i1> %0, <8 x i8> undef) 2458 %1 = sext <8 x i8> %wide.masked.load to <8 x i16> 2459 %wide.masked.load24 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr %next.gep20, i32 1, <8 x i1> %0, <8 x i8> undef) 2460 %2 = sext <8 x i8> %wide.masked.load24 to <8 x i16> 2461 %3 = mul nsw <8 x i16> %2, %1 2462 %4 = ashr <8 x i16> %3, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 2463 %5 = icmp sgt <8 x i16> %4, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2464 %6 = select <8 x i1> %5, <8 x i16> %4, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2465 %7 = icmp slt <8 x i16> %6, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2466 %8 = select <8 x i1> %7, <8 x i16> %6, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2467 %9 = trunc <8 x i16> %8 to <8 x i8> 2468 call void @llvm.masked.store.v8i8.p0(<8 x i8> %9, ptr %next.gep21, i32 1, <8 x i1> %0) 2469 %index.next = add i32 %index, 8 2470 %10 = icmp eq i32 %index.next, %n.vec 2471 br i1 %10, label %for.cond.cleanup, label %vector.body 2472 2473for.cond.cleanup: ; preds = %vector.body, %entry 2474 ret void 2475} 2476 2477define arm_aapcs_vfpcc void @ssatmul_16t_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 2478; CHECK-LABEL: ssatmul_16t_q7: 2479; CHECK: @ %bb.0: @ %entry 2480; CHECK-NEXT: .save {r4, r5, r6, lr} 2481; CHECK-NEXT: push {r4, r5, r6, lr} 2482; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 2483; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 2484; CHECK-NEXT: .pad #80 2485; CHECK-NEXT: sub sp, #80 2486; CHECK-NEXT: cmp r3, #0 2487; CHECK-NEXT: beq .LBB18_3 2488; CHECK-NEXT: @ %bb.1: @ %vector.ph 2489; CHECK-NEXT: add.w r6, r3, #15 2490; CHECK-NEXT: movs r5, #1 2491; CHECK-NEXT: bic r6, r6, #15 2492; CHECK-NEXT: add r4, sp, #48 2493; CHECK-NEXT: subs r6, #16 2494; CHECK-NEXT: vmov.i8 q2, #0x0 2495; CHECK-NEXT: vmov.i8 q3, #0xff 2496; CHECK-NEXT: add.w lr, r5, r6, lsr #4 2497; CHECK-NEXT: adr r5, .LCPI18_0 2498; CHECK-NEXT: subs r6, r3, #1 2499; CHECK-NEXT: vldrw.u32 q0, [r5] 2500; CHECK-NEXT: vdup.32 q1, r6 2501; CHECK-NEXT: adr r6, .LCPI18_1 2502; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 2503; CHECK-NEXT: vldrw.u32 q0, [r6] 2504; CHECK-NEXT: adr r6, .LCPI18_2 2505; CHECK-NEXT: vldrw.u32 q5, [r6] 2506; CHECK-NEXT: adr r6, .LCPI18_3 2507; CHECK-NEXT: vldrw.u32 q6, [r6] 2508; CHECK-NEXT: add r5, sp, #32 2509; CHECK-NEXT: add r6, sp, #64 2510; CHECK-NEXT: movs r3, #0 2511; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 2512; CHECK-NEXT: .LBB18_2: @ %vector.body 2513; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2514; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 2515; CHECK-NEXT: vdup.32 q7, r3 2516; CHECK-NEXT: adds r3, #16 2517; CHECK-NEXT: vorr q0, q7, q0 2518; CHECK-NEXT: vcmp.u32 cs, q1, q0 2519; CHECK-NEXT: vpsel q0, q3, q2 2520; CHECK-NEXT: vstrh.32 q0, [r4, #8] 2521; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 2522; CHECK-NEXT: vorr q0, q7, q0 2523; CHECK-NEXT: vcmp.u32 cs, q1, q0 2524; CHECK-NEXT: vpsel q0, q3, q2 2525; CHECK-NEXT: vstrh.32 q0, [r4] 2526; CHECK-NEXT: vorr q0, q7, q5 2527; CHECK-NEXT: vcmp.u32 cs, q1, q0 2528; CHECK-NEXT: vpsel q0, q3, q2 2529; CHECK-NEXT: vstrh.32 q0, [r5, #8] 2530; CHECK-NEXT: vorr q0, q7, q6 2531; CHECK-NEXT: vcmp.u32 cs, q1, q0 2532; CHECK-NEXT: vpsel q0, q3, q2 2533; CHECK-NEXT: vstrh.32 q0, [r5] 2534; CHECK-NEXT: vldrw.u32 q0, [r4] 2535; CHECK-NEXT: vcmp.i16 ne, q0, zr 2536; CHECK-NEXT: vpsel q0, q3, q2 2537; CHECK-NEXT: vstrb.16 q0, [r6, #8] 2538; CHECK-NEXT: vldrw.u32 q0, [r5] 2539; CHECK-NEXT: vcmp.i16 ne, q0, zr 2540; CHECK-NEXT: vpsel q0, q3, q2 2541; CHECK-NEXT: vstrb.16 q0, [r6] 2542; CHECK-NEXT: vldrw.u32 q0, [r6] 2543; CHECK-NEXT: vptt.i8 ne, q0, zr 2544; CHECK-NEXT: vldrbt.u8 q0, [r0], #16 2545; CHECK-NEXT: vldrbt.u8 q7, [r1], #16 2546; CHECK-NEXT: vmullt.s8 q4, q7, q0 2547; CHECK-NEXT: vmullb.s8 q0, q7, q0 2548; CHECK-NEXT: vqshrnb.s16 q0, q0, #7 2549; CHECK-NEXT: vqshrnt.s16 q0, q4, #7 2550; CHECK-NEXT: vpst 2551; CHECK-NEXT: vstrbt.8 q0, [r2], #16 2552; CHECK-NEXT: le lr, .LBB18_2 2553; CHECK-NEXT: .LBB18_3: @ %for.cond.cleanup 2554; CHECK-NEXT: add sp, #80 2555; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 2556; CHECK-NEXT: pop {r4, r5, r6, pc} 2557; CHECK-NEXT: .p2align 4 2558; CHECK-NEXT: @ %bb.4: 2559; CHECK-NEXT: .LCPI18_0: 2560; CHECK-NEXT: .long 12 @ 0xc 2561; CHECK-NEXT: .long 13 @ 0xd 2562; CHECK-NEXT: .long 14 @ 0xe 2563; CHECK-NEXT: .long 15 @ 0xf 2564; CHECK-NEXT: .LCPI18_1: 2565; CHECK-NEXT: .long 8 @ 0x8 2566; CHECK-NEXT: .long 9 @ 0x9 2567; CHECK-NEXT: .long 10 @ 0xa 2568; CHECK-NEXT: .long 11 @ 0xb 2569; CHECK-NEXT: .LCPI18_2: 2570; CHECK-NEXT: .long 4 @ 0x4 2571; CHECK-NEXT: .long 5 @ 0x5 2572; CHECK-NEXT: .long 6 @ 0x6 2573; CHECK-NEXT: .long 7 @ 0x7 2574; CHECK-NEXT: .LCPI18_3: 2575; CHECK-NEXT: .long 0 @ 0x0 2576; CHECK-NEXT: .long 1 @ 0x1 2577; CHECK-NEXT: .long 2 @ 0x2 2578; CHECK-NEXT: .long 3 @ 0x3 2579entry: 2580 %cmp10 = icmp eq i32 %N, 0 2581 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph 2582 2583vector.ph: ; preds = %entry 2584 %n.rnd.up = add i32 %N, 15 2585 %n.vec = and i32 %n.rnd.up, -16 2586 %trip.count.minus.1 = add i32 %N, -1 2587 %broadcast.splatinsert22 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 2588 %broadcast.splat23 = shufflevector <16 x i32> %broadcast.splatinsert22, <16 x i32> undef, <16 x i32> zeroinitializer 2589 br label %vector.body 2590 2591vector.body: ; preds = %vector.body, %vector.ph 2592 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2593 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 2594 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer 2595 %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2596 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index 2597 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index 2598 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index 2599 %0 = icmp ule <16 x i32> %induction, %broadcast.splat23 2600 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep, i32 1, <16 x i1> %0, <16 x i8> undef) 2601 %1 = sext <16 x i8> %wide.masked.load to <16 x i16> 2602 %wide.masked.load24 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep20, i32 1, <16 x i1> %0, <16 x i8> undef) 2603 %2 = sext <16 x i8> %wide.masked.load24 to <16 x i16> 2604 %3 = mul nsw <16 x i16> %2, %1 2605 %4 = ashr <16 x i16> %3, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 2606 %5 = icmp sgt <16 x i16> %4, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2607 %6 = select <16 x i1> %5, <16 x i16> %4, <16 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2608 %7 = icmp slt <16 x i16> %6, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2609 %8 = select <16 x i1> %7, <16 x i16> %6, <16 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2610 %9 = trunc <16 x i16> %8 to <16 x i8> 2611 call void @llvm.masked.store.v16i8.p0(<16 x i8> %9, ptr %next.gep21, i32 1, <16 x i1> %0) 2612 %index.next = add i32 %index, 16 2613 %10 = icmp eq i32 %index.next, %n.vec 2614 br i1 %10, label %for.cond.cleanup, label %vector.body 2615 2616for.cond.cleanup: ; preds = %vector.body, %entry 2617 ret void 2618} 2619 2620define arm_aapcs_vfpcc void @ssatmul_16ti_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 2621; CHECK-LABEL: ssatmul_16ti_q7: 2622; CHECK: @ %bb.0: @ %entry 2623; CHECK-NEXT: .save {r4, r5, r6, lr} 2624; CHECK-NEXT: push {r4, r5, r6, lr} 2625; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 2626; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 2627; CHECK-NEXT: .pad #80 2628; CHECK-NEXT: sub sp, #80 2629; CHECK-NEXT: cmp r3, #0 2630; CHECK-NEXT: beq .LBB19_3 2631; CHECK-NEXT: @ %bb.1: @ %vector.ph 2632; CHECK-NEXT: add.w r6, r3, #15 2633; CHECK-NEXT: movs r5, #1 2634; CHECK-NEXT: bic r6, r6, #15 2635; CHECK-NEXT: add r4, sp, #48 2636; CHECK-NEXT: subs r6, #16 2637; CHECK-NEXT: vmov.i8 q2, #0x0 2638; CHECK-NEXT: vmov.i8 q3, #0xff 2639; CHECK-NEXT: add.w lr, r5, r6, lsr #4 2640; CHECK-NEXT: adr r5, .LCPI19_0 2641; CHECK-NEXT: subs r6, r3, #1 2642; CHECK-NEXT: vldrw.u32 q0, [r5] 2643; CHECK-NEXT: vdup.32 q1, r6 2644; CHECK-NEXT: adr r6, .LCPI19_1 2645; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 2646; CHECK-NEXT: vldrw.u32 q0, [r6] 2647; CHECK-NEXT: adr r6, .LCPI19_2 2648; CHECK-NEXT: vldrw.u32 q5, [r6] 2649; CHECK-NEXT: adr r6, .LCPI19_3 2650; CHECK-NEXT: vldrw.u32 q6, [r6] 2651; CHECK-NEXT: add r5, sp, #32 2652; CHECK-NEXT: add r6, sp, #64 2653; CHECK-NEXT: movs r3, #0 2654; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 2655; CHECK-NEXT: .LBB19_2: @ %vector.body 2656; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2657; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 2658; CHECK-NEXT: vdup.32 q7, r3 2659; CHECK-NEXT: adds r3, #16 2660; CHECK-NEXT: vorr q0, q7, q0 2661; CHECK-NEXT: vcmp.u32 cs, q1, q0 2662; CHECK-NEXT: vpsel q0, q3, q2 2663; CHECK-NEXT: vstrh.32 q0, [r4, #8] 2664; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload 2665; CHECK-NEXT: vorr q0, q7, q0 2666; CHECK-NEXT: vcmp.u32 cs, q1, q0 2667; CHECK-NEXT: vpsel q0, q3, q2 2668; CHECK-NEXT: vstrh.32 q0, [r4] 2669; CHECK-NEXT: vorr q0, q7, q5 2670; CHECK-NEXT: vcmp.u32 cs, q1, q0 2671; CHECK-NEXT: vpsel q0, q3, q2 2672; CHECK-NEXT: vstrh.32 q0, [r5, #8] 2673; CHECK-NEXT: vorr q0, q7, q6 2674; CHECK-NEXT: vcmp.u32 cs, q1, q0 2675; CHECK-NEXT: vpsel q0, q3, q2 2676; CHECK-NEXT: vstrh.32 q0, [r5] 2677; CHECK-NEXT: vldrw.u32 q0, [r4] 2678; CHECK-NEXT: vcmp.i16 ne, q0, zr 2679; CHECK-NEXT: vpsel q0, q3, q2 2680; CHECK-NEXT: vstrb.16 q0, [r6, #8] 2681; CHECK-NEXT: vldrw.u32 q0, [r5] 2682; CHECK-NEXT: vcmp.i16 ne, q0, zr 2683; CHECK-NEXT: vpsel q0, q3, q2 2684; CHECK-NEXT: vstrb.16 q0, [r6] 2685; CHECK-NEXT: vldrw.u32 q0, [r6] 2686; CHECK-NEXT: vptt.i8 ne, q0, zr 2687; CHECK-NEXT: vldrbt.u8 q0, [r0], #16 2688; CHECK-NEXT: vldrbt.u8 q7, [r1], #16 2689; CHECK-NEXT: vmullt.s8 q4, q7, q0 2690; CHECK-NEXT: vmullb.s8 q0, q7, q0 2691; CHECK-NEXT: vqshrnb.s16 q0, q0, #7 2692; CHECK-NEXT: vqshrnt.s16 q0, q4, #7 2693; CHECK-NEXT: vpst 2694; CHECK-NEXT: vstrbt.8 q0, [r2], #16 2695; CHECK-NEXT: le lr, .LBB19_2 2696; CHECK-NEXT: .LBB19_3: @ %for.cond.cleanup 2697; CHECK-NEXT: add sp, #80 2698; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 2699; CHECK-NEXT: pop {r4, r5, r6, pc} 2700; CHECK-NEXT: .p2align 4 2701; CHECK-NEXT: @ %bb.4: 2702; CHECK-NEXT: .LCPI19_0: 2703; CHECK-NEXT: .long 12 @ 0xc 2704; CHECK-NEXT: .long 13 @ 0xd 2705; CHECK-NEXT: .long 14 @ 0xe 2706; CHECK-NEXT: .long 15 @ 0xf 2707; CHECK-NEXT: .LCPI19_1: 2708; CHECK-NEXT: .long 8 @ 0x8 2709; CHECK-NEXT: .long 9 @ 0x9 2710; CHECK-NEXT: .long 10 @ 0xa 2711; CHECK-NEXT: .long 11 @ 0xb 2712; CHECK-NEXT: .LCPI19_2: 2713; CHECK-NEXT: .long 4 @ 0x4 2714; CHECK-NEXT: .long 5 @ 0x5 2715; CHECK-NEXT: .long 6 @ 0x6 2716; CHECK-NEXT: .long 7 @ 0x7 2717; CHECK-NEXT: .LCPI19_3: 2718; CHECK-NEXT: .long 0 @ 0x0 2719; CHECK-NEXT: .long 1 @ 0x1 2720; CHECK-NEXT: .long 2 @ 0x2 2721; CHECK-NEXT: .long 3 @ 0x3 2722entry: 2723 %cmp10 = icmp eq i32 %N, 0 2724 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph 2725 2726vector.ph: ; preds = %entry 2727 %n.rnd.up = add i32 %N, 15 2728 %n.vec = and i32 %n.rnd.up, -16 2729 %trip.count.minus.1 = add i32 %N, -1 2730 %broadcast.splatinsert22 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 2731 %broadcast.splat23 = shufflevector <16 x i32> %broadcast.splatinsert22, <16 x i32> undef, <16 x i32> zeroinitializer 2732 br label %vector.body 2733 2734vector.body: ; preds = %vector.body, %vector.ph 2735 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2736 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 2737 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer 2738 %induction = or <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 2739 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index 2740 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index 2741 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index 2742 %0 = icmp ule <16 x i32> %induction, %broadcast.splat23 2743 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep, i32 1, <16 x i1> %0, <16 x i8> undef) 2744 %1 = shufflevector <16 x i8> %wide.masked.load, <16 x i8> %wide.masked.load, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 2745 %2 = shufflevector <16 x i8> %wide.masked.load, <16 x i8> %wide.masked.load, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 2746 %3 = sext <8 x i8> %1 to <8 x i16> 2747 %4 = sext <8 x i8> %2 to <8 x i16> 2748 %wide.masked.load24 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %next.gep20, i32 1, <16 x i1> %0, <16 x i8> undef) 2749 %5 = shufflevector <16 x i8> %wide.masked.load24, <16 x i8> %wide.masked.load24, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14> 2750 %6 = shufflevector <16 x i8> %wide.masked.load24, <16 x i8> %wide.masked.load24, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15> 2751 %7 = sext <8 x i8> %5 to <8 x i16> 2752 %8 = sext <8 x i8> %6 to <8 x i16> 2753 %9 = mul <8 x i16> %7, %3 2754 %10 = mul <8 x i16> %8, %4 2755 %11 = ashr <8 x i16> %9, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 2756 %12 = ashr <8 x i16> %10, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 2757 %13 = icmp sgt <8 x i16> %11, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2758 %14 = icmp sgt <8 x i16> %12, <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2759 %15 = select <8 x i1> %13, <8 x i16> %11, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2760 %16 = select <8 x i1> %14, <8 x i16> %12, <8 x i16> <i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128, i16 -128> 2761 %17 = icmp slt <8 x i16> %15, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2762 %18 = icmp slt <8 x i16> %16, <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2763 %19 = select <8 x i1> %17, <8 x i16> %15, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2764 %20 = select <8 x i1> %18, <8 x i16> %16, <8 x i16> <i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127, i16 127> 2765 %21 = shufflevector <8 x i16> %19, <8 x i16> %20, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 2766 %22 = trunc <16 x i16> %21 to <16 x i8> 2767 call void @llvm.masked.store.v16i8.p0(<16 x i8> %22, ptr %next.gep21, i32 1, <16 x i1> %0) 2768 %index.next = add i32 %index, 16 2769 %23 = icmp eq i32 %index.next, %n.vec 2770 br i1 %23, label %for.cond.cleanup, label %vector.body 2771 2772for.cond.cleanup: ; preds = %vector.body, %entry 2773 ret void 2774} 2775 2776define arm_aapcs_vfpcc void @usatmul_8_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 2777; CHECK-LABEL: usatmul_8_q7: 2778; CHECK: @ %bb.0: @ %entry 2779; CHECK-NEXT: .save {r4, r5, r6, lr} 2780; CHECK-NEXT: push {r4, r5, r6, lr} 2781; CHECK-NEXT: cbz r3, .LBB20_8 2782; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 2783; CHECK-NEXT: cmp r3, #7 2784; CHECK-NEXT: bhi .LBB20_3 2785; CHECK-NEXT: @ %bb.2: 2786; CHECK-NEXT: movs r5, #0 2787; CHECK-NEXT: mov r12, r0 2788; CHECK-NEXT: mov r6, r1 2789; CHECK-NEXT: mov r4, r2 2790; CHECK-NEXT: b .LBB20_6 2791; CHECK-NEXT: .LBB20_3: @ %vector.ph 2792; CHECK-NEXT: bic r5, r3, #7 2793; CHECK-NEXT: movs r4, #1 2794; CHECK-NEXT: sub.w r6, r5, #8 2795; CHECK-NEXT: add.w r12, r0, r5 2796; CHECK-NEXT: add.w lr, r4, r6, lsr #3 2797; CHECK-NEXT: adds r4, r2, r5 2798; CHECK-NEXT: adds r6, r1, r5 2799; CHECK-NEXT: .LBB20_4: @ %vector.body 2800; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2801; CHECK-NEXT: vldrb.u16 q0, [r0], #8 2802; CHECK-NEXT: vldrb.u16 q1, [r1], #8 2803; CHECK-NEXT: vmul.i16 q0, q1, q0 2804; CHECK-NEXT: vqshrnb.u16 q0, q0, #7 2805; CHECK-NEXT: vstrb.16 q0, [r2], #8 2806; CHECK-NEXT: le lr, .LBB20_4 2807; CHECK-NEXT: @ %bb.5: @ %middle.block 2808; CHECK-NEXT: cmp r5, r3 2809; CHECK-NEXT: it eq 2810; CHECK-NEXT: popeq {r4, r5, r6, pc} 2811; CHECK-NEXT: .LBB20_6: @ %for.body.preheader23 2812; CHECK-NEXT: sub.w lr, r3, r5 2813; CHECK-NEXT: .LBB20_7: @ %for.body 2814; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2815; CHECK-NEXT: ldrb r0, [r12], #1 2816; CHECK-NEXT: ldrb r1, [r6], #1 2817; CHECK-NEXT: muls r0, r1, r0 2818; CHECK-NEXT: lsrs r1, r0, #7 2819; CHECK-NEXT: cmp r1, #255 2820; CHECK-NEXT: mov.w r1, #255 2821; CHECK-NEXT: it lo 2822; CHECK-NEXT: lsrlo r1, r0, #7 2823; CHECK-NEXT: strb r1, [r4], #1 2824; CHECK-NEXT: le lr, .LBB20_7 2825; CHECK-NEXT: .LBB20_8: @ %for.cond.cleanup 2826; CHECK-NEXT: pop {r4, r5, r6, pc} 2827entry: 2828 %cmp10 = icmp eq i32 %N, 0 2829 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader 2830 2831for.body.preheader: ; preds = %entry 2832 %min.iters.check = icmp ult i32 %N, 8 2833 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph 2834 2835for.body.preheader23: ; preds = %middle.block, %for.body.preheader 2836 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 2837 %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 2838 %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ] 2839 %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ] 2840 br label %for.body 2841 2842vector.ph: ; preds = %for.body.preheader 2843 %n.vec = and i32 %N, -8 2844 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec 2845 %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec 2846 %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec 2847 br label %vector.body 2848 2849vector.body: ; preds = %vector.body, %vector.ph 2850 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2851 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index 2852 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index 2853 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index 2854 %wide.load = load <8 x i8>, ptr %next.gep, align 1 2855 %0 = zext <8 x i8> %wide.load to <8 x i16> 2856 %wide.load22 = load <8 x i8>, ptr %next.gep20, align 1 2857 %1 = zext <8 x i8> %wide.load22 to <8 x i16> 2858 %2 = mul nuw <8 x i16> %1, %0 2859 %3 = lshr <8 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 2860 %4 = icmp ult <8 x i16> %3, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 2861 %5 = select <8 x i1> %4, <8 x i16> %3, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 2862 %6 = trunc <8 x i16> %5 to <8 x i8> 2863 store <8 x i8> %6, ptr %next.gep21, align 1 2864 %index.next = add i32 %index, 8 2865 %7 = icmp eq i32 %index.next, %n.vec 2866 br i1 %7, label %middle.block, label %vector.body 2867 2868middle.block: ; preds = %vector.body 2869 %cmp.n = icmp eq i32 %n.vec, %N 2870 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23 2871 2872for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 2873 ret void 2874 2875for.body: ; preds = %for.body.preheader23, %for.body 2876 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ] 2877 %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ] 2878 %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ] 2879 %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ] 2880 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1 2881 %8 = load i8, ptr %pSrcA.addr.013, align 1 2882 %conv1 = zext i8 %8 to i16 2883 %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1 2884 %9 = load i8, ptr %pSrcB.addr.012, align 1 2885 %conv3 = zext i8 %9 to i16 2886 %mul = mul nuw i16 %conv3, %conv1 2887 %10 = lshr i16 %mul, 7 2888 %11 = icmp ult i16 %10, 255 2889 %retval.0.i = select i1 %11, i16 %10, i16 255 2890 %conv5 = trunc i16 %retval.0.i to i8 2891 %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1 2892 store i8 %conv5, ptr %pDst.addr.011, align 1 2893 %inc = add nuw i32 %i.014, 1 2894 %exitcond = icmp eq i32 %inc, %N 2895 br i1 %exitcond, label %for.cond.cleanup, label %for.body 2896} 2897 2898define arm_aapcs_vfpcc void @usatmul_16_q7(ptr nocapture readonly %pSrcA, ptr nocapture readonly %pSrcB, ptr noalias nocapture %pDst, i32 %N) { 2899; CHECK-LABEL: usatmul_16_q7: 2900; CHECK: @ %bb.0: @ %entry 2901; CHECK-NEXT: .save {r4, r5, r6, lr} 2902; CHECK-NEXT: push {r4, r5, r6, lr} 2903; CHECK-NEXT: cmp r3, #0 2904; CHECK-NEXT: beq .LBB21_8 2905; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 2906; CHECK-NEXT: cmp r3, #15 2907; CHECK-NEXT: bhi .LBB21_3 2908; CHECK-NEXT: @ %bb.2: 2909; CHECK-NEXT: movs r5, #0 2910; CHECK-NEXT: mov r12, r0 2911; CHECK-NEXT: mov r6, r1 2912; CHECK-NEXT: mov r4, r2 2913; CHECK-NEXT: b .LBB21_6 2914; CHECK-NEXT: .LBB21_3: @ %vector.ph 2915; CHECK-NEXT: bic r5, r3, #15 2916; CHECK-NEXT: movs r4, #1 2917; CHECK-NEXT: sub.w r6, r5, #16 2918; CHECK-NEXT: add.w r12, r0, r5 2919; CHECK-NEXT: add.w lr, r4, r6, lsr #4 2920; CHECK-NEXT: adds r4, r2, r5 2921; CHECK-NEXT: adds r6, r1, r5 2922; CHECK-NEXT: .LBB21_4: @ %vector.body 2923; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2924; CHECK-NEXT: vldrb.u8 q0, [r0], #16 2925; CHECK-NEXT: vldrb.u8 q1, [r1], #16 2926; CHECK-NEXT: vmullt.u8 q2, q1, q0 2927; CHECK-NEXT: vmullb.u8 q0, q1, q0 2928; CHECK-NEXT: vqshrnb.u16 q0, q0, #7 2929; CHECK-NEXT: vqshrnt.u16 q0, q2, #7 2930; CHECK-NEXT: vstrb.8 q0, [r2], #16 2931; CHECK-NEXT: le lr, .LBB21_4 2932; CHECK-NEXT: @ %bb.5: @ %middle.block 2933; CHECK-NEXT: cmp r5, r3 2934; CHECK-NEXT: it eq 2935; CHECK-NEXT: popeq {r4, r5, r6, pc} 2936; CHECK-NEXT: .LBB21_6: @ %for.body.preheader23 2937; CHECK-NEXT: sub.w lr, r3, r5 2938; CHECK-NEXT: .LBB21_7: @ %for.body 2939; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2940; CHECK-NEXT: ldrb r0, [r12], #1 2941; CHECK-NEXT: ldrb r1, [r6], #1 2942; CHECK-NEXT: muls r0, r1, r0 2943; CHECK-NEXT: lsrs r1, r0, #7 2944; CHECK-NEXT: cmp r1, #255 2945; CHECK-NEXT: mov.w r1, #255 2946; CHECK-NEXT: it lo 2947; CHECK-NEXT: lsrlo r1, r0, #7 2948; CHECK-NEXT: strb r1, [r4], #1 2949; CHECK-NEXT: le lr, .LBB21_7 2950; CHECK-NEXT: .LBB21_8: @ %for.cond.cleanup 2951; CHECK-NEXT: pop {r4, r5, r6, pc} 2952entry: 2953 %cmp10 = icmp eq i32 %N, 0 2954 br i1 %cmp10, label %for.cond.cleanup, label %for.body.preheader 2955 2956for.body.preheader: ; preds = %entry 2957 %min.iters.check = icmp ult i32 %N, 16 2958 br i1 %min.iters.check, label %for.body.preheader23, label %vector.ph 2959 2960for.body.preheader23: ; preds = %middle.block, %for.body.preheader 2961 %i.014.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 2962 %pSrcA.addr.013.ph = phi ptr [ %pSrcA, %for.body.preheader ], [ %ind.end, %middle.block ] 2963 %pSrcB.addr.012.ph = phi ptr [ %pSrcB, %for.body.preheader ], [ %ind.end17, %middle.block ] 2964 %pDst.addr.011.ph = phi ptr [ %pDst, %for.body.preheader ], [ %ind.end19, %middle.block ] 2965 br label %for.body 2966 2967vector.ph: ; preds = %for.body.preheader 2968 %n.vec = and i32 %N, -16 2969 %ind.end = getelementptr i8, ptr %pSrcA, i32 %n.vec 2970 %ind.end17 = getelementptr i8, ptr %pSrcB, i32 %n.vec 2971 %ind.end19 = getelementptr i8, ptr %pDst, i32 %n.vec 2972 br label %vector.body 2973 2974vector.body: ; preds = %vector.body, %vector.ph 2975 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2976 %next.gep = getelementptr i8, ptr %pSrcA, i32 %index 2977 %next.gep20 = getelementptr i8, ptr %pSrcB, i32 %index 2978 %next.gep21 = getelementptr i8, ptr %pDst, i32 %index 2979 %wide.load = load <16 x i8>, ptr %next.gep, align 1 2980 %0 = zext <16 x i8> %wide.load to <16 x i16> 2981 %wide.load22 = load <16 x i8>, ptr %next.gep20, align 1 2982 %1 = zext <16 x i8> %wide.load22 to <16 x i16> 2983 %2 = mul nuw <16 x i16> %1, %0 2984 %3 = lshr <16 x i16> %2, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7> 2985 %4 = icmp ult <16 x i16> %3, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 2986 %5 = select <16 x i1> %4, <16 x i16> %3, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> 2987 %6 = trunc <16 x i16> %5 to <16 x i8> 2988 store <16 x i8> %6, ptr %next.gep21, align 1 2989 %index.next = add i32 %index, 16 2990 %7 = icmp eq i32 %index.next, %n.vec 2991 br i1 %7, label %middle.block, label %vector.body 2992 2993middle.block: ; preds = %vector.body 2994 %cmp.n = icmp eq i32 %n.vec, %N 2995 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader23 2996 2997for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 2998 ret void 2999 3000for.body: ; preds = %for.body.preheader23, %for.body 3001 %i.014 = phi i32 [ %inc, %for.body ], [ %i.014.ph, %for.body.preheader23 ] 3002 %pSrcA.addr.013 = phi ptr [ %incdec.ptr, %for.body ], [ %pSrcA.addr.013.ph, %for.body.preheader23 ] 3003 %pSrcB.addr.012 = phi ptr [ %incdec.ptr2, %for.body ], [ %pSrcB.addr.012.ph, %for.body.preheader23 ] 3004 %pDst.addr.011 = phi ptr [ %incdec.ptr6, %for.body ], [ %pDst.addr.011.ph, %for.body.preheader23 ] 3005 %incdec.ptr = getelementptr inbounds i8, ptr %pSrcA.addr.013, i32 1 3006 %8 = load i8, ptr %pSrcA.addr.013, align 1 3007 %conv1 = zext i8 %8 to i16 3008 %incdec.ptr2 = getelementptr inbounds i8, ptr %pSrcB.addr.012, i32 1 3009 %9 = load i8, ptr %pSrcB.addr.012, align 1 3010 %conv3 = zext i8 %9 to i16 3011 %mul = mul nuw i16 %conv3, %conv1 3012 %10 = lshr i16 %mul, 7 3013 %11 = icmp ult i16 %10, 255 3014 %retval.0.i = select i1 %11, i16 %10, i16 255 3015 %conv5 = trunc i16 %retval.0.i to i8 3016 %incdec.ptr6 = getelementptr inbounds i8, ptr %pDst.addr.011, i32 1 3017 store i8 %conv5, ptr %pDst.addr.011, align 1 3018 %inc = add nuw i32 %i.014, 1 3019 %exitcond = icmp eq i32 %inc, %N 3020 br i1 %exitcond, label %for.cond.cleanup, label %for.body 3021} 3022 3023declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32, <4 x i1>, <4 x i32>) 3024declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32, <8 x i1>, <8 x i16>) 3025declare <4 x i16> @llvm.masked.load.v4i16.p0(ptr, i32, <4 x i1>, <4 x i16>) 3026declare <8 x i8> @llvm.masked.load.v8i8.p0(ptr, i32, <8 x i1>, <8 x i8>) 3027declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32, <16 x i1>, <16 x i8>) 3028declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>) 3029declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32, <8 x i1>) 3030declare void @llvm.masked.store.v4i16.p0(<4 x i16>, ptr, i32, <4 x i1>) 3031declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr, i32, <8 x i1>) 3032declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32, <16 x i1>) 3033