1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=enabled -verify-machineinstrs %s -o - | FileCheck %s 3 4define i32 @add_i32(ptr nocapture readonly %x, i32 %n) { 5; CHECK-LABEL: add_i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: cmp r1, #1 10; CHECK-NEXT: blt .LBB0_3 11; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 12; CHECK-NEXT: mov r12, r0 13; CHECK-NEXT: cmp r1, #4 14; CHECK-NEXT: bhs .LBB0_4 15; CHECK-NEXT: @ %bb.2: 16; CHECK-NEXT: movs r3, #0 17; CHECK-NEXT: movs r0, #0 18; CHECK-NEXT: b .LBB0_7 19; CHECK-NEXT: .LBB0_3: 20; CHECK-NEXT: movs r0, #0 21; CHECK-NEXT: pop {r7, pc} 22; CHECK-NEXT: .LBB0_4: @ %vector.ph 23; CHECK-NEXT: bic r3, r1, #3 24; CHECK-NEXT: movs r2, #1 25; CHECK-NEXT: subs r0, r3, #4 26; CHECK-NEXT: add.w lr, r2, r0, lsr #2 27; CHECK-NEXT: movs r0, #0 28; CHECK-NEXT: mov r2, r12 29; CHECK-NEXT: .LBB0_5: @ %vector.body 30; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 31; CHECK-NEXT: vldrw.u32 q0, [r2], #16 32; CHECK-NEXT: vaddva.u32 r0, q0 33; CHECK-NEXT: le lr, .LBB0_5 34; CHECK-NEXT: @ %bb.6: @ %middle.block 35; CHECK-NEXT: cmp r3, r1 36; CHECK-NEXT: it eq 37; CHECK-NEXT: popeq {r7, pc} 38; CHECK-NEXT: .LBB0_7: @ %for.body.preheader1 39; CHECK-NEXT: sub.w lr, r1, r3 40; CHECK-NEXT: add.w r2, r12, r3, lsl #2 41; CHECK-NEXT: .LBB0_8: @ %for.body 42; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 43; CHECK-NEXT: ldr r1, [r2], #4 44; CHECK-NEXT: add r0, r1 45; CHECK-NEXT: le lr, .LBB0_8 46; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup 47; CHECK-NEXT: pop {r7, pc} 48entry: 49 %cmp6 = icmp sgt i32 %n, 0 50 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 51 52for.body.preheader: ; preds = %entry 53 %min.iters.check = icmp ult i32 %n, 4 54 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 55 56vector.ph: ; preds = %for.body.preheader 57 %n.vec = and i32 %n, -4 58 br label %vector.body 59 60vector.body: ; preds = %vector.body, %vector.ph 61 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 62 %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ] 63 %0 = getelementptr inbounds i32, ptr %x, i32 %index 64 %1 = bitcast ptr %0 to ptr 65 %wide.load = load <4 x i32>, ptr %1, align 4 66 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load) 67 %3 = add i32 %2, %vec.phi 68 %index.next = add i32 %index, 4 69 %4 = icmp eq i32 %index.next, %n.vec 70 br i1 %4, label %middle.block, label %vector.body 71 72middle.block: ; preds = %vector.body 73 %cmp.n = icmp eq i32 %n.vec, %n 74 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 75 76for.body.preheader1: ; preds = %middle.block, %for.body.preheader 77 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 78 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %3, %middle.block ] 79 br label %for.body 80 81for.body: ; preds = %for.body.preheader1, %for.body 82 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 83 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 84 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 85 %5 = load i32, ptr %arrayidx, align 4 86 %add = add nsw i32 %5, %r.07 87 %inc = add nuw nsw i32 %i.08, 1 88 %exitcond = icmp eq i32 %inc, %n 89 br i1 %exitcond, label %for.cond.cleanup, label %for.body 90 91for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 92 %r.0.lcssa = phi i32 [ 0, %entry ], [ %3, %middle.block ], [ %add, %for.body ] 93 ret i32 %r.0.lcssa 94} 95 96define i32 @mul_i32(ptr nocapture readonly %x, i32 %n) { 97; CHECK-LABEL: mul_i32: 98; CHECK: @ %bb.0: @ %entry 99; CHECK-NEXT: .save {r4, lr} 100; CHECK-NEXT: push {r4, lr} 101; CHECK-NEXT: movs r2, #1 102; CHECK-NEXT: cmp r1, #1 103; CHECK-NEXT: blt .LBB1_8 104; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 105; CHECK-NEXT: cmp r1, #4 106; CHECK-NEXT: bhs .LBB1_3 107; CHECK-NEXT: @ %bb.2: 108; CHECK-NEXT: mov.w r12, #0 109; CHECK-NEXT: b .LBB1_6 110; CHECK-NEXT: .LBB1_3: @ %vector.ph 111; CHECK-NEXT: bic r12, r1, #3 112; CHECK-NEXT: vmov.i32 q0, #0x1 113; CHECK-NEXT: sub.w r3, r12, #4 114; CHECK-NEXT: add.w lr, r2, r3, lsr #2 115; CHECK-NEXT: mov r2, r0 116; CHECK-NEXT: .LBB1_4: @ %vector.body 117; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 118; CHECK-NEXT: vldrw.u32 q1, [r2], #16 119; CHECK-NEXT: vmul.i32 q0, q1, q0 120; CHECK-NEXT: le lr, .LBB1_4 121; CHECK-NEXT: @ %bb.5: @ %middle.block 122; CHECK-NEXT: vmov lr, r3, d1 123; CHECK-NEXT: cmp r12, r1 124; CHECK-NEXT: vmov r2, r4, d0 125; CHECK-NEXT: mul r3, lr, r3 126; CHECK-NEXT: mul r2, r4, r2 127; CHECK-NEXT: mul r2, r3, r2 128; CHECK-NEXT: beq .LBB1_8 129; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1 130; CHECK-NEXT: sub.w lr, r1, r12 131; CHECK-NEXT: add.w r0, r0, r12, lsl #2 132; CHECK-NEXT: .LBB1_7: @ %for.body 133; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 134; CHECK-NEXT: ldr r1, [r0], #4 135; CHECK-NEXT: muls r2, r1, r2 136; CHECK-NEXT: le lr, .LBB1_7 137; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup 138; CHECK-NEXT: mov r0, r2 139; CHECK-NEXT: pop {r4, pc} 140entry: 141 %cmp6 = icmp sgt i32 %n, 0 142 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 143 144for.body.preheader: ; preds = %entry 145 %min.iters.check = icmp ult i32 %n, 4 146 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 147 148vector.ph: ; preds = %for.body.preheader 149 %n.vec = and i32 %n, -4 150 br label %vector.body 151 152vector.body: ; preds = %vector.body, %vector.ph 153 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 154 %vec.phi = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ %2, %vector.body ] 155 %0 = getelementptr inbounds i32, ptr %x, i32 %index 156 %1 = bitcast ptr %0 to ptr 157 %wide.load = load <4 x i32>, ptr %1, align 4 158 %2 = mul <4 x i32> %wide.load, %vec.phi 159 %index.next = add i32 %index, 4 160 %3 = icmp eq i32 %index.next, %n.vec 161 br i1 %3, label %middle.block, label %vector.body 162 163middle.block: ; preds = %vector.body 164 %4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2) 165 %cmp.n = icmp eq i32 %n.vec, %n 166 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 167 168for.body.preheader1: ; preds = %middle.block, %for.body.preheader 169 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 170 %r.07.ph = phi i32 [ 1, %for.body.preheader ], [ %4, %middle.block ] 171 br label %for.body 172 173for.body: ; preds = %for.body.preheader1, %for.body 174 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 175 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 176 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 177 %5 = load i32, ptr %arrayidx, align 4 178 %add = mul nsw i32 %5, %r.07 179 %inc = add nuw nsw i32 %i.08, 1 180 %exitcond = icmp eq i32 %inc, %n 181 br i1 %exitcond, label %for.cond.cleanup, label %for.body 182 183for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 184 %r.0.lcssa = phi i32 [ 1, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 185 ret i32 %r.0.lcssa 186} 187 188define i32 @and_i32(ptr nocapture readonly %x, i32 %n) { 189; CHECK-LABEL: and_i32: 190; CHECK: @ %bb.0: @ %entry 191; CHECK-NEXT: .save {r4, lr} 192; CHECK-NEXT: push {r4, lr} 193; CHECK-NEXT: cmp r1, #1 194; CHECK-NEXT: blt .LBB2_3 195; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 196; CHECK-NEXT: cmp r1, #4 197; CHECK-NEXT: bhs .LBB2_4 198; CHECK-NEXT: @ %bb.2: 199; CHECK-NEXT: mov.w r2, #-1 200; CHECK-NEXT: movs r3, #0 201; CHECK-NEXT: b .LBB2_7 202; CHECK-NEXT: .LBB2_3: 203; CHECK-NEXT: mov.w r2, #-1 204; CHECK-NEXT: mov r0, r2 205; CHECK-NEXT: pop {r4, pc} 206; CHECK-NEXT: .LBB2_4: @ %vector.ph 207; CHECK-NEXT: bic r3, r1, #3 208; CHECK-NEXT: movs r2, #1 209; CHECK-NEXT: sub.w r12, r3, #4 210; CHECK-NEXT: vmov.i8 q0, #0xff 211; CHECK-NEXT: add.w lr, r2, r12, lsr #2 212; CHECK-NEXT: mov r2, r0 213; CHECK-NEXT: .LBB2_5: @ %vector.body 214; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 215; CHECK-NEXT: vldrw.u32 q1, [r2], #16 216; CHECK-NEXT: vand q0, q1, q0 217; CHECK-NEXT: le lr, .LBB2_5 218; CHECK-NEXT: @ %bb.6: @ %middle.block 219; CHECK-NEXT: vmov lr, r12, d1 220; CHECK-NEXT: cmp r3, r1 221; CHECK-NEXT: vmov r2, r4, d0 222; CHECK-NEXT: and.w r12, r12, lr 223; CHECK-NEXT: and.w r2, r2, r4 224; CHECK-NEXT: and.w r2, r2, r12 225; CHECK-NEXT: beq .LBB2_9 226; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1 227; CHECK-NEXT: sub.w lr, r1, r3 228; CHECK-NEXT: add.w r0, r0, r3, lsl #2 229; CHECK-NEXT: .LBB2_8: @ %for.body 230; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 231; CHECK-NEXT: ldr r1, [r0], #4 232; CHECK-NEXT: ands r2, r1 233; CHECK-NEXT: le lr, .LBB2_8 234; CHECK-NEXT: .LBB2_9: @ %for.cond.cleanup 235; CHECK-NEXT: mov r0, r2 236; CHECK-NEXT: pop {r4, pc} 237entry: 238 %cmp6 = icmp sgt i32 %n, 0 239 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 240 241for.body.preheader: ; preds = %entry 242 %min.iters.check = icmp ult i32 %n, 4 243 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 244 245vector.ph: ; preds = %for.body.preheader 246 %n.vec = and i32 %n, -4 247 br label %vector.body 248 249vector.body: ; preds = %vector.body, %vector.ph 250 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 251 %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %2, %vector.body ] 252 %0 = getelementptr inbounds i32, ptr %x, i32 %index 253 %1 = bitcast ptr %0 to ptr 254 %wide.load = load <4 x i32>, ptr %1, align 4 255 %2 = and <4 x i32> %wide.load, %vec.phi 256 %index.next = add i32 %index, 4 257 %3 = icmp eq i32 %index.next, %n.vec 258 br i1 %3, label %middle.block, label %vector.body 259 260middle.block: ; preds = %vector.body 261 %4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2) 262 %cmp.n = icmp eq i32 %n.vec, %n 263 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 264 265for.body.preheader1: ; preds = %middle.block, %for.body.preheader 266 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 267 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %4, %middle.block ] 268 br label %for.body 269 270for.body: ; preds = %for.body.preheader1, %for.body 271 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 272 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 273 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 274 %5 = load i32, ptr %arrayidx, align 4 275 %add = and i32 %5, %r.07 276 %inc = add nuw nsw i32 %i.08, 1 277 %exitcond = icmp eq i32 %inc, %n 278 br i1 %exitcond, label %for.cond.cleanup, label %for.body 279 280for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 281 %r.0.lcssa = phi i32 [ -1, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 282 ret i32 %r.0.lcssa 283} 284 285define i32 @or_i32(ptr nocapture readonly %x, i32 %n) { 286; CHECK-LABEL: or_i32: 287; CHECK: @ %bb.0: @ %entry 288; CHECK-NEXT: .save {r4, lr} 289; CHECK-NEXT: push {r4, lr} 290; CHECK-NEXT: cmp r1, #1 291; CHECK-NEXT: blt .LBB3_3 292; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 293; CHECK-NEXT: cmp r1, #4 294; CHECK-NEXT: bhs .LBB3_4 295; CHECK-NEXT: @ %bb.2: 296; CHECK-NEXT: movs r3, #0 297; CHECK-NEXT: movs r2, #0 298; CHECK-NEXT: b .LBB3_7 299; CHECK-NEXT: .LBB3_3: 300; CHECK-NEXT: movs r2, #0 301; CHECK-NEXT: mov r0, r2 302; CHECK-NEXT: pop {r4, pc} 303; CHECK-NEXT: .LBB3_4: @ %vector.ph 304; CHECK-NEXT: bic r3, r1, #3 305; CHECK-NEXT: movs r2, #1 306; CHECK-NEXT: sub.w r12, r3, #4 307; CHECK-NEXT: vmov.i32 q0, #0x0 308; CHECK-NEXT: add.w lr, r2, r12, lsr #2 309; CHECK-NEXT: mov r2, r0 310; CHECK-NEXT: .LBB3_5: @ %vector.body 311; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 312; CHECK-NEXT: vldrw.u32 q1, [r2], #16 313; CHECK-NEXT: vorr q0, q1, q0 314; CHECK-NEXT: le lr, .LBB3_5 315; CHECK-NEXT: @ %bb.6: @ %middle.block 316; CHECK-NEXT: vmov lr, r12, d1 317; CHECK-NEXT: cmp r3, r1 318; CHECK-NEXT: vmov r2, r4, d0 319; CHECK-NEXT: orr.w r12, r12, lr 320; CHECK-NEXT: orr.w r2, r2, r4 321; CHECK-NEXT: orr.w r2, r2, r12 322; CHECK-NEXT: beq .LBB3_9 323; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1 324; CHECK-NEXT: sub.w lr, r1, r3 325; CHECK-NEXT: add.w r0, r0, r3, lsl #2 326; CHECK-NEXT: .LBB3_8: @ %for.body 327; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 328; CHECK-NEXT: ldr r1, [r0], #4 329; CHECK-NEXT: orrs r2, r1 330; CHECK-NEXT: le lr, .LBB3_8 331; CHECK-NEXT: .LBB3_9: @ %for.cond.cleanup 332; CHECK-NEXT: mov r0, r2 333; CHECK-NEXT: pop {r4, pc} 334entry: 335 %cmp6 = icmp sgt i32 %n, 0 336 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 337 338for.body.preheader: ; preds = %entry 339 %min.iters.check = icmp ult i32 %n, 4 340 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 341 342vector.ph: ; preds = %for.body.preheader 343 %n.vec = and i32 %n, -4 344 br label %vector.body 345 346vector.body: ; preds = %vector.body, %vector.ph 347 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 348 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ] 349 %0 = getelementptr inbounds i32, ptr %x, i32 %index 350 %1 = bitcast ptr %0 to ptr 351 %wide.load = load <4 x i32>, ptr %1, align 4 352 %2 = or <4 x i32> %wide.load, %vec.phi 353 %index.next = add i32 %index, 4 354 %3 = icmp eq i32 %index.next, %n.vec 355 br i1 %3, label %middle.block, label %vector.body 356 357middle.block: ; preds = %vector.body 358 %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2) 359 %cmp.n = icmp eq i32 %n.vec, %n 360 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 361 362for.body.preheader1: ; preds = %middle.block, %for.body.preheader 363 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 364 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ] 365 br label %for.body 366 367for.body: ; preds = %for.body.preheader1, %for.body 368 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 369 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 370 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 371 %5 = load i32, ptr %arrayidx, align 4 372 %add = or i32 %5, %r.07 373 %inc = add nuw nsw i32 %i.08, 1 374 %exitcond = icmp eq i32 %inc, %n 375 br i1 %exitcond, label %for.cond.cleanup, label %for.body 376 377for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 378 %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 379 ret i32 %r.0.lcssa 380} 381 382define i32 @xor_i32(ptr nocapture readonly %x, i32 %n) { 383; CHECK-LABEL: xor_i32: 384; CHECK: @ %bb.0: @ %entry 385; CHECK-NEXT: .save {r4, lr} 386; CHECK-NEXT: push {r4, lr} 387; CHECK-NEXT: cmp r1, #1 388; CHECK-NEXT: blt .LBB4_3 389; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 390; CHECK-NEXT: cmp r1, #4 391; CHECK-NEXT: bhs .LBB4_4 392; CHECK-NEXT: @ %bb.2: 393; CHECK-NEXT: movs r3, #0 394; CHECK-NEXT: movs r2, #0 395; CHECK-NEXT: b .LBB4_7 396; CHECK-NEXT: .LBB4_3: 397; CHECK-NEXT: movs r2, #0 398; CHECK-NEXT: mov r0, r2 399; CHECK-NEXT: pop {r4, pc} 400; CHECK-NEXT: .LBB4_4: @ %vector.ph 401; CHECK-NEXT: bic r3, r1, #3 402; CHECK-NEXT: movs r2, #1 403; CHECK-NEXT: sub.w r12, r3, #4 404; CHECK-NEXT: vmov.i32 q0, #0x0 405; CHECK-NEXT: add.w lr, r2, r12, lsr #2 406; CHECK-NEXT: mov r2, r0 407; CHECK-NEXT: .LBB4_5: @ %vector.body 408; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 409; CHECK-NEXT: vldrw.u32 q1, [r2], #16 410; CHECK-NEXT: veor q0, q1, q0 411; CHECK-NEXT: le lr, .LBB4_5 412; CHECK-NEXT: @ %bb.6: @ %middle.block 413; CHECK-NEXT: vmov lr, r12, d1 414; CHECK-NEXT: cmp r3, r1 415; CHECK-NEXT: vmov r2, r4, d0 416; CHECK-NEXT: eor.w r12, r12, lr 417; CHECK-NEXT: eor.w r2, r2, r4 418; CHECK-NEXT: eor.w r2, r2, r12 419; CHECK-NEXT: beq .LBB4_9 420; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1 421; CHECK-NEXT: sub.w lr, r1, r3 422; CHECK-NEXT: add.w r0, r0, r3, lsl #2 423; CHECK-NEXT: .LBB4_8: @ %for.body 424; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 425; CHECK-NEXT: ldr r1, [r0], #4 426; CHECK-NEXT: eors r2, r1 427; CHECK-NEXT: le lr, .LBB4_8 428; CHECK-NEXT: .LBB4_9: @ %for.cond.cleanup 429; CHECK-NEXT: mov r0, r2 430; CHECK-NEXT: pop {r4, pc} 431entry: 432 %cmp6 = icmp sgt i32 %n, 0 433 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 434 435for.body.preheader: ; preds = %entry 436 %min.iters.check = icmp ult i32 %n, 4 437 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 438 439vector.ph: ; preds = %for.body.preheader 440 %n.vec = and i32 %n, -4 441 br label %vector.body 442 443vector.body: ; preds = %vector.body, %vector.ph 444 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 445 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ] 446 %0 = getelementptr inbounds i32, ptr %x, i32 %index 447 %1 = bitcast ptr %0 to ptr 448 %wide.load = load <4 x i32>, ptr %1, align 4 449 %2 = xor <4 x i32> %wide.load, %vec.phi 450 %index.next = add i32 %index, 4 451 %3 = icmp eq i32 %index.next, %n.vec 452 br i1 %3, label %middle.block, label %vector.body 453 454middle.block: ; preds = %vector.body 455 %4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2) 456 %cmp.n = icmp eq i32 %n.vec, %n 457 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 458 459for.body.preheader1: ; preds = %middle.block, %for.body.preheader 460 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 461 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ] 462 br label %for.body 463 464for.body: ; preds = %for.body.preheader1, %for.body 465 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 466 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 467 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 468 %5 = load i32, ptr %arrayidx, align 4 469 %add = xor i32 %5, %r.07 470 %inc = add nuw nsw i32 %i.08, 1 471 %exitcond = icmp eq i32 %inc, %n 472 br i1 %exitcond, label %for.cond.cleanup, label %for.body 473 474for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 475 %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 476 ret i32 %r.0.lcssa 477} 478 479define float @fadd_f32(ptr nocapture readonly %x, i32 %n) { 480; CHECK-LABEL: fadd_f32: 481; CHECK: @ %bb.0: @ %entry 482; CHECK-NEXT: .save {r7, lr} 483; CHECK-NEXT: push {r7, lr} 484; CHECK-NEXT: cmp r1, #1 485; CHECK-NEXT: blt .LBB5_3 486; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 487; CHECK-NEXT: cmp r1, #4 488; CHECK-NEXT: bhs .LBB5_4 489; CHECK-NEXT: @ %bb.2: 490; CHECK-NEXT: vldr s0, .LCPI5_0 491; CHECK-NEXT: movs r2, #0 492; CHECK-NEXT: b .LBB5_7 493; CHECK-NEXT: .LBB5_3: 494; CHECK-NEXT: vldr s0, .LCPI5_0 495; CHECK-NEXT: vmov r0, s0 496; CHECK-NEXT: pop {r7, pc} 497; CHECK-NEXT: .LBB5_4: @ %vector.ph 498; CHECK-NEXT: bic r2, r1, #3 499; CHECK-NEXT: movs r3, #1 500; CHECK-NEXT: sub.w r12, r2, #4 501; CHECK-NEXT: vmov.i32 q0, #0x0 502; CHECK-NEXT: add.w lr, r3, r12, lsr #2 503; CHECK-NEXT: mov r3, r0 504; CHECK-NEXT: .LBB5_5: @ %vector.body 505; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 506; CHECK-NEXT: vldrw.u32 q1, [r3], #16 507; CHECK-NEXT: vadd.f32 q0, q1, q0 508; CHECK-NEXT: le lr, .LBB5_5 509; CHECK-NEXT: @ %bb.6: @ %middle.block 510; CHECK-NEXT: vadd.f32 s2, s2, s3 511; CHECK-NEXT: cmp r2, r1 512; CHECK-NEXT: vadd.f32 s0, s0, s1 513; CHECK-NEXT: vadd.f32 s0, s0, s2 514; CHECK-NEXT: beq .LBB5_9 515; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1 516; CHECK-NEXT: sub.w lr, r1, r2 517; CHECK-NEXT: add.w r0, r0, r2, lsl #2 518; CHECK-NEXT: .LBB5_8: @ %for.body 519; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 520; CHECK-NEXT: vldmia r0!, {s2} 521; CHECK-NEXT: vadd.f32 s0, s2, s0 522; CHECK-NEXT: le lr, .LBB5_8 523; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup 524; CHECK-NEXT: vmov r0, s0 525; CHECK-NEXT: pop {r7, pc} 526; CHECK-NEXT: .p2align 2 527; CHECK-NEXT: @ %bb.10: 528; CHECK-NEXT: .LCPI5_0: 529; CHECK-NEXT: .long 0x00000000 @ float 0 530entry: 531 %cmp6 = icmp sgt i32 %n, 0 532 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 533 534for.body.preheader: ; preds = %entry 535 %min.iters.check = icmp ult i32 %n, 4 536 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 537 538vector.ph: ; preds = %for.body.preheader 539 %n.vec = and i32 %n, -4 540 br label %vector.body 541 542vector.body: ; preds = %vector.body, %vector.ph 543 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 544 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ] 545 %0 = getelementptr inbounds float, ptr %x, i32 %index 546 %1 = bitcast ptr %0 to ptr 547 %wide.load = load <4 x float>, ptr %1, align 4 548 %2 = fadd fast <4 x float> %wide.load, %vec.phi 549 %index.next = add i32 %index, 4 550 %3 = icmp eq i32 %index.next, %n.vec 551 br i1 %3, label %middle.block, label %vector.body 552 553middle.block: ; preds = %vector.body 554 %4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2) 555 %cmp.n = icmp eq i32 %n.vec, %n 556 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 557 558for.body.preheader1: ; preds = %middle.block, %for.body.preheader 559 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 560 %r.07.ph = phi float [ 0.000000e+00, %for.body.preheader ], [ %4, %middle.block ] 561 br label %for.body 562 563for.body: ; preds = %for.body.preheader1, %for.body 564 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 565 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 566 %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08 567 %5 = load float, ptr %arrayidx, align 4 568 %add = fadd fast float %5, %r.07 569 %inc = add nuw nsw i32 %i.08, 1 570 %exitcond = icmp eq i32 %inc, %n 571 br i1 %exitcond, label %for.cond.cleanup, label %for.body 572 573for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 574 %r.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 575 ret float %r.0.lcssa 576} 577 578define float @fmul_f32(ptr nocapture readonly %x, i32 %n) { 579; CHECK-LABEL: fmul_f32: 580; CHECK: @ %bb.0: @ %entry 581; CHECK-NEXT: .save {r7, lr} 582; CHECK-NEXT: push {r7, lr} 583; CHECK-NEXT: cmp r1, #1 584; CHECK-NEXT: blt .LBB6_3 585; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 586; CHECK-NEXT: cmp r1, #4 587; CHECK-NEXT: bhs .LBB6_4 588; CHECK-NEXT: @ %bb.2: 589; CHECK-NEXT: vmov.f32 s0, #1.000000e+00 590; CHECK-NEXT: movs r2, #0 591; CHECK-NEXT: b .LBB6_7 592; CHECK-NEXT: .LBB6_3: 593; CHECK-NEXT: vmov.f32 s0, #1.000000e+00 594; CHECK-NEXT: vmov r0, s0 595; CHECK-NEXT: pop {r7, pc} 596; CHECK-NEXT: .LBB6_4: @ %vector.ph 597; CHECK-NEXT: bic r2, r1, #3 598; CHECK-NEXT: movs r3, #1 599; CHECK-NEXT: sub.w r12, r2, #4 600; CHECK-NEXT: vmov.f32 q0, #1.000000e+00 601; CHECK-NEXT: add.w lr, r3, r12, lsr #2 602; CHECK-NEXT: mov r3, r0 603; CHECK-NEXT: .LBB6_5: @ %vector.body 604; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 605; CHECK-NEXT: vldrw.u32 q1, [r3], #16 606; CHECK-NEXT: vmul.f32 q0, q1, q0 607; CHECK-NEXT: le lr, .LBB6_5 608; CHECK-NEXT: @ %bb.6: @ %middle.block 609; CHECK-NEXT: vmul.f32 s2, s2, s3 610; CHECK-NEXT: cmp r2, r1 611; CHECK-NEXT: vmul.f32 s0, s0, s1 612; CHECK-NEXT: vmul.f32 s0, s0, s2 613; CHECK-NEXT: beq .LBB6_9 614; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1 615; CHECK-NEXT: sub.w lr, r1, r2 616; CHECK-NEXT: add.w r0, r0, r2, lsl #2 617; CHECK-NEXT: .LBB6_8: @ %for.body 618; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 619; CHECK-NEXT: vldmia r0!, {s2} 620; CHECK-NEXT: vmul.f32 s0, s2, s0 621; CHECK-NEXT: le lr, .LBB6_8 622; CHECK-NEXT: .LBB6_9: @ %for.cond.cleanup 623; CHECK-NEXT: vmov r0, s0 624; CHECK-NEXT: pop {r7, pc} 625entry: 626 %cmp6 = icmp sgt i32 %n, 0 627 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 628 629for.body.preheader: ; preds = %entry 630 %min.iters.check = icmp ult i32 %n, 4 631 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 632 633vector.ph: ; preds = %for.body.preheader 634 %n.vec = and i32 %n, -4 635 br label %vector.body 636 637vector.body: ; preds = %vector.body, %vector.ph 638 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 639 %vec.phi = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ %2, %vector.body ] 640 %0 = getelementptr inbounds float, ptr %x, i32 %index 641 %1 = bitcast ptr %0 to ptr 642 %wide.load = load <4 x float>, ptr %1, align 4 643 %2 = fmul fast <4 x float> %wide.load, %vec.phi 644 %index.next = add i32 %index, 4 645 %3 = icmp eq i32 %index.next, %n.vec 646 br i1 %3, label %middle.block, label %vector.body 647 648middle.block: ; preds = %vector.body 649 %4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2) 650 %cmp.n = icmp eq i32 %n.vec, %n 651 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 652 653for.body.preheader1: ; preds = %middle.block, %for.body.preheader 654 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 655 %r.07.ph = phi float [ 1.000000e+00, %for.body.preheader ], [ %4, %middle.block ] 656 br label %for.body 657 658for.body: ; preds = %for.body.preheader1, %for.body 659 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 660 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 661 %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08 662 %5 = load float, ptr %arrayidx, align 4 663 %add = fmul fast float %5, %r.07 664 %inc = add nuw nsw i32 %i.08, 1 665 %exitcond = icmp eq i32 %inc, %n 666 br i1 %exitcond, label %for.cond.cleanup, label %for.body 667 668for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 669 %r.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ] 670 ret float %r.0.lcssa 671} 672 673define i32 @smin_i32(ptr nocapture readonly %x, i32 %n) { 674; CHECK-LABEL: smin_i32: 675; CHECK: @ %bb.0: @ %entry 676; CHECK-NEXT: .save {r7, lr} 677; CHECK-NEXT: push {r7, lr} 678; CHECK-NEXT: cmp r1, #1 679; CHECK-NEXT: blt .LBB7_3 680; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 681; CHECK-NEXT: cmp r1, #4 682; CHECK-NEXT: bhs .LBB7_4 683; CHECK-NEXT: @ %bb.2: 684; CHECK-NEXT: mvn r2, #-2147483648 685; CHECK-NEXT: movs r3, #0 686; CHECK-NEXT: b .LBB7_7 687; CHECK-NEXT: .LBB7_3: 688; CHECK-NEXT: mvn r2, #-2147483648 689; CHECK-NEXT: mov r0, r2 690; CHECK-NEXT: pop {r7, pc} 691; CHECK-NEXT: .LBB7_4: @ %vector.ph 692; CHECK-NEXT: bic r3, r1, #3 693; CHECK-NEXT: movs r2, #1 694; CHECK-NEXT: sub.w r12, r3, #4 695; CHECK-NEXT: vmvn.i32 q0, #0x80000000 696; CHECK-NEXT: add.w lr, r2, r12, lsr #2 697; CHECK-NEXT: mov r2, r0 698; CHECK-NEXT: .LBB7_5: @ %vector.body 699; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 700; CHECK-NEXT: vldrw.u32 q1, [r2], #16 701; CHECK-NEXT: vmin.s32 q0, q0, q1 702; CHECK-NEXT: le lr, .LBB7_5 703; CHECK-NEXT: @ %bb.6: @ %middle.block 704; CHECK-NEXT: mvn r2, #-2147483648 705; CHECK-NEXT: cmp r3, r1 706; CHECK-NEXT: vminv.s32 r2, q0 707; CHECK-NEXT: beq .LBB7_9 708; CHECK-NEXT: .LBB7_7: @ %for.body.preheader1 709; CHECK-NEXT: sub.w lr, r1, r3 710; CHECK-NEXT: add.w r0, r0, r3, lsl #2 711; CHECK-NEXT: .LBB7_8: @ %for.body 712; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 713; CHECK-NEXT: ldr r1, [r0], #4 714; CHECK-NEXT: cmp r2, r1 715; CHECK-NEXT: csel r2, r2, r1, lt 716; CHECK-NEXT: le lr, .LBB7_8 717; CHECK-NEXT: .LBB7_9: @ %for.cond.cleanup 718; CHECK-NEXT: mov r0, r2 719; CHECK-NEXT: pop {r7, pc} 720entry: 721 %cmp6 = icmp sgt i32 %n, 0 722 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 723 724for.body.preheader: ; preds = %entry 725 %min.iters.check = icmp ult i32 %n, 4 726 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 727 728vector.ph: ; preds = %for.body.preheader 729 %n.vec = and i32 %n, -4 730 br label %vector.body 731 732vector.body: ; preds = %vector.body, %vector.ph 733 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 734 %vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %vector.ph ], [ %3, %vector.body ] 735 %0 = getelementptr inbounds i32, ptr %x, i32 %index 736 %1 = bitcast ptr %0 to ptr 737 %wide.load = load <4 x i32>, ptr %1, align 4 738 %2 = icmp slt <4 x i32> %vec.phi, %wide.load 739 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load 740 %index.next = add i32 %index, 4 741 %4 = icmp eq i32 %index.next, %n.vec 742 br i1 %4, label %middle.block, label %vector.body 743 744middle.block: ; preds = %vector.body 745 %5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3) 746 %cmp.n = icmp eq i32 %n.vec, %n 747 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 748 749for.body.preheader1: ; preds = %middle.block, %for.body.preheader 750 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 751 %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ] 752 br label %for.body 753 754for.body: ; preds = %for.body.preheader1, %for.body 755 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 756 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 757 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 758 %6 = load i32, ptr %arrayidx, align 4 759 %c = icmp slt i32 %r.07, %6 760 %add = select i1 %c, i32 %r.07, i32 %6 761 %inc = add nuw nsw i32 %i.08, 1 762 %exitcond = icmp eq i32 %inc, %n 763 br i1 %exitcond, label %for.cond.cleanup, label %for.body 764 765for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 766 %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 767 ret i32 %r.0.lcssa 768} 769 770define i32 @smin_i32_inloop(ptr nocapture readonly %x, i32 %n) { 771; CHECK-LABEL: smin_i32_inloop: 772; CHECK: @ %bb.0: @ %entry 773; CHECK-NEXT: .save {r7, lr} 774; CHECK-NEXT: push {r7, lr} 775; CHECK-NEXT: cmp r1, #1 776; CHECK-NEXT: blt .LBB8_3 777; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 778; CHECK-NEXT: mov r12, r0 779; CHECK-NEXT: cmp r1, #4 780; CHECK-NEXT: bhs .LBB8_4 781; CHECK-NEXT: @ %bb.2: 782; CHECK-NEXT: mvn r0, #-2147483648 783; CHECK-NEXT: movs r3, #0 784; CHECK-NEXT: b .LBB8_7 785; CHECK-NEXT: .LBB8_3: 786; CHECK-NEXT: mvn r0, #-2147483648 787; CHECK-NEXT: pop {r7, pc} 788; CHECK-NEXT: .LBB8_4: @ %vector.ph 789; CHECK-NEXT: bic r3, r1, #3 790; CHECK-NEXT: movs r2, #1 791; CHECK-NEXT: subs r0, r3, #4 792; CHECK-NEXT: add.w lr, r2, r0, lsr #2 793; CHECK-NEXT: mvn r0, #-2147483648 794; CHECK-NEXT: mov r2, r12 795; CHECK-NEXT: .LBB8_5: @ %vector.body 796; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 797; CHECK-NEXT: vldrw.u32 q0, [r2], #16 798; CHECK-NEXT: vminv.s32 r0, q0 799; CHECK-NEXT: le lr, .LBB8_5 800; CHECK-NEXT: @ %bb.6: @ %middle.block 801; CHECK-NEXT: cmp r3, r1 802; CHECK-NEXT: it eq 803; CHECK-NEXT: popeq {r7, pc} 804; CHECK-NEXT: .LBB8_7: @ %for.body.preheader1 805; CHECK-NEXT: sub.w lr, r1, r3 806; CHECK-NEXT: add.w r2, r12, r3, lsl #2 807; CHECK-NEXT: .LBB8_8: @ %for.body 808; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 809; CHECK-NEXT: ldr r1, [r2], #4 810; CHECK-NEXT: cmp r0, r1 811; CHECK-NEXT: csel r0, r0, r1, lt 812; CHECK-NEXT: le lr, .LBB8_8 813; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup 814; CHECK-NEXT: pop {r7, pc} 815entry: 816 %cmp6 = icmp sgt i32 %n, 0 817 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 818 819for.body.preheader: ; preds = %entry 820 %min.iters.check = icmp ult i32 %n, 4 821 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 822 823vector.ph: ; preds = %for.body.preheader 824 %n.vec = and i32 %n, -4 825 br label %vector.body 826 827vector.body: ; preds = %vector.body, %vector.ph 828 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 829 %vec.phi = phi i32 [ 2147483647, %vector.ph ], [ %3, %vector.body ] 830 %0 = getelementptr inbounds i32, ptr %x, i32 %index 831 %1 = bitcast ptr %0 to ptr 832 %wide.load = load <4 x i32>, ptr %1, align 4 833 %l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load) 834 %2 = icmp slt i32 %vec.phi, %l5 835 %3 = select i1 %2, i32 %vec.phi, i32 %l5 836 %index.next = add i32 %index, 4 837 %4 = icmp eq i32 %index.next, %n.vec 838 br i1 %4, label %middle.block, label %vector.body 839 840middle.block: ; preds = %vector.body 841 %5 = phi i32 [ %3, %vector.body ] 842 %cmp.n = icmp eq i32 %n.vec, %n 843 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 844 845for.body.preheader1: ; preds = %middle.block, %for.body.preheader 846 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 847 %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ] 848 br label %for.body 849 850for.body: ; preds = %for.body.preheader1, %for.body 851 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 852 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 853 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 854 %6 = load i32, ptr %arrayidx, align 4 855 %c = icmp slt i32 %r.07, %6 856 %add = select i1 %c, i32 %r.07, i32 %6 857 %inc = add nuw nsw i32 %i.08, 1 858 %exitcond = icmp eq i32 %inc, %n 859 br i1 %exitcond, label %for.cond.cleanup, label %for.body 860 861for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 862 %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 863 ret i32 %r.0.lcssa 864} 865 866define i32 @smax_i32(ptr nocapture readonly %x, i32 %n) { 867; CHECK-LABEL: smax_i32: 868; CHECK: @ %bb.0: @ %entry 869; CHECK-NEXT: .save {r7, lr} 870; CHECK-NEXT: push {r7, lr} 871; CHECK-NEXT: cmp r1, #1 872; CHECK-NEXT: blt .LBB9_3 873; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 874; CHECK-NEXT: cmp r1, #4 875; CHECK-NEXT: bhs .LBB9_4 876; CHECK-NEXT: @ %bb.2: 877; CHECK-NEXT: mov.w r2, #-2147483648 878; CHECK-NEXT: movs r3, #0 879; CHECK-NEXT: b .LBB9_7 880; CHECK-NEXT: .LBB9_3: 881; CHECK-NEXT: mov.w r2, #-2147483648 882; CHECK-NEXT: mov r0, r2 883; CHECK-NEXT: pop {r7, pc} 884; CHECK-NEXT: .LBB9_4: @ %vector.ph 885; CHECK-NEXT: bic r3, r1, #3 886; CHECK-NEXT: movs r2, #1 887; CHECK-NEXT: sub.w r12, r3, #4 888; CHECK-NEXT: vmov.i32 q0, #0x80000000 889; CHECK-NEXT: add.w lr, r2, r12, lsr #2 890; CHECK-NEXT: mov r2, r0 891; CHECK-NEXT: .LBB9_5: @ %vector.body 892; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 893; CHECK-NEXT: vldrw.u32 q1, [r2], #16 894; CHECK-NEXT: vmax.s32 q0, q0, q1 895; CHECK-NEXT: le lr, .LBB9_5 896; CHECK-NEXT: @ %bb.6: @ %middle.block 897; CHECK-NEXT: mov.w r2, #-2147483648 898; CHECK-NEXT: cmp r3, r1 899; CHECK-NEXT: vmaxv.s32 r2, q0 900; CHECK-NEXT: beq .LBB9_9 901; CHECK-NEXT: .LBB9_7: @ %for.body.preheader1 902; CHECK-NEXT: sub.w lr, r1, r3 903; CHECK-NEXT: add.w r0, r0, r3, lsl #2 904; CHECK-NEXT: .LBB9_8: @ %for.body 905; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 906; CHECK-NEXT: ldr r1, [r0], #4 907; CHECK-NEXT: cmp r2, r1 908; CHECK-NEXT: csel r2, r2, r1, gt 909; CHECK-NEXT: le lr, .LBB9_8 910; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup 911; CHECK-NEXT: mov r0, r2 912; CHECK-NEXT: pop {r7, pc} 913entry: 914 %cmp6 = icmp sgt i32 %n, 0 915 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 916 917for.body.preheader: ; preds = %entry 918 %min.iters.check = icmp ult i32 %n, 4 919 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 920 921vector.ph: ; preds = %for.body.preheader 922 %n.vec = and i32 %n, -4 923 br label %vector.body 924 925vector.body: ; preds = %vector.body, %vector.ph 926 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 927 %vec.phi = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %vector.ph ], [ %3, %vector.body ] 928 %0 = getelementptr inbounds i32, ptr %x, i32 %index 929 %1 = bitcast ptr %0 to ptr 930 %wide.load = load <4 x i32>, ptr %1, align 4 931 %2 = icmp sgt <4 x i32> %vec.phi, %wide.load 932 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load 933 %index.next = add i32 %index, 4 934 %4 = icmp eq i32 %index.next, %n.vec 935 br i1 %4, label %middle.block, label %vector.body 936 937middle.block: ; preds = %vector.body 938 %5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3) 939 %cmp.n = icmp eq i32 %n.vec, %n 940 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 941 942for.body.preheader1: ; preds = %middle.block, %for.body.preheader 943 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 944 %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ] 945 br label %for.body 946 947for.body: ; preds = %for.body.preheader1, %for.body 948 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 949 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 950 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 951 %6 = load i32, ptr %arrayidx, align 4 952 %c = icmp sgt i32 %r.07, %6 953 %add = select i1 %c, i32 %r.07, i32 %6 954 %inc = add nuw nsw i32 %i.08, 1 955 %exitcond = icmp eq i32 %inc, %n 956 br i1 %exitcond, label %for.cond.cleanup, label %for.body 957 958for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 959 %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 960 ret i32 %r.0.lcssa 961} 962 963define i32 @smax_i32_inloop(ptr nocapture readonly %x, i32 %n) { 964; CHECK-LABEL: smax_i32_inloop: 965; CHECK: @ %bb.0: @ %entry 966; CHECK-NEXT: .save {r7, lr} 967; CHECK-NEXT: push {r7, lr} 968; CHECK-NEXT: cmp r1, #1 969; CHECK-NEXT: blt .LBB10_3 970; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 971; CHECK-NEXT: mov r12, r0 972; CHECK-NEXT: cmp r1, #4 973; CHECK-NEXT: bhs .LBB10_4 974; CHECK-NEXT: @ %bb.2: 975; CHECK-NEXT: mov.w r0, #-2147483648 976; CHECK-NEXT: movs r3, #0 977; CHECK-NEXT: b .LBB10_7 978; CHECK-NEXT: .LBB10_3: 979; CHECK-NEXT: mov.w r0, #-2147483648 980; CHECK-NEXT: pop {r7, pc} 981; CHECK-NEXT: .LBB10_4: @ %vector.ph 982; CHECK-NEXT: bic r3, r1, #3 983; CHECK-NEXT: movs r2, #1 984; CHECK-NEXT: subs r0, r3, #4 985; CHECK-NEXT: add.w lr, r2, r0, lsr #2 986; CHECK-NEXT: mov.w r0, #-2147483648 987; CHECK-NEXT: mov r2, r12 988; CHECK-NEXT: .LBB10_5: @ %vector.body 989; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 990; CHECK-NEXT: vldrw.u32 q0, [r2], #16 991; CHECK-NEXT: vmaxv.s32 r0, q0 992; CHECK-NEXT: le lr, .LBB10_5 993; CHECK-NEXT: @ %bb.6: @ %middle.block 994; CHECK-NEXT: cmp r3, r1 995; CHECK-NEXT: it eq 996; CHECK-NEXT: popeq {r7, pc} 997; CHECK-NEXT: .LBB10_7: @ %for.body.preheader1 998; CHECK-NEXT: sub.w lr, r1, r3 999; CHECK-NEXT: add.w r2, r12, r3, lsl #2 1000; CHECK-NEXT: .LBB10_8: @ %for.body 1001; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1002; CHECK-NEXT: ldr r1, [r2], #4 1003; CHECK-NEXT: cmp r0, r1 1004; CHECK-NEXT: csel r0, r0, r1, gt 1005; CHECK-NEXT: le lr, .LBB10_8 1006; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup 1007; CHECK-NEXT: pop {r7, pc} 1008entry: 1009 %cmp6 = icmp sgt i32 %n, 0 1010 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1011 1012for.body.preheader: ; preds = %entry 1013 %min.iters.check = icmp ult i32 %n, 4 1014 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1015 1016vector.ph: ; preds = %for.body.preheader 1017 %n.vec = and i32 %n, -4 1018 br label %vector.body 1019 1020vector.body: ; preds = %vector.body, %vector.ph 1021 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1022 %vec.phi = phi i32 [ -2147483648, %vector.ph ], [ %3, %vector.body ] 1023 %0 = getelementptr inbounds i32, ptr %x, i32 %index 1024 %1 = bitcast ptr %0 to ptr 1025 %wide.load = load <4 x i32>, ptr %1, align 4 1026 %l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load) 1027 %2 = icmp sgt i32 %vec.phi, %l5 1028 %3 = select i1 %2, i32 %vec.phi, i32 %l5 1029 %index.next = add i32 %index, 4 1030 %4 = icmp eq i32 %index.next, %n.vec 1031 br i1 %4, label %middle.block, label %vector.body 1032 1033middle.block: ; preds = %vector.body 1034 %5 = phi i32 [ %3, %vector.body ] 1035 %cmp.n = icmp eq i32 %n.vec, %n 1036 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1037 1038for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1039 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1040 %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ] 1041 br label %for.body 1042 1043for.body: ; preds = %for.body.preheader1, %for.body 1044 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1045 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1046 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 1047 %6 = load i32, ptr %arrayidx, align 4 1048 %c = icmp sgt i32 %r.07, %6 1049 %add = select i1 %c, i32 %r.07, i32 %6 1050 %inc = add nuw nsw i32 %i.08, 1 1051 %exitcond = icmp eq i32 %inc, %n 1052 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1053 1054for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1055 %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1056 ret i32 %r.0.lcssa 1057} 1058 1059define i32 @umin_i32(ptr nocapture readonly %x, i32 %n) { 1060; CHECK-LABEL: umin_i32: 1061; CHECK: @ %bb.0: @ %entry 1062; CHECK-NEXT: .save {r7, lr} 1063; CHECK-NEXT: push {r7, lr} 1064; CHECK-NEXT: cmp r1, #1 1065; CHECK-NEXT: blt .LBB11_3 1066; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1067; CHECK-NEXT: cmp r1, #4 1068; CHECK-NEXT: bhs .LBB11_4 1069; CHECK-NEXT: @ %bb.2: 1070; CHECK-NEXT: mov.w r2, #-1 1071; CHECK-NEXT: movs r3, #0 1072; CHECK-NEXT: b .LBB11_7 1073; CHECK-NEXT: .LBB11_3: 1074; CHECK-NEXT: mov.w r2, #-1 1075; CHECK-NEXT: mov r0, r2 1076; CHECK-NEXT: pop {r7, pc} 1077; CHECK-NEXT: .LBB11_4: @ %vector.ph 1078; CHECK-NEXT: bic r3, r1, #3 1079; CHECK-NEXT: movs r2, #1 1080; CHECK-NEXT: sub.w r12, r3, #4 1081; CHECK-NEXT: vmov.i8 q0, #0xff 1082; CHECK-NEXT: add.w lr, r2, r12, lsr #2 1083; CHECK-NEXT: mov r2, r0 1084; CHECK-NEXT: .LBB11_5: @ %vector.body 1085; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1086; CHECK-NEXT: vldrw.u32 q1, [r2], #16 1087; CHECK-NEXT: vmin.u32 q0, q0, q1 1088; CHECK-NEXT: le lr, .LBB11_5 1089; CHECK-NEXT: @ %bb.6: @ %middle.block 1090; CHECK-NEXT: mov.w r2, #-1 1091; CHECK-NEXT: cmp r3, r1 1092; CHECK-NEXT: vminv.u32 r2, q0 1093; CHECK-NEXT: beq .LBB11_9 1094; CHECK-NEXT: .LBB11_7: @ %for.body.preheader1 1095; CHECK-NEXT: sub.w lr, r1, r3 1096; CHECK-NEXT: add.w r0, r0, r3, lsl #2 1097; CHECK-NEXT: .LBB11_8: @ %for.body 1098; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1099; CHECK-NEXT: ldr r1, [r0], #4 1100; CHECK-NEXT: cmp r2, r1 1101; CHECK-NEXT: csel r2, r2, r1, lo 1102; CHECK-NEXT: le lr, .LBB11_8 1103; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup 1104; CHECK-NEXT: mov r0, r2 1105; CHECK-NEXT: pop {r7, pc} 1106entry: 1107 %cmp6 = icmp sgt i32 %n, 0 1108 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1109 1110for.body.preheader: ; preds = %entry 1111 %min.iters.check = icmp ult i32 %n, 4 1112 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1113 1114vector.ph: ; preds = %for.body.preheader 1115 %n.vec = and i32 %n, -4 1116 br label %vector.body 1117 1118vector.body: ; preds = %vector.body, %vector.ph 1119 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1120 %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %3, %vector.body ] 1121 %0 = getelementptr inbounds i32, ptr %x, i32 %index 1122 %1 = bitcast ptr %0 to ptr 1123 %wide.load = load <4 x i32>, ptr %1, align 4 1124 %2 = icmp ult <4 x i32> %vec.phi, %wide.load 1125 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load 1126 %index.next = add i32 %index, 4 1127 %4 = icmp eq i32 %index.next, %n.vec 1128 br i1 %4, label %middle.block, label %vector.body 1129 1130middle.block: ; preds = %vector.body 1131 %5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3) 1132 %cmp.n = icmp eq i32 %n.vec, %n 1133 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1134 1135for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1136 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1137 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ] 1138 br label %for.body 1139 1140for.body: ; preds = %for.body.preheader1, %for.body 1141 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1142 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1143 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 1144 %6 = load i32, ptr %arrayidx, align 4 1145 %c = icmp ult i32 %r.07, %6 1146 %add = select i1 %c, i32 %r.07, i32 %6 1147 %inc = add nuw nsw i32 %i.08, 1 1148 %exitcond = icmp eq i32 %inc, %n 1149 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1150 1151for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1152 %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1153 ret i32 %r.0.lcssa 1154} 1155 1156define i32 @umin_i32_inloop(ptr nocapture readonly %x, i32 %n) { 1157; CHECK-LABEL: umin_i32_inloop: 1158; CHECK: @ %bb.0: @ %entry 1159; CHECK-NEXT: .save {r7, lr} 1160; CHECK-NEXT: push {r7, lr} 1161; CHECK-NEXT: cmp r1, #1 1162; CHECK-NEXT: blt .LBB12_3 1163; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1164; CHECK-NEXT: mov r12, r0 1165; CHECK-NEXT: cmp r1, #4 1166; CHECK-NEXT: bhs .LBB12_4 1167; CHECK-NEXT: @ %bb.2: 1168; CHECK-NEXT: mov.w r0, #-1 1169; CHECK-NEXT: movs r3, #0 1170; CHECK-NEXT: b .LBB12_7 1171; CHECK-NEXT: .LBB12_3: 1172; CHECK-NEXT: mov.w r0, #-1 1173; CHECK-NEXT: pop {r7, pc} 1174; CHECK-NEXT: .LBB12_4: @ %vector.ph 1175; CHECK-NEXT: bic r3, r1, #3 1176; CHECK-NEXT: movs r2, #1 1177; CHECK-NEXT: subs r0, r3, #4 1178; CHECK-NEXT: add.w lr, r2, r0, lsr #2 1179; CHECK-NEXT: mov.w r0, #-1 1180; CHECK-NEXT: mov r2, r12 1181; CHECK-NEXT: .LBB12_5: @ %vector.body 1182; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1183; CHECK-NEXT: vldrw.u32 q0, [r2], #16 1184; CHECK-NEXT: vminv.u32 r0, q0 1185; CHECK-NEXT: le lr, .LBB12_5 1186; CHECK-NEXT: @ %bb.6: @ %middle.block 1187; CHECK-NEXT: cmp r3, r1 1188; CHECK-NEXT: it eq 1189; CHECK-NEXT: popeq {r7, pc} 1190; CHECK-NEXT: .LBB12_7: @ %for.body.preheader1 1191; CHECK-NEXT: sub.w lr, r1, r3 1192; CHECK-NEXT: add.w r2, r12, r3, lsl #2 1193; CHECK-NEXT: .LBB12_8: @ %for.body 1194; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1195; CHECK-NEXT: ldr r1, [r2], #4 1196; CHECK-NEXT: cmp r0, r1 1197; CHECK-NEXT: csel r0, r0, r1, hi 1198; CHECK-NEXT: le lr, .LBB12_8 1199; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup 1200; CHECK-NEXT: pop {r7, pc} 1201entry: 1202 %cmp6 = icmp sgt i32 %n, 0 1203 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1204 1205for.body.preheader: ; preds = %entry 1206 %min.iters.check = icmp ult i32 %n, 4 1207 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1208 1209vector.ph: ; preds = %for.body.preheader 1210 %n.vec = and i32 %n, -4 1211 br label %vector.body 1212 1213vector.body: ; preds = %vector.body, %vector.ph 1214 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1215 %vec.phi = phi i32 [ -1, %vector.ph ], [ %3, %vector.body ] 1216 %0 = getelementptr inbounds i32, ptr %x, i32 %index 1217 %1 = bitcast ptr %0 to ptr 1218 %wide.load = load <4 x i32>, ptr %1, align 4 1219 %l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load) 1220 %2 = icmp ult i32 %vec.phi, %l5 1221 %3 = select i1 %2, i32 %vec.phi, i32 %l5 1222 %index.next = add i32 %index, 4 1223 %4 = icmp eq i32 %index.next, %n.vec 1224 br i1 %4, label %middle.block, label %vector.body 1225 1226middle.block: ; preds = %vector.body 1227 %5 = phi i32 [ %3, %vector.body ] 1228 %cmp.n = icmp eq i32 %n.vec, %n 1229 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1230 1231for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1232 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1233 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ] 1234 br label %for.body 1235 1236for.body: ; preds = %for.body.preheader1, %for.body 1237 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1238 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1239 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 1240 %6 = load i32, ptr %arrayidx, align 4 1241 %c = icmp ugt i32 %r.07, %6 1242 %add = select i1 %c, i32 %r.07, i32 %6 1243 %inc = add nuw nsw i32 %i.08, 1 1244 %exitcond = icmp eq i32 %inc, %n 1245 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1246 1247for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1248 %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1249 ret i32 %r.0.lcssa 1250} 1251 1252define i32 @umax_i32(ptr nocapture readonly %x, i32 %n) { 1253; CHECK-LABEL: umax_i32: 1254; CHECK: @ %bb.0: @ %entry 1255; CHECK-NEXT: .save {r7, lr} 1256; CHECK-NEXT: push {r7, lr} 1257; CHECK-NEXT: cmp r1, #1 1258; CHECK-NEXT: blt .LBB13_3 1259; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1260; CHECK-NEXT: cmp r1, #4 1261; CHECK-NEXT: bhs .LBB13_4 1262; CHECK-NEXT: @ %bb.2: 1263; CHECK-NEXT: movs r3, #0 1264; CHECK-NEXT: movs r2, #0 1265; CHECK-NEXT: b .LBB13_7 1266; CHECK-NEXT: .LBB13_3: 1267; CHECK-NEXT: movs r2, #0 1268; CHECK-NEXT: mov r0, r2 1269; CHECK-NEXT: pop {r7, pc} 1270; CHECK-NEXT: .LBB13_4: @ %vector.ph 1271; CHECK-NEXT: bic r3, r1, #3 1272; CHECK-NEXT: movs r2, #1 1273; CHECK-NEXT: sub.w r12, r3, #4 1274; CHECK-NEXT: vmov.i32 q0, #0x0 1275; CHECK-NEXT: add.w lr, r2, r12, lsr #2 1276; CHECK-NEXT: mov r2, r0 1277; CHECK-NEXT: .LBB13_5: @ %vector.body 1278; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1279; CHECK-NEXT: vldrw.u32 q1, [r2], #16 1280; CHECK-NEXT: vmax.u32 q0, q0, q1 1281; CHECK-NEXT: le lr, .LBB13_5 1282; CHECK-NEXT: @ %bb.6: @ %middle.block 1283; CHECK-NEXT: movs r2, #0 1284; CHECK-NEXT: cmp r3, r1 1285; CHECK-NEXT: vmaxv.u32 r2, q0 1286; CHECK-NEXT: beq .LBB13_9 1287; CHECK-NEXT: .LBB13_7: @ %for.body.preheader1 1288; CHECK-NEXT: sub.w lr, r1, r3 1289; CHECK-NEXT: add.w r0, r0, r3, lsl #2 1290; CHECK-NEXT: .LBB13_8: @ %for.body 1291; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1292; CHECK-NEXT: ldr r1, [r0], #4 1293; CHECK-NEXT: cmp r2, r1 1294; CHECK-NEXT: csel r2, r2, r1, hi 1295; CHECK-NEXT: le lr, .LBB13_8 1296; CHECK-NEXT: .LBB13_9: @ %for.cond.cleanup 1297; CHECK-NEXT: mov r0, r2 1298; CHECK-NEXT: pop {r7, pc} 1299entry: 1300 %cmp6 = icmp sgt i32 %n, 0 1301 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1302 1303for.body.preheader: ; preds = %entry 1304 %min.iters.check = icmp ult i32 %n, 4 1305 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1306 1307vector.ph: ; preds = %for.body.preheader 1308 %n.vec = and i32 %n, -4 1309 br label %vector.body 1310 1311vector.body: ; preds = %vector.body, %vector.ph 1312 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1313 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 1314 %0 = getelementptr inbounds i32, ptr %x, i32 %index 1315 %1 = bitcast ptr %0 to ptr 1316 %wide.load = load <4 x i32>, ptr %1, align 4 1317 %2 = icmp ugt <4 x i32> %vec.phi, %wide.load 1318 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load 1319 %index.next = add i32 %index, 4 1320 %4 = icmp eq i32 %index.next, %n.vec 1321 br i1 %4, label %middle.block, label %vector.body 1322 1323middle.block: ; preds = %vector.body 1324 %5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3) 1325 %cmp.n = icmp eq i32 %n.vec, %n 1326 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1327 1328for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1329 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1330 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ] 1331 br label %for.body 1332 1333for.body: ; preds = %for.body.preheader1, %for.body 1334 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1335 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1336 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 1337 %6 = load i32, ptr %arrayidx, align 4 1338 %c = icmp ugt i32 %r.07, %6 1339 %add = select i1 %c, i32 %r.07, i32 %6 1340 %inc = add nuw nsw i32 %i.08, 1 1341 %exitcond = icmp eq i32 %inc, %n 1342 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1343 1344for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1345 %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1346 ret i32 %r.0.lcssa 1347} 1348 1349define i32 @umax_i32_inloop(ptr nocapture readonly %x, i32 %n) { 1350; CHECK-LABEL: umax_i32_inloop: 1351; CHECK: @ %bb.0: @ %entry 1352; CHECK-NEXT: .save {r7, lr} 1353; CHECK-NEXT: push {r7, lr} 1354; CHECK-NEXT: cmp r1, #1 1355; CHECK-NEXT: blt .LBB14_3 1356; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1357; CHECK-NEXT: mov r12, r0 1358; CHECK-NEXT: cmp r1, #4 1359; CHECK-NEXT: bhs .LBB14_4 1360; CHECK-NEXT: @ %bb.2: 1361; CHECK-NEXT: movs r3, #0 1362; CHECK-NEXT: movs r0, #0 1363; CHECK-NEXT: b .LBB14_7 1364; CHECK-NEXT: .LBB14_3: 1365; CHECK-NEXT: movs r0, #0 1366; CHECK-NEXT: pop {r7, pc} 1367; CHECK-NEXT: .LBB14_4: @ %vector.ph 1368; CHECK-NEXT: bic r3, r1, #3 1369; CHECK-NEXT: movs r2, #1 1370; CHECK-NEXT: subs r0, r3, #4 1371; CHECK-NEXT: add.w lr, r2, r0, lsr #2 1372; CHECK-NEXT: movs r0, #0 1373; CHECK-NEXT: mov r2, r12 1374; CHECK-NEXT: .LBB14_5: @ %vector.body 1375; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1376; CHECK-NEXT: vldrw.u32 q0, [r2], #16 1377; CHECK-NEXT: vmaxv.u32 r0, q0 1378; CHECK-NEXT: le lr, .LBB14_5 1379; CHECK-NEXT: @ %bb.6: @ %middle.block 1380; CHECK-NEXT: cmp r3, r1 1381; CHECK-NEXT: it eq 1382; CHECK-NEXT: popeq {r7, pc} 1383; CHECK-NEXT: .LBB14_7: @ %for.body.preheader1 1384; CHECK-NEXT: sub.w lr, r1, r3 1385; CHECK-NEXT: add.w r2, r12, r3, lsl #2 1386; CHECK-NEXT: .LBB14_8: @ %for.body 1387; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1388; CHECK-NEXT: ldr r1, [r2], #4 1389; CHECK-NEXT: cmp r0, r1 1390; CHECK-NEXT: csel r0, r0, r1, hi 1391; CHECK-NEXT: le lr, .LBB14_8 1392; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup 1393; CHECK-NEXT: pop {r7, pc} 1394entry: 1395 %cmp6 = icmp sgt i32 %n, 0 1396 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1397 1398for.body.preheader: ; preds = %entry 1399 %min.iters.check = icmp ult i32 %n, 4 1400 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1401 1402vector.ph: ; preds = %for.body.preheader 1403 %n.vec = and i32 %n, -4 1404 br label %vector.body 1405 1406vector.body: ; preds = %vector.body, %vector.ph 1407 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1408 %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ] 1409 %0 = getelementptr inbounds i32, ptr %x, i32 %index 1410 %1 = bitcast ptr %0 to ptr 1411 %wide.load = load <4 x i32>, ptr %1, align 4 1412 %l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load) 1413 %2 = icmp ugt i32 %vec.phi, %l5 1414 %3 = select i1 %2, i32 %vec.phi, i32 %l5 1415 %index.next = add i32 %index, 4 1416 %4 = icmp eq i32 %index.next, %n.vec 1417 br i1 %4, label %middle.block, label %vector.body 1418 1419middle.block: ; preds = %vector.body 1420 %5 = phi i32 [ %3, %vector.body ] 1421 %cmp.n = icmp eq i32 %n.vec, %n 1422 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1423 1424for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1425 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1426 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ] 1427 br label %for.body 1428 1429for.body: ; preds = %for.body.preheader1, %for.body 1430 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1431 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1432 %arrayidx = getelementptr inbounds i32, ptr %x, i32 %i.08 1433 %6 = load i32, ptr %arrayidx, align 4 1434 %c = icmp ugt i32 %r.07, %6 1435 %add = select i1 %c, i32 %r.07, i32 %6 1436 %inc = add nuw nsw i32 %i.08, 1 1437 %exitcond = icmp eq i32 %inc, %n 1438 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1439 1440for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1441 %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1442 ret i32 %r.0.lcssa 1443} 1444 1445define float @fmin_f32(ptr nocapture readonly %x, i32 %n) { 1446; CHECK-LABEL: fmin_f32: 1447; CHECK: @ %bb.0: @ %entry 1448; CHECK-NEXT: .save {r7, lr} 1449; CHECK-NEXT: push {r7, lr} 1450; CHECK-NEXT: cmp r1, #1 1451; CHECK-NEXT: blt .LBB15_3 1452; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1453; CHECK-NEXT: cmp r1, #4 1454; CHECK-NEXT: bhs .LBB15_4 1455; CHECK-NEXT: @ %bb.2: 1456; CHECK-NEXT: vldr s0, .LCPI15_0 1457; CHECK-NEXT: movs r2, #0 1458; CHECK-NEXT: b .LBB15_7 1459; CHECK-NEXT: .LBB15_3: 1460; CHECK-NEXT: vldr s0, .LCPI15_0 1461; CHECK-NEXT: vmov r0, s0 1462; CHECK-NEXT: pop {r7, pc} 1463; CHECK-NEXT: .LBB15_4: @ %vector.ph 1464; CHECK-NEXT: bic r2, r1, #3 1465; CHECK-NEXT: movs r3, #1 1466; CHECK-NEXT: sub.w r12, r2, #4 1467; CHECK-NEXT: vmov.i32 q0, #0x0 1468; CHECK-NEXT: add.w lr, r3, r12, lsr #2 1469; CHECK-NEXT: mov r3, r0 1470; CHECK-NEXT: .LBB15_5: @ %vector.body 1471; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1472; CHECK-NEXT: vldrw.u32 q1, [r3], #16 1473; CHECK-NEXT: vcmp.f32 lt, q0, q1 1474; CHECK-NEXT: vpsel q0, q0, q1 1475; CHECK-NEXT: le lr, .LBB15_5 1476; CHECK-NEXT: @ %bb.6: @ %middle.block 1477; CHECK-NEXT: vminnm.f32 s2, s2, s3 1478; CHECK-NEXT: vminnm.f32 s0, s0, s1 1479; CHECK-NEXT: vminnm.f32 s0, s0, s2 1480; CHECK-NEXT: cmp r2, r1 1481; CHECK-NEXT: beq .LBB15_9 1482; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1 1483; CHECK-NEXT: sub.w lr, r1, r2 1484; CHECK-NEXT: add.w r0, r0, r2, lsl #2 1485; CHECK-NEXT: .LBB15_8: @ %for.body 1486; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1487; CHECK-NEXT: vldmia r0!, {s2} 1488; CHECK-NEXT: vcmp.f32 s0, s2 1489; CHECK-NEXT: vmrs APSR_nzcv, fpscr 1490; CHECK-NEXT: vselge.f32 s0, s2, s0 1491; CHECK-NEXT: le lr, .LBB15_8 1492; CHECK-NEXT: .LBB15_9: @ %for.cond.cleanup 1493; CHECK-NEXT: vmov r0, s0 1494; CHECK-NEXT: pop {r7, pc} 1495; CHECK-NEXT: .p2align 2 1496; CHECK-NEXT: @ %bb.10: 1497; CHECK-NEXT: .LCPI15_0: 1498; CHECK-NEXT: .long 0x00000000 @ float 0 1499entry: 1500 %cmp6 = icmp sgt i32 %n, 0 1501 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1502 1503for.body.preheader: ; preds = %entry 1504 %min.iters.check = icmp ult i32 %n, 4 1505 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1506 1507vector.ph: ; preds = %for.body.preheader 1508 %n.vec = and i32 %n, -4 1509 br label %vector.body 1510 1511vector.body: ; preds = %vector.body, %vector.ph 1512 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1513 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 1514 %0 = getelementptr inbounds float, ptr %x, i32 %index 1515 %1 = bitcast ptr %0 to ptr 1516 %wide.load = load <4 x float>, ptr %1, align 4 1517 %2 = fcmp ult <4 x float> %vec.phi, %wide.load 1518 %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load 1519 %index.next = add i32 %index, 4 1520 %4 = icmp eq i32 %index.next, %n.vec 1521 br i1 %4, label %middle.block, label %vector.body 1522 1523middle.block: ; preds = %vector.body 1524 %5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3) 1525 %cmp.n = icmp eq i32 %n.vec, %n 1526 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1527 1528for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1529 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1530 %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ] 1531 br label %for.body 1532 1533for.body: ; preds = %for.body.preheader1, %for.body 1534 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1535 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1536 %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08 1537 %6 = load float, ptr %arrayidx, align 4 1538 %c = fcmp ult float %r.07, %6 1539 %add = select i1 %c, float %r.07, float %6 1540 %inc = add nuw nsw i32 %i.08, 1 1541 %exitcond = icmp eq i32 %inc, %n 1542 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1543 1544for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1545 %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1546 ret float %r.0.lcssa 1547} 1548 1549define float @fmax_f32(ptr nocapture readonly %x, i32 %n) { 1550; CHECK-LABEL: fmax_f32: 1551; CHECK: @ %bb.0: @ %entry 1552; CHECK-NEXT: .save {r7, lr} 1553; CHECK-NEXT: push {r7, lr} 1554; CHECK-NEXT: cmp r1, #1 1555; CHECK-NEXT: blt .LBB16_3 1556; CHECK-NEXT: @ %bb.1: @ %for.body.preheader 1557; CHECK-NEXT: cmp r1, #4 1558; CHECK-NEXT: bhs .LBB16_4 1559; CHECK-NEXT: @ %bb.2: 1560; CHECK-NEXT: vldr s0, .LCPI16_0 1561; CHECK-NEXT: movs r2, #0 1562; CHECK-NEXT: b .LBB16_7 1563; CHECK-NEXT: .LBB16_3: 1564; CHECK-NEXT: vldr s0, .LCPI16_0 1565; CHECK-NEXT: vmov r0, s0 1566; CHECK-NEXT: pop {r7, pc} 1567; CHECK-NEXT: .LBB16_4: @ %vector.ph 1568; CHECK-NEXT: bic r2, r1, #3 1569; CHECK-NEXT: movs r3, #1 1570; CHECK-NEXT: sub.w r12, r2, #4 1571; CHECK-NEXT: vmov.i32 q0, #0x0 1572; CHECK-NEXT: add.w lr, r3, r12, lsr #2 1573; CHECK-NEXT: mov r3, r0 1574; CHECK-NEXT: .LBB16_5: @ %vector.body 1575; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1576; CHECK-NEXT: vldrw.u32 q1, [r3], #16 1577; CHECK-NEXT: vcmp.f32 lt, q1, q0 1578; CHECK-NEXT: vpsel q0, q0, q1 1579; CHECK-NEXT: le lr, .LBB16_5 1580; CHECK-NEXT: @ %bb.6: @ %middle.block 1581; CHECK-NEXT: vmaxnm.f32 s2, s2, s3 1582; CHECK-NEXT: vmaxnm.f32 s0, s0, s1 1583; CHECK-NEXT: vmaxnm.f32 s0, s0, s2 1584; CHECK-NEXT: cmp r2, r1 1585; CHECK-NEXT: beq .LBB16_9 1586; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1 1587; CHECK-NEXT: sub.w lr, r1, r2 1588; CHECK-NEXT: add.w r0, r0, r2, lsl #2 1589; CHECK-NEXT: .LBB16_8: @ %for.body 1590; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1591; CHECK-NEXT: vldmia r0!, {s2} 1592; CHECK-NEXT: vcmp.f32 s2, s0 1593; CHECK-NEXT: vmrs APSR_nzcv, fpscr 1594; CHECK-NEXT: vselge.f32 s0, s2, s0 1595; CHECK-NEXT: le lr, .LBB16_8 1596; CHECK-NEXT: .LBB16_9: @ %for.cond.cleanup 1597; CHECK-NEXT: vmov r0, s0 1598; CHECK-NEXT: pop {r7, pc} 1599; CHECK-NEXT: .p2align 2 1600; CHECK-NEXT: @ %bb.10: 1601; CHECK-NEXT: .LCPI16_0: 1602; CHECK-NEXT: .long 0x00000000 @ float 0 1603entry: 1604 %cmp6 = icmp sgt i32 %n, 0 1605 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup 1606 1607for.body.preheader: ; preds = %entry 1608 %min.iters.check = icmp ult i32 %n, 4 1609 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph 1610 1611vector.ph: ; preds = %for.body.preheader 1612 %n.vec = and i32 %n, -4 1613 br label %vector.body 1614 1615vector.body: ; preds = %vector.body, %vector.ph 1616 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1617 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ] 1618 %0 = getelementptr inbounds float, ptr %x, i32 %index 1619 %1 = bitcast ptr %0 to ptr 1620 %wide.load = load <4 x float>, ptr %1, align 4 1621 %2 = fcmp ugt <4 x float> %vec.phi, %wide.load 1622 %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load 1623 %index.next = add i32 %index, 4 1624 %4 = icmp eq i32 %index.next, %n.vec 1625 br i1 %4, label %middle.block, label %vector.body 1626 1627middle.block: ; preds = %vector.body 1628 %5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3) 1629 %cmp.n = icmp eq i32 %n.vec, %n 1630 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1 1631 1632for.body.preheader1: ; preds = %middle.block, %for.body.preheader 1633 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] 1634 %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ] 1635 br label %for.body 1636 1637for.body: ; preds = %for.body.preheader1, %for.body 1638 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ] 1639 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ] 1640 %arrayidx = getelementptr inbounds float, ptr %x, i32 %i.08 1641 %6 = load float, ptr %arrayidx, align 4 1642 %c = fcmp ugt float %r.07, %6 1643 %add = select i1 %c, float %r.07, float %6 1644 %inc = add nuw nsw i32 %i.08, 1 1645 %exitcond = icmp eq i32 %inc, %n 1646 br i1 %exitcond, label %for.cond.cleanup, label %for.body 1647 1648for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1649 %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ] 1650 ret float %r.0.lcssa 1651} 1652 1653define i32 @add4i32(ptr noalias nocapture readonly %x, i32 %n) { 1654; CHECK-LABEL: add4i32: 1655; CHECK: @ %bb.0: @ %entry 1656; CHECK-NEXT: .save {r7, lr} 1657; CHECK-NEXT: push {r7, lr} 1658; CHECK-NEXT: cbz r1, .LBB17_4 1659; CHECK-NEXT: @ %bb.1: @ %vector.ph 1660; CHECK-NEXT: movs r2, #0 1661; CHECK-NEXT: dlstp.32 lr, r1 1662; CHECK-NEXT: .LBB17_2: @ %vector.body 1663; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1664; CHECK-NEXT: vldrw.u32 q0, [r0], #16 1665; CHECK-NEXT: vaddva.u32 r2, q0 1666; CHECK-NEXT: letp lr, .LBB17_2 1667; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1668; CHECK-NEXT: mov r0, r2 1669; CHECK-NEXT: pop {r7, pc} 1670; CHECK-NEXT: .LBB17_4: 1671; CHECK-NEXT: movs r2, #0 1672; CHECK-NEXT: mov r0, r2 1673; CHECK-NEXT: pop {r7, pc} 1674entry: 1675 %cmp6.not = icmp eq i32 %n, 0 1676 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph 1677 1678vector.ph: ; preds = %entry 1679 %n.rnd.up = add i32 %n, 3 1680 %n.vec = and i32 %n.rnd.up, -4 1681 br label %vector.body 1682 1683vector.body: ; preds = %vector.body, %vector.ph 1684 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1685 %vec.phi = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ] 1686 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 1687 %0 = getelementptr inbounds i32, ptr %x, i32 %index 1688 %1 = bitcast ptr %0 to ptr 1689 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 1690 %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer 1691 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 1692 %4 = add i32 %3, %vec.phi 1693 %index.next = add i32 %index, 4 1694 %5 = icmp eq i32 %index.next, %n.vec 1695 br i1 %5, label %for.cond.cleanup, label %vector.body 1696 1697for.cond.cleanup: ; preds = %vector.body, %entry 1698 %s.0.lcssa = phi i32 [ 0, %entry ], [ %4, %vector.body ] 1699 ret i32 %s.0.lcssa 1700} 1701 1702define i32 @mla4i32(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 1703; CHECK-LABEL: mla4i32: 1704; CHECK: @ %bb.0: @ %entry 1705; CHECK-NEXT: .save {r7, lr} 1706; CHECK-NEXT: push {r7, lr} 1707; CHECK-NEXT: cbz r2, .LBB18_4 1708; CHECK-NEXT: @ %bb.1: @ %vector.ph 1709; CHECK-NEXT: mov.w r12, #0 1710; CHECK-NEXT: dlstp.32 lr, r2 1711; CHECK-NEXT: .LBB18_2: @ %vector.body 1712; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1713; CHECK-NEXT: vldrw.u32 q0, [r0], #16 1714; CHECK-NEXT: vldrw.u32 q1, [r1], #16 1715; CHECK-NEXT: vmlava.u32 r12, q1, q0 1716; CHECK-NEXT: letp lr, .LBB18_2 1717; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1718; CHECK-NEXT: mov r0, r12 1719; CHECK-NEXT: pop {r7, pc} 1720; CHECK-NEXT: .LBB18_4: 1721; CHECK-NEXT: mov.w r12, #0 1722; CHECK-NEXT: mov r0, r12 1723; CHECK-NEXT: pop {r7, pc} 1724entry: 1725 %cmp8.not = icmp eq i32 %n, 0 1726 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph 1727 1728vector.ph: ; preds = %entry 1729 %n.rnd.up = add i32 %n, 3 1730 %n.vec = and i32 %n.rnd.up, -4 1731 br label %vector.body 1732 1733vector.body: ; preds = %vector.body, %vector.ph 1734 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1735 %vec.phi = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ] 1736 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 1737 %0 = getelementptr inbounds i32, ptr %x, i32 %index 1738 %1 = bitcast ptr %0 to ptr 1739 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 1740 %2 = getelementptr inbounds i32, ptr %y, i32 %index 1741 %3 = bitcast ptr %2 to ptr 1742 %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 1743 %4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load 1744 %5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer 1745 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5) 1746 %7 = add i32 %6, %vec.phi 1747 %index.next = add i32 %index, 4 1748 %8 = icmp eq i32 %index.next, %n.vec 1749 br i1 %8, label %for.cond.cleanup, label %vector.body 1750 1751for.cond.cleanup: ; preds = %vector.body, %entry 1752 %s.0.lcssa = phi i32 [ 0, %entry ], [ %7, %vector.body ] 1753 ret i32 %s.0.lcssa 1754} 1755 1756define i32 @add8i32(ptr noalias nocapture readonly %x, i32 %n) { 1757; CHECK-LABEL: add8i32: 1758; CHECK: @ %bb.0: @ %entry 1759; CHECK-NEXT: .save {r7, lr} 1760; CHECK-NEXT: push {r7, lr} 1761; CHECK-NEXT: cbz r1, .LBB19_4 1762; CHECK-NEXT: @ %bb.1: @ %vector.ph 1763; CHECK-NEXT: movs r2, #0 1764; CHECK-NEXT: dlstp.16 lr, r1 1765; CHECK-NEXT: .LBB19_2: @ %vector.body 1766; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1767; CHECK-NEXT: vldrh.u16 q0, [r0], #16 1768; CHECK-NEXT: vaddva.s16 r2, q0 1769; CHECK-NEXT: letp lr, .LBB19_2 1770; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1771; CHECK-NEXT: mov r0, r2 1772; CHECK-NEXT: pop {r7, pc} 1773; CHECK-NEXT: .LBB19_4: 1774; CHECK-NEXT: movs r2, #0 1775; CHECK-NEXT: mov r0, r2 1776; CHECK-NEXT: pop {r7, pc} 1777entry: 1778 %cmp6.not = icmp eq i32 %n, 0 1779 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph 1780 1781vector.ph: ; preds = %entry 1782 %n.rnd.up = add i32 %n, 7 1783 %n.vec = and i32 %n.rnd.up, -8 1784 br label %vector.body 1785 1786vector.body: ; preds = %vector.body, %vector.ph 1787 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1788 %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ] 1789 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 1790 %0 = getelementptr inbounds i16, ptr %x, i32 %index 1791 %1 = bitcast ptr %0 to ptr 1792 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 1793 %2 = sext <8 x i16> %wide.masked.load to <8 x i32> 1794 %3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer 1795 %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3) 1796 %5 = add i32 %4, %vec.phi 1797 %index.next = add i32 %index, 8 1798 %6 = icmp eq i32 %index.next, %n.vec 1799 br i1 %6, label %for.cond.cleanup, label %vector.body 1800 1801for.cond.cleanup: ; preds = %vector.body, %entry 1802 %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ] 1803 ret i32 %s.0.lcssa 1804} 1805 1806define i32 @mla8i32(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 1807; CHECK-LABEL: mla8i32: 1808; CHECK: @ %bb.0: @ %entry 1809; CHECK-NEXT: .save {r7, lr} 1810; CHECK-NEXT: push {r7, lr} 1811; CHECK-NEXT: cbz r2, .LBB20_4 1812; CHECK-NEXT: @ %bb.1: @ %vector.ph 1813; CHECK-NEXT: mov.w r12, #0 1814; CHECK-NEXT: dlstp.16 lr, r2 1815; CHECK-NEXT: .LBB20_2: @ %vector.body 1816; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1817; CHECK-NEXT: vldrh.u16 q0, [r0], #16 1818; CHECK-NEXT: vldrh.u16 q1, [r1], #16 1819; CHECK-NEXT: vmlava.s16 r12, q1, q0 1820; CHECK-NEXT: letp lr, .LBB20_2 1821; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1822; CHECK-NEXT: mov r0, r12 1823; CHECK-NEXT: pop {r7, pc} 1824; CHECK-NEXT: .LBB20_4: 1825; CHECK-NEXT: mov.w r12, #0 1826; CHECK-NEXT: mov r0, r12 1827; CHECK-NEXT: pop {r7, pc} 1828entry: 1829 %cmp9.not = icmp eq i32 %n, 0 1830 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph 1831 1832vector.ph: ; preds = %entry 1833 %n.rnd.up = add i32 %n, 7 1834 %n.vec = and i32 %n.rnd.up, -8 1835 br label %vector.body 1836 1837vector.body: ; preds = %vector.body, %vector.ph 1838 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1839 %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ] 1840 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 1841 %0 = getelementptr inbounds i16, ptr %x, i32 %index 1842 %1 = bitcast ptr %0 to ptr 1843 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 1844 %2 = sext <8 x i16> %wide.masked.load to <8 x i32> 1845 %3 = getelementptr inbounds i16, ptr %y, i32 %index 1846 %4 = bitcast ptr %3 to ptr 1847 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 1848 %5 = sext <8 x i16> %wide.masked.load14 to <8 x i32> 1849 %6 = mul nsw <8 x i32> %5, %2 1850 %7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer 1851 %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7) 1852 %9 = add i32 %8, %vec.phi 1853 %index.next = add i32 %index, 8 1854 %10 = icmp eq i32 %index.next, %n.vec 1855 br i1 %10, label %for.cond.cleanup, label %vector.body 1856 1857for.cond.cleanup: ; preds = %vector.body, %entry 1858 %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ] 1859 ret i32 %s.0.lcssa 1860} 1861 1862define i32 @add16i32(ptr noalias nocapture readonly %x, i32 %n) { 1863; CHECK-LABEL: add16i32: 1864; CHECK: @ %bb.0: @ %entry 1865; CHECK-NEXT: .save {r7, lr} 1866; CHECK-NEXT: push {r7, lr} 1867; CHECK-NEXT: cbz r1, .LBB21_4 1868; CHECK-NEXT: @ %bb.1: @ %vector.ph 1869; CHECK-NEXT: movs r2, #0 1870; CHECK-NEXT: dlstp.8 lr, r1 1871; CHECK-NEXT: .LBB21_2: @ %vector.body 1872; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1873; CHECK-NEXT: vldrb.u8 q0, [r0], #16 1874; CHECK-NEXT: vaddva.u8 r2, q0 1875; CHECK-NEXT: letp lr, .LBB21_2 1876; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1877; CHECK-NEXT: mov r0, r2 1878; CHECK-NEXT: pop {r7, pc} 1879; CHECK-NEXT: .LBB21_4: 1880; CHECK-NEXT: movs r2, #0 1881; CHECK-NEXT: mov r0, r2 1882; CHECK-NEXT: pop {r7, pc} 1883entry: 1884 %cmp6.not = icmp eq i32 %n, 0 1885 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph 1886 1887vector.ph: ; preds = %entry 1888 %n.rnd.up = add i32 %n, 15 1889 %n.vec = and i32 %n.rnd.up, -16 1890 br label %vector.body 1891 1892vector.body: ; preds = %vector.body, %vector.ph 1893 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1894 %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ] 1895 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 1896 %0 = getelementptr inbounds i8, ptr %x, i32 %index 1897 %1 = bitcast ptr %0 to ptr 1898 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 1899 %2 = zext <16 x i8> %wide.masked.load to <16 x i32> 1900 %3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer 1901 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3) 1902 %5 = add i32 %4, %vec.phi 1903 %index.next = add i32 %index, 16 1904 %6 = icmp eq i32 %index.next, %n.vec 1905 br i1 %6, label %for.cond.cleanup, label %vector.body 1906 1907for.cond.cleanup: ; preds = %vector.body, %entry 1908 %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ] 1909 ret i32 %s.0.lcssa 1910} 1911 1912define i32 @mla16i32(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 1913; CHECK-LABEL: mla16i32: 1914; CHECK: @ %bb.0: @ %entry 1915; CHECK-NEXT: .save {r7, lr} 1916; CHECK-NEXT: push {r7, lr} 1917; CHECK-NEXT: cbz r2, .LBB22_4 1918; CHECK-NEXT: @ %bb.1: @ %vector.ph 1919; CHECK-NEXT: mov.w r12, #0 1920; CHECK-NEXT: dlstp.8 lr, r2 1921; CHECK-NEXT: .LBB22_2: @ %vector.body 1922; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1923; CHECK-NEXT: vldrb.u8 q0, [r0], #16 1924; CHECK-NEXT: vldrb.u8 q1, [r1], #16 1925; CHECK-NEXT: vmlava.u8 r12, q1, q0 1926; CHECK-NEXT: letp lr, .LBB22_2 1927; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1928; CHECK-NEXT: mov r0, r12 1929; CHECK-NEXT: pop {r7, pc} 1930; CHECK-NEXT: .LBB22_4: 1931; CHECK-NEXT: mov.w r12, #0 1932; CHECK-NEXT: mov r0, r12 1933; CHECK-NEXT: pop {r7, pc} 1934entry: 1935 %cmp9.not = icmp eq i32 %n, 0 1936 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph 1937 1938vector.ph: ; preds = %entry 1939 %n.rnd.up = add i32 %n, 15 1940 %n.vec = and i32 %n.rnd.up, -16 1941 br label %vector.body 1942 1943vector.body: ; preds = %vector.body, %vector.ph 1944 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1945 %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ] 1946 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 1947 %0 = getelementptr inbounds i8, ptr %x, i32 %index 1948 %1 = bitcast ptr %0 to ptr 1949 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 1950 %2 = zext <16 x i8> %wide.masked.load to <16 x i32> 1951 %3 = getelementptr inbounds i8, ptr %y, i32 %index 1952 %4 = bitcast ptr %3 to ptr 1953 %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 1954 %5 = zext <16 x i8> %wide.masked.load14 to <16 x i32> 1955 %6 = mul nuw nsw <16 x i32> %5, %2 1956 %7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer 1957 %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7) 1958 %9 = add i32 %8, %vec.phi 1959 %index.next = add i32 %index, 16 1960 %10 = icmp eq i32 %index.next, %n.vec 1961 br i1 %10, label %for.cond.cleanup, label %vector.body 1962 1963for.cond.cleanup: ; preds = %vector.body, %entry 1964 %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ] 1965 ret i32 %s.0.lcssa 1966} 1967 1968define signext i16 @add8i16(ptr noalias nocapture readonly %x, i32 %n) { 1969; CHECK-LABEL: add8i16: 1970; CHECK: @ %bb.0: @ %entry 1971; CHECK-NEXT: .save {r7, lr} 1972; CHECK-NEXT: push {r7, lr} 1973; CHECK-NEXT: cbz r1, .LBB23_4 1974; CHECK-NEXT: @ %bb.1: @ %vector.ph 1975; CHECK-NEXT: movs r2, #0 1976; CHECK-NEXT: dlstp.16 lr, r1 1977; CHECK-NEXT: .LBB23_2: @ %vector.body 1978; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1979; CHECK-NEXT: vldrh.u16 q0, [r0], #16 1980; CHECK-NEXT: vaddva.u16 r2, q0 1981; CHECK-NEXT: letp lr, .LBB23_2 1982; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1983; CHECK-NEXT: sxth r0, r2 1984; CHECK-NEXT: pop {r7, pc} 1985; CHECK-NEXT: .LBB23_4: 1986; CHECK-NEXT: movs r2, #0 1987; CHECK-NEXT: sxth r0, r2 1988; CHECK-NEXT: pop {r7, pc} 1989entry: 1990 %cmp8.not = icmp eq i32 %n, 0 1991 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph 1992 1993vector.ph: ; preds = %entry 1994 %n.rnd.up = add i32 %n, 7 1995 %n.vec = and i32 %n.rnd.up, -8 1996 br label %vector.body 1997 1998vector.body: ; preds = %vector.body, %vector.ph 1999 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2000 %vec.phi = phi i16 [ 0, %vector.ph ], [ %4, %vector.body ] 2001 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 2002 %0 = getelementptr inbounds i16, ptr %x, i32 %index 2003 %1 = bitcast ptr %0 to ptr 2004 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 2005 %2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer 2006 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2) 2007 %4 = add i16 %3, %vec.phi 2008 %index.next = add i32 %index, 8 2009 %5 = icmp eq i32 %index.next, %n.vec 2010 br i1 %5, label %for.cond.cleanup, label %vector.body 2011 2012for.cond.cleanup: ; preds = %vector.body, %entry 2013 %s.0.lcssa = phi i16 [ 0, %entry ], [ %4, %vector.body ] 2014 ret i16 %s.0.lcssa 2015} 2016 2017define signext i16 @mla8i16(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 2018; CHECK-LABEL: mla8i16: 2019; CHECK: @ %bb.0: @ %entry 2020; CHECK-NEXT: .save {r7, lr} 2021; CHECK-NEXT: push {r7, lr} 2022; CHECK-NEXT: cbz r2, .LBB24_4 2023; CHECK-NEXT: @ %bb.1: @ %vector.ph 2024; CHECK-NEXT: mov.w r12, #0 2025; CHECK-NEXT: dlstp.16 lr, r2 2026; CHECK-NEXT: .LBB24_2: @ %vector.body 2027; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2028; CHECK-NEXT: vldrh.u16 q0, [r0], #16 2029; CHECK-NEXT: vldrh.u16 q1, [r1], #16 2030; CHECK-NEXT: vmlava.u16 r12, q1, q0 2031; CHECK-NEXT: letp lr, .LBB24_2 2032; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 2033; CHECK-NEXT: sxth.w r0, r12 2034; CHECK-NEXT: pop {r7, pc} 2035; CHECK-NEXT: .LBB24_4: 2036; CHECK-NEXT: mov.w r12, #0 2037; CHECK-NEXT: sxth.w r0, r12 2038; CHECK-NEXT: pop {r7, pc} 2039entry: 2040 %cmp11.not = icmp eq i32 %n, 0 2041 br i1 %cmp11.not, label %for.cond.cleanup, label %vector.ph 2042 2043vector.ph: ; preds = %entry 2044 %n.rnd.up = add i32 %n, 7 2045 %n.vec = and i32 %n.rnd.up, -8 2046 br label %vector.body 2047 2048vector.body: ; preds = %vector.body, %vector.ph 2049 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2050 %vec.phi = phi i16 [ 0, %vector.ph ], [ %7, %vector.body ] 2051 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 2052 %0 = getelementptr inbounds i16, ptr %x, i32 %index 2053 %1 = bitcast ptr %0 to ptr 2054 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 2055 %2 = getelementptr inbounds i16, ptr %y, i32 %index 2056 %3 = bitcast ptr %2 to ptr 2057 %wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 2058 %4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load 2059 %5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer 2060 %6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5) 2061 %7 = add i16 %6, %vec.phi 2062 %index.next = add i32 %index, 8 2063 %8 = icmp eq i32 %index.next, %n.vec 2064 br i1 %8, label %for.cond.cleanup, label %vector.body 2065 2066for.cond.cleanup: ; preds = %vector.body, %entry 2067 %s.0.lcssa = phi i16 [ 0, %entry ], [ %7, %vector.body ] 2068 ret i16 %s.0.lcssa 2069} 2070 2071define signext i16 @add16i16(ptr noalias nocapture readonly %x, i32 %n) { 2072; CHECK-LABEL: add16i16: 2073; CHECK: @ %bb.0: @ %entry 2074; CHECK-NEXT: .save {r7, lr} 2075; CHECK-NEXT: push {r7, lr} 2076; CHECK-NEXT: cbz r1, .LBB25_4 2077; CHECK-NEXT: @ %bb.1: @ %vector.ph 2078; CHECK-NEXT: movs r2, #0 2079; CHECK-NEXT: dlstp.8 lr, r1 2080; CHECK-NEXT: .LBB25_2: @ %vector.body 2081; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2082; CHECK-NEXT: vldrb.u8 q0, [r0], #16 2083; CHECK-NEXT: vaddva.u8 r2, q0 2084; CHECK-NEXT: letp lr, .LBB25_2 2085; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 2086; CHECK-NEXT: sxth r0, r2 2087; CHECK-NEXT: pop {r7, pc} 2088; CHECK-NEXT: .LBB25_4: 2089; CHECK-NEXT: movs r2, #0 2090; CHECK-NEXT: sxth r0, r2 2091; CHECK-NEXT: pop {r7, pc} 2092entry: 2093 %cmp8.not = icmp eq i32 %n, 0 2094 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph 2095 2096vector.ph: ; preds = %entry 2097 %n.rnd.up = add i32 %n, 15 2098 %n.vec = and i32 %n.rnd.up, -16 2099 br label %vector.body 2100 2101vector.body: ; preds = %vector.body, %vector.ph 2102 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2103 %vec.phi = phi i16 [ 0, %vector.ph ], [ %5, %vector.body ] 2104 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 2105 %0 = getelementptr inbounds i8, ptr %x, i32 %index 2106 %1 = bitcast ptr %0 to ptr 2107 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2108 %2 = zext <16 x i8> %wide.masked.load to <16 x i16> 2109 %3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer 2110 %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3) 2111 %5 = add i16 %4, %vec.phi 2112 %index.next = add i32 %index, 16 2113 %6 = icmp eq i32 %index.next, %n.vec 2114 br i1 %6, label %for.cond.cleanup, label %vector.body 2115 2116for.cond.cleanup: ; preds = %vector.body, %entry 2117 %s.0.lcssa = phi i16 [ 0, %entry ], [ %5, %vector.body ] 2118 ret i16 %s.0.lcssa 2119} 2120 2121define signext i16 @mla16i16(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 2122; CHECK-LABEL: mla16i16: 2123; CHECK: @ %bb.0: @ %entry 2124; CHECK-NEXT: .save {r7, lr} 2125; CHECK-NEXT: push {r7, lr} 2126; CHECK-NEXT: cbz r2, .LBB26_4 2127; CHECK-NEXT: @ %bb.1: @ %vector.ph 2128; CHECK-NEXT: mov.w r12, #0 2129; CHECK-NEXT: dlstp.8 lr, r2 2130; CHECK-NEXT: .LBB26_2: @ %vector.body 2131; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2132; CHECK-NEXT: vldrb.u8 q0, [r0], #16 2133; CHECK-NEXT: vldrb.u8 q1, [r1], #16 2134; CHECK-NEXT: vmlava.u8 r12, q1, q0 2135; CHECK-NEXT: letp lr, .LBB26_2 2136; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 2137; CHECK-NEXT: sxth.w r0, r12 2138; CHECK-NEXT: pop {r7, pc} 2139; CHECK-NEXT: .LBB26_4: 2140; CHECK-NEXT: mov.w r12, #0 2141; CHECK-NEXT: sxth.w r0, r12 2142; CHECK-NEXT: pop {r7, pc} 2143entry: 2144 %cmp13.not = icmp eq i32 %n, 0 2145 br i1 %cmp13.not, label %for.cond.cleanup, label %vector.ph 2146 2147vector.ph: ; preds = %entry 2148 %n.rnd.up = add i32 %n, 15 2149 %n.vec = and i32 %n.rnd.up, -16 2150 br label %vector.body 2151 2152vector.body: ; preds = %vector.body, %vector.ph 2153 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2154 %vec.phi = phi i16 [ 0, %vector.ph ], [ %9, %vector.body ] 2155 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 2156 %0 = getelementptr inbounds i8, ptr %x, i32 %index 2157 %1 = bitcast ptr %0 to ptr 2158 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2159 %2 = zext <16 x i8> %wide.masked.load to <16 x i16> 2160 %3 = getelementptr inbounds i8, ptr %y, i32 %index 2161 %4 = bitcast ptr %3 to ptr 2162 %wide.masked.load18 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2163 %5 = zext <16 x i8> %wide.masked.load18 to <16 x i16> 2164 %6 = mul nuw <16 x i16> %5, %2 2165 %7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer 2166 %8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7) 2167 %9 = add i16 %8, %vec.phi 2168 %index.next = add i32 %index, 16 2169 %10 = icmp eq i32 %index.next, %n.vec 2170 br i1 %10, label %for.cond.cleanup, label %vector.body 2171 2172for.cond.cleanup: ; preds = %vector.body, %entry 2173 %s.0.lcssa = phi i16 [ 0, %entry ], [ %9, %vector.body ] 2174 ret i16 %s.0.lcssa 2175} 2176 2177define zeroext i8 @add16i8(ptr noalias nocapture readonly %x, i32 %n) { 2178; CHECK-LABEL: add16i8: 2179; CHECK: @ %bb.0: @ %entry 2180; CHECK-NEXT: .save {r7, lr} 2181; CHECK-NEXT: push {r7, lr} 2182; CHECK-NEXT: cbz r1, .LBB27_4 2183; CHECK-NEXT: @ %bb.1: @ %vector.ph 2184; CHECK-NEXT: movs r2, #0 2185; CHECK-NEXT: dlstp.8 lr, r1 2186; CHECK-NEXT: .LBB27_2: @ %vector.body 2187; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2188; CHECK-NEXT: vldrb.u8 q0, [r0], #16 2189; CHECK-NEXT: vaddva.u8 r2, q0 2190; CHECK-NEXT: letp lr, .LBB27_2 2191; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 2192; CHECK-NEXT: uxtb r0, r2 2193; CHECK-NEXT: pop {r7, pc} 2194; CHECK-NEXT: .LBB27_4: 2195; CHECK-NEXT: movs r2, #0 2196; CHECK-NEXT: uxtb r0, r2 2197; CHECK-NEXT: pop {r7, pc} 2198entry: 2199 %cmp7.not = icmp eq i32 %n, 0 2200 br i1 %cmp7.not, label %for.cond.cleanup, label %vector.ph 2201 2202vector.ph: ; preds = %entry 2203 %n.rnd.up = add i32 %n, 15 2204 %n.vec = and i32 %n.rnd.up, -16 2205 br label %vector.body 2206 2207vector.body: ; preds = %vector.body, %vector.ph 2208 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2209 %vec.phi = phi i8 [ 0, %vector.ph ], [ %4, %vector.body ] 2210 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 2211 %0 = getelementptr inbounds i8, ptr %x, i32 %index 2212 %1 = bitcast ptr %0 to ptr 2213 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2214 %2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer 2215 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2) 2216 %4 = add i8 %3, %vec.phi 2217 %index.next = add i32 %index, 16 2218 %5 = icmp eq i32 %index.next, %n.vec 2219 br i1 %5, label %for.cond.cleanup, label %vector.body 2220 2221for.cond.cleanup: ; preds = %vector.body, %entry 2222 %s.0.lcssa = phi i8 [ 0, %entry ], [ %4, %vector.body ] 2223 ret i8 %s.0.lcssa 2224} 2225 2226define zeroext i8 @mla16i8(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 2227; CHECK-LABEL: mla16i8: 2228; CHECK: @ %bb.0: @ %entry 2229; CHECK-NEXT: .save {r7, lr} 2230; CHECK-NEXT: push {r7, lr} 2231; CHECK-NEXT: cbz r2, .LBB28_4 2232; CHECK-NEXT: @ %bb.1: @ %vector.ph 2233; CHECK-NEXT: mov.w r12, #0 2234; CHECK-NEXT: dlstp.8 lr, r2 2235; CHECK-NEXT: .LBB28_2: @ %vector.body 2236; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2237; CHECK-NEXT: vldrb.u8 q0, [r0], #16 2238; CHECK-NEXT: vldrb.u8 q1, [r1], #16 2239; CHECK-NEXT: vmlava.u8 r12, q1, q0 2240; CHECK-NEXT: letp lr, .LBB28_2 2241; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 2242; CHECK-NEXT: uxtb.w r0, r12 2243; CHECK-NEXT: pop {r7, pc} 2244; CHECK-NEXT: .LBB28_4: 2245; CHECK-NEXT: mov.w r12, #0 2246; CHECK-NEXT: uxtb.w r0, r12 2247; CHECK-NEXT: pop {r7, pc} 2248entry: 2249 %cmp10.not = icmp eq i32 %n, 0 2250 br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph 2251 2252vector.ph: ; preds = %entry 2253 %n.rnd.up = add i32 %n, 15 2254 %n.vec = and i32 %n.rnd.up, -16 2255 br label %vector.body 2256 2257vector.body: ; preds = %vector.body, %vector.ph 2258 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2259 %vec.phi = phi i8 [ 0, %vector.ph ], [ %7, %vector.body ] 2260 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n) 2261 %0 = getelementptr inbounds i8, ptr %x, i32 %index 2262 %1 = bitcast ptr %0 to ptr 2263 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2264 %2 = getelementptr inbounds i8, ptr %y, i32 %index 2265 %3 = bitcast ptr %2 to ptr 2266 %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef) 2267 %4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load 2268 %5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer 2269 %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5) 2270 %7 = add i8 %6, %vec.phi 2271 %index.next = add i32 %index, 16 2272 %8 = icmp eq i32 %index.next, %n.vec 2273 br i1 %8, label %for.cond.cleanup, label %vector.body 2274 2275for.cond.cleanup: ; preds = %vector.body, %entry 2276 %s.0.lcssa = phi i8 [ 0, %entry ], [ %7, %vector.body ] 2277 ret i8 %s.0.lcssa 2278} 2279 2280define i64 @add4i64(ptr noalias nocapture readonly %x, i32 %n) { 2281; CHECK-LABEL: add4i64: 2282; CHECK: @ %bb.0: @ %entry 2283; CHECK-NEXT: .save {r7, lr} 2284; CHECK-NEXT: push {r7, lr} 2285; CHECK-NEXT: cbz r1, .LBB29_3 2286; CHECK-NEXT: @ %bb.1: @ %vector.ph 2287; CHECK-NEXT: movs r2, #0 2288; CHECK-NEXT: mov r3, r2 2289; CHECK-NEXT: dlstp.32 lr, r1 2290; CHECK-NEXT: .LBB29_2: @ %vector.body 2291; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2292; CHECK-NEXT: vldrw.u32 q0, [r0], #16 2293; CHECK-NEXT: vaddlva.s32 r2, r3, q0 2294; CHECK-NEXT: letp lr, .LBB29_2 2295; CHECK-NEXT: b .LBB29_4 2296; CHECK-NEXT: .LBB29_3: 2297; CHECK-NEXT: movs r2, #0 2298; CHECK-NEXT: mov r3, r2 2299; CHECK-NEXT: .LBB29_4: @ %for.cond.cleanup 2300; CHECK-NEXT: mov r0, r2 2301; CHECK-NEXT: mov r1, r3 2302; CHECK-NEXT: pop {r7, pc} 2303entry: 2304 %cmp6.not = icmp eq i32 %n, 0 2305 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph 2306 2307vector.ph: ; preds = %entry 2308 %n.rnd.up = add i32 %n, 3 2309 %n.vec = and i32 %n.rnd.up, -4 2310 br label %vector.body 2311 2312vector.body: ; preds = %vector.body, %vector.ph 2313 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2314 %vec.phi = phi i64 [ 0, %vector.ph ], [ %5, %vector.body ] 2315 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 2316 %0 = getelementptr inbounds i32, ptr %x, i32 %index 2317 %1 = bitcast ptr %0 to ptr 2318 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 2319 %2 = sext <4 x i32> %wide.masked.load to <4 x i64> 2320 %3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer 2321 %4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3) 2322 %5 = add i64 %4, %vec.phi 2323 %index.next = add i32 %index, 4 2324 %6 = icmp eq i32 %index.next, %n.vec 2325 br i1 %6, label %for.cond.cleanup, label %vector.body 2326 2327for.cond.cleanup: ; preds = %vector.body, %entry 2328 %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %vector.body ] 2329 ret i64 %s.0.lcssa 2330} 2331 2332define i64 @mla4i64(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 2333; CHECK-LABEL: mla4i64: 2334; CHECK: @ %bb.0: @ %entry 2335; CHECK-NEXT: .save {r7, lr} 2336; CHECK-NEXT: push {r7, lr} 2337; CHECK-NEXT: cbz r2, .LBB30_3 2338; CHECK-NEXT: @ %bb.1: @ %vector.ph 2339; CHECK-NEXT: mov.w r12, #0 2340; CHECK-NEXT: mov r3, r12 2341; CHECK-NEXT: dlstp.32 lr, r2 2342; CHECK-NEXT: .LBB30_2: @ %vector.body 2343; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2344; CHECK-NEXT: vldrw.u32 q0, [r0], #16 2345; CHECK-NEXT: vldrw.u32 q1, [r1], #16 2346; CHECK-NEXT: vmlalva.s32 r12, r3, q1, q0 2347; CHECK-NEXT: letp lr, .LBB30_2 2348; CHECK-NEXT: b .LBB30_4 2349; CHECK-NEXT: .LBB30_3: 2350; CHECK-NEXT: mov.w r12, #0 2351; CHECK-NEXT: mov r3, r12 2352; CHECK-NEXT: .LBB30_4: @ %for.cond.cleanup 2353; CHECK-NEXT: mov r0, r12 2354; CHECK-NEXT: mov r1, r3 2355; CHECK-NEXT: pop {r7, pc} 2356entry: 2357 %cmp9.not = icmp eq i32 %n, 0 2358 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph 2359 2360vector.ph: ; preds = %entry 2361 %n.rnd.up = add i32 %n, 3 2362 %n.vec = and i32 %n.rnd.up, -4 2363 br label %vector.body 2364 2365vector.body: ; preds = %vector.body, %vector.ph 2366 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2367 %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ] 2368 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 2369 %0 = getelementptr inbounds i32, ptr %x, i32 %index 2370 %1 = bitcast ptr %0 to ptr 2371 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 2372 %2 = sext <4 x i32> %wide.masked.load to <4 x i64> 2373 %3 = getelementptr inbounds i32, ptr %y, i32 %index 2374 %4 = bitcast ptr %3 to ptr 2375 %wide.masked.load14 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 2376 %5 = sext <4 x i32> %wide.masked.load14 to <4 x i64> 2377 %6 = mul nsw <4 x i64> %5, %2 2378 %7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer 2379 %8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7) 2380 %9 = add i64 %8, %vec.phi 2381 %index.next = add i32 %index, 4 2382 %10 = icmp eq i32 %index.next, %n.vec 2383 br i1 %10, label %for.cond.cleanup, label %vector.body 2384 2385for.cond.cleanup: ; preds = %vector.body, %entry 2386 %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ] 2387 ret i64 %s.0.lcssa 2388} 2389 2390define i64 @mla8i64(ptr noalias nocapture readonly %x, ptr noalias nocapture readonly %y, i32 %n) { 2391; CHECK-LABEL: mla8i64: 2392; CHECK: @ %bb.0: @ %entry 2393; CHECK-NEXT: .save {r7, lr} 2394; CHECK-NEXT: push {r7, lr} 2395; CHECK-NEXT: cbz r2, .LBB31_3 2396; CHECK-NEXT: @ %bb.1: @ %vector.ph 2397; CHECK-NEXT: mov.w r12, #0 2398; CHECK-NEXT: mov r3, r12 2399; CHECK-NEXT: dlstp.16 lr, r2 2400; CHECK-NEXT: .LBB31_2: @ %vector.body 2401; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 2402; CHECK-NEXT: vldrh.u16 q0, [r0], #16 2403; CHECK-NEXT: vldrh.u16 q1, [r1], #16 2404; CHECK-NEXT: vmlalva.s16 r12, r3, q1, q0 2405; CHECK-NEXT: letp lr, .LBB31_2 2406; CHECK-NEXT: b .LBB31_4 2407; CHECK-NEXT: .LBB31_3: 2408; CHECK-NEXT: mov.w r12, #0 2409; CHECK-NEXT: mov r3, r12 2410; CHECK-NEXT: .LBB31_4: @ %for.cond.cleanup 2411; CHECK-NEXT: mov r0, r12 2412; CHECK-NEXT: mov r1, r3 2413; CHECK-NEXT: pop {r7, pc} 2414entry: 2415 %cmp9.not = icmp eq i32 %n, 0 2416 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph 2417 2418vector.ph: ; preds = %entry 2419 %n.rnd.up = add i32 %n, 7 2420 %n.vec = and i32 %n.rnd.up, -8 2421 br label %vector.body 2422 2423vector.body: ; preds = %vector.body, %vector.ph 2424 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 2425 %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ] 2426 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n) 2427 %0 = getelementptr inbounds i16, ptr %x, i32 %index 2428 %1 = bitcast ptr %0 to ptr 2429 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 2430 %2 = sext <8 x i16> %wide.masked.load to <8 x i64> 2431 %3 = getelementptr inbounds i16, ptr %y, i32 %index 2432 %4 = bitcast ptr %3 to ptr 2433 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef) 2434 %5 = sext <8 x i16> %wide.masked.load14 to <8 x i64> 2435 %6 = mul nsw <8 x i64> %5, %2 2436 %7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer 2437 %8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7) 2438 %9 = add i64 %8, %vec.phi 2439 %index.next = add i32 %index, 8 2440 %10 = icmp eq i32 %index.next, %n.vec 2441 br i1 %10, label %for.cond.cleanup, label %vector.body 2442 2443for.cond.cleanup: ; preds = %vector.body, %entry 2444 %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ] 2445 ret i64 %s.0.lcssa 2446} 2447 2448declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1 2449declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #2 2450declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1 2451declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>) #2 2452declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3 2453declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1 2454declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>) #2 2455declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3 2456declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3 2457declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3 2458declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3 2459declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3 2460declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3 2461 2462declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 2463declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) 2464declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) 2465declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) 2466declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) 2467declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 2468declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) 2469declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) 2470declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) 2471declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) 2472declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) 2473declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) 2474declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) 2475