1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedldst %s -o - | FileCheck %s 3 4define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <4 x i32> %offs) { 5; CHECK-LABEL: gather_inc_mini_4i32: 6; CHECK: @ %bb.0: 7; CHECK-NEXT: movs r1, #4 8; CHECK-NEXT: vadd.i32 q1, q0, r1 9; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] 10; CHECK-NEXT: bx lr 11 %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4> 12 %2 = getelementptr inbounds i32, ptr %data, <4 x i32> %1 13 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 14 ret <4 x i32> %wide.masked.gather 15} 16 17define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32_i8(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) { 18; CHECK-LABEL: gather_inc_mini_4i32_i8: 19; CHECK: @ %bb.0: 20; CHECK-NEXT: movs r1, #16 21; CHECK-NEXT: vadd.i32 q1, q0, r1 22; CHECK-NEXT: vldrw.u32 q0, [r0, q1] 23; CHECK-NEXT: bx lr 24 %1 = add <4 x i32> %offs, <i32 16, i32 16, i32 16, i32 16> 25 %2 = getelementptr inbounds i8, i32* %data, <4 x i32> %1 26 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 27 ret <4 x i32> %wide.masked.gather 28} 29 30define arm_aapcs_vfpcc <4 x i32> @gather_inc_minipred_4i32(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <4 x i32> %offs) { 31; CHECK-LABEL: gather_inc_minipred_4i32: 32; CHECK: @ %bb.0: 33; CHECK-NEXT: movs r1, #4 34; CHECK-NEXT: movw r2, #3855 35; CHECK-NEXT: vadd.i32 q1, q0, r1 36; CHECK-NEXT: vmsr p0, r2 37; CHECK-NEXT: vpst 38; CHECK-NEXT: vldrwt.u32 q0, [r0, q1, uxtw #2] 39; CHECK-NEXT: bx lr 40 %1 = add <4 x i32> %offs, <i32 4, i32 4, i32 4, i32 4> 41 %2 = getelementptr inbounds i32, ptr %data, <4 x i32> %1 42 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) 43 ret <4 x i32> %wide.masked.gather 44} 45 46define arm_aapcs_vfpcc <8 x i16> @gather_inc_mini_8i16(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <8 x i32> %offs) { 47; CHECK-LABEL: gather_inc_mini_8i16: 48; CHECK: @ %bb.0: 49; CHECK-NEXT: .save {r4, r5, r6, lr} 50; CHECK-NEXT: push {r4, r5, r6, lr} 51; CHECK-NEXT: vshl.i32 q1, q1, #1 52; CHECK-NEXT: mov.w r12, #16 53; CHECK-NEXT: vadd.i32 q1, q1, r0 54; CHECK-NEXT: vshl.i32 q0, q0, #1 55; CHECK-NEXT: vadd.i32 q1, q1, r12 56; CHECK-NEXT: vadd.i32 q0, q0, r0 57; CHECK-NEXT: vmov r1, lr, d3 58; CHECK-NEXT: vadd.i32 q0, q0, r12 59; CHECK-NEXT: vmov r0, r3, d1 60; CHECK-NEXT: vmov r2, r4, d2 61; CHECK-NEXT: ldrh r6, [r1] 62; CHECK-NEXT: vmov r1, r5, d0 63; CHECK-NEXT: ldrh r0, [r0] 64; CHECK-NEXT: ldrh r3, [r3] 65; CHECK-NEXT: ldrh r2, [r2] 66; CHECK-NEXT: ldrh r4, [r4] 67; CHECK-NEXT: ldrh.w r12, [lr] 68; CHECK-NEXT: ldrh r1, [r1] 69; CHECK-NEXT: ldrh r5, [r5] 70; CHECK-NEXT: vmov.16 q0[0], r1 71; CHECK-NEXT: vmov.16 q0[1], r5 72; CHECK-NEXT: vmov.16 q0[2], r0 73; CHECK-NEXT: vmov.16 q0[3], r3 74; CHECK-NEXT: vmov.16 q0[4], r2 75; CHECK-NEXT: vmov.16 q0[5], r4 76; CHECK-NEXT: vmov.16 q0[6], r6 77; CHECK-NEXT: vmov.16 q0[7], r12 78; CHECK-NEXT: pop {r4, r5, r6, pc} 79 %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> 80 %2 = getelementptr inbounds i16, ptr %data, <8 x i32> %1 81 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %2, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 82 ret <8 x i16> %wide.masked.gather 83} 84 85define arm_aapcs_vfpcc <8 x i16> @gather_inc_minipred_8i16(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <8 x i32> %offs) { 86; CHECK-LABEL: gather_inc_minipred_8i16: 87; CHECK: @ %bb.0: 88; CHECK-NEXT: vshl.i32 q0, q0, #1 89; CHECK-NEXT: movs r1, #16 90; CHECK-NEXT: vadd.i32 q0, q0, r0 91; CHECK-NEXT: vshl.i32 q1, q1, #1 92; CHECK-NEXT: vadd.i32 q0, q0, r1 93; CHECK-NEXT: vadd.i32 q1, q1, r0 94; CHECK-NEXT: vmov r2, s0 95; CHECK-NEXT: vadd.i32 q1, q1, r1 96; CHECK-NEXT: vmov r3, s2 97; CHECK-NEXT: vmov r0, s4 98; CHECK-NEXT: vmov r1, s6 99; CHECK-NEXT: ldrh r2, [r2] 100; CHECK-NEXT: ldrh r3, [r3] 101; CHECK-NEXT: vmov.16 q0[0], r2 102; CHECK-NEXT: ldrh r0, [r0] 103; CHECK-NEXT: vmov.16 q0[2], r3 104; CHECK-NEXT: ldrh r1, [r1] 105; CHECK-NEXT: vmov.16 q0[4], r0 106; CHECK-NEXT: vmov.16 q0[6], r1 107; CHECK-NEXT: bx lr 108 %1 = add <8 x i32> %offs, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> 109 %2 = getelementptr inbounds i16, ptr %data, <8 x i32> %1 110 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %2, i32 4, <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> undef) 111 ret <8 x i16> %wide.masked.gather 112} 113 114define arm_aapcs_vfpcc <16 x i8> @gather_inc_mini_16i8(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <16 x i32> %offs) { 115; CHECK-LABEL: gather_inc_mini_16i8: 116; CHECK: @ %bb.0: 117; CHECK-NEXT: .save {r4, r5, r6, r7, lr} 118; CHECK-NEXT: push {r4, r5, r6, r7, lr} 119; CHECK-NEXT: movs r5, #16 120; CHECK-NEXT: vadd.i32 q3, q3, r0 121; CHECK-NEXT: vadd.i32 q3, q3, r5 122; CHECK-NEXT: vadd.i32 q0, q0, r0 123; CHECK-NEXT: vmov r1, r2, d7 124; CHECK-NEXT: vadd.i32 q1, q1, r0 125; CHECK-NEXT: vmov r3, r4, d6 126; CHECK-NEXT: vadd.i32 q3, q0, r5 127; CHECK-NEXT: vadd.i32 q0, q2, r0 128; CHECK-NEXT: vadd.i32 q1, q1, r5 129; CHECK-NEXT: vadd.i32 q2, q0, r5 130; CHECK-NEXT: ldrb.w r12, [r1] 131; CHECK-NEXT: ldrb r1, [r3] 132; CHECK-NEXT: ldrb.w lr, [r2] 133; CHECK-NEXT: ldrb r3, [r4] 134; CHECK-NEXT: vmov r2, r4, d6 135; CHECK-NEXT: ldrb r2, [r2] 136; CHECK-NEXT: ldrb r4, [r4] 137; CHECK-NEXT: vmov.8 q0[0], r2 138; CHECK-NEXT: vmov r2, r6, d5 139; CHECK-NEXT: vmov.8 q0[1], r4 140; CHECK-NEXT: ldrb r4, [r2] 141; CHECK-NEXT: ldrb r2, [r6] 142; CHECK-NEXT: vmov r6, r7, d7 143; CHECK-NEXT: ldrb r0, [r6] 144; CHECK-NEXT: ldrb r7, [r7] 145; CHECK-NEXT: vmov.8 q0[2], r0 146; CHECK-NEXT: vmov r0, r5, d2 147; CHECK-NEXT: vmov.8 q0[3], r7 148; CHECK-NEXT: ldrb r0, [r0] 149; CHECK-NEXT: ldrb r5, [r5] 150; CHECK-NEXT: vmov.8 q0[4], r0 151; CHECK-NEXT: vmov.8 q0[5], r5 152; CHECK-NEXT: vmov r0, r5, d3 153; CHECK-NEXT: ldrb r0, [r0] 154; CHECK-NEXT: ldrb r5, [r5] 155; CHECK-NEXT: vmov.8 q0[6], r0 156; CHECK-NEXT: vmov.8 q0[7], r5 157; CHECK-NEXT: vmov r0, r5, d4 158; CHECK-NEXT: ldrb r0, [r0] 159; CHECK-NEXT: ldrb r5, [r5] 160; CHECK-NEXT: vmov.8 q0[8], r0 161; CHECK-NEXT: vmov.8 q0[9], r5 162; CHECK-NEXT: vmov.8 q0[10], r4 163; CHECK-NEXT: vmov.8 q0[11], r2 164; CHECK-NEXT: vmov.8 q0[12], r1 165; CHECK-NEXT: vmov.8 q0[13], r3 166; CHECK-NEXT: vmov.8 q0[14], r12 167; CHECK-NEXT: vmov.8 q0[15], lr 168; CHECK-NEXT: pop {r4, r5, r6, r7, pc} 169 %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 170 %2 = getelementptr inbounds i8, ptr %data, <16 x i32> %1 171 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %2, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 172 ret <16 x i8> %wide.masked.gather 173} 174 175define arm_aapcs_vfpcc <16 x i8> @gather_inc_minipred_16i8(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, <16 x i32> %offs) { 176; CHECK-LABEL: gather_inc_minipred_16i8: 177; CHECK: @ %bb.0: 178; CHECK-NEXT: .save {r4, r5, r7, lr} 179; CHECK-NEXT: push {r4, r5, r7, lr} 180; CHECK-NEXT: movs r1, #16 181; CHECK-NEXT: vadd.i32 q1, q1, r0 182; CHECK-NEXT: vadd.i32 q1, q1, r1 183; CHECK-NEXT: vadd.i32 q2, q2, r0 184; CHECK-NEXT: vmov r2, s4 185; CHECK-NEXT: vadd.i32 q2, q2, r1 186; CHECK-NEXT: vadd.i32 q0, q0, r0 187; CHECK-NEXT: vmov r3, s10 188; CHECK-NEXT: vadd.i32 q0, q0, r1 189; CHECK-NEXT: vmov r4, s0 190; CHECK-NEXT: vmov r5, s2 191; CHECK-NEXT: ldrb.w r12, [r2] 192; CHECK-NEXT: vmov r2, s8 193; CHECK-NEXT: ldrb r3, [r3] 194; CHECK-NEXT: ldrb r4, [r4] 195; CHECK-NEXT: ldrb r5, [r5] 196; CHECK-NEXT: vmov.8 q0[0], r4 197; CHECK-NEXT: vmov.8 q0[2], r5 198; CHECK-NEXT: vmov.8 q0[4], r12 199; CHECK-NEXT: ldrb.w lr, [r2] 200; CHECK-NEXT: vmov r2, s6 201; CHECK-NEXT: vadd.i32 q1, q3, r0 202; CHECK-NEXT: vadd.i32 q1, q1, r1 203; CHECK-NEXT: vmov r0, s4 204; CHECK-NEXT: vmov r1, s6 205; CHECK-NEXT: ldrb r2, [r2] 206; CHECK-NEXT: vmov.8 q0[6], r2 207; CHECK-NEXT: vmov.8 q0[8], lr 208; CHECK-NEXT: ldrb r0, [r0] 209; CHECK-NEXT: vmov.8 q0[10], r3 210; CHECK-NEXT: ldrb r1, [r1] 211; CHECK-NEXT: vmov.8 q0[12], r0 212; CHECK-NEXT: vmov.8 q0[14], r1 213; CHECK-NEXT: pop {r4, r5, r7, pc} 214 %1 = add <16 x i32> %offs, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 215 %2 = getelementptr inbounds i8, ptr %data, <16 x i32> %1 216 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %2, i32 2, <16 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <16 x i8> undef) 217 ret <16 x i8> %wide.masked.gather 218} 219 220define arm_aapcs_vfpcc void @gather_pre_inc(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec) { 221; CHECK-LABEL: gather_pre_inc: 222; CHECK: @ %bb.0: @ %vector.ph 223; CHECK-NEXT: adr r3, .LCPI7_0 224; CHECK-NEXT: vldrw.u32 q0, [r3] 225; CHECK-NEXT: vadd.i32 q0, q0, r0 226; CHECK-NEXT: .LBB7_1: @ %vector.body 227; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 228; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! 229; CHECK-NEXT: subs r2, #4 230; CHECK-NEXT: vstrb.8 q1, [r1], #16 231; CHECK-NEXT: bne .LBB7_1 232; CHECK-NEXT: @ %bb.2: @ %end 233; CHECK-NEXT: bx lr 234; CHECK-NEXT: .p2align 4 235; CHECK-NEXT: @ %bb.3: 236; CHECK-NEXT: .LCPI7_0: 237; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 238; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 239; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 240; CHECK-NEXT: .long 0 @ 0x0 241vector.ph: ; preds = %for.body.preheader 242 br label %vector.body 243 244vector.body: ; preds = %vector.body, %vector.ph 245 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 246 %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ] 247 %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3> 248 %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6> 249 %2 = getelementptr inbounds i32, ptr %data, <4 x i32> %1 250 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 251 %3 = getelementptr inbounds i32, ptr %dst, i32 %index 252 store <4 x i32> %wide.masked.gather, ptr %3, align 4 253 %index.next = add i32 %index, 4 254 %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8> 255 %4 = icmp eq i32 %index.next, %n.vec 256 br i1 %4, label %end, label %vector.body 257 258end: 259 ret void; 260} 261 262define arm_aapcs_vfpcc void @gather_pre_inc_i8(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec) { 263; CHECK-LABEL: gather_pre_inc_i8: 264; CHECK: @ %bb.0: @ %vector.ph 265; CHECK-NEXT: adr r3, .LCPI8_0 266; CHECK-NEXT: vldrw.u32 q0, [r3] 267; CHECK-NEXT: vadd.i32 q0, q0, r0 268; CHECK-NEXT: .LBB8_1: @ %vector.body 269; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 270; CHECK-NEXT: vldrw.u32 q1, [q0, #24]! 271; CHECK-NEXT: subs r2, #4 272; CHECK-NEXT: vstrb.8 q1, [r1], #16 273; CHECK-NEXT: bne .LBB8_1 274; CHECK-NEXT: @ %bb.2: @ %end 275; CHECK-NEXT: bx lr 276; CHECK-NEXT: .p2align 4 277; CHECK-NEXT: @ %bb.3: 278; CHECK-NEXT: .LCPI8_0: 279; CHECK-NEXT: .long 4294967278 @ 0xffffffee 280; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 281; CHECK-NEXT: .long 4294967290 @ 0xfffffffa 282; CHECK-NEXT: .long 0 @ 0x0 283vector.ph: ; preds = %for.body.preheader 284 br label %vector.body 285 286vector.body: ; preds = %vector.body, %vector.ph 287 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 288 %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ] 289 %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3> 290 %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6> 291 %2 = getelementptr inbounds i8, ptr %data, <4 x i32> %1 292 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 293 %3 = getelementptr inbounds i32, ptr %dst, i32 %index 294 store <4 x i32> %wide.masked.gather, ptr %3, align 4 295 %index.next = add i32 %index, 4 296 %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8> 297 %4 = icmp eq i32 %index.next, %n.vec 298 br i1 %4, label %end, label %vector.body 299 300end: 301 ret void; 302} 303 304define arm_aapcs_vfpcc void @gather_post_inc(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n.vec43) { 305; CHECK-LABEL: gather_post_inc: 306; CHECK: @ %bb.0: @ %vector.ph41 307; CHECK-NEXT: adr r3, .LCPI9_0 308; CHECK-NEXT: vldrw.u32 q0, [r3] 309; CHECK-NEXT: vadd.i32 q0, q0, r0 310; CHECK-NEXT: .LBB9_1: @ %vector.body39 311; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 312; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! 313; CHECK-NEXT: subs r2, #4 314; CHECK-NEXT: vstrb.8 q1, [r1], #16 315; CHECK-NEXT: bne .LBB9_1 316; CHECK-NEXT: @ %bb.2: @ %end 317; CHECK-NEXT: bx lr 318; CHECK-NEXT: .p2align 4 319; CHECK-NEXT: @ %bb.3: 320; CHECK-NEXT: .LCPI9_0: 321; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 322; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 323; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 324; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 325vector.ph41: ; preds = %for.body6.preheader 326 br label %vector.body39 327 328vector.body39: ; preds = %vector.body39, %vector.ph41 329 %index44 = phi i32 [ 0, %vector.ph41 ], [ %index.next45, %vector.body39 ] 330 %vec.ind50 = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph41 ], [ %vec.ind.next51, %vector.body39 ] 331 %0 = mul nuw nsw <4 x i32> %vec.ind50, <i32 3, i32 3, i32 3, i32 3> 332 %1 = getelementptr inbounds i32, ptr %data, <4 x i32> %0 333 %wide.masked.gather55 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 334 %2 = getelementptr inbounds i32, ptr %dst, i32 %index44 335 store <4 x i32> %wide.masked.gather55, ptr %2, align 4 336 %index.next45 = add i32 %index44, 4 337 %vec.ind.next51 = add <4 x i32> %vec.ind50, <i32 8, i32 8, i32 8, i32 8> 338 %3 = icmp eq i32 %index.next45, %n.vec43 339 br i1 %3, label %end, label %vector.body39 340 341end: 342 ret void; 343} 344 345define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) { 346; CHECK-LABEL: gather_inc_v4i32_simple: 347; CHECK: @ %bb.0: @ %entry 348; CHECK-NEXT: cmp r2, #1 349; CHECK-NEXT: it lt 350; CHECK-NEXT: bxlt lr 351; CHECK-NEXT: .LBB10_1: @ %vector.ph.preheader 352; CHECK-NEXT: .save {r4, lr} 353; CHECK-NEXT: push {r4, lr} 354; CHECK-NEXT: bic r12, r2, #3 355; CHECK-NEXT: movs r3, #1 356; CHECK-NEXT: sub.w lr, r12, #4 357; CHECK-NEXT: add.w r4, r3, lr, lsr #2 358; CHECK-NEXT: adr r3, .LCPI10_0 359; CHECK-NEXT: vldrw.u32 q0, [r3] 360; CHECK-NEXT: vadd.i32 q0, q0, r0 361; CHECK-NEXT: .LBB10_2: @ %vector.ph 362; CHECK-NEXT: @ =>This Loop Header: Depth=1 363; CHECK-NEXT: @ Child Loop BB10_3 Depth 2 364; CHECK-NEXT: dls lr, r4 365; CHECK-NEXT: mov r0, r1 366; CHECK-NEXT: vmov q1, q0 367; CHECK-NEXT: .LBB10_3: @ %vector.body 368; CHECK-NEXT: @ Parent Loop BB10_2 Depth=1 369; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 370; CHECK-NEXT: vldrw.u32 q2, [q1, #16]! 371; CHECK-NEXT: vstrb.8 q2, [r0], #16 372; CHECK-NEXT: le lr, .LBB10_3 373; CHECK-NEXT: @ %bb.4: @ %middle.block 374; CHECK-NEXT: @ in Loop: Header=BB10_2 Depth=1 375; CHECK-NEXT: cmp r12, r2 376; CHECK-NEXT: bne .LBB10_2 377; CHECK-NEXT: @ %bb.5: 378; CHECK-NEXT: pop.w {r4, lr} 379; CHECK-NEXT: bx lr 380; CHECK-NEXT: .p2align 4 381; CHECK-NEXT: @ %bb.6: 382; CHECK-NEXT: .LCPI10_0: 383; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 384; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 385; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 386; CHECK-NEXT: .long 4294967292 @ 0xfffffffc 387entry: 388 %cmp22 = icmp sgt i32 %n, 0 389 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 390 391vector.ph: ; preds = %for.body.preheader 392 %n.vec = and i32 %n, -4 393 br label %vector.body 394 395vector.body: ; preds = %vector.body, %vector.ph 396 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 397 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ] 398 %0 = getelementptr inbounds i32, ptr %data, <4 x i32> %vec.ind 399 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %0, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 400 %1 = getelementptr inbounds i32, ptr %dst, i32 %index 401 store <4 x i32> %wide.masked.gather, ptr %1, align 4 402 %index.next = add i32 %index, 4 403 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4> 404 %2 = icmp eq i32 %index.next, %n.vec 405 br i1 %2, label %middle.block, label %vector.body 406 407middle.block: ; preds = %vector.body 408 %cmp.n = icmp eq i32 %n.vec, %n 409 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 410 411for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 412 ret void 413} 414 415define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) { 416; CHECK-LABEL: gather_inc_v4i32_complex: 417; CHECK: @ %bb.0: @ %entry 418; CHECK-NEXT: cmp r2, #1 419; CHECK-NEXT: it lt 420; CHECK-NEXT: bxlt lr 421; CHECK-NEXT: .LBB11_1: @ %vector.ph.preheader 422; CHECK-NEXT: .save {r4, r5, r7, lr} 423; CHECK-NEXT: push {r4, r5, r7, lr} 424; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 425; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 426; CHECK-NEXT: bic r12, r2, #3 427; CHECK-NEXT: movs r3, #1 428; CHECK-NEXT: sub.w lr, r12, #4 429; CHECK-NEXT: adr r4, .LCPI11_1 430; CHECK-NEXT: adr r5, .LCPI11_2 431; CHECK-NEXT: vldrw.u32 q1, [r4] 432; CHECK-NEXT: add.w r3, r3, lr, lsr #2 433; CHECK-NEXT: adr.w lr, .LCPI11_0 434; CHECK-NEXT: vldrw.u32 q0, [r5] 435; CHECK-NEXT: vldrw.u32 q2, [lr] 436; CHECK-NEXT: vadd.i32 q1, q1, r0 437; CHECK-NEXT: vadd.i32 q0, q0, r0 438; CHECK-NEXT: vadd.i32 q2, q2, r0 439; CHECK-NEXT: .LBB11_2: @ %vector.ph 440; CHECK-NEXT: @ =>This Loop Header: Depth=1 441; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 442; CHECK-NEXT: dls lr, r3 443; CHECK-NEXT: mov r0, r1 444; CHECK-NEXT: vmov q3, q1 445; CHECK-NEXT: vmov q4, q0 446; CHECK-NEXT: vmov q5, q2 447; CHECK-NEXT: .LBB11_3: @ %vector.body 448; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 449; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 450; CHECK-NEXT: vldrw.u32 q6, [q5, #48]! 451; CHECK-NEXT: vldrw.u32 q7, [q3, #48]! 452; CHECK-NEXT: vadd.i32 q6, q7, q6 453; CHECK-NEXT: vldrw.u32 q7, [q4, #48]! 454; CHECK-NEXT: vadd.i32 q6, q6, q7 455; CHECK-NEXT: vstrb.8 q6, [r0], #16 456; CHECK-NEXT: le lr, .LBB11_3 457; CHECK-NEXT: @ %bb.4: @ %middle.block 458; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 459; CHECK-NEXT: cmp r12, r2 460; CHECK-NEXT: bne .LBB11_2 461; CHECK-NEXT: @ %bb.5: 462; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 463; CHECK-NEXT: pop.w {r4, r5, r7, lr} 464; CHECK-NEXT: bx lr 465; CHECK-NEXT: .p2align 4 466; CHECK-NEXT: @ %bb.6: 467; CHECK-NEXT: .LCPI11_0: 468; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 469; CHECK-NEXT: .long 4294967260 @ 0xffffffdc 470; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 471; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 472; CHECK-NEXT: .LCPI11_1: 473; CHECK-NEXT: .long 4294967252 @ 0xffffffd4 474; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 475; CHECK-NEXT: .long 4294967276 @ 0xffffffec 476; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 477; CHECK-NEXT: .LCPI11_2: 478; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 479; CHECK-NEXT: .long 4294967268 @ 0xffffffe4 480; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 481; CHECK-NEXT: .long 4294967292 @ 0xfffffffc 482entry: 483 %cmp22 = icmp sgt i32 %n, 0 484 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 485 486vector.ph: ; preds = %for.body.preheader 487 %n.vec = and i32 %n, -4 488 br label %vector.body 489 490vector.body: ; preds = %vector.body, %vector.ph 491 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 492 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ] 493 %0 = mul nuw nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3> 494 %1 = getelementptr inbounds i32, ptr %data, <4 x i32> %0 495 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %1, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 496 %2 = add nuw nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1> 497 %3 = getelementptr inbounds i32, ptr %data, <4 x i32> %2 498 %wide.masked.gather24 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 499 %4 = add nuw nsw <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2> 500 %5 = getelementptr inbounds i32, ptr %data, <4 x i32> %4 501 %wide.masked.gather25 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %5, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 502 %6 = add nsw <4 x i32> %wide.masked.gather24, %wide.masked.gather 503 %7 = add nsw <4 x i32> %6, %wide.masked.gather25 504 %8 = getelementptr inbounds i32, ptr %dst, i32 %index 505 store <4 x i32> %7, ptr %8, align 4 506 %index.next = add i32 %index, 4 507 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4> 508 %9 = icmp eq i32 %index.next, %n.vec 509 br i1 %9, label %middle.block, label %vector.body 510 511middle.block: ; preds = %vector.body 512 %cmp.n = icmp eq i32 %n.vec, %n 513 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 514 515for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 516 ret void 517} 518 519define arm_aapcs_vfpcc void @gather_inc_v4i32_large(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) { 520; CHECK-LABEL: gather_inc_v4i32_large: 521; CHECK: @ %bb.0: @ %entry 522; CHECK-NEXT: cmp r2, #1 523; CHECK-NEXT: it lt 524; CHECK-NEXT: bxlt lr 525; CHECK-NEXT: .LBB12_1: @ %vector.ph.preheader 526; CHECK-NEXT: .save {r4, lr} 527; CHECK-NEXT: push {r4, lr} 528; CHECK-NEXT: bic r12, r2, #3 529; CHECK-NEXT: movs r3, #1 530; CHECK-NEXT: sub.w lr, r12, #4 531; CHECK-NEXT: add.w r4, r3, lr, lsr #2 532; CHECK-NEXT: adr r3, .LCPI12_0 533; CHECK-NEXT: vldrw.u32 q0, [r3] 534; CHECK-NEXT: vadd.i32 q0, q0, r0 535; CHECK-NEXT: .LBB12_2: @ %vector.ph 536; CHECK-NEXT: @ =>This Loop Header: Depth=1 537; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 538; CHECK-NEXT: dls lr, r4 539; CHECK-NEXT: mov r0, r1 540; CHECK-NEXT: vmov q1, q0 541; CHECK-NEXT: .LBB12_3: @ %vector.body 542; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 543; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 544; CHECK-NEXT: vldrw.u32 q2, [q1, #508]! 545; CHECK-NEXT: vstrb.8 q2, [r0], #16 546; CHECK-NEXT: le lr, .LBB12_3 547; CHECK-NEXT: @ %bb.4: @ %middle.block 548; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 549; CHECK-NEXT: cmp r12, r2 550; CHECK-NEXT: bne .LBB12_2 551; CHECK-NEXT: @ %bb.5: 552; CHECK-NEXT: pop.w {r4, lr} 553; CHECK-NEXT: bx lr 554; CHECK-NEXT: .p2align 4 555; CHECK-NEXT: @ %bb.6: 556; CHECK-NEXT: .LCPI12_0: 557; CHECK-NEXT: .long 4294966788 @ 0xfffffe04 558; CHECK-NEXT: .long 4294966792 @ 0xfffffe08 559; CHECK-NEXT: .long 4294966796 @ 0xfffffe0c 560; CHECK-NEXT: .long 4294966800 @ 0xfffffe10 561entry: 562 %cmp22 = icmp sgt i32 %n, 0 563 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 564 565vector.ph: ; preds = %for.body.preheader 566 %n.vec = and i32 %n, -4 567 br label %vector.body 568 569vector.body: ; preds = %vector.body, %vector.ph 570 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 571 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ] 572 %0 = getelementptr inbounds i32, ptr %data, <4 x i32> %vec.ind 573 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %0, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef) 574 %1 = getelementptr inbounds i32, ptr %dst, i32 %index 575 store <4 x i32> %wide.masked.gather, ptr %1, align 4 576 %index.next = add i32 %index, 4 577 %vec.ind.next = add <4 x i32> %vec.ind, <i32 127, i32 127, i32 127, i32 127> 578 %2 = icmp eq i32 %index.next, %n.vec 579 br i1 %2, label %middle.block, label %vector.body 580 581middle.block: ; preds = %vector.body 582 %cmp.n = icmp eq i32 %n.vec, %n 583 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 584 585for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 586 ret void 587} 588 589; TODO: uneven - I think it's not possible to create such an example, because vec.ind will always be increased by a vector with 4 elements (=> x*4 = even) 590 591; TODO: What is sxth? 592define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) { 593; CHECK-LABEL: gather_inc_v8i16_simple: 594; CHECK: @ %bb.0: @ %entry 595; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 596; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 597; CHECK-NEXT: .pad #28 598; CHECK-NEXT: sub sp, #28 599; CHECK-NEXT: cmp r2, #1 600; CHECK-NEXT: strd r1, r2, [sp, #4] @ 8-byte Folded Spill 601; CHECK-NEXT: blt .LBB13_5 602; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader 603; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload 604; CHECK-NEXT: movs r6, #1 605; CHECK-NEXT: add r2, sp, #12 606; CHECK-NEXT: mov.w r9, #8 607; CHECK-NEXT: bic r1, r1, #7 608; CHECK-NEXT: str r1, [sp] @ 4-byte Spill 609; CHECK-NEXT: sub.w r3, r1, #8 610; CHECK-NEXT: add.w r8, r6, r3, lsr #3 611; CHECK-NEXT: adr r3, .LCPI13_0 612; CHECK-NEXT: vldrw.u32 q0, [r3] 613; CHECK-NEXT: .LBB13_2: @ %vector.ph 614; CHECK-NEXT: @ =>This Loop Header: Depth=1 615; CHECK-NEXT: @ Child Loop BB13_3 Depth 2 616; CHECK-NEXT: dls lr, r8 617; CHECK-NEXT: vmov q1, q0 618; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload 619; CHECK-NEXT: .LBB13_3: @ %vector.body 620; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 621; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 622; CHECK-NEXT: vstrw.32 q1, [r2] 623; CHECK-NEXT: mov r12, r2 624; CHECK-NEXT: vldrh.s32 q2, [r2, #8] 625; CHECK-NEXT: vadd.i16 q1, q1, r9 626; CHECK-NEXT: vshl.i32 q2, q2, #1 627; CHECK-NEXT: vadd.i32 q2, q2, r0 628; CHECK-NEXT: vmov r7, r5, d5 629; CHECK-NEXT: vmov r3, r4, d4 630; CHECK-NEXT: vldrh.s32 q2, [r2] 631; CHECK-NEXT: vshl.i32 q2, q2, #1 632; CHECK-NEXT: vadd.i32 q2, q2, r0 633; CHECK-NEXT: vmov r1, r10, d5 634; CHECK-NEXT: ldrh r7, [r7] 635; CHECK-NEXT: ldrh r4, [r4] 636; CHECK-NEXT: ldrh r5, [r5] 637; CHECK-NEXT: ldrh.w r2, [r10] 638; CHECK-NEXT: ldrh.w r10, [r3] 639; CHECK-NEXT: vmov r3, r11, d4 640; CHECK-NEXT: ldrh r1, [r1] 641; CHECK-NEXT: ldrh r3, [r3] 642; CHECK-NEXT: ldrh.w r11, [r11] 643; CHECK-NEXT: vmov.16 q2[0], r3 644; CHECK-NEXT: vmov.16 q2[1], r11 645; CHECK-NEXT: vmov.16 q2[2], r1 646; CHECK-NEXT: vmov.16 q2[3], r2 647; CHECK-NEXT: mov r2, r12 648; CHECK-NEXT: vmov.16 q2[4], r10 649; CHECK-NEXT: vmov.16 q2[5], r4 650; CHECK-NEXT: vmov.16 q2[6], r7 651; CHECK-NEXT: vmov.16 q2[7], r5 652; CHECK-NEXT: vstrb.8 q2, [r6], #16 653; CHECK-NEXT: le lr, .LBB13_3 654; CHECK-NEXT: @ %bb.4: @ %middle.block 655; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1 656; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload 657; CHECK-NEXT: ldr r3, [sp] @ 4-byte Reload 658; CHECK-NEXT: cmp r3, r1 659; CHECK-NEXT: bne .LBB13_2 660; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup 661; CHECK-NEXT: add sp, #28 662; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 663; CHECK-NEXT: .p2align 4 664; CHECK-NEXT: @ %bb.6: 665; CHECK-NEXT: .LCPI13_0: 666; CHECK-NEXT: .short 0 @ 0x0 667; CHECK-NEXT: .short 1 @ 0x1 668; CHECK-NEXT: .short 2 @ 0x2 669; CHECK-NEXT: .short 3 @ 0x3 670; CHECK-NEXT: .short 4 @ 0x4 671; CHECK-NEXT: .short 5 @ 0x5 672; CHECK-NEXT: .short 6 @ 0x6 673; CHECK-NEXT: .short 7 @ 0x7 674 675 676entry: 677 %cmp22 = icmp sgt i32 %n, 0 678 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 679 680vector.ph: ; preds = %for.body.preheader 681 %n.vec = and i32 %n, -8 682 br label %vector.body 683 684vector.body: ; preds = %vector.body, %vector.ph 685 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 686 %vec.ind = phi <8 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %vector.ph ], [ %vec.ind.next, %vector.body ] 687 %0 = getelementptr inbounds i16, ptr %data, <8 x i16> %vec.ind 688 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %0, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 689 %1 = getelementptr inbounds i16, ptr %dst, i32 %index 690 store <8 x i16> %wide.masked.gather, ptr %1, align 2 691 %index.next = add i32 %index, 8 692 %vec.ind.next = add <8 x i16> %vec.ind, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 693 %2 = icmp eq i32 %index.next, %n.vec 694 br i1 %2, label %middle.block, label %vector.body 695 696middle.block: ; preds = %vector.body 697 %cmp.n = icmp eq i32 %n.vec, %n 698 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 699 700for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 701 ret void 702} 703 704; TODO: This looks absolutely terrifying :( 705define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) { 706; CHECK-LABEL: gather_inc_v8i16_complex: 707; CHECK: @ %bb.0: @ %entry 708; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 709; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 710; CHECK-NEXT: .pad #4 711; CHECK-NEXT: sub sp, #4 712; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 713; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 714; CHECK-NEXT: .pad #136 715; CHECK-NEXT: sub sp, #136 716; CHECK-NEXT: cmp r2, #1 717; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill 718; CHECK-NEXT: blt.w .LBB14_5 719; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader 720; CHECK-NEXT: ldr r1, [sp, #68] @ 4-byte Reload 721; CHECK-NEXT: adr r3, .LCPI14_2 722; CHECK-NEXT: vldrw.u32 q0, [r3] 723; CHECK-NEXT: movs r2, #1 724; CHECK-NEXT: bic r1, r1, #7 725; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill 726; CHECK-NEXT: subs r1, #8 727; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill 728; CHECK-NEXT: vmov.i16 q2, #0x18 729; CHECK-NEXT: add.w r1, r2, r1, lsr #3 730; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill 731; CHECK-NEXT: adr r1, .LCPI14_0 732; CHECK-NEXT: adr r2, .LCPI14_1 733; CHECK-NEXT: vldrw.u32 q0, [r1] 734; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill 735; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill 736; CHECK-NEXT: vldrw.u32 q0, [r2] 737; CHECK-NEXT: add r2, sp, #120 738; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill 739; CHECK-NEXT: .LBB14_2: @ %vector.ph 740; CHECK-NEXT: @ =>This Loop Header: Depth=1 741; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 742; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload 743; CHECK-NEXT: add.w r10, sp, #104 744; CHECK-NEXT: dls lr, r1 745; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload 746; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload 747; CHECK-NEXT: vldrw.u32 q5, [sp, #40] @ 16-byte Reload 748; CHECK-NEXT: vldrw.u32 q6, [sp, #8] @ 16-byte Reload 749; CHECK-NEXT: .LBB14_3: @ %vector.body 750; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 751; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 752; CHECK-NEXT: vstrw.32 q5, [r2] 753; CHECK-NEXT: mov r8, r2 754; CHECK-NEXT: vldrh.s32 q0, [r2, #8] 755; CHECK-NEXT: vshl.i32 q0, q0, #1 756; CHECK-NEXT: vadd.i32 q0, q0, r0 757; CHECK-NEXT: vmov r1, r3, d0 758; CHECK-NEXT: vmov r4, r5, d1 759; CHECK-NEXT: vldrh.s32 q0, [r2] 760; CHECK-NEXT: vshl.i32 q0, q0, #1 761; CHECK-NEXT: vadd.i32 q2, q0, r0 762; CHECK-NEXT: vmov r6, r2, d4 763; CHECK-NEXT: ldrh r1, [r1] 764; CHECK-NEXT: ldrh.w r12, [r4] 765; CHECK-NEXT: add r4, sp, #88 766; CHECK-NEXT: ldrh.w r11, [r5] 767; CHECK-NEXT: ldrh r3, [r3] 768; CHECK-NEXT: ldrh r5, [r6] 769; CHECK-NEXT: ldrh r2, [r2] 770; CHECK-NEXT: vstrw.32 q6, [r4] 771; CHECK-NEXT: vldrh.s32 q0, [r4] 772; CHECK-NEXT: vmov.16 q7[0], r5 773; CHECK-NEXT: vmov.16 q7[1], r2 774; CHECK-NEXT: vshl.i32 q0, q0, #1 775; CHECK-NEXT: vadd.i32 q0, q0, r0 776; CHECK-NEXT: vmov r6, r9, d0 777; CHECK-NEXT: vmov r2, r5, d1 778; CHECK-NEXT: vldrh.s32 q0, [r4, #8] 779; CHECK-NEXT: vshl.i32 q0, q0, #1 780; CHECK-NEXT: vadd.i32 q0, q0, r0 781; CHECK-NEXT: ldrh r6, [r6] 782; CHECK-NEXT: ldrh r2, [r2] 783; CHECK-NEXT: vmov.16 q1[0], r6 784; CHECK-NEXT: ldrh.w r6, [r9] 785; CHECK-NEXT: ldrh r5, [r5] 786; CHECK-NEXT: vmov.16 q1[1], r6 787; CHECK-NEXT: vmov.16 q1[2], r2 788; CHECK-NEXT: vmov r2, r6, d0 789; CHECK-NEXT: vmov.16 q1[3], r5 790; CHECK-NEXT: ldrh r2, [r2] 791; CHECK-NEXT: ldrh r6, [r6] 792; CHECK-NEXT: vmov.16 q1[4], r2 793; CHECK-NEXT: vmov r2, r5, d1 794; CHECK-NEXT: vmov.16 q1[5], r6 795; CHECK-NEXT: mov r6, r10 796; CHECK-NEXT: ldrh r2, [r2] 797; CHECK-NEXT: ldrh r5, [r5] 798; CHECK-NEXT: vstrw.32 q4, [r10] 799; CHECK-NEXT: vldrh.s32 q0, [r6] 800; CHECK-NEXT: vmov.16 q1[6], r2 801; CHECK-NEXT: vmov.16 q1[7], r5 802; CHECK-NEXT: vshl.i32 q0, q0, #1 803; CHECK-NEXT: vadd.i32 q0, q0, r0 804; CHECK-NEXT: vmov r2, r5, d0 805; CHECK-NEXT: ldrh r2, [r2] 806; CHECK-NEXT: ldrh r5, [r5] 807; CHECK-NEXT: vmov.16 q3[0], r2 808; CHECK-NEXT: vmov.16 q3[1], r5 809; CHECK-NEXT: vmov r2, r5, d5 810; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload 811; CHECK-NEXT: vadd.i16 q6, q6, q2 812; CHECK-NEXT: vadd.i16 q5, q5, q2 813; CHECK-NEXT: vadd.i16 q4, q4, q2 814; CHECK-NEXT: ldrh.w r9, [r2] 815; CHECK-NEXT: vmov r2, r4, d1 816; CHECK-NEXT: vldrh.s32 q0, [r6, #8] 817; CHECK-NEXT: ldrh r5, [r5] 818; CHECK-NEXT: vmov.16 q7[2], r9 819; CHECK-NEXT: vshl.i32 q0, q0, #1 820; CHECK-NEXT: vmov.16 q7[3], r5 821; CHECK-NEXT: vadd.i32 q0, q0, r0 822; CHECK-NEXT: vmov.16 q7[4], r1 823; CHECK-NEXT: vmov.16 q7[5], r3 824; CHECK-NEXT: vmov.16 q7[6], r12 825; CHECK-NEXT: vmov.16 q7[7], r11 826; CHECK-NEXT: ldrh r2, [r2] 827; CHECK-NEXT: ldrh r4, [r4] 828; CHECK-NEXT: vmov.16 q3[2], r2 829; CHECK-NEXT: vmov.16 q3[3], r4 830; CHECK-NEXT: vmov r2, r4, d0 831; CHECK-NEXT: ldrh r2, [r2] 832; CHECK-NEXT: ldrh r4, [r4] 833; CHECK-NEXT: vmov.16 q3[4], r2 834; CHECK-NEXT: vmov.16 q3[5], r4 835; CHECK-NEXT: vmov r2, r4, d1 836; CHECK-NEXT: ldrh r2, [r2] 837; CHECK-NEXT: ldrh r4, [r4] 838; CHECK-NEXT: vmov.16 q3[6], r2 839; CHECK-NEXT: mov r2, r8 840; CHECK-NEXT: vmov.16 q3[7], r4 841; CHECK-NEXT: vadd.i16 q0, q3, q1 842; CHECK-NEXT: vadd.i16 q0, q0, q7 843; CHECK-NEXT: vstrb.8 q0, [r7], #16 844; CHECK-NEXT: le lr, .LBB14_3 845; CHECK-NEXT: @ %bb.4: @ %middle.block 846; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1 847; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload 848; CHECK-NEXT: ldr r3, [sp, #68] @ 4-byte Reload 849; CHECK-NEXT: cmp r1, r3 850; CHECK-NEXT: bne.w .LBB14_2 851; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup 852; CHECK-NEXT: add sp, #136 853; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 854; CHECK-NEXT: add sp, #4 855; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 856; CHECK-NEXT: .p2align 4 857; CHECK-NEXT: @ %bb.6: 858; CHECK-NEXT: .LCPI14_0: 859; CHECK-NEXT: .short 1 @ 0x1 860; CHECK-NEXT: .short 4 @ 0x4 861; CHECK-NEXT: .short 7 @ 0x7 862; CHECK-NEXT: .short 10 @ 0xa 863; CHECK-NEXT: .short 13 @ 0xd 864; CHECK-NEXT: .short 16 @ 0x10 865; CHECK-NEXT: .short 19 @ 0x13 866; CHECK-NEXT: .short 22 @ 0x16 867; CHECK-NEXT: .LCPI14_1: 868; CHECK-NEXT: .short 0 @ 0x0 869; CHECK-NEXT: .short 3 @ 0x3 870; CHECK-NEXT: .short 6 @ 0x6 871; CHECK-NEXT: .short 9 @ 0x9 872; CHECK-NEXT: .short 12 @ 0xc 873; CHECK-NEXT: .short 15 @ 0xf 874; CHECK-NEXT: .short 18 @ 0x12 875; CHECK-NEXT: .short 21 @ 0x15 876; CHECK-NEXT: .LCPI14_2: 877; CHECK-NEXT: .short 2 @ 0x2 878; CHECK-NEXT: .short 5 @ 0x5 879; CHECK-NEXT: .short 8 @ 0x8 880; CHECK-NEXT: .short 11 @ 0xb 881; CHECK-NEXT: .short 14 @ 0xe 882; CHECK-NEXT: .short 17 @ 0x11 883; CHECK-NEXT: .short 20 @ 0x14 884; CHECK-NEXT: .short 23 @ 0x17 885 886 887entry: 888 %cmp22 = icmp sgt i32 %n, 0 889 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 890 891vector.ph: ; preds = %for.body.preheader 892 %n.vec = and i32 %n, -8 893 br label %vector.body 894 895vector.body: ; preds = %vector.body, %vector.ph 896 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 897 %vec.ind = phi <8 x i16> [ <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, %vector.ph ], [ %vec.ind.next, %vector.body ] 898 %0 = mul nuw nsw <8 x i16> %vec.ind, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3> 899 %1 = getelementptr inbounds i16, ptr %data, <8 x i16> %0 900 %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %1, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 901 %2 = add nuw nsw <8 x i16> %0, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 902 %3 = getelementptr inbounds i16, ptr %data, <8 x i16> %2 903 %wide.masked.gather24 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %3, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 904 %4 = add nuw nsw <8 x i16> %0, <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2> 905 %5 = getelementptr inbounds i16, ptr %data, <8 x i16> %4 906 %wide.masked.gather25 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr> %5, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef) 907 %6 = add nsw <8 x i16> %wide.masked.gather24, %wide.masked.gather 908 %7 = add nsw <8 x i16> %6, %wide.masked.gather25 909 %8 = getelementptr inbounds i16, ptr %dst, i32 %index 910 store <8 x i16> %7, ptr %8, align 2 911 %index.next = add i32 %index, 8 912 %vec.ind.next = add <8 x i16> %vec.ind, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 913 %9 = icmp eq i32 %index.next, %n.vec 914 br i1 %9, label %middle.block, label %vector.body 915 916middle.block: ; preds = %vector.body 917 %cmp.n = icmp eq i32 %n.vec, %n 918 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 919 920for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 921 ret void 922} 923 924 925define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) { 926; CHECK-LABEL: gather_inc_v16i8_complex: 927; CHECK: @ %bb.0: @ %entry 928; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 929; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 930; CHECK-NEXT: .pad #4 931; CHECK-NEXT: sub sp, #4 932; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 933; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 934; CHECK-NEXT: .pad #312 935; CHECK-NEXT: sub sp, #312 936; CHECK-NEXT: cmp r2, #1 937; CHECK-NEXT: str r1, [sp, #116] @ 4-byte Spill 938; CHECK-NEXT: blt.w .LBB15_5 939; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader 940; CHECK-NEXT: adr r1, .LCPI15_0 941; CHECK-NEXT: adr r6, .LCPI15_8 942; CHECK-NEXT: vldrw.u32 q0, [r1] 943; CHECK-NEXT: adr r1, .LCPI15_1 944; CHECK-NEXT: adr r7, .LCPI15_7 945; CHECK-NEXT: adr r3, .LCPI15_6 946; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill 947; CHECK-NEXT: vldrw.u32 q0, [r1] 948; CHECK-NEXT: adr r1, .LCPI15_5 949; CHECK-NEXT: bic r10, r2, #7 950; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill 951; CHECK-NEXT: vldrw.u32 q0, [r6] 952; CHECK-NEXT: adr r6, .LCPI15_9 953; CHECK-NEXT: vmov.i32 q2, #0x30 954; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill 955; CHECK-NEXT: vldrw.u32 q0, [r7] 956; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill 957; CHECK-NEXT: vldrw.u32 q0, [r6] 958; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill 959; CHECK-NEXT: vldrw.u32 q0, [r1] 960; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 961; CHECK-NEXT: vldrw.u32 q0, [r3] 962; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 963; CHECK-NEXT: .LBB15_2: @ %vector.ph 964; CHECK-NEXT: @ =>This Loop Header: Depth=1 965; CHECK-NEXT: @ Child Loop BB15_3 Depth 2 966; CHECK-NEXT: adr r1, .LCPI15_3 967; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload 968; CHECK-NEXT: vldrw.u32 q0, [r1] 969; CHECK-NEXT: adr r1, .LCPI15_4 970; CHECK-NEXT: vldrw.u32 q5, [r1] 971; CHECK-NEXT: adr r1, .LCPI15_2 972; CHECK-NEXT: vldrw.u32 q3, [r1] 973; CHECK-NEXT: adr r1, .LCPI15_10 974; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill 975; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload 976; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill 977; CHECK-NEXT: vldrw.u32 q3, [r1] 978; CHECK-NEXT: adr r1, .LCPI15_11 979; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload 980; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill 981; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload 982; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill 983; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload 984; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill 985; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload 986; CHECK-NEXT: vldrw.u32 q7, [r1] 987; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload 988; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill 989; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload 990; CHECK-NEXT: mov r11, r10 991; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill 992; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill 993; CHECK-NEXT: .LBB15_3: @ %vector.body 994; CHECK-NEXT: @ Parent Loop BB15_2 Depth=1 995; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 996; CHECK-NEXT: vadd.i32 q4, q1, r0 997; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill 998; CHECK-NEXT: vmov r1, lr, d8 999; CHECK-NEXT: vadd.i32 q7, q7, r0 1000; CHECK-NEXT: vmov r5, r4, d15 1001; CHECK-NEXT: vadd.i32 q6, q0, r0 1002; CHECK-NEXT: vmov r6, r7, d13 1003; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill 1004; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload 1005; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill 1006; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload 1007; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload 1008; CHECK-NEXT: vadd.i32 q1, q1, r0 1009; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill 1010; CHECK-NEXT: vadd.i32 q0, q0, r0 1011; CHECK-NEXT: subs.w r11, r11, #16 1012; CHECK-NEXT: ldrb.w r9, [r1] 1013; CHECK-NEXT: vmov r1, r3, d14 1014; CHECK-NEXT: ldrb r5, [r5] 1015; CHECK-NEXT: ldrb r7, [r7] 1016; CHECK-NEXT: ldrb r1, [r1] 1017; CHECK-NEXT: vmov.8 q7[0], r1 1018; CHECK-NEXT: ldrb r1, [r3] 1019; CHECK-NEXT: vmov.8 q7[1], r1 1020; CHECK-NEXT: vmov r1, r3, d12 1021; CHECK-NEXT: vmov.8 q7[2], r5 1022; CHECK-NEXT: ldrb r5, [r6] 1023; CHECK-NEXT: ldrb r6, [r4] 1024; CHECK-NEXT: vmov.8 q7[3], r6 1025; CHECK-NEXT: ldrb r1, [r1] 1026; CHECK-NEXT: ldrb r3, [r3] 1027; CHECK-NEXT: vmov.8 q6[0], r1 1028; CHECK-NEXT: vmov r6, r1, d2 1029; CHECK-NEXT: vmov.8 q6[1], r3 1030; CHECK-NEXT: vmov.8 q6[2], r5 1031; CHECK-NEXT: vmov.8 q6[3], r7 1032; CHECK-NEXT: ldrb.w r7, [lr] 1033; CHECK-NEXT: vmov.8 q6[4], r9 1034; CHECK-NEXT: vmov.8 q6[5], r7 1035; CHECK-NEXT: ldrb r4, [r1] 1036; CHECK-NEXT: vmov r1, r5, d3 1037; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload 1038; CHECK-NEXT: ldrb.w r12, [r1] 1039; CHECK-NEXT: vmov r1, r3, d9 1040; CHECK-NEXT: ldrb r5, [r5] 1041; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload 1042; CHECK-NEXT: ldrb r1, [r1] 1043; CHECK-NEXT: ldrb r3, [r3] 1044; CHECK-NEXT: vmov.8 q6[6], r1 1045; CHECK-NEXT: vmov r1, r7, d0 1046; CHECK-NEXT: vmov.8 q6[7], r3 1047; CHECK-NEXT: ldrb r1, [r1] 1048; CHECK-NEXT: ldrb r7, [r7] 1049; CHECK-NEXT: vmov.8 q7[4], r1 1050; CHECK-NEXT: vmov r1, r3, d1 1051; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload 1052; CHECK-NEXT: vmov.8 q7[5], r7 1053; CHECK-NEXT: vadd.i32 q0, q0, r0 1054; CHECK-NEXT: ldrb r1, [r1] 1055; CHECK-NEXT: ldrb r3, [r3] 1056; CHECK-NEXT: vmov.8 q7[6], r1 1057; CHECK-NEXT: ldrb r1, [r6] 1058; CHECK-NEXT: vmov r7, r6, d0 1059; CHECK-NEXT: vmov.8 q7[7], r3 1060; CHECK-NEXT: vmov r3, lr, d1 1061; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload 1062; CHECK-NEXT: vmov.8 q7[8], r1 1063; CHECK-NEXT: vadd.i32 q0, q0, r0 1064; CHECK-NEXT: vmov.8 q7[9], r4 1065; CHECK-NEXT: vmov r4, r1, d0 1066; CHECK-NEXT: vmov.8 q7[10], r12 1067; CHECK-NEXT: vmov.8 q7[11], r5 1068; CHECK-NEXT: ldrb r7, [r7] 1069; CHECK-NEXT: ldrb r6, [r6] 1070; CHECK-NEXT: ldrb r3, [r3] 1071; CHECK-NEXT: ldrb r4, [r4] 1072; CHECK-NEXT: ldrb r1, [r1] 1073; CHECK-NEXT: vmov.8 q6[8], r4 1074; CHECK-NEXT: vmov r5, r4, d1 1075; CHECK-NEXT: vmov.8 q6[9], r1 1076; CHECK-NEXT: vadd.i32 q0, q5, r0 1077; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload 1078; CHECK-NEXT: ldrb r5, [r5] 1079; CHECK-NEXT: ldrb r4, [r4] 1080; CHECK-NEXT: vmov.8 q6[10], r5 1081; CHECK-NEXT: vmov.8 q6[11], r4 1082; CHECK-NEXT: vmov.8 q6[12], r7 1083; CHECK-NEXT: vmov.8 q6[13], r6 1084; CHECK-NEXT: vmov.8 q6[14], r3 1085; CHECK-NEXT: vmov r1, r3, d0 1086; CHECK-NEXT: ldrb r1, [r1] 1087; CHECK-NEXT: vmov.8 q7[12], r1 1088; CHECK-NEXT: ldrb r1, [r3] 1089; CHECK-NEXT: vmov.8 q7[13], r1 1090; CHECK-NEXT: vmov r1, r3, d1 1091; CHECK-NEXT: vadd.i32 q0, q1, r0 1092; CHECK-NEXT: vadd.i32 q1, q1, q2 1093; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill 1094; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload 1095; CHECK-NEXT: vadd.i32 q1, q1, q2 1096; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill 1097; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload 1098; CHECK-NEXT: vadd.i32 q1, q1, q2 1099; CHECK-NEXT: ldrb r1, [r1] 1100; CHECK-NEXT: vmov.8 q7[14], r1 1101; CHECK-NEXT: ldrb r1, [r3] 1102; CHECK-NEXT: vmov.8 q7[15], r1 1103; CHECK-NEXT: ldrb.w r1, [lr] 1104; CHECK-NEXT: vmov.8 q6[15], r1 1105; CHECK-NEXT: vmov r1, r3, d0 1106; CHECK-NEXT: vadd.i8 q6, q6, q7 1107; CHECK-NEXT: ldrb r1, [r1] 1108; CHECK-NEXT: ldrb r3, [r3] 1109; CHECK-NEXT: vmov.8 q7[0], r1 1110; CHECK-NEXT: vmov.8 q7[1], r3 1111; CHECK-NEXT: vmov r1, r3, d1 1112; CHECK-NEXT: vadd.i32 q0, q3, r0 1113; CHECK-NEXT: vadd.i32 q3, q3, q2 1114; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill 1115; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload 1116; CHECK-NEXT: vadd.i32 q3, q3, q2 1117; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill 1118; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload 1119; CHECK-NEXT: vadd.i32 q3, q3, q2 1120; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill 1121; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload 1122; CHECK-NEXT: vadd.i32 q3, q3, q2 1123; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill 1124; CHECK-NEXT: ldrb r1, [r1] 1125; CHECK-NEXT: vmov.8 q7[2], r1 1126; CHECK-NEXT: ldrb r1, [r3] 1127; CHECK-NEXT: vmov.8 q7[3], r1 1128; CHECK-NEXT: vmov r1, r3, d0 1129; CHECK-NEXT: ldrb r1, [r1] 1130; CHECK-NEXT: vmov.8 q7[4], r1 1131; CHECK-NEXT: ldrb r1, [r3] 1132; CHECK-NEXT: vmov.8 q7[5], r1 1133; CHECK-NEXT: vmov r1, r3, d1 1134; CHECK-NEXT: vadd.i32 q0, q5, r0 1135; CHECK-NEXT: vadd.i32 q5, q5, q2 1136; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill 1137; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload 1138; CHECK-NEXT: vadd.i32 q5, q5, q2 1139; CHECK-NEXT: ldrb r1, [r1] 1140; CHECK-NEXT: vmov.8 q7[6], r1 1141; CHECK-NEXT: ldrb r1, [r3] 1142; CHECK-NEXT: vmov.8 q7[7], r1 1143; CHECK-NEXT: vmov r1, r3, d0 1144; CHECK-NEXT: ldrb r1, [r1] 1145; CHECK-NEXT: vmov.8 q7[8], r1 1146; CHECK-NEXT: ldrb r1, [r3] 1147; CHECK-NEXT: vmov.8 q7[9], r1 1148; CHECK-NEXT: vmov r1, r3, d1 1149; CHECK-NEXT: vadd.i32 q0, q4, r0 1150; CHECK-NEXT: vadd.i32 q4, q4, q2 1151; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill 1152; CHECK-NEXT: ldrb r1, [r1] 1153; CHECK-NEXT: vmov.8 q7[10], r1 1154; CHECK-NEXT: ldrb r1, [r3] 1155; CHECK-NEXT: vmov.8 q7[11], r1 1156; CHECK-NEXT: vmov r1, r3, d0 1157; CHECK-NEXT: ldrb r1, [r1] 1158; CHECK-NEXT: vmov.8 q7[12], r1 1159; CHECK-NEXT: ldrb r1, [r3] 1160; CHECK-NEXT: vmov.8 q7[13], r1 1161; CHECK-NEXT: vmov r1, r3, d1 1162; CHECK-NEXT: ldrb r1, [r1] 1163; CHECK-NEXT: vmov.8 q7[14], r1 1164; CHECK-NEXT: ldrb r1, [r3] 1165; CHECK-NEXT: vmov.8 q7[15], r1 1166; CHECK-NEXT: vadd.i8 q0, q6, q7 1167; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload 1168; CHECK-NEXT: vstrb.8 q0, [r8], #16 1169; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload 1170; CHECK-NEXT: vadd.i32 q7, q7, q2 1171; CHECK-NEXT: vadd.i32 q0, q0, q2 1172; CHECK-NEXT: bne.w .LBB15_3 1173; CHECK-NEXT: @ %bb.4: @ %middle.block 1174; CHECK-NEXT: @ in Loop: Header=BB15_2 Depth=1 1175; CHECK-NEXT: cmp r10, r2 1176; CHECK-NEXT: bne.w .LBB15_2 1177; CHECK-NEXT: .LBB15_5: @ %for.cond.cleanup 1178; CHECK-NEXT: add sp, #312 1179; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1180; CHECK-NEXT: add sp, #4 1181; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 1182; CHECK-NEXT: .p2align 4 1183; CHECK-NEXT: @ %bb.6: 1184; CHECK-NEXT: .LCPI15_0: 1185; CHECK-NEXT: .long 38 @ 0x26 1186; CHECK-NEXT: .long 41 @ 0x29 1187; CHECK-NEXT: .long 44 @ 0x2c 1188; CHECK-NEXT: .long 47 @ 0x2f 1189; CHECK-NEXT: .LCPI15_1: 1190; CHECK-NEXT: .long 14 @ 0xe 1191; CHECK-NEXT: .long 17 @ 0x11 1192; CHECK-NEXT: .long 20 @ 0x14 1193; CHECK-NEXT: .long 23 @ 0x17 1194; CHECK-NEXT: .LCPI15_2: 1195; CHECK-NEXT: .long 24 @ 0x18 1196; CHECK-NEXT: .long 27 @ 0x1b 1197; CHECK-NEXT: .long 30 @ 0x1e 1198; CHECK-NEXT: .long 33 @ 0x21 1199; CHECK-NEXT: .LCPI15_3: 1200; CHECK-NEXT: .long 1 @ 0x1 1201; CHECK-NEXT: .long 4 @ 0x4 1202; CHECK-NEXT: .long 7 @ 0x7 1203; CHECK-NEXT: .long 10 @ 0xa 1204; CHECK-NEXT: .LCPI15_4: 1205; CHECK-NEXT: .long 36 @ 0x24 1206; CHECK-NEXT: .long 39 @ 0x27 1207; CHECK-NEXT: .long 42 @ 0x2a 1208; CHECK-NEXT: .long 45 @ 0x2d 1209; CHECK-NEXT: .LCPI15_5: 1210; CHECK-NEXT: .long 25 @ 0x19 1211; CHECK-NEXT: .long 28 @ 0x1c 1212; CHECK-NEXT: .long 31 @ 0x1f 1213; CHECK-NEXT: .long 34 @ 0x22 1214; CHECK-NEXT: .LCPI15_6: 1215; CHECK-NEXT: .long 13 @ 0xd 1216; CHECK-NEXT: .long 16 @ 0x10 1217; CHECK-NEXT: .long 19 @ 0x13 1218; CHECK-NEXT: .long 22 @ 0x16 1219; CHECK-NEXT: .LCPI15_7: 1220; CHECK-NEXT: .long 2 @ 0x2 1221; CHECK-NEXT: .long 5 @ 0x5 1222; CHECK-NEXT: .long 8 @ 0x8 1223; CHECK-NEXT: .long 11 @ 0xb 1224; CHECK-NEXT: .LCPI15_8: 1225; CHECK-NEXT: .long 26 @ 0x1a 1226; CHECK-NEXT: .long 29 @ 0x1d 1227; CHECK-NEXT: .long 32 @ 0x20 1228; CHECK-NEXT: .long 35 @ 0x23 1229; CHECK-NEXT: .LCPI15_9: 1230; CHECK-NEXT: .long 37 @ 0x25 1231; CHECK-NEXT: .long 40 @ 0x28 1232; CHECK-NEXT: .long 43 @ 0x2b 1233; CHECK-NEXT: .long 46 @ 0x2e 1234; CHECK-NEXT: .LCPI15_10: 1235; CHECK-NEXT: .long 12 @ 0xc 1236; CHECK-NEXT: .long 15 @ 0xf 1237; CHECK-NEXT: .long 18 @ 0x12 1238; CHECK-NEXT: .long 21 @ 0x15 1239; CHECK-NEXT: .LCPI15_11: 1240; CHECK-NEXT: .long 0 @ 0x0 1241; CHECK-NEXT: .long 3 @ 0x3 1242; CHECK-NEXT: .long 6 @ 0x6 1243; CHECK-NEXT: .long 9 @ 0x9 1244 1245 1246entry: 1247 %cmp22 = icmp sgt i32 %n, 0 1248 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 1249 1250vector.ph: ; preds = %for.body.preheader 1251 %n.vec = and i32 %n, -8 1252 br label %vector.body 1253 1254vector.body: ; preds = %vector.body, %vector.ph 1255 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1256 %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ] 1257 %0 = mul nuw nsw <16 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3> 1258 %1 = getelementptr inbounds i8, ptr %data, <16 x i32> %0 1259 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %1, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 1260 %2 = add nuw nsw <16 x i32> %0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 1261 %3 = getelementptr inbounds i8, ptr %data, <16 x i32> %2 1262 %wide.masked.gather24 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %3, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 1263 %4 = add nuw nsw <16 x i32> %0, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 1264 %5 = getelementptr inbounds i8, ptr %data, <16 x i32> %4 1265 %wide.masked.gather25 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %5, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 1266 %6 = add nsw <16 x i8> %wide.masked.gather24, %wide.masked.gather 1267 %7 = add nsw <16 x i8> %6, %wide.masked.gather25 1268 %8 = getelementptr inbounds i8, ptr %dst, i32 %index 1269 store <16 x i8> %7, ptr %8, align 2 1270 %index.next = add i32 %index, 16 1271 %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1272 %9 = icmp eq i32 %index.next, %n.vec 1273 br i1 %9, label %middle.block, label %vector.body 1274 1275middle.block: ; preds = %vector.body 1276 %cmp.n = icmp eq i32 %n.vec, %n 1277 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 1278 1279for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1280 ret void 1281} 1282 1283define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(ptr noalias nocapture readonly %data, ptr noalias nocapture %dst, i32 %n) { 1284; CHECK-LABEL: gather_inc_v16i8_simple: 1285; CHECK: @ %bb.0: @ %entry 1286; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1287; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} 1288; CHECK-NEXT: .pad #4 1289; CHECK-NEXT: sub sp, #4 1290; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} 1291; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} 1292; CHECK-NEXT: .pad #64 1293; CHECK-NEXT: sub sp, #64 1294; CHECK-NEXT: cmp r2, #1 1295; CHECK-NEXT: strd r1, r2, [sp, #56] @ 8-byte Folded Spill 1296; CHECK-NEXT: blt.w .LBB16_5 1297; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader 1298; CHECK-NEXT: adr r5, .LCPI16_3 1299; CHECK-NEXT: adr r7, .LCPI16_1 1300; CHECK-NEXT: vldrw.u32 q0, [r5] 1301; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload 1302; CHECK-NEXT: adr r3, .LCPI16_0 1303; CHECK-NEXT: adr r6, .LCPI16_2 1304; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill 1305; CHECK-NEXT: vldrw.u32 q0, [r7] 1306; CHECK-NEXT: bic r9, r1, #7 1307; CHECK-NEXT: vldrw.u32 q3, [r3] 1308; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill 1309; CHECK-NEXT: vldrw.u32 q0, [r6] 1310; CHECK-NEXT: mov.w lr, #16 1311; CHECK-NEXT: str.w r9, [sp, #52] @ 4-byte Spill 1312; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill 1313; CHECK-NEXT: .LBB16_2: @ %vector.ph 1314; CHECK-NEXT: @ =>This Loop Header: Depth=1 1315; CHECK-NEXT: @ Child Loop BB16_3 Depth 2 1316; CHECK-NEXT: ldr.w r8, [sp, #56] @ 4-byte Reload 1317; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload 1318; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload 1319; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload 1320; CHECK-NEXT: vmov q4, q3 1321; CHECK-NEXT: .LBB16_3: @ %vector.body 1322; CHECK-NEXT: @ Parent Loop BB16_2 Depth=1 1323; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 1324; CHECK-NEXT: vadd.i32 q1, q5, r0 1325; CHECK-NEXT: vadd.i32 q2, q4, r0 1326; CHECK-NEXT: vmov r7, r3, d3 1327; CHECK-NEXT: vadd.i32 q6, q0, lr 1328; CHECK-NEXT: vmov r5, r6, d5 1329; CHECK-NEXT: subs.w r9, r9, #16 1330; CHECK-NEXT: vmov r4, r10, d2 1331; CHECK-NEXT: vadd.i32 q1, q7, lr 1332; CHECK-NEXT: vadd.i32 q4, q4, lr 1333; CHECK-NEXT: vadd.i32 q5, q5, lr 1334; CHECK-NEXT: ldrb.w r11, [r3] 1335; CHECK-NEXT: ldrb r3, [r7] 1336; CHECK-NEXT: vmov r7, r12, d4 1337; CHECK-NEXT: vadd.i32 q2, q7, r0 1338; CHECK-NEXT: vadd.i32 q7, q0, r0 1339; CHECK-NEXT: ldrb r5, [r5] 1340; CHECK-NEXT: ldrb r6, [r6] 1341; CHECK-NEXT: ldrb r4, [r4] 1342; CHECK-NEXT: ldrb.w r10, [r10] 1343; CHECK-NEXT: ldrb r7, [r7] 1344; CHECK-NEXT: ldrb.w r1, [r12] 1345; CHECK-NEXT: vmov.8 q0[0], r7 1346; CHECK-NEXT: vmov.8 q0[1], r1 1347; CHECK-NEXT: vmov r1, r7, d15 1348; CHECK-NEXT: vmov.8 q0[2], r5 1349; CHECK-NEXT: vmov.8 q0[3], r6 1350; CHECK-NEXT: vmov.8 q0[4], r4 1351; CHECK-NEXT: vmov r4, r2, d4 1352; CHECK-NEXT: vmov.8 q0[5], r10 1353; CHECK-NEXT: vmov.8 q0[6], r3 1354; CHECK-NEXT: vmov.8 q0[7], r11 1355; CHECK-NEXT: ldrb r6, [r7] 1356; CHECK-NEXT: vmov r5, r7, d5 1357; CHECK-NEXT: ldrb r1, [r1] 1358; CHECK-NEXT: ldrb r2, [r2] 1359; CHECK-NEXT: ldrb r3, [r5] 1360; CHECK-NEXT: ldrb.w r12, [r7] 1361; CHECK-NEXT: ldrb r5, [r4] 1362; CHECK-NEXT: vmov r4, r7, d14 1363; CHECK-NEXT: vmov q7, q1 1364; CHECK-NEXT: ldrb r4, [r4] 1365; CHECK-NEXT: ldrb r7, [r7] 1366; CHECK-NEXT: vmov.8 q0[8], r4 1367; CHECK-NEXT: vmov.8 q0[9], r7 1368; CHECK-NEXT: vmov.8 q0[10], r1 1369; CHECK-NEXT: vmov.8 q0[11], r6 1370; CHECK-NEXT: vmov.8 q0[12], r5 1371; CHECK-NEXT: vmov.8 q0[13], r2 1372; CHECK-NEXT: vmov.8 q0[14], r3 1373; CHECK-NEXT: vmov.8 q0[15], r12 1374; CHECK-NEXT: vstrb.8 q0, [r8], #16 1375; CHECK-NEXT: vmov q0, q6 1376; CHECK-NEXT: bne .LBB16_3 1377; CHECK-NEXT: @ %bb.4: @ %middle.block 1378; CHECK-NEXT: @ in Loop: Header=BB16_2 Depth=1 1379; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload 1380; CHECK-NEXT: ldr.w r9, [sp, #52] @ 4-byte Reload 1381; CHECK-NEXT: cmp r9, r1 1382; CHECK-NEXT: bne .LBB16_2 1383; CHECK-NEXT: .LBB16_5: @ %for.cond.cleanup 1384; CHECK-NEXT: add sp, #64 1385; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} 1386; CHECK-NEXT: add sp, #4 1387; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} 1388; CHECK-NEXT: .p2align 4 1389; CHECK-NEXT: @ %bb.6: 1390; CHECK-NEXT: .LCPI16_0: 1391; CHECK-NEXT: .long 0 @ 0x0 1392; CHECK-NEXT: .long 1 @ 0x1 1393; CHECK-NEXT: .long 2 @ 0x2 1394; CHECK-NEXT: .long 3 @ 0x3 1395; CHECK-NEXT: .LCPI16_1: 1396; CHECK-NEXT: .long 8 @ 0x8 1397; CHECK-NEXT: .long 9 @ 0x9 1398; CHECK-NEXT: .long 10 @ 0xa 1399; CHECK-NEXT: .long 11 @ 0xb 1400; CHECK-NEXT: .LCPI16_2: 1401; CHECK-NEXT: .long 4 @ 0x4 1402; CHECK-NEXT: .long 5 @ 0x5 1403; CHECK-NEXT: .long 6 @ 0x6 1404; CHECK-NEXT: .long 7 @ 0x7 1405; CHECK-NEXT: .LCPI16_3: 1406; CHECK-NEXT: .long 12 @ 0xc 1407; CHECK-NEXT: .long 13 @ 0xd 1408; CHECK-NEXT: .long 14 @ 0xe 1409; CHECK-NEXT: .long 15 @ 0xf 1410 1411 1412entry: 1413 %cmp22 = icmp sgt i32 %n, 0 1414 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup 1415 1416vector.ph: ; preds = %for.body.preheader 1417 %n.vec = and i32 %n, -8 1418 br label %vector.body 1419 1420vector.body: ; preds = %vector.body, %vector.ph 1421 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1422 %vec.ind = phi <16 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, %vector.ph ], [ %vec.ind.next, %vector.body ] 1423 %0 = getelementptr inbounds i8, ptr %data, <16 x i32> %vec.ind 1424 %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr> %0, i32 2, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i8> undef) 1425 %1 = getelementptr inbounds i8, ptr %dst, i32 %index 1426 store <16 x i8> %wide.masked.gather, ptr %1, align 2 1427 %index.next = add i32 %index, 16 1428 %vec.ind.next = add <16 x i32> %vec.ind, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 1429 %2 = icmp eq i32 %index.next, %n.vec 1430 br i1 %2, label %middle.block, label %vector.body 1431 1432middle.block: ; preds = %vector.body 1433 %cmp.n = icmp eq i32 %n.vec, %n 1434 br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph 1435 1436for.cond.cleanup: ; preds = %for.body, %middle.block, %entry 1437 ret void 1438} 1439 1440define void @shl(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) { 1441; CHECK-LABEL: shl: 1442; CHECK: @ %bb.0: @ %entry 1443; CHECK-NEXT: .save {r7, lr} 1444; CHECK-NEXT: push {r7, lr} 1445; CHECK-NEXT: cmp r2, #1 1446; CHECK-NEXT: it lt 1447; CHECK-NEXT: poplt {r7, pc} 1448; CHECK-NEXT: .LBB17_1: @ %vector.ph 1449; CHECK-NEXT: adr r3, .LCPI17_0 1450; CHECK-NEXT: vldrw.u32 q0, [r3] 1451; CHECK-NEXT: vadd.i32 q0, q0, r1 1452; CHECK-NEXT: dlstp.32 lr, r2 1453; CHECK-NEXT: .LBB17_2: @ %vector.body 1454; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1455; CHECK-NEXT: vldrw.u32 q1, [q0, #64]! 1456; CHECK-NEXT: vstrw.32 q1, [r0], #16 1457; CHECK-NEXT: letp lr, .LBB17_2 1458; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup 1459; CHECK-NEXT: pop {r7, pc} 1460; CHECK-NEXT: .p2align 4 1461; CHECK-NEXT: @ %bb.4: 1462; CHECK-NEXT: .LCPI17_0: 1463; CHECK-NEXT: .long 4294967232 @ 0xffffffc0 1464; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 1465; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 1466; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 1467entry: 1468 %cmp6 = icmp sgt i32 %n, 0 1469 br i1 %cmp6, label %vector.ph, label %for.cond.cleanup 1470 1471vector.ph: ; preds = %entry 1472 %n.rnd.up = add i32 %n, 3 1473 %n.vec = and i32 %n.rnd.up, -4 1474 br label %vector.body 1475 1476vector.body: ; preds = %vector.body, %vector.ph 1477 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1478 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ] 1479 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 1480 %0 = shl nsw <4 x i32> %vec.ind, <i32 2, i32 2, i32 2, i32 2> 1481 %1 = getelementptr inbounds i32, ptr %y, <4 x i32> %0 1482 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 1483 %2 = getelementptr inbounds i32, ptr %x, i32 %index 1484 call void @llvm.masked.store.v4i32.p0(<4 x i32> %wide.masked.gather, ptr %2, i32 4, <4 x i1> %active.lane.mask) 1485 %index.next = add i32 %index, 4 1486 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4> 1487 %3 = icmp eq i32 %index.next, %n.vec 1488 br i1 %3, label %for.cond.cleanup, label %vector.body 1489 1490for.cond.cleanup: ; preds = %vector.body, %entry 1491 ret void 1492} 1493 1494define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) { 1495; CHECK-LABEL: shlor: 1496; CHECK: @ %bb.0: @ %entry 1497; CHECK-NEXT: .save {r4, r5, r6, lr} 1498; CHECK-NEXT: push {r4, r5, r6, lr} 1499; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} 1500; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} 1501; CHECK-NEXT: cmp r2, #1 1502; CHECK-NEXT: blt .LBB18_3 1503; CHECK-NEXT: @ %bb.1: @ %vector.ph 1504; CHECK-NEXT: adr.w lr, .LCPI18_0 1505; CHECK-NEXT: adr r4, .LCPI18_1 1506; CHECK-NEXT: adr r5, .LCPI18_2 1507; CHECK-NEXT: adr r6, .LCPI18_3 1508; CHECK-NEXT: vldrw.u32 q0, [r6] 1509; CHECK-NEXT: vldrw.u32 q1, [r5] 1510; CHECK-NEXT: vldrw.u32 q2, [r4] 1511; CHECK-NEXT: vldrw.u32 q3, [lr] 1512; CHECK-NEXT: vadd.i32 q0, q0, r1 1513; CHECK-NEXT: vadd.i32 q1, q1, r1 1514; CHECK-NEXT: vadd.i32 q2, q2, r1 1515; CHECK-NEXT: vadd.i32 q3, q3, r1 1516; CHECK-NEXT: dlstp.32 lr, r2 1517; CHECK-NEXT: .LBB18_2: @ %vector.body 1518; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 1519; CHECK-NEXT: vldrw.u32 q4, [q3, #128]! 1520; CHECK-NEXT: vldrw.u32 q5, [q2, #128]! 1521; CHECK-NEXT: vldrw.u32 q6, [q0, #128]! 1522; CHECK-NEXT: vadd.i32 q4, q5, q4 1523; CHECK-NEXT: vldrw.u32 q5, [q1, #128]! 1524; CHECK-NEXT: vadd.i32 q4, q4, q5 1525; CHECK-NEXT: vadd.i32 q4, q4, q6 1526; CHECK-NEXT: vstrw.32 q4, [r0], #16 1527; CHECK-NEXT: letp lr, .LBB18_2 1528; CHECK-NEXT: .LBB18_3: @ %for.cond.cleanup 1529; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} 1530; CHECK-NEXT: pop {r4, r5, r6, pc} 1531; CHECK-NEXT: .p2align 4 1532; CHECK-NEXT: @ %bb.4: 1533; CHECK-NEXT: .LCPI18_0: 1534; CHECK-NEXT: .long 4294967168 @ 0xffffff80 1535; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 1536; CHECK-NEXT: .long 4294967232 @ 0xffffffc0 1537; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 1538; CHECK-NEXT: .LCPI18_1: 1539; CHECK-NEXT: .long 4294967176 @ 0xffffff88 1540; CHECK-NEXT: .long 4294967208 @ 0xffffffa8 1541; CHECK-NEXT: .long 4294967240 @ 0xffffffc8 1542; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 1543; CHECK-NEXT: .LCPI18_2: 1544; CHECK-NEXT: .long 4294967184 @ 0xffffff90 1545; CHECK-NEXT: .long 4294967216 @ 0xffffffb0 1546; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 1547; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 1548; CHECK-NEXT: .LCPI18_3: 1549; CHECK-NEXT: .long 4294967192 @ 0xffffff98 1550; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 1551; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 1552; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 1553entry: 1554 %cmp23 = icmp sgt i32 %n, 0 1555 br i1 %cmp23, label %vector.ph, label %for.cond.cleanup 1556 1557vector.ph: ; preds = %entry 1558 %n.rnd.up = add i32 %n, 3 1559 %n.vec = and i32 %n.rnd.up, -4 1560 br label %vector.body 1561 1562vector.body: ; preds = %vector.body, %vector.ph 1563 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 1564 %vec.ind = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ %vec.ind.next, %vector.body ] 1565 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n) 1566 %0 = shl nsw <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3> 1567 %1 = getelementptr inbounds i32, ptr %y, <4 x i32> %0 1568 %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 1569 %2 = or <4 x i32> %0, <i32 2, i32 2, i32 2, i32 2> 1570 %3 = getelementptr inbounds i32, ptr %y, <4 x i32> %2 1571 %wide.masked.gather25 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 1572 %4 = add nsw <4 x i32> %wide.masked.gather25, %wide.masked.gather 1573 %5 = or <4 x i32> %0, <i32 4, i32 4, i32 4, i32 4> 1574 %6 = getelementptr inbounds i32, ptr %y, <4 x i32> %5 1575 %wide.masked.gather26 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %6, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 1576 %7 = add nsw <4 x i32> %4, %wide.masked.gather26 1577 %8 = or <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6> 1578 %9 = getelementptr inbounds i32, ptr %y, <4 x i32> %8 1579 %wide.masked.gather27 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %9, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 1580 %10 = add nsw <4 x i32> %7, %wide.masked.gather27 1581 %11 = getelementptr inbounds i32, ptr %x, i32 %index 1582 call void @llvm.masked.store.v4i32.p0(<4 x i32> %10, ptr %11, i32 4, <4 x i1> %active.lane.mask) 1583 %index.next = add i32 %index, 4 1584 %vec.ind.next = add <4 x i32> %vec.ind, <i32 4, i32 4, i32 4, i32 4> 1585 %12 = icmp eq i32 %index.next, %n.vec 1586 br i1 %12, label %for.cond.cleanup, label %vector.body 1587 1588for.cond.cleanup: ; preds = %vector.body, %entry 1589 ret void 1590} 1591 1592 1593declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>) 1594declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>) 1595declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i32>) 1596declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>) 1597declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>) 1598declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>) 1599declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>) 1600declare <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i16>) 1601declare <4 x i16> @llvm.masked.gather.v4i16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i16>) 1602declare <8 x i16> @llvm.masked.gather.v8i16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i16>) 1603declare <16 x i16> @llvm.masked.gather.v16i16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i16>) 1604declare <4 x half> @llvm.masked.gather.v4f16.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x half>) 1605declare <8 x half> @llvm.masked.gather.v8f16.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x half>) 1606declare <16 x half> @llvm.masked.gather.v16f16.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x half>) 1607declare <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i8>) 1608declare <8 x i8> @llvm.masked.gather.v8i8.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i8>) 1609declare <16 x i8> @llvm.masked.gather.v16i8.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i8>) 1610declare <32 x i8> @llvm.masked.gather.v32i8.v32p0(<32 x ptr>, i32, <32 x i1>, <32 x i8>) 1611declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32, <4 x i1>) 1612declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 1613