1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve %s --verify-machineinstrs -o - | FileCheck %s 3 4define dso_local arm_aapcs_vfpcc void @sink_shl_i32(ptr nocapture readonly %in, ptr noalias nocapture %out, i32 %shift, i32 %N) { 5; CHECK-LABEL: sink_shl_i32: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: bic r3, r3, #3 10; CHECK-NEXT: sub.w r12, r3, #4 11; CHECK-NEXT: movs r3, #1 12; CHECK-NEXT: add.w lr, r3, r12, lsr #2 13; CHECK-NEXT: .LBB0_1: @ %vector.body 14; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 15; CHECK-NEXT: vldrw.u32 q0, [r0], #16 16; CHECK-NEXT: vshl.u32 q0, r2 17; CHECK-NEXT: vstrb.8 q0, [r1], #16 18; CHECK-NEXT: le lr, .LBB0_1 19; CHECK-NEXT: @ %bb.2: @ %exit 20; CHECK-NEXT: pop {r7, pc} 21entry: 22 br label %vector.ph 23 24vector.ph: 25 %n.vec = and i32 %N, -4 26 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0 27 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 28 br label %vector.body 29 30vector.body: ; preds = %vector.body, %vector.ph 31 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 32 %gep.in = getelementptr inbounds i32, ptr %in, i32 %index 33 %wide.load = load <4 x i32>, ptr %gep.in, align 4 34 %res = shl <4 x i32> %wide.load, %broadcast.splat11 35 %gep.out = getelementptr inbounds i32, ptr %out, i32 %index 36 store <4 x i32> %res, ptr %gep.out, align 4 37 %index.next = add i32 %index, 4 38 %cmp = icmp eq i32 %index.next, %n.vec 39 br i1 %cmp, label %exit, label %vector.body 40 41exit: 42 ret void 43} 44 45define dso_local arm_aapcs_vfpcc void @sink_shl_i16(ptr nocapture readonly %in, ptr noalias nocapture %out, i16 %shift, i32 %N) { 46; CHECK-LABEL: sink_shl_i16: 47; CHECK: @ %bb.0: @ %entry 48; CHECK-NEXT: .save {r7, lr} 49; CHECK-NEXT: push {r7, lr} 50; CHECK-NEXT: bic r3, r3, #3 51; CHECK-NEXT: sub.w r12, r3, #4 52; CHECK-NEXT: movs r3, #1 53; CHECK-NEXT: add.w lr, r3, r12, lsr #2 54; CHECK-NEXT: .LBB1_1: @ %vector.body 55; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 56; CHECK-NEXT: vldrw.u32 q0, [r0], #8 57; CHECK-NEXT: vshl.u16 q0, r2 58; CHECK-NEXT: vstrb.8 q0, [r1], #8 59; CHECK-NEXT: le lr, .LBB1_1 60; CHECK-NEXT: @ %bb.2: @ %exit 61; CHECK-NEXT: pop {r7, pc} 62entry: 63 br label %vector.ph 64 65vector.ph: 66 %n.vec = and i32 %N, -4 67 %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0 68 %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer 69 br label %vector.body 70 71vector.body: ; preds = %vector.body, %vector.ph 72 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 73 %gep.in = getelementptr inbounds i16, ptr %in, i32 %index 74 %wide.load = load <8 x i16>, ptr %gep.in, align 4 75 %res = shl <8 x i16> %wide.load, %broadcast.splat11 76 %gep.out = getelementptr inbounds i16, ptr %out, i32 %index 77 store <8 x i16> %res, ptr %gep.out, align 4 78 %index.next = add i32 %index, 4 79 %cmp = icmp eq i32 %index.next, %n.vec 80 br i1 %cmp, label %exit, label %vector.body 81 82exit: 83 ret void 84} 85 86define dso_local arm_aapcs_vfpcc void @sink_shl_i8(ptr nocapture readonly %in, ptr noalias nocapture %out, i8 %shift, i32 %N) { 87; CHECK-LABEL: sink_shl_i8: 88; CHECK: @ %bb.0: @ %entry 89; CHECK-NEXT: .save {r7, lr} 90; CHECK-NEXT: push {r7, lr} 91; CHECK-NEXT: bic r3, r3, #3 92; CHECK-NEXT: sub.w r12, r3, #4 93; CHECK-NEXT: movs r3, #1 94; CHECK-NEXT: add.w lr, r3, r12, lsr #2 95; CHECK-NEXT: .LBB2_1: @ %vector.body 96; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 97; CHECK-NEXT: vldrw.u32 q0, [r0], #4 98; CHECK-NEXT: vshl.u8 q0, r2 99; CHECK-NEXT: vstrb.8 q0, [r1], #4 100; CHECK-NEXT: le lr, .LBB2_1 101; CHECK-NEXT: @ %bb.2: @ %exit 102; CHECK-NEXT: pop {r7, pc} 103entry: 104 br label %vector.ph 105 106vector.ph: 107 %n.vec = and i32 %N, -4 108 %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0 109 %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer 110 br label %vector.body 111 112vector.body: ; preds = %vector.body, %vector.ph 113 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 114 %gep.in = getelementptr inbounds i8, ptr %in, i32 %index 115 %wide.load = load <16 x i8>, ptr %gep.in, align 4 116 %res = shl <16 x i8> %wide.load, %broadcast.splat11 117 %gep.out = getelementptr inbounds i8, ptr %out, i32 %index 118 store <16 x i8> %res, ptr %gep.out, align 4 119 %index.next = add i32 %index, 4 120 %cmp = icmp eq i32 %index.next, %n.vec 121 br i1 %cmp, label %exit, label %vector.body 122 123exit: 124 ret void 125} 126 127define dso_local arm_aapcs_vfpcc void @sink_lshr_i32(ptr nocapture readonly %in, ptr noalias nocapture %out, i32 %shift, i32 %N) { 128; CHECK-LABEL: sink_lshr_i32: 129; CHECK: @ %bb.0: @ %entry 130; CHECK-NEXT: .save {r7, lr} 131; CHECK-NEXT: push {r7, lr} 132; CHECK-NEXT: bic r3, r3, #3 133; CHECK-NEXT: rsbs r2, r2, #0 134; CHECK-NEXT: sub.w r12, r3, #4 135; CHECK-NEXT: movs r3, #1 136; CHECK-NEXT: add.w lr, r3, r12, lsr #2 137; CHECK-NEXT: .LBB3_1: @ %vector.body 138; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 139; CHECK-NEXT: vldrw.u32 q0, [r0], #16 140; CHECK-NEXT: vshl.u32 q0, r2 141; CHECK-NEXT: vstrb.8 q0, [r1], #16 142; CHECK-NEXT: le lr, .LBB3_1 143; CHECK-NEXT: @ %bb.2: @ %exit 144; CHECK-NEXT: pop {r7, pc} 145entry: 146 br label %vector.ph 147 148vector.ph: 149 %n.vec = and i32 %N, -4 150 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0 151 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 152 br label %vector.body 153 154vector.body: ; preds = %vector.body, %vector.ph 155 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 156 %gep.in = getelementptr inbounds i32, ptr %in, i32 %index 157 %wide.load = load <4 x i32>, ptr %gep.in, align 4 158 %res = lshr <4 x i32> %wide.load, %broadcast.splat11 159 %gep.out = getelementptr inbounds i32, ptr %out, i32 %index 160 store <4 x i32> %res, ptr %gep.out, align 4 161 %index.next = add i32 %index, 4 162 %cmp = icmp eq i32 %index.next, %n.vec 163 br i1 %cmp, label %exit, label %vector.body 164 165exit: 166 ret void 167} 168 169define dso_local arm_aapcs_vfpcc void @sink_lshr_i16(ptr nocapture readonly %in, ptr noalias nocapture %out, i16 %shift, i32 %N) { 170; CHECK-LABEL: sink_lshr_i16: 171; CHECK: @ %bb.0: @ %entry 172; CHECK-NEXT: .save {r7, lr} 173; CHECK-NEXT: push {r7, lr} 174; CHECK-NEXT: bic r3, r3, #3 175; CHECK-NEXT: rsbs r2, r2, #0 176; CHECK-NEXT: sub.w r12, r3, #4 177; CHECK-NEXT: movs r3, #1 178; CHECK-NEXT: add.w lr, r3, r12, lsr #2 179; CHECK-NEXT: .LBB4_1: @ %vector.body 180; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 181; CHECK-NEXT: vldrw.u32 q0, [r0], #8 182; CHECK-NEXT: vshl.u16 q0, r2 183; CHECK-NEXT: vstrb.8 q0, [r1], #8 184; CHECK-NEXT: le lr, .LBB4_1 185; CHECK-NEXT: @ %bb.2: @ %exit 186; CHECK-NEXT: pop {r7, pc} 187entry: 188 br label %vector.ph 189 190vector.ph: 191 %n.vec = and i32 %N, -4 192 %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0 193 %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer 194 br label %vector.body 195 196vector.body: ; preds = %vector.body, %vector.ph 197 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 198 %gep.in = getelementptr inbounds i16, ptr %in, i32 %index 199 %wide.load = load <8 x i16>, ptr %gep.in, align 4 200 %res = lshr <8 x i16> %wide.load, %broadcast.splat11 201 %gep.out = getelementptr inbounds i16, ptr %out, i32 %index 202 store <8 x i16> %res, ptr %gep.out, align 4 203 %index.next = add i32 %index, 4 204 %cmp = icmp eq i32 %index.next, %n.vec 205 br i1 %cmp, label %exit, label %vector.body 206 207exit: 208 ret void 209} 210 211define dso_local arm_aapcs_vfpcc void @sink_lshr_i8(ptr nocapture readonly %in, ptr noalias nocapture %out, i8 %shift, i32 %N) { 212; CHECK-LABEL: sink_lshr_i8: 213; CHECK: @ %bb.0: @ %entry 214; CHECK-NEXT: .save {r7, lr} 215; CHECK-NEXT: push {r7, lr} 216; CHECK-NEXT: bic r3, r3, #3 217; CHECK-NEXT: rsbs r2, r2, #0 218; CHECK-NEXT: sub.w r12, r3, #4 219; CHECK-NEXT: movs r3, #1 220; CHECK-NEXT: add.w lr, r3, r12, lsr #2 221; CHECK-NEXT: .LBB5_1: @ %vector.body 222; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 223; CHECK-NEXT: vldrw.u32 q0, [r0], #4 224; CHECK-NEXT: vshl.u8 q0, r2 225; CHECK-NEXT: vstrb.8 q0, [r1], #4 226; CHECK-NEXT: le lr, .LBB5_1 227; CHECK-NEXT: @ %bb.2: @ %exit 228; CHECK-NEXT: pop {r7, pc} 229entry: 230 br label %vector.ph 231 232vector.ph: 233 %n.vec = and i32 %N, -4 234 %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0 235 %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer 236 br label %vector.body 237 238vector.body: ; preds = %vector.body, %vector.ph 239 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 240 %gep.in = getelementptr inbounds i8, ptr %in, i32 %index 241 %wide.load = load <16 x i8>, ptr %gep.in, align 4 242 %res = lshr <16 x i8> %wide.load, %broadcast.splat11 243 %gep.out = getelementptr inbounds i8, ptr %out, i32 %index 244 store <16 x i8> %res, ptr %gep.out, align 4 245 %index.next = add i32 %index, 4 246 %cmp = icmp eq i32 %index.next, %n.vec 247 br i1 %cmp, label %exit, label %vector.body 248 249exit: 250 ret void 251} 252 253define dso_local arm_aapcs_vfpcc void @sink_ashr_i32(ptr nocapture readonly %in, ptr noalias nocapture %out, i32 %shift, i32 %N) { 254; CHECK-LABEL: sink_ashr_i32: 255; CHECK: @ %bb.0: @ %entry 256; CHECK-NEXT: .save {r7, lr} 257; CHECK-NEXT: push {r7, lr} 258; CHECK-NEXT: bic r3, r3, #3 259; CHECK-NEXT: rsbs r2, r2, #0 260; CHECK-NEXT: sub.w r12, r3, #4 261; CHECK-NEXT: movs r3, #1 262; CHECK-NEXT: add.w lr, r3, r12, lsr #2 263; CHECK-NEXT: .LBB6_1: @ %vector.body 264; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 265; CHECK-NEXT: vldrw.u32 q0, [r0], #16 266; CHECK-NEXT: vshl.s32 q0, r2 267; CHECK-NEXT: vstrb.8 q0, [r1], #16 268; CHECK-NEXT: le lr, .LBB6_1 269; CHECK-NEXT: @ %bb.2: @ %exit 270; CHECK-NEXT: pop {r7, pc} 271entry: 272 br label %vector.ph 273 274vector.ph: 275 %n.vec = and i32 %N, -4 276 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %shift, i32 0 277 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer 278 br label %vector.body 279 280vector.body: ; preds = %vector.body, %vector.ph 281 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 282 %gep.in = getelementptr inbounds i32, ptr %in, i32 %index 283 %wide.load = load <4 x i32>, ptr %gep.in, align 4 284 %res = ashr <4 x i32> %wide.load, %broadcast.splat11 285 %gep.out = getelementptr inbounds i32, ptr %out, i32 %index 286 store <4 x i32> %res, ptr %gep.out, align 4 287 %index.next = add i32 %index, 4 288 %cmp = icmp eq i32 %index.next, %n.vec 289 br i1 %cmp, label %exit, label %vector.body 290 291exit: 292 ret void 293} 294 295define dso_local arm_aapcs_vfpcc void @sink_ashr_i16(ptr nocapture readonly %in, ptr noalias nocapture %out, i16 %shift, i32 %N) { 296; CHECK-LABEL: sink_ashr_i16: 297; CHECK: @ %bb.0: @ %entry 298; CHECK-NEXT: .save {r7, lr} 299; CHECK-NEXT: push {r7, lr} 300; CHECK-NEXT: bic r3, r3, #3 301; CHECK-NEXT: rsbs r2, r2, #0 302; CHECK-NEXT: sub.w r12, r3, #4 303; CHECK-NEXT: movs r3, #1 304; CHECK-NEXT: add.w lr, r3, r12, lsr #2 305; CHECK-NEXT: .LBB7_1: @ %vector.body 306; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 307; CHECK-NEXT: vldrw.u32 q0, [r0], #8 308; CHECK-NEXT: vshl.s16 q0, r2 309; CHECK-NEXT: vstrb.8 q0, [r1], #8 310; CHECK-NEXT: le lr, .LBB7_1 311; CHECK-NEXT: @ %bb.2: @ %exit 312; CHECK-NEXT: pop {r7, pc} 313entry: 314 br label %vector.ph 315 316vector.ph: 317 %n.vec = and i32 %N, -4 318 %broadcast.splatinsert10 = insertelement <8 x i16> undef, i16 %shift, i32 0 319 %broadcast.splat11 = shufflevector <8 x i16> %broadcast.splatinsert10, <8 x i16> undef, <8 x i32> zeroinitializer 320 br label %vector.body 321 322vector.body: ; preds = %vector.body, %vector.ph 323 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 324 %gep.in = getelementptr inbounds i16, ptr %in, i32 %index 325 %wide.load = load <8 x i16>, ptr %gep.in, align 4 326 %res = ashr <8 x i16> %wide.load, %broadcast.splat11 327 %gep.out = getelementptr inbounds i16, ptr %out, i32 %index 328 store <8 x i16> %res, ptr %gep.out, align 4 329 %index.next = add i32 %index, 4 330 %cmp = icmp eq i32 %index.next, %n.vec 331 br i1 %cmp, label %exit, label %vector.body 332 333exit: 334 ret void 335} 336 337define dso_local arm_aapcs_vfpcc void @sink_ashr_i8(ptr nocapture readonly %in, ptr noalias nocapture %out, i8 %shift, i32 %N) { 338; CHECK-LABEL: sink_ashr_i8: 339; CHECK: @ %bb.0: @ %entry 340; CHECK-NEXT: .save {r7, lr} 341; CHECK-NEXT: push {r7, lr} 342; CHECK-NEXT: bic r3, r3, #3 343; CHECK-NEXT: rsbs r2, r2, #0 344; CHECK-NEXT: sub.w r12, r3, #4 345; CHECK-NEXT: movs r3, #1 346; CHECK-NEXT: add.w lr, r3, r12, lsr #2 347; CHECK-NEXT: .LBB8_1: @ %vector.body 348; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 349; CHECK-NEXT: vldrw.u32 q0, [r0], #4 350; CHECK-NEXT: vshl.s8 q0, r2 351; CHECK-NEXT: vstrb.8 q0, [r1], #4 352; CHECK-NEXT: le lr, .LBB8_1 353; CHECK-NEXT: @ %bb.2: @ %exit 354; CHECK-NEXT: pop {r7, pc} 355entry: 356 br label %vector.ph 357 358vector.ph: 359 %n.vec = and i32 %N, -4 360 %broadcast.splatinsert10 = insertelement <16 x i8> undef, i8 %shift, i32 0 361 %broadcast.splat11 = shufflevector <16 x i8> %broadcast.splatinsert10, <16 x i8> undef, <16 x i32> zeroinitializer 362 br label %vector.body 363 364vector.body: ; preds = %vector.body, %vector.ph 365 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 366 %gep.in = getelementptr inbounds i8, ptr %in, i32 %index 367 %wide.load = load <16 x i8>, ptr %gep.in, align 4 368 %res = ashr <16 x i8> %wide.load, %broadcast.splat11 369 %gep.out = getelementptr inbounds i8, ptr %out, i32 %index 370 store <16 x i8> %res, ptr %gep.out, align 4 371 %index.next = add i32 %index, 4 372 %cmp = icmp eq i32 %index.next, %n.vec 373 br i1 %cmp, label %exit, label %vector.body 374 375exit: 376 ret void 377} 378