1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve %s -o - | FileCheck %s 3 4define dso_local i32 @test_500_504(ptr nocapture readonly %x) { 5; CHECK-LABEL: test_500_504: 6; CHECK: @ %bb.0: @ %entry 7; CHECK-NEXT: .save {r7, lr} 8; CHECK-NEXT: push {r7, lr} 9; CHECK-NEXT: mov.w lr, #126 10; CHECK-NEXT: adr r2, .LCPI0_0 11; CHECK-NEXT: vldrw.u32 q0, [r2] 12; CHECK-NEXT: mov.w r2, #500 13; CHECK-NEXT: vdup.32 q1, r2 14; CHECK-NEXT: movs r1, #0 15; CHECK-NEXT: movs r2, #0 16; CHECK-NEXT: .LBB0_1: @ %vector.body 17; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 18; CHECK-NEXT: vqadd.u32 q2, q0, r1 19; CHECK-NEXT: adds r1, #4 20; CHECK-NEXT: vptt.u32 hi, q1, q2 21; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 22; CHECK-NEXT: vaddvat.u32 r2, q2 23; CHECK-NEXT: le lr, .LBB0_1 24; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 25; CHECK-NEXT: mov r0, r2 26; CHECK-NEXT: pop {r7, pc} 27; CHECK-NEXT: .p2align 4 28; CHECK-NEXT: @ %bb.3: 29; CHECK-NEXT: .LCPI0_0: 30; CHECK-NEXT: .long 0 @ 0x0 31; CHECK-NEXT: .long 1 @ 0x1 32; CHECK-NEXT: .long 2 @ 0x2 33; CHECK-NEXT: .long 3 @ 0x3 34entry: 35 br label %vector.body 36 37vector.body: ; preds = %vector.body, %entry 38 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 39 %vec.phi = phi i32 [ 0, %entry ], [ %4, %vector.body ] 40 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 500) 41 %0 = getelementptr inbounds i32, ptr %x, i32 %index 42 %1 = bitcast ptr %0 to ptr 43 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 44 %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer 45 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 46 %4 = add i32 %3, %vec.phi 47 %index.next = add i32 %index, 4 48 %5 = icmp eq i32 %index.next, 504 49 br i1 %5, label %for.cond.cleanup, label %vector.body 50 51for.cond.cleanup: ; preds = %vector.body 52 ret i32 %4 53} 54 55define dso_local i32 @test_501_504(ptr nocapture readonly %x) { 56; CHECK-LABEL: test_501_504: 57; CHECK: @ %bb.0: @ %entry 58; CHECK-NEXT: .save {r7, lr} 59; CHECK-NEXT: push {r7, lr} 60; CHECK-NEXT: movw r1, #501 61; CHECK-NEXT: movs r2, #0 62; CHECK-NEXT: dlstp.32 lr, r1 63; CHECK-NEXT: .LBB1_1: @ %vector.body 64; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 65; CHECK-NEXT: vldrw.u32 q0, [r0], #16 66; CHECK-NEXT: vaddva.u32 r2, q0 67; CHECK-NEXT: letp lr, .LBB1_1 68; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 69; CHECK-NEXT: mov r0, r2 70; CHECK-NEXT: pop {r7, pc} 71entry: 72 br label %vector.body 73 74vector.body: ; preds = %vector.body, %entry 75 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 76 %vec.phi = phi i32 [ 0, %entry ], [ %4, %vector.body ] 77 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 501) 78 %0 = getelementptr inbounds i32, ptr %x, i32 %index 79 %1 = bitcast ptr %0 to ptr 80 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 81 %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer 82 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 83 %4 = add i32 %3, %vec.phi 84 %index.next = add i32 %index, 4 85 %5 = icmp eq i32 %index.next, 504 86 br i1 %5, label %for.cond.cleanup, label %vector.body 87 88for.cond.cleanup: ; preds = %vector.body 89 ret i32 %4 90} 91 92define dso_local i32 @test_502_504(ptr nocapture readonly %x) { 93; CHECK-LABEL: test_502_504: 94; CHECK: @ %bb.0: @ %entry 95; CHECK-NEXT: .save {r7, lr} 96; CHECK-NEXT: push {r7, lr} 97; CHECK-NEXT: mov.w r1, #502 98; CHECK-NEXT: movs r2, #0 99; CHECK-NEXT: dlstp.32 lr, r1 100; CHECK-NEXT: .LBB2_1: @ %vector.body 101; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 102; CHECK-NEXT: vldrw.u32 q0, [r0], #16 103; CHECK-NEXT: vaddva.u32 r2, q0 104; CHECK-NEXT: letp lr, .LBB2_1 105; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 106; CHECK-NEXT: mov r0, r2 107; CHECK-NEXT: pop {r7, pc} 108entry: 109 br label %vector.body 110 111vector.body: ; preds = %vector.body, %entry 112 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 113 %vec.phi = phi i32 [ 0, %entry ], [ %4, %vector.body ] 114 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 502) 115 %0 = getelementptr inbounds i32, ptr %x, i32 %index 116 %1 = bitcast ptr %0 to ptr 117 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 118 %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer 119 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 120 %4 = add i32 %3, %vec.phi 121 %index.next = add i32 %index, 4 122 %5 = icmp eq i32 %index.next, 504 123 br i1 %5, label %for.cond.cleanup, label %vector.body 124 125for.cond.cleanup: ; preds = %vector.body 126 ret i32 %4 127} 128 129define dso_local i32 @test_503_504(ptr nocapture readonly %x) { 130; CHECK-LABEL: test_503_504: 131; CHECK: @ %bb.0: @ %entry 132; CHECK-NEXT: .save {r7, lr} 133; CHECK-NEXT: push {r7, lr} 134; CHECK-NEXT: movw r1, #503 135; CHECK-NEXT: movs r2, #0 136; CHECK-NEXT: dlstp.32 lr, r1 137; CHECK-NEXT: .LBB3_1: @ %vector.body 138; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 139; CHECK-NEXT: vldrw.u32 q0, [r0], #16 140; CHECK-NEXT: vaddva.u32 r2, q0 141; CHECK-NEXT: letp lr, .LBB3_1 142; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 143; CHECK-NEXT: mov r0, r2 144; CHECK-NEXT: pop {r7, pc} 145entry: 146 br label %vector.body 147 148vector.body: ; preds = %vector.body, %entry 149 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 150 %vec.phi = phi i32 [ 0, %entry ], [ %4, %vector.body ] 151 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 503) 152 %0 = getelementptr inbounds i32, ptr %x, i32 %index 153 %1 = bitcast ptr %0 to ptr 154 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 155 %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer 156 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 157 %4 = add i32 %3, %vec.phi 158 %index.next = add i32 %index, 4 159 %5 = icmp eq i32 %index.next, 504 160 br i1 %5, label %for.cond.cleanup, label %vector.body 161 162for.cond.cleanup: ; preds = %vector.body 163 ret i32 %4 164} 165 166define dso_local i32 @test_504_504(ptr nocapture readonly %x) { 167; CHECK-LABEL: test_504_504: 168; CHECK: @ %bb.0: @ %entry 169; CHECK-NEXT: .save {r7, lr} 170; CHECK-NEXT: push {r7, lr} 171; CHECK-NEXT: mov.w r1, #504 172; CHECK-NEXT: movs r2, #0 173; CHECK-NEXT: dlstp.32 lr, r1 174; CHECK-NEXT: .LBB4_1: @ %vector.body 175; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 176; CHECK-NEXT: vldrw.u32 q0, [r0], #16 177; CHECK-NEXT: vaddva.u32 r2, q0 178; CHECK-NEXT: letp lr, .LBB4_1 179; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup 180; CHECK-NEXT: mov r0, r2 181; CHECK-NEXT: pop {r7, pc} 182entry: 183 br label %vector.body 184 185vector.body: ; preds = %vector.body, %entry 186 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 187 %vec.phi = phi i32 [ 0, %entry ], [ %4, %vector.body ] 188 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 504) 189 %0 = getelementptr inbounds i32, ptr %x, i32 %index 190 %1 = bitcast ptr %0 to ptr 191 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 192 %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer 193 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2) 194 %4 = add i32 %3, %vec.phi 195 %index.next = add i32 %index, 4 196 %5 = icmp eq i32 %index.next, 504 197 br i1 %5, label %for.cond.cleanup, label %vector.body 198 199for.cond.cleanup: ; preds = %vector.body 200 ret i32 %4 201} 202 203declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 204declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) 205declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 206