1; Test loop tuning. 2; 3; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 -disable-block-placement | FileCheck %s 4; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -disable-block-placement \ 5; RUN: | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-Z13 6 7; Test that strength reduction is applied to addresses with a scale factor, 8; but that indexed addressing can still be used. 9define void @f1(ptr %dest, i32 %a) { 10; CHECK-LABEL: f1: 11; CHECK-NOT: sllg 12; CHECK: st %r3, 400({{%r[1-5],%r[1-5]}}) 13; CHECK: br %r14 14entry: 15 br label %loop 16 17loop: 18 %index = phi i64 [ 0, %entry ], [ %next, %loop ] 19 %ptr = getelementptr i32, ptr %dest, i64 %index 20 store i32 %a, ptr %ptr 21 %next = add i64 %index, 1 22 %cmp = icmp ne i64 %next, 100 23 br i1 %cmp, label %loop, label %exit 24 25exit: 26 ret void 27} 28 29; Test a loop that should be converted into dbr form and then use BRCT. 30define void @f2(ptr %src, ptr %dest) { 31; CHECK-LABEL: f2: 32; CHECK: lhi [[REG:%r[0-5]]], 100 33; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop 34; CHECK: brct [[REG]], [[LABEL]] 35; CHECK: br %r14 36entry: 37 br label %loop 38 39loop: 40 %count = phi i32 [ 0, %entry ], [ %next, %loop.next ] 41 %next = add i32 %count, 1 42 %val = load volatile i32, ptr %src 43 %cmp = icmp eq i32 %val, 0 44 br i1 %cmp, label %loop.next, label %loop.store 45 46loop.store: 47 %add = add i32 %val, 1 48 store volatile i32 %add, ptr %dest 49 br label %loop.next 50 51loop.next: 52 %cont = icmp ne i32 %next, 100 53 br i1 %cont, label %loop, label %exit 54 55exit: 56 ret void 57} 58 59; Like f2, but for BRCTG. 60define void @f3(ptr %src, ptr %dest) { 61; CHECK-LABEL: f3: 62; CHECK: lghi [[REG:%r[0-5]]], 100 63; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop 64; CHECK: brctg [[REG]], [[LABEL]] 65; CHECK: br %r14 66entry: 67 br label %loop 68 69loop: 70 %count = phi i64 [ 0, %entry ], [ %next, %loop.next ] 71 %next = add i64 %count, 1 72 %val = load volatile i64, ptr %src 73 %cmp = icmp eq i64 %val, 0 74 br i1 %cmp, label %loop.next, label %loop.store 75 76loop.store: 77 %add = add i64 %val, 1 78 store volatile i64 %add, ptr %dest 79 br label %loop.next 80 81loop.next: 82 %cont = icmp ne i64 %next, 100 83 br i1 %cont, label %loop, label %exit 84 85exit: 86 ret void 87} 88 89; Test a loop with a 64-bit decremented counter in which the 32-bit 90; low part of the counter is used after the decrement. This is an example 91; of a subregister use being the only thing that blocks a conversion to BRCTG. 92define void @f4(ptr %src, ptr %dest, ptr %dest2, i64 %count) { 93; CHECK-LABEL: f4: 94; CHECK: aghi [[REG:%r[0-5]]], -1 95; CHECK: lr [[REG2:%r[0-5]]], [[REG]] 96; CHECK: stg [[REG2]], 97; CHECK: jne {{\..*}} 98; CHECK: br %r14 99entry: 100 br label %loop 101 102loop: 103 %left = phi i64 [ %count, %entry ], [ %next, %loop.next ] 104 store volatile i64 %left, ptr %dest2 105 %val = load volatile i32, ptr %src 106 %cmp = icmp eq i32 %val, 0 107 br i1 %cmp, label %loop.next, label %loop.store 108 109loop.store: 110 %add = add i32 %val, 1 111 store volatile i32 %add, ptr %dest 112 br label %loop.next 113 114loop.next: 115 %next = add i64 %left, -1 116 %ext = zext i32 %val to i64 117 %shl = shl i64 %ext, 32 118 %and = and i64 %next, 4294967295 119 %or = or i64 %shl, %and 120 store volatile i64 %or, ptr %dest2 121 %cont = icmp ne i64 %next, 0 122 br i1 %cont, label %loop, label %exit 123 124exit: 125 ret void 126} 127 128; Test that negative offsets are avoided for loads of floating point. 129%s.float = type { float, float, float } 130define void @f5(ptr nocapture %a, 131 ptr nocapture readonly %b, 132 i32 zeroext %S) { 133; CHECK-Z13-LABEL: f5: 134; CHECK-Z13-NOT: -{{[0-9]+}}(%r 135 136entry: 137 %cmp9 = icmp eq i32 %S, 0 138 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader 139 140for.body.preheader: ; preds = %entry 141 br label %for.body 142 143for.cond.cleanup.loopexit: ; preds = %for.body 144 br label %for.cond.cleanup 145 146for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry 147 ret void 148 149for.body: ; preds = %for.body.preheader, %for.body 150 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] 151 %a1 = getelementptr inbounds %s.float, ptr %b, i64 %indvars.iv, i32 0 152 %tmp = load float, ptr %a1, align 4 153 %b4 = getelementptr inbounds %s.float, ptr %b, i64 %indvars.iv, i32 1 154 %tmp1 = load float, ptr %b4, align 4 155 %add = fadd float %tmp, %tmp1 156 %c = getelementptr inbounds %s.float, ptr %b, i64 %indvars.iv, i32 2 157 %tmp2 = load float, ptr %c, align 4 158 %add7 = fadd float %add, %tmp2 159 %a10 = getelementptr inbounds %s.float, ptr %a, i64 %indvars.iv, i32 0 160 store float %add7, ptr %a10, align 4 161 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 162 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 163 %exitcond = icmp eq i32 %lftr.wideiv, %S 164 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body 165} 166 167; Test that negative offsets are avoided for loads of double. 168%s.double = type { double, double, double } 169define void @f6(ptr nocapture %a, 170 ptr nocapture readonly %b, 171 i32 zeroext %S) { 172; CHECK-Z13-LABEL: f6: 173; CHECK-Z13-NOT: -{{[0-9]+}}(%r 174entry: 175 %cmp9 = icmp eq i32 %S, 0 176 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader 177 178for.body.preheader: ; preds = %entry 179 br label %for.body 180 181for.cond.cleanup.loopexit: ; preds = %for.body 182 br label %for.cond.cleanup 183 184for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry 185 ret void 186 187for.body: ; preds = %for.body.preheader, %for.body 188 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] 189 %a1 = getelementptr inbounds %s.double, ptr %b, i64 %indvars.iv, i32 0 190 %tmp = load double, ptr %a1, align 4 191 %b4 = getelementptr inbounds %s.double, ptr %b, i64 %indvars.iv, i32 1 192 %tmp1 = load double, ptr %b4, align 4 193 %add = fadd double %tmp, %tmp1 194 %c = getelementptr inbounds %s.double, ptr %b, i64 %indvars.iv, i32 2 195 %tmp2 = load double, ptr %c, align 4 196 %add7 = fadd double %add, %tmp2 197 %a10 = getelementptr inbounds %s.double, ptr %a, i64 %indvars.iv, i32 0 198 store double %add7, ptr %a10, align 4 199 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 200 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 201 %exitcond = icmp eq i32 %lftr.wideiv, %S 202 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body 203} 204 205; Test that negative offsets are avoided for memory accesses of vector type. 206%s.vec = type { <4 x i32>, <4 x i32>, <4 x i32> } 207define void @f7(ptr nocapture %a, 208 ptr nocapture readonly %b, 209 i32 zeroext %S) { 210; CHECK-Z13-LABEL: f7: 211; CHECK-Z13-NOT: -{{[0-9]+}}(%r 212entry: 213 %cmp9 = icmp eq i32 %S, 0 214 br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader 215 216for.body.preheader: ; preds = %entry 217 br label %for.body 218 219for.cond.cleanup.loopexit: ; preds = %for.body 220 br label %for.cond.cleanup 221 222for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry 223 ret void 224 225for.body: ; preds = %for.body.preheader, %for.body 226 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] 227 %a1 = getelementptr inbounds %s.vec, ptr %b, i64 %indvars.iv, i32 0 228 %tmp = load <4 x i32>, ptr %a1, align 4 229 %b4 = getelementptr inbounds %s.vec, ptr %b, i64 %indvars.iv, i32 1 230 %tmp1 = load <4 x i32>, ptr %b4, align 4 231 %add = add <4 x i32> %tmp1, %tmp 232 %c = getelementptr inbounds %s.vec, ptr %b, i64 %indvars.iv, i32 2 233 %tmp2 = load <4 x i32>, ptr %c, align 4 234 %add7 = add <4 x i32> %add, %tmp2 235 %a10 = getelementptr inbounds %s.vec, ptr %a, i64 %indvars.iv, i32 0 236 store <4 x i32> %add7, ptr %a10, align 4 237 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 238 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 239 %exitcond = icmp eq i32 %lftr.wideiv, %S 240 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body 241} 242 243; Test that a memcpy loop does not get a lot of lays before each mvc (D12 and no index-reg). 244%0 = type { %1, ptr } 245%1 = type { ptr, ptr } 246%2 = type <{ %3, i32, [4 x i8] }> 247%3 = type { ptr, ptr, ptr } 248 249declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #0 250 251define void @f8() { 252; CHECK-Z13-LABEL: f8: 253; CHECK-Z13: mvc 254; CHECK-Z13-NEXT: mvc 255; CHECK-Z13-NEXT: mvc 256; CHECK-Z13-NEXT: mvc 257 258bb: 259 %tmp = load ptr, ptr undef, align 8 260 br i1 undef, label %bb2, label %bb1 261 262bb1: ; preds = %bb 263 br label %bb2 264 265bb2: ; preds = %bb1, %bb 266 %tmp3 = phi ptr [ %tmp, %bb ], [ undef, %bb1 ] 267 %tmp4 = phi ptr [ undef, %bb ], [ undef, %bb1 ] 268 br label %bb5 269 270bb5: ; preds = %bb5, %bb2 271 %tmp6 = phi ptr [ %tmp21, %bb5 ], [ %tmp3, %bb2 ] 272 %tmp7 = phi ptr [ %tmp20, %bb5 ], [ %tmp4, %bb2 ] 273 %tmp8 = getelementptr inbounds %0, ptr %tmp7, i64 -1 274 %tmp9 = getelementptr inbounds %0, ptr %tmp6, i64 -1 275 tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp9, ptr align 8 %tmp8, i64 24, i1 false) 276 %tmp12 = getelementptr inbounds %0, ptr %tmp7, i64 -2 277 %tmp13 = getelementptr inbounds %0, ptr %tmp6, i64 -2 278 tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp13, ptr align 8 %tmp12, i64 24, i1 false) 279 %tmp16 = getelementptr inbounds %0, ptr %tmp7, i64 -3 280 %tmp17 = getelementptr inbounds %0, ptr %tmp6, i64 -3 281 tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp17, ptr align 8 %tmp16, i64 24, i1 false) 282 %tmp20 = getelementptr inbounds %0, ptr %tmp7, i64 -4 283 %tmp21 = getelementptr inbounds %0, ptr %tmp6, i64 -4 284 tail call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp21, ptr align 8 %tmp20, i64 24, i1 false) 285 br label %bb5 286} 287 288; Test that a chsi does not need an aghik inside the loop (no index reg) 289define void @f9() { 290; CHECK-Z13-LABEL: f9: 291; CHECK-Z13: # =>This Inner Loop Header: Depth=1 292; CHECK-Z13-NOT: aghik 293; CHECK-Z13: chsi 294 295entry: 296 br label %for.body.i63 297 298for.body.i63: ; preds = %for.inc.i, %entry 299 %indvars.iv155.i = phi i64 [ 0, %entry ], [ %indvars.iv.next156.i.3, %for.inc.i ] 300 %arrayidx.i62 = getelementptr inbounds i32, ptr undef, i64 %indvars.iv155.i 301 %tmp = load i32, ptr %arrayidx.i62, align 4 302 %cmp9.i = icmp eq i32 %tmp, 0 303 br i1 %cmp9.i, label %for.inc.i, label %if.then10.i 304 305if.then10.i: ; preds = %for.body.i63 306 unreachable 307 308for.inc.i: ; preds = %for.body.i63 309 %indvars.iv.next156.i = or i64 %indvars.iv155.i, 1 310 %arrayidx.i62.1 = getelementptr inbounds i32, ptr undef, i64 %indvars.iv.next156.i 311 %tmp1 = load i32, ptr %arrayidx.i62.1, align 4 312 %indvars.iv.next156.i.3 = add nsw i64 %indvars.iv155.i, 4 313 br label %for.body.i63 314} 315 316; Test that offsets are in range for i128 memory accesses. 317define void @fun10() { 318; CHECK-Z13-LABEL: fun10: 319; CHECK-Z13: # =>This Inner Loop Header: Depth=1 320; CHECK-Z13-NOT: lay 321entry: 322 %A1 = alloca [3 x [7 x [10 x i128]]], align 8 323 br label %for.body 324 325for.body: ; preds = %for.body, %entry 326 %IV = phi i64 [ 0, %entry ], [ %IV.next, %for.body ] 327 %Addr1 = getelementptr inbounds [3 x [7 x [10 x i128]]], ptr %A1, i64 0, i64 %IV, i64 6, i64 6 328 store i128 17174966165894859678, ptr %Addr1, align 8 329 %Addr2 = getelementptr inbounds [3 x [7 x [10 x i128]]], ptr %A1, i64 0, i64 %IV, i64 6, i64 8 330 store i128 17174966165894859678, ptr %Addr2, align 8 331 %IV.next = add nuw nsw i64 %IV, 1 332 %exitcond.not.i.i = icmp eq i64 %IV.next, 3 333 br i1 %exitcond.not.i.i, label %exit, label %for.body 334 335exit: ; preds = %for.body 336 unreachable 337} 338