1; REQUIRES: asserts 2; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefixes=CHECK,CHECK-A57 3; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m3 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s 4 5; Test ldr clustering. 6; CHECK: ********** MI Scheduling ********** 7; CHECK-LABEL: ldr_int:%bb.0 8; CHECK: Cluster ld/st SU(1) - SU(2) 9; CHECK: SU(1): %{{[0-9]+}}:gpr32 = LDRWui 10; CHECK: SU(2): %{{[0-9]+}}:gpr32 = LDRWui 11define i32 @ldr_int(ptr %a) nounwind { 12 %p1 = getelementptr inbounds i32, ptr %a, i32 1 13 %tmp1 = load i32, ptr %p1, align 2 14 %p2 = getelementptr inbounds i32, ptr %a, i32 2 15 %tmp2 = load i32, ptr %p2, align 2 16 %tmp3 = add i32 %tmp1, %tmp2 17 ret i32 %tmp3 18} 19 20; Test ldpsw clustering 21; CHECK: ********** MI Scheduling ********** 22; CHECK-LABEL: ldp_sext_int:%bb.0 23; CHECK: Cluster ld/st SU(1) - SU(2) 24; CHECK: SU(1): %{{[0-9]+}}:gpr64 = LDRSWui 25; CHECK: SU(2): %{{[0-9]+}}:gpr64 = LDRSWui 26define i64 @ldp_sext_int(ptr %p) nounwind { 27 %tmp = load i32, ptr %p, align 4 28 %add.ptr = getelementptr inbounds i32, ptr %p, i64 1 29 %tmp1 = load i32, ptr %add.ptr, align 4 30 %sexttmp = sext i32 %tmp to i64 31 %sexttmp1 = sext i32 %tmp1 to i64 32 %add = add nsw i64 %sexttmp1, %sexttmp 33 ret i64 %add 34} 35 36; Test ldur clustering. 37; CHECK: ********** MI Scheduling ********** 38; CHECK-LABEL: ldur_int:%bb.0 39; CHECK: Cluster ld/st SU(1) - SU(2) 40; CHECK: SU(1): %{{[0-9]+}}:gpr32 = LDURWi 41; CHECK: SU(2): %{{[0-9]+}}:gpr32 = LDURWi 42define i32 @ldur_int(ptr %a) nounwind { 43 %p1 = getelementptr inbounds i32, ptr %a, i32 -1 44 %tmp1 = load i32, ptr %p1, align 2 45 %p2 = getelementptr inbounds i32, ptr %a, i32 -2 46 %tmp2 = load i32, ptr %p2, align 2 47 %tmp3 = add i32 %tmp1, %tmp2 48 ret i32 %tmp3 49} 50 51; Test sext + zext clustering. 52; CHECK: ********** MI Scheduling ********** 53; CHECK-LABEL: ldp_half_sext_zext_int:%bb.0 54; CHECK: Cluster ld/st SU(3) - SU(4) 55; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDRSWui 56; CHECK: SU(4): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui 57define i64 @ldp_half_sext_zext_int(ptr %q, ptr %p) nounwind { 58 %tmp0 = load i64, ptr %q, align 4 59 %tmp = load i32, ptr %p, align 4 60 %add.ptr = getelementptr inbounds i32, ptr %p, i64 1 61 %tmp1 = load i32, ptr %add.ptr, align 4 62 %sexttmp = sext i32 %tmp to i64 63 %sexttmp1 = zext i32 %tmp1 to i64 64 %add = add nsw i64 %sexttmp1, %sexttmp 65 %add1 = add nsw i64 %add, %tmp0 66 ret i64 %add1 67} 68 69; Test zext + sext clustering. 70; CHECK: ********** MI Scheduling ********** 71; CHECK-LABEL: ldp_half_zext_sext_int:%bb.0 72; CHECK: Cluster ld/st SU(3) - SU(4) 73; CHECK: SU(3): undef %{{[0-9]+}}.sub_32:gpr64 = LDRWui 74; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRSWui 75define i64 @ldp_half_zext_sext_int(ptr %q, ptr %p) nounwind { 76 %tmp0 = load i64, ptr %q, align 4 77 %tmp = load i32, ptr %p, align 4 78 %add.ptr = getelementptr inbounds i32, ptr %p, i64 1 79 %tmp1 = load i32, ptr %add.ptr, align 4 80 %sexttmp = zext i32 %tmp to i64 81 %sexttmp1 = sext i32 %tmp1 to i64 82 %add = add nsw i64 %sexttmp1, %sexttmp 83 %add1 = add nsw i64 %add, %tmp0 84 ret i64 %add1 85} 86 87; Verify we don't cluster volatile loads. 88; CHECK: ********** MI Scheduling ********** 89; CHECK-LABEL: ldr_int_volatile:%bb.0 90; CHECK-NOT: Cluster ld/st 91; CHECK: SU(1): %{{[0-9]+}}:gpr32 = LDRWui 92; CHECK: SU(2): %{{[0-9]+}}:gpr32 = LDRWui 93define i32 @ldr_int_volatile(ptr %a) nounwind { 94 %p1 = getelementptr inbounds i32, ptr %a, i32 1 95 %tmp1 = load volatile i32, ptr %p1, align 2 96 %p2 = getelementptr inbounds i32, ptr %a, i32 2 97 %tmp2 = load volatile i32, ptr %p2, align 2 98 %tmp3 = add i32 %tmp1, %tmp2 99 ret i32 %tmp3 100} 101 102; Test ldq clustering (no clustering for Exynos). 103; CHECK: ********** MI Scheduling ********** 104; CHECK-LABEL: ldq_cluster:%bb.0 105; CHECK: Cluster ld/st SU(1) - SU(3) 106; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDRQui 107; CHECK: SU(3): %{{[0-9]+}}:fpr128 = LDRQui 108define <2 x i64> @ldq_cluster(ptr %p) { 109 %tmp1 = load <2 x i64>, < 2 x i64>* %p, align 8 110 %add.ptr2 = getelementptr inbounds i64, ptr %p, i64 2 111 %tmp2 = add nsw <2 x i64> %tmp1, %tmp1 112 %tmp3 = load <2 x i64>, ptr %add.ptr2, align 8 113 %res = mul nsw <2 x i64> %tmp2, %tmp3 114 ret <2 x i64> %res 115} 116 117; CHECK: ********** MI Scheduling ********** 118; CHECK: LDURSi_LDRSui:%bb.0 entry 119; CHECK: Cluster ld/st SU(3) - SU(4) 120; CHECK: SU(3): %3:fpr32 = LDURSi %0:gpr64 121; CHECK: SU(4): %4:fpr32 = LDRSui %0:gpr64 122; 123define void @LDURSi_LDRSui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) { 124entry: 125 %r51 = getelementptr i8, ptr %arg, i64 -4 126 %r52 = load float, ptr %r51, align 4 127 %r53 = load float, ptr %arg, align 4 128 store float %r52, ptr %wa 129 store float %r53, ptr %wb 130 ret void 131} 132 133; Test LDURQi / LDRQui clustering 134; 135; CHECK: ********** MI Scheduling ********** 136; CHECK: LDURQi_LDRQui:%bb.1 vector_body 137; 138; CHECK: Cluster ld/st SU(0) - SU(4) 139; CHECK: Cluster ld/st SU(1) - SU(5) 140; 141; CHECK: SU(0): %{{[0-9]+}}:fpr128 = LDURQi 142; CHECK: SU(1): %{{[0-9]+}}:fpr128 = LDURQi 143; CHECK: SU(4): %{{[0-9]+}}:fpr128 = LDRQui 144; CHECK: SU(5): %{{[0-9]+}}:fpr128 = LDRQui 145; 146define void @LDURQi_LDRQui(ptr nocapture readonly %arg) { 147entry: 148 br label %vector_body 149vector_body: 150 %phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ] 151 %phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ] 152 %phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ] 153 %r51 = getelementptr i8, ptr %phi1, i64 -16 154 %r52 = load <2 x double>, ptr %r51, align 8 155 %r53 = getelementptr i8, ptr %phi2, i64 -16 156 %r54 = load <2 x double>, ptr %r53, align 8 157 %r55 = fmul fast <2 x double> %r54, <double 3.0, double 4.0> 158 %r56 = fsub fast <2 x double> %r52, %r55 159 store <2 x double> %r56, ptr %r51, align 1 160 %r57 = load <2 x double>, ptr %phi1, align 8 161 %r58 = load <2 x double>, ptr %phi2, align 8 162 %r59 = fmul fast <2 x double> %r58,<double 3.0, double 4.0> 163 %r60 = fsub fast <2 x double> %r57, %r59 164 store <2 x double> %r60, ptr %phi1, align 1 165 %r61 = add i32 %phi3, 4 166 %r62 = getelementptr i8, ptr %phi2, i64 32 167 %r63 = getelementptr i8, ptr %phi1, i64 32 168 %r.not = icmp eq i32 %r61, 0 169 br i1 %r.not, label %exit, label %vector_body 170exit: 171 ret void 172} 173 174; Test LDURDi / LDRDui clustering 175; 176; CHECK: ********** MI Scheduling ********** 177; CHECK: LDURDi_LDRDui:%bb.1 vector_body 178; 179; CHECK: Cluster ld/st SU(0) - SU(4) 180; CHECK: Cluster ld/st SU(1) - SU(5) 181; 182; CHECK: SU(0): %{{[0-9]+}}:fpr64 = LDURDi 183; CHECK: SU(1): %{{[0-9]+}}:fpr64 = LDURDi 184; CHECK: SU(4): %{{[0-9]+}}:fpr64 = LDRDui 185; CHECK: SU(5): %{{[0-9]+}}:fpr64 = LDRDui 186; 187define void @LDURDi_LDRDui(ptr nocapture readonly %arg) { 188entry: 189 br label %vector_body 190vector_body: 191 %phi1 = phi ptr [ null, %entry ], [ %r63, %vector_body ] 192 %phi2 = phi ptr [ %arg, %entry ], [ %r62, %vector_body ] 193 %phi3 = phi i32 [ 0, %entry ], [ %r61, %vector_body ] 194 %r51 = getelementptr i8, ptr %phi1, i64 -8 195 %r52 = load <2 x float>, ptr %r51, align 8 196 %r53 = getelementptr i8, ptr %phi2, i64 -8 197 %r54 = load <2 x float>, ptr %r53, align 8 198 %r55 = fmul fast <2 x float> %r54, <float 3.0, float 4.0> 199 %r56 = fsub fast <2 x float> %r52, %r55 200 store <2 x float> %r56, ptr %r51, align 1 201 %r57 = load <2 x float>, ptr %phi1, align 8 202 %r58 = load <2 x float>, ptr %phi2, align 8 203 %r59 = fmul fast <2 x float> %r58, <float 3.0, float 4.0> 204 %r60 = fsub fast <2 x float> %r57, %r59 205 store <2 x float> %r60, ptr %phi1, align 1 206 %r61 = add i32 %phi3, 4 207 %r62 = getelementptr i8, ptr %phi2, i64 32 208 %r63 = getelementptr i8, ptr %phi1, i64 32 209 %r.not = icmp eq i32 %r61, 0 210 br i1 %r.not, label %exit, label %vector_body 211exit: 212 ret void 213} 214 215; CHECK: ********** MI Scheduling ********** 216; CHECK: LDURXi_LDRXui:%bb.0 entry 217; CHECK: Cluster ld/st SU(3) - SU(4) 218; CHECK: SU(3): %{{[0-9]+}}:gpr64 = LDURXi 219; CHECK: SU(4): %{{[0-9]+}}:gpr64 = LDRXui 220; 221define void @LDURXi_LDRXui(ptr nocapture readonly %arg, ptr nocapture readonly %wa, ptr nocapture readonly %wb) { 222entry: 223 %r51 = getelementptr i8, ptr %arg, i64 -8 224 %r52 = load i64, ptr %r51, align 8 225 %r53 = load i64, ptr %arg, align 8 226 store i64 %r52, ptr %wa 227 store i64 %r53, ptr %wb 228 ret void 229} 230 231; CHECK: ********** MI Scheduling ********** 232; CHECK: STURWi_STRWui:%bb.0 entry 233; CHECK: Cluster ld/st SU(3) - SU(4) 234; CHECK: SU(3): STURWi %{{[0-9]+}}:gpr32 235; CHECK: SU(4): STRWui %{{[0-9]+}}:gpr32 236; 237define void @STURWi_STRWui(ptr nocapture readonly %arg, i32 %b, i32 %c) { 238entry: 239 %r51 = getelementptr i8, ptr %arg, i64 -4 240 store i32 %b, ptr %r51 241 store i32 %c, ptr %arg 242 ret void 243} 244 245; CHECK: ********** MI Scheduling ********** 246; CHECK: STURXi_STRXui:%bb.0 entry 247; CHECK: Cluster ld/st SU(3) - SU(4) 248; CHECK: SU(3): STURXi %{{[0-9]+}}:gpr64 249; CHECK: SU(4): STRXui %{{[0-9]+}}:gpr64 250; 251define void @STURXi_STRXui(ptr nocapture readonly %arg, i64 %b, i64 %c) { 252entry: 253 %r51 = getelementptr i8, ptr %arg, i64 -8 254 store i64 %b, ptr %r51 255 store i64 %c, ptr %arg 256 ret void 257} 258 259; CHECK-A57: ********** MI Scheduling ********** 260; CHECK-A57: STURSi_STRSui:%bb.0 entry 261; CHECK-A57: Cluster ld/st SU(3) - SU(4) 262; CHECK-A57: SU(3): STURSi %{{[0-9]+}}:fpr32 263; CHECK-A57: SU(4): STRSui %{{[0-9]+}}:fpr32 264; 265define void @STURSi_STRSui(ptr nocapture readonly %arg, float %b, float %c) { 266entry: 267 %r51 = getelementptr i8, ptr %arg, i64 -4 268 store float %b, ptr %r51 269 store float %c, ptr %arg 270 ret void 271} 272 273; CHECK-A57: ********** MI Scheduling ********** 274; CHECK-A57: STURDi_STRDui:%bb.0 entry 275; CHECK-A57: Cluster ld/st SU(3) - SU(4) 276; CHECK-A57: SU(3): STURDi %{{[0-9]+}}:fpr64 277; CHECK-A57: SU(4): STRDui %{{[0-9]+}}:fpr64 278; 279define void @STURDi_STRDui(ptr nocapture readonly %arg, <2 x float> %b, <2 x float> %c) { 280entry: 281 %r51 = getelementptr i8, ptr %arg, i64 -8 282 store <2 x float> %b, ptr %r51 283 store <2 x float> %c, ptr %arg 284 ret void 285} 286 287; CHECK-A57: ********** MI Scheduling ********** 288; CHECK-A57: STURQi_STRQui:%bb.0 entry 289; CHECK-A57: Cluster ld/st SU(3) - SU(4) 290; CHECK-A57: SU(3): STURQi %{{[0-9]+}}:fpr128 291; CHECK-A57: SU(4): STRQui %{{[0-9]+}}:fpr128 292; 293define void @STURQi_STRQui(ptr nocapture readonly %arg, <2 x double> %b, <2 x double> %c) { 294entry: 295 %r51 = getelementptr i8, ptr %arg, i64 -16 296 store <2 x double> %b, ptr %r51 297 store <2 x double> %c, ptr %arg 298 ret void 299} 300