1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -ppc-asm-full-reg-names -verify-machineinstrs -ppc-formprep-chain-commoning \ 3; RUN: -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 < %s | FileCheck %s 4 5; addresses: 6; 1: base1 + offset 7; 2: + offset 8; 3: + offset 9; 4: + offset 10; 11; chains: 12; 1: base: base1 + offset, offsets: (0, offset) 13; 2: base: base1 + 3*offset, offsets: (0, offset) 14; 15; long long two_chain_same_offset_succ(char *p, long long offset, long long base1, long long n) { 16; long long o1 = base1 + offset; 17; long long o2 = base1 + 2 * offset; 18; long long o3 = base1 + 3 * offset; 19; long long o4 = base1 + 4 * offset; 20; char *p1 = p + o1; 21; char *p2 = p + o2; 22; char *p3 = p + o3; 23; char *p4 = p + o4; 24; long long sum = 0; 25; for (long long i = 0; i < n; ++i) { 26; unsigned long x1 = *(unsigned long *)(p1 + i); 27; unsigned long x2 = *(unsigned long *)(p2 + i); 28; unsigned long x3 = *(unsigned long *)(p3 + i); 29; unsigned long x4 = *(unsigned long *)(p4 + i); 30; sum += x1 * x2 * x3 * x4; 31; } 32; return sum; 33; } 34; 35define i64 @two_chain_same_offset_succ(ptr %p, i64 %offset, i64 %base1, i64 %n) { 36; CHECK-LABEL: two_chain_same_offset_succ: 37; CHECK: # %bb.0: # %entry 38; CHECK-NEXT: cmpdi r6, 0 39; CHECK-NEXT: ble cr0, .LBB0_4 40; CHECK-NEXT: # %bb.1: # %for.body.preheader 41; CHECK-NEXT: sldi r7, r4, 1 42; CHECK-NEXT: mtctr r6 43; CHECK-NEXT: add r8, r4, r7 44; CHECK-NEXT: add r7, r5, r4 45; CHECK-NEXT: add r5, r5, r8 46; CHECK-NEXT: add r7, r3, r7 47; CHECK-NEXT: add r5, r3, r5 48; CHECK-NEXT: li r3, 0 49; CHECK-NEXT: .p2align 4 50; CHECK-NEXT: .LBB0_2: # %for.body 51; CHECK-NEXT: # 52; CHECK-NEXT: ld r6, 0(r7) 53; CHECK-NEXT: ldx r8, r7, r4 54; CHECK-NEXT: ld r9, 0(r5) 55; CHECK-NEXT: ldx r10, r5, r4 56; CHECK-NEXT: addi r7, r7, 1 57; CHECK-NEXT: addi r5, r5, 1 58; CHECK-NEXT: mulld r6, r8, r6 59; CHECK-NEXT: mulld r6, r6, r9 60; CHECK-NEXT: maddld r3, r6, r10, r3 61; CHECK-NEXT: bdnz .LBB0_2 62; CHECK-NEXT: # %bb.3: # %for.cond.cleanup 63; CHECK-NEXT: blr 64; CHECK-NEXT: .LBB0_4: 65; CHECK-NEXT: li r3, 0 66; CHECK-NEXT: blr 67entry: 68 %mul = shl nsw i64 %offset, 1 69 %mul2 = mul nsw i64 %offset, 3 70 %mul4 = shl nsw i64 %offset, 2 71 %cmp46 = icmp sgt i64 %n, 0 72 br i1 %cmp46, label %for.body, label %for.cond.cleanup 73 74for.cond.cleanup: ; preds = %for.body, %entry 75 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ] 76 ret i64 %sum.0.lcssa 77 78for.body: ; preds = %entry, %for.body 79 %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ] 80 %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 81 %add = add i64 %i.047, %base1 82 %add.ptr9.idx = add i64 %add, %offset 83 %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx 84 %0 = load i64, ptr %add.ptr9, align 8 85 %add.ptr10.idx = add i64 %add, %mul 86 %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx 87 %1 = load i64, ptr %add.ptr10, align 8 88 %add.ptr11.idx = add i64 %add, %mul2 89 %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx 90 %2 = load i64, ptr %add.ptr11, align 8 91 %add.ptr12.idx = add i64 %add, %mul4 92 %add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx 93 %3 = load i64, ptr %add.ptr12, align 8 94 %mul13 = mul i64 %1, %0 95 %mul14 = mul i64 %mul13, %2 96 %mul15 = mul i64 %mul14, %3 97 %add16 = add i64 %mul15, %sum.048 98 %inc = add nuw nsw i64 %i.047, 1 99 %exitcond.not = icmp eq i64 %inc, %n 100 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 101} 102 103; addresses: 104; 1: base1 + offset 105; 2: + offset 106; 3: + offset 107; 4: + offset 108; 5: + offset 109; 110; It can not be commoned to chains because we need a chain for a single address. 111; It is not profitable to common chains if not all addresses are in chains. 112; 113; long long not_perfect_chain_all_same_offset_fail(char *p, long long offset, long long base1, long long n) { 114; long long o1 = base1 + offset; 115; long long o2 = base1 + 2 * offset; 116; long long o3 = base1 + 3 * offset; 117; long long o4 = base1 + 4 * offset; 118; long long o5 = base1 + 5 * offset; 119; char *p1 = p + o1; 120; char *p2 = p + o2; 121; char *p3 = p + o3; 122; char *p4 = p + o4; 123; char *p5 = p + o5; 124; long long sum = 0; 125; for (long long i = 0; i < n; ++i) { 126; unsigned long x1 = *(unsigned long *)(p1 + i); 127; unsigned long x2 = *(unsigned long *)(p2 + i); 128; unsigned long x3 = *(unsigned long *)(p3 + i); 129; unsigned long x4 = *(unsigned long *)(p4 + i); 130; unsigned long x5 = *(unsigned long *)(p5 + i); 131; sum += x1 * x2 * x3 * x4 * x5; 132; } 133; return sum; 134; } 135; 136define i64 @not_perfect_chain_all_same_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) { 137; CHECK-LABEL: not_perfect_chain_all_same_offset_fail: 138; CHECK: # %bb.0: # %entry 139; CHECK-NEXT: cmpdi r6, 0 140; CHECK-NEXT: ble cr0, .LBB1_4 141; CHECK-NEXT: # %bb.1: # %for.body.preheader 142; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill 143; CHECK-NEXT: sldi r7, r4, 1 144; CHECK-NEXT: add r5, r3, r5 145; CHECK-NEXT: li r3, 0 146; CHECK-NEXT: add r8, r4, r7 147; CHECK-NEXT: sldi r9, r4, 2 148; CHECK-NEXT: mtctr r6 149; CHECK-NEXT: add r10, r4, r9 150; CHECK-NEXT: .p2align 4 151; CHECK-NEXT: .LBB1_2: # %for.body 152; CHECK-NEXT: # 153; CHECK-NEXT: ldx r6, r5, r4 154; CHECK-NEXT: ldx r11, r5, r7 155; CHECK-NEXT: ldx r12, r5, r8 156; CHECK-NEXT: ldx r0, r5, r9 157; CHECK-NEXT: mulld r6, r11, r6 158; CHECK-NEXT: ldx r30, r5, r10 159; CHECK-NEXT: addi r5, r5, 1 160; CHECK-NEXT: mulld r6, r6, r12 161; CHECK-NEXT: mulld r6, r6, r0 162; CHECK-NEXT: maddld r3, r6, r30, r3 163; CHECK-NEXT: bdnz .LBB1_2 164; CHECK-NEXT: # %bb.3: 165; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload 166; CHECK-NEXT: blr 167; CHECK-NEXT: .LBB1_4: 168; CHECK-NEXT: li r3, 0 169; CHECK-NEXT: blr 170entry: 171 %mul = shl nsw i64 %offset, 1 172 %mul2 = mul nsw i64 %offset, 3 173 %mul4 = shl nsw i64 %offset, 2 174 %mul6 = mul nsw i64 %offset, 5 175 %cmp58 = icmp sgt i64 %n, 0 176 br i1 %cmp58, label %for.body, label %for.cond.cleanup 177 178for.cond.cleanup: ; preds = %for.body, %entry 179 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add21, %for.body ] 180 ret i64 %sum.0.lcssa 181 182for.body: ; preds = %entry, %for.body 183 %sum.060 = phi i64 [ %add21, %for.body ], [ 0, %entry ] 184 %i.059 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 185 %add = add i64 %i.059, %base1 186 %add.ptr12.idx = add i64 %add, %offset 187 %add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx 188 %0 = load i64, ptr %add.ptr12, align 8 189 %add.ptr13.idx = add i64 %add, %mul 190 %add.ptr13 = getelementptr inbounds i8, ptr %p, i64 %add.ptr13.idx 191 %1 = load i64, ptr %add.ptr13, align 8 192 %add.ptr14.idx = add i64 %add, %mul2 193 %add.ptr14 = getelementptr inbounds i8, ptr %p, i64 %add.ptr14.idx 194 %2 = load i64, ptr %add.ptr14, align 8 195 %add.ptr15.idx = add i64 %add, %mul4 196 %add.ptr15 = getelementptr inbounds i8, ptr %p, i64 %add.ptr15.idx 197 %3 = load i64, ptr %add.ptr15, align 8 198 %add.ptr16.idx = add i64 %add, %mul6 199 %add.ptr16 = getelementptr inbounds i8, ptr %p, i64 %add.ptr16.idx 200 %4 = load i64, ptr %add.ptr16, align 8 201 %mul17 = mul i64 %1, %0 202 %mul18 = mul i64 %mul17, %2 203 %mul19 = mul i64 %mul18, %3 204 %mul20 = mul i64 %mul19, %4 205 %add21 = add i64 %mul20, %sum.060 206 %inc = add nuw nsw i64 %i.059, 1 207 %exitcond.not = icmp eq i64 %inc, %n 208 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 209} 210 211; addresses: 212; 1: base1 213; 2: + 2*offset 214; 3: + offset 215; 216; We need at least 4 addresses to common 2 chains to reuse at least 1 offset. 217; 218; long long no_enough_elements_fail(char *p, long long offset, long long base1, long long n) { 219; long long o1 = base1; 220; long long o2 = base1 + 2 * offset; 221; long long o3 = base1 + 3 * offset; 222; char *p1 = p + o1; 223; char *p2 = p + o2; 224; char *p3 = p + o3; 225; long long sum = 0; 226; for (long long i = 0; i < n; ++i) { 227; unsigned long x1 = *(unsigned long *)(p1 + i); 228; unsigned long x2 = *(unsigned long *)(p2 + i); 229; unsigned long x3 = *(unsigned long *)(p3 + i); 230; sum += x1 * x2 * x3; 231; } 232; return sum; 233; } 234; 235define i64 @no_enough_elements_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) { 236; CHECK-LABEL: no_enough_elements_fail: 237; CHECK: # %bb.0: # %entry 238; CHECK-NEXT: cmpdi r6, 0 239; CHECK-NEXT: ble cr0, .LBB2_4 240; CHECK-NEXT: # %bb.1: # %for.body.preheader 241; CHECK-NEXT: sldi r7, r4, 1 242; CHECK-NEXT: mtctr r6 243; CHECK-NEXT: add r5, r3, r5 244; CHECK-NEXT: li r3, 0 245; CHECK-NEXT: add r4, r4, r7 246; CHECK-NEXT: .p2align 5 247; CHECK-NEXT: .LBB2_2: # %for.body 248; CHECK-NEXT: # 249; CHECK-NEXT: ld r6, 0(r5) 250; CHECK-NEXT: ldx r8, r5, r7 251; CHECK-NEXT: ldx r9, r5, r4 252; CHECK-NEXT: addi r5, r5, 1 253; CHECK-NEXT: mulld r6, r8, r6 254; CHECK-NEXT: maddld r3, r6, r9, r3 255; CHECK-NEXT: bdnz .LBB2_2 256; CHECK-NEXT: # %bb.3: # %for.cond.cleanup 257; CHECK-NEXT: blr 258; CHECK-NEXT: .LBB2_4: 259; CHECK-NEXT: li r3, 0 260; CHECK-NEXT: blr 261entry: 262 %mul = shl nsw i64 %offset, 1 263 %mul1 = mul nsw i64 %offset, 3 264 %cmp32 = icmp sgt i64 %n, 0 265 br i1 %cmp32, label %for.body, label %for.cond.cleanup 266 267for.cond.cleanup: ; preds = %for.body, %entry 268 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add10, %for.body ] 269 ret i64 %sum.0.lcssa 270 271for.body: ; preds = %entry, %for.body 272 %sum.034 = phi i64 [ %add10, %for.body ], [ 0, %entry ] 273 %i.033 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 274 %add.ptr5.idx = add i64 %i.033, %base1 275 %add.ptr5 = getelementptr inbounds i8, ptr %p, i64 %add.ptr5.idx 276 %0 = load i64, ptr %add.ptr5, align 8 277 %add.ptr6.idx = add i64 %add.ptr5.idx, %mul 278 %add.ptr6 = getelementptr inbounds i8, ptr %p, i64 %add.ptr6.idx 279 %1 = load i64, ptr %add.ptr6, align 8 280 %add.ptr7.idx = add i64 %add.ptr5.idx, %mul1 281 %add.ptr7 = getelementptr inbounds i8, ptr %p, i64 %add.ptr7.idx 282 %2 = load i64, ptr %add.ptr7, align 8 283 %mul8 = mul i64 %1, %0 284 %mul9 = mul i64 %mul8, %2 285 %add10 = add i64 %mul9, %sum.034 286 %inc = add nuw nsw i64 %i.033, 1 287 %exitcond.not = icmp eq i64 %inc, %n 288 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 289} 290 291; addresses: 292; 1: base1 293; 2: + 2*offset 294; 3: + 2*offset 295; 4: + 3*offset 296; 297; The diff between address 2 and address 1 is 2*offset, and this offset is not reused among other chains, 298; so we can not common any chains. 299; 300; long long no_reuseable_offset_fail(char *p, long long offset, long long base1, long long n) { 301; long long o1 = base1; 302; long long o2 = base1 + 2 * offset; 303; long long o3 = base1 + 4 * offset; 304; long long o4 = base1 + 7 * offset; 305; char *p1 = p + o1; 306; char *p2 = p + o2; 307; char *p3 = p + o3; 308; char *p4 = p + o4; 309; long long sum = 0; 310; for (long long i = 0; i < n; ++i) { 311; unsigned long x1 = *(unsigned long *)(p1 + i); 312; unsigned long x2 = *(unsigned long *)(p2 + i); 313; unsigned long x3 = *(unsigned long *)(p3 + i); 314; unsigned long x4 = *(unsigned long *)(p4 + i); 315; sum += x1 * x2 * x3 * x4; 316; } 317; return sum; 318; } 319; 320define i64 @no_reuseable_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) { 321; CHECK-LABEL: no_reuseable_offset_fail: 322; CHECK: # %bb.0: # %entry 323; CHECK-NEXT: cmpdi r6, 0 324; CHECK-NEXT: ble cr0, .LBB3_4 325; CHECK-NEXT: # %bb.1: # %for.body.preheader 326; CHECK-NEXT: sldi r9, r4, 3 327; CHECK-NEXT: mtctr r6 328; CHECK-NEXT: add r5, r3, r5 329; CHECK-NEXT: li r3, 0 330; CHECK-NEXT: sldi r7, r4, 1 331; CHECK-NEXT: sldi r8, r4, 2 332; CHECK-NEXT: sub r4, r9, r4 333; CHECK-NEXT: .p2align 4 334; CHECK-NEXT: .LBB3_2: # %for.body 335; CHECK-NEXT: # 336; CHECK-NEXT: ld r6, 0(r5) 337; CHECK-NEXT: ldx r9, r5, r7 338; CHECK-NEXT: ldx r10, r5, r8 339; CHECK-NEXT: ldx r11, r5, r4 340; CHECK-NEXT: addi r5, r5, 1 341; CHECK-NEXT: mulld r6, r9, r6 342; CHECK-NEXT: mulld r6, r6, r10 343; CHECK-NEXT: maddld r3, r6, r11, r3 344; CHECK-NEXT: bdnz .LBB3_2 345; CHECK-NEXT: # %bb.3: # %for.cond.cleanup 346; CHECK-NEXT: blr 347; CHECK-NEXT: .LBB3_4: 348; CHECK-NEXT: li r3, 0 349; CHECK-NEXT: blr 350entry: 351 %mul = shl nsw i64 %offset, 1 352 %mul1 = shl nsw i64 %offset, 2 353 %mul3 = mul nsw i64 %offset, 7 354 %cmp44 = icmp sgt i64 %n, 0 355 br i1 %cmp44, label %for.body, label %for.cond.cleanup 356 357for.cond.cleanup: ; preds = %for.body, %entry 358 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ] 359 ret i64 %sum.0.lcssa 360 361for.body: ; preds = %entry, %for.body 362 %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ] 363 %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 364 %add.ptr8.idx = add i64 %i.045, %base1 365 %add.ptr8 = getelementptr inbounds i8, ptr %p, i64 %add.ptr8.idx 366 %0 = load i64, ptr %add.ptr8, align 8 367 %add.ptr9.idx = add i64 %add.ptr8.idx, %mul 368 %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx 369 %1 = load i64, ptr %add.ptr9, align 8 370 %add.ptr10.idx = add i64 %add.ptr8.idx, %mul1 371 %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx 372 %2 = load i64, ptr %add.ptr10, align 8 373 %add.ptr11.idx = add i64 %add.ptr8.idx, %mul3 374 %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx 375 %3 = load i64, ptr %add.ptr11, align 8 376 %mul12 = mul i64 %1, %0 377 %mul13 = mul i64 %mul12, %2 378 %mul14 = mul i64 %mul13, %3 379 %add15 = add i64 %mul14, %sum.046 380 %inc = add nuw nsw i64 %i.045, 1 381 %exitcond.not = icmp eq i64 %inc, %n 382 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 383} 384 385; addresses: 386; 1: base1 + offset 387; 2: + offset 388; 3: + 3*offset 389; 4: + 2*offset 390; 5: + 1*offset 391; 6: + 2*offset 392; 393; The diff between address 2 and address 1 is 1*offset, and this offset is reused between address 4 and address 5. 394; but the diff between address 3 and address 2 (3*offset) is not the same with the diff between address 6 395; and address 5(2*offset), so we can not common chains for these addresses. 396; 397; long long not_same_offset_fail(char *p, long long offset, long long base1, long long n) { 398; long long o1 = base1 + offset; 399; long long o2 = base1 + 2 * offset; 400; long long o3 = base1 + 5 * offset; 401; long long o4 = base1 + 7 * offset; 402; long long o5 = base1 + 8 * offset; 403; long long o6 = base1 + 10 * offset; 404; char *p1 = p + o1; 405; char *p2 = p + o2; 406; char *p3 = p + o3; 407; char *p4 = p + o4; 408; char *p5 = p + o5; 409; char *p6 = p + o6; 410; long long sum = 0; 411; for (long long i = 0; i < n; ++i) { 412; unsigned long x1 = *(unsigned long *)(p1 + i); 413; unsigned long x2 = *(unsigned long *)(p2 + i); 414; unsigned long x3 = *(unsigned long *)(p3 + i); 415; unsigned long x4 = *(unsigned long *)(p4 + i); 416; unsigned long x5 = *(unsigned long *)(p5 + i); 417; unsigned long x6 = *(unsigned long *)(p6 + i); 418; sum += x1 * x2 * x3 * x4 * x5 * x6; 419; } 420; return sum; 421; } 422; 423define i64 @not_same_offset_fail(ptr %p, i64 %offset, i64 %base1, i64 %n) { 424; CHECK-LABEL: not_same_offset_fail: 425; CHECK: # %bb.0: # %entry 426; CHECK-NEXT: cmpdi r6, 0 427; CHECK-NEXT: ble cr0, .LBB4_4 428; CHECK-NEXT: # %bb.1: # %for.body.preheader 429; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill 430; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill 431; CHECK-NEXT: add r5, r3, r5 432; CHECK-NEXT: li r3, 0 433; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill 434; CHECK-NEXT: mtctr r6 435; CHECK-NEXT: mulli r11, r4, 10 436; CHECK-NEXT: sldi r8, r4, 2 437; CHECK-NEXT: add r8, r4, r8 438; CHECK-NEXT: sldi r9, r4, 3 439; CHECK-NEXT: sub r10, r9, r4 440; CHECK-NEXT: sldi r7, r4, 1 441; CHECK-NEXT: .p2align 4 442; CHECK-NEXT: .LBB4_2: # %for.body 443; CHECK-NEXT: # 444; CHECK-NEXT: ldx r6, r5, r4 445; CHECK-NEXT: ldx r12, r5, r7 446; CHECK-NEXT: ldx r0, r5, r8 447; CHECK-NEXT: ldx r30, r5, r10 448; CHECK-NEXT: mulld r6, r12, r6 449; CHECK-NEXT: ldx r29, r5, r9 450; CHECK-NEXT: ldx r28, r5, r11 451; CHECK-NEXT: addi r5, r5, 1 452; CHECK-NEXT: mulld r6, r6, r0 453; CHECK-NEXT: mulld r6, r6, r30 454; CHECK-NEXT: mulld r6, r6, r29 455; CHECK-NEXT: maddld r3, r6, r28, r3 456; CHECK-NEXT: bdnz .LBB4_2 457; CHECK-NEXT: # %bb.3: 458; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload 459; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload 460; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload 461; CHECK-NEXT: blr 462; CHECK-NEXT: .LBB4_4: 463; CHECK-NEXT: li r3, 0 464; CHECK-NEXT: blr 465entry: 466 %mul = shl nsw i64 %offset, 1 467 %mul2 = mul nsw i64 %offset, 5 468 %mul4 = mul nsw i64 %offset, 7 469 %mul6 = shl nsw i64 %offset, 3 470 %mul8 = mul nsw i64 %offset, 10 471 %cmp70 = icmp sgt i64 %n, 0 472 br i1 %cmp70, label %for.body, label %for.cond.cleanup 473 474for.cond.cleanup: ; preds = %for.body, %entry 475 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add26, %for.body ] 476 ret i64 %sum.0.lcssa 477 478for.body: ; preds = %entry, %for.body 479 %sum.072 = phi i64 [ %add26, %for.body ], [ 0, %entry ] 480 %i.071 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 481 %add = add i64 %i.071, %base1 482 %add.ptr15.idx = add i64 %add, %offset 483 %add.ptr15 = getelementptr inbounds i8, ptr %p, i64 %add.ptr15.idx 484 %0 = load i64, ptr %add.ptr15, align 8 485 %add.ptr16.idx = add i64 %add, %mul 486 %add.ptr16 = getelementptr inbounds i8, ptr %p, i64 %add.ptr16.idx 487 %1 = load i64, ptr %add.ptr16, align 8 488 %add.ptr17.idx = add i64 %add, %mul2 489 %add.ptr17 = getelementptr inbounds i8, ptr %p, i64 %add.ptr17.idx 490 %2 = load i64, ptr %add.ptr17, align 8 491 %add.ptr18.idx = add i64 %add, %mul4 492 %add.ptr18 = getelementptr inbounds i8, ptr %p, i64 %add.ptr18.idx 493 %3 = load i64, ptr %add.ptr18, align 8 494 %add.ptr19.idx = add i64 %add, %mul6 495 %add.ptr19 = getelementptr inbounds i8, ptr %p, i64 %add.ptr19.idx 496 %4 = load i64, ptr %add.ptr19, align 8 497 %add.ptr20.idx = add i64 %add, %mul8 498 %add.ptr20 = getelementptr inbounds i8, ptr %p, i64 %add.ptr20.idx 499 %5 = load i64, ptr %add.ptr20, align 8 500 %mul21 = mul i64 %1, %0 501 %mul22 = mul i64 %mul21, %2 502 %mul23 = mul i64 %mul22, %3 503 %mul24 = mul i64 %mul23, %4 504 %mul25 = mul i64 %mul24, %5 505 %add26 = add i64 %mul25, %sum.072 506 %inc = add nuw nsw i64 %i.071, 1 507 %exitcond.not = icmp eq i64 %inc, %n 508 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 509} 510 511; addresses: 512; 1: base1 + offset 513; 2: + offset 514; 3: + 3*offset 515; 4: + 2*offset 516; 517; chains: 518; 1: base1 + offset, offsets: (0, 2*offset) 519; 2: base1 + 4*offset, offsets: (0, 2*offset) 520; 521; long long two_chain_different_offsets_succ(char *p, long long offset, long long base1, long long n) { 522; long long o1 = base1 + offset; 523; long long o2 = base1 + 3 * offset; 524; long long o3 = base1 + 4 * offset; 525; long long o4 = base1 + 6 * offset; 526; char *p1 = p + o1; 527; char *p2 = p + o2; 528; char *p3 = p + o3; 529; char *p4 = p + o4; 530; long long sum = 0; 531; for (long long i = 0; i < n; ++i) { 532; unsigned long x1 = *(unsigned long *)(p1 + i); 533; unsigned long x2 = *(unsigned long *)(p2 + i); 534; unsigned long x3 = *(unsigned long *)(p3 + i); 535; unsigned long x4 = *(unsigned long *)(p4 + i); 536; sum += x1 * x2 * x3 * x4; 537; } 538; return sum; 539; } 540; 541define i64 @two_chain_different_offsets_succ(ptr %p, i64 %offset, i64 %base1, i64 %n) { 542; CHECK-LABEL: two_chain_different_offsets_succ: 543; CHECK: # %bb.0: # %entry 544; CHECK-NEXT: cmpdi r6, 0 545; CHECK-NEXT: ble cr0, .LBB5_4 546; CHECK-NEXT: # %bb.1: # %for.body.preheader 547; CHECK-NEXT: sldi r8, r4, 2 548; CHECK-NEXT: add r7, r5, r4 549; CHECK-NEXT: mtctr r6 550; CHECK-NEXT: add r5, r5, r8 551; CHECK-NEXT: add r7, r3, r7 552; CHECK-NEXT: sldi r4, r4, 1 553; CHECK-NEXT: add r5, r3, r5 554; CHECK-NEXT: li r3, 0 555; CHECK-NEXT: .p2align 4 556; CHECK-NEXT: .LBB5_2: # %for.body 557; CHECK-NEXT: # 558; CHECK-NEXT: ld r6, 0(r7) 559; CHECK-NEXT: ldx r8, r7, r4 560; CHECK-NEXT: ld r9, 0(r5) 561; CHECK-NEXT: ldx r10, r5, r4 562; CHECK-NEXT: addi r7, r7, 1 563; CHECK-NEXT: addi r5, r5, 1 564; CHECK-NEXT: mulld r6, r8, r6 565; CHECK-NEXT: mulld r6, r6, r9 566; CHECK-NEXT: maddld r3, r6, r10, r3 567; CHECK-NEXT: bdnz .LBB5_2 568; CHECK-NEXT: # %bb.3: # %for.cond.cleanup 569; CHECK-NEXT: blr 570; CHECK-NEXT: .LBB5_4: 571; CHECK-NEXT: li r3, 0 572; CHECK-NEXT: blr 573entry: 574 %mul = mul nsw i64 %offset, 3 575 %mul2 = shl nsw i64 %offset, 2 576 %mul4 = mul nsw i64 %offset, 6 577 %cmp46 = icmp sgt i64 %n, 0 578 br i1 %cmp46, label %for.body, label %for.cond.cleanup 579 580for.cond.cleanup: ; preds = %for.body, %entry 581 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add16, %for.body ] 582 ret i64 %sum.0.lcssa 583 584for.body: ; preds = %entry, %for.body 585 %sum.048 = phi i64 [ %add16, %for.body ], [ 0, %entry ] 586 %i.047 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 587 %add = add i64 %i.047, %base1 588 %add.ptr9.idx = add i64 %add, %offset 589 %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx 590 %0 = load i64, ptr %add.ptr9, align 8 591 %add.ptr10.idx = add i64 %add, %mul 592 %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx 593 %1 = load i64, ptr %add.ptr10, align 8 594 %add.ptr11.idx = add i64 %add, %mul2 595 %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx 596 %2 = load i64, ptr %add.ptr11, align 8 597 %add.ptr12.idx = add i64 %add, %mul4 598 %add.ptr12 = getelementptr inbounds i8, ptr %p, i64 %add.ptr12.idx 599 %3 = load i64, ptr %add.ptr12, align 8 600 %mul13 = mul i64 %1, %0 601 %mul14 = mul i64 %mul13, %2 602 %mul15 = mul i64 %mul14, %3 603 %add16 = add i64 %mul15, %sum.048 604 %inc = add nuw nsw i64 %i.047, 1 605 %exitcond.not = icmp eq i64 %inc, %n 606 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 607} 608 609; addresses: 610; 1: base1 + offset 611; 2: + 2*offset 612; 3: + base2 - base1 - 2*offset 613; 4: + 2*offset 614; 615; chains: 616; 1: base1 + offset, offsets: (0, 2*offset) 617; 2: base2 + offset, offsets: (0, 2*offset) 618; 619; long long two_chain_two_bases_succ(char *p, long long offset, long long base1, long long base2, long long n) { 620; long long o1 = base1 + offset; 621; long long o2 = base1 + 3 * offset; 622; long long o3 = base2 + offset; 623; long long o4 = base2 + 3 * offset; 624; char *p1 = p + o1; 625; char *p2 = p + o2; 626; char *p3 = p + o3; 627; char *p4 = p + o4; 628; long long sum = 0; 629; for (long long i = 0; i < n; ++i) { 630; unsigned long x1 = *(unsigned long *)(p1 + i); 631; unsigned long x2 = *(unsigned long *)(p2 + i); 632; unsigned long x3 = *(unsigned long *)(p3 + i); 633; unsigned long x4 = *(unsigned long *)(p4 + i); 634; sum += x1 * x2 * x3 * x4; 635; } 636; return sum; 637; } 638; 639define i64 @two_chain_two_bases_succ(ptr %p, i64 %offset, i64 %base1, i64 %base2, i64 %n) { 640; CHECK-LABEL: two_chain_two_bases_succ: 641; CHECK: # %bb.0: # %entry 642; CHECK-NEXT: cmpdi r7, 0 643; CHECK-NEXT: ble cr0, .LBB6_4 644; CHECK-NEXT: # %bb.1: # %for.body.preheader 645; CHECK-NEXT: add r5, r5, r4 646; CHECK-NEXT: add r6, r6, r4 647; CHECK-NEXT: mtctr r7 648; CHECK-NEXT: sldi r4, r4, 1 649; CHECK-NEXT: add r5, r3, r5 650; CHECK-NEXT: add r6, r3, r6 651; CHECK-NEXT: li r3, 0 652; CHECK-NEXT: .p2align 4 653; CHECK-NEXT: .LBB6_2: # %for.body 654; CHECK-NEXT: # 655; CHECK-NEXT: ld r7, 0(r5) 656; CHECK-NEXT: ldx r8, r5, r4 657; CHECK-NEXT: ld r9, 0(r6) 658; CHECK-NEXT: ldx r10, r6, r4 659; CHECK-NEXT: addi r5, r5, 1 660; CHECK-NEXT: addi r6, r6, 1 661; CHECK-NEXT: mulld r7, r8, r7 662; CHECK-NEXT: mulld r7, r7, r9 663; CHECK-NEXT: maddld r3, r7, r10, r3 664; CHECK-NEXT: bdnz .LBB6_2 665; CHECK-NEXT: # %bb.3: # %for.cond.cleanup 666; CHECK-NEXT: blr 667; CHECK-NEXT: .LBB6_4: 668; CHECK-NEXT: li r3, 0 669; CHECK-NEXT: blr 670entry: 671 %mul = mul nsw i64 %offset, 3 672 %cmp44 = icmp sgt i64 %n, 0 673 br i1 %cmp44, label %for.body, label %for.cond.cleanup 674 675for.cond.cleanup: ; preds = %for.body, %entry 676 %sum.0.lcssa = phi i64 [ 0, %entry ], [ %add15, %for.body ] 677 ret i64 %sum.0.lcssa 678 679for.body: ; preds = %entry, %for.body 680 %sum.046 = phi i64 [ %add15, %for.body ], [ 0, %entry ] 681 %i.045 = phi i64 [ %inc, %for.body ], [ 0, %entry ] 682 %add = add i64 %i.045, %base1 683 %add.ptr8.idx = add i64 %add, %offset 684 %add.ptr8 = getelementptr inbounds i8, ptr %p, i64 %add.ptr8.idx 685 %0 = load i64, ptr %add.ptr8, align 8 686 %add1 = add i64 %i.045, %mul 687 %add.ptr9.idx = add i64 %add1, %base1 688 %add.ptr9 = getelementptr inbounds i8, ptr %p, i64 %add.ptr9.idx 689 %1 = load i64, ptr %add.ptr9, align 8 690 %add2 = add i64 %i.045, %base2 691 %add.ptr10.idx = add i64 %add2, %offset 692 %add.ptr10 = getelementptr inbounds i8, ptr %p, i64 %add.ptr10.idx 693 %2 = load i64, ptr %add.ptr10, align 8 694 %add.ptr11.idx = add i64 %add2, %mul 695 %add.ptr11 = getelementptr inbounds i8, ptr %p, i64 %add.ptr11.idx 696 %3 = load i64, ptr %add.ptr11, align 8 697 %mul12 = mul i64 %1, %0 698 %mul13 = mul i64 %mul12, %2 699 %mul14 = mul i64 %mul13, %3 700 %add15 = add i64 %mul14, %sum.046 701 %inc = add nuw nsw i64 %i.045, 1 702 %exitcond.not = icmp eq i64 %inc, %n 703 br i1 %exitcond.not, label %for.cond.cleanup, label %for.body 704} 705; 706; Check chain commoning can reduce register pressure to save register spill/reload. 707; 708; int spill_reduce_succ(double *input1, double *input2, double *output, long long m, long long inc1, long long inc2, long long inc3, long long inc4, long long inc) { 709; inc = inc4; 710; #pragma unroll 4 711; for (long long i = 0; i < 4 * m; i++) { 712; output[inc + inc1] += input1[inc + inc1] * input2[inc + inc1]; 713; output[inc + inc2] += input1[inc + inc2] * input2[inc + inc2]; 714; output[inc + inc3] += input1[inc + inc3] * input2[inc + inc3]; 715; inc = inc + inc4; 716; } 717; return 0; 718; } 719; 720define signext i32 @spill_reduce_succ(ptr %input1, ptr %input2, ptr %output, i64 %m, i64 %inc1, i64 %inc2, i64 %inc3, i64 %inc4, i64 %inc) { 721; CHECK-LABEL: spill_reduce_succ: 722; CHECK: # %bb.0: # %entry 723; CHECK-NEXT: cmpdi r6, 0 724; CHECK-NEXT: std r14, -144(r1) # 8-byte Folded Spill 725; CHECK-NEXT: std r15, -136(r1) # 8-byte Folded Spill 726; CHECK-NEXT: std r16, -128(r1) # 8-byte Folded Spill 727; CHECK-NEXT: std r17, -120(r1) # 8-byte Folded Spill 728; CHECK-NEXT: std r18, -112(r1) # 8-byte Folded Spill 729; CHECK-NEXT: std r19, -104(r1) # 8-byte Folded Spill 730; CHECK-NEXT: std r20, -96(r1) # 8-byte Folded Spill 731; CHECK-NEXT: std r21, -88(r1) # 8-byte Folded Spill 732; CHECK-NEXT: std r22, -80(r1) # 8-byte Folded Spill 733; CHECK-NEXT: std r23, -72(r1) # 8-byte Folded Spill 734; CHECK-NEXT: std r24, -64(r1) # 8-byte Folded Spill 735; CHECK-NEXT: std r25, -56(r1) # 8-byte Folded Spill 736; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill 737; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill 738; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill 739; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill 740; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill 741; CHECK-NEXT: std r31, -8(r1) # 8-byte Folded Spill 742; CHECK-NEXT: std r2, -152(r1) # 8-byte Folded Spill 743; CHECK-NEXT: std r9, -184(r1) # 8-byte Folded Spill 744; CHECK-NEXT: std r8, -176(r1) # 8-byte Folded Spill 745; CHECK-NEXT: std r7, -168(r1) # 8-byte Folded Spill 746; CHECK-NEXT: std r3, -160(r1) # 8-byte Folded Spill 747; CHECK-NEXT: ble cr0, .LBB7_7 748; CHECK-NEXT: # %bb.1: # %for.body.preheader 749; CHECK-NEXT: sldi r6, r6, 2 750; CHECK-NEXT: li r7, 1 751; CHECK-NEXT: mr r30, r10 752; CHECK-NEXT: cmpdi r6, 1 753; CHECK-NEXT: iselgt r7, r6, r7 754; CHECK-NEXT: addi r8, r7, -1 755; CHECK-NEXT: clrldi r6, r7, 63 756; CHECK-NEXT: cmpldi r8, 3 757; CHECK-NEXT: blt cr0, .LBB7_4 758; CHECK-NEXT: # %bb.2: # %for.body.preheader.new 759; CHECK-NEXT: ld r14, -168(r1) # 8-byte Folded Reload 760; CHECK-NEXT: mulli r24, r30, 24 761; CHECK-NEXT: ld r16, -184(r1) # 8-byte Folded Reload 762; CHECK-NEXT: ld r15, -176(r1) # 8-byte Folded Reload 763; CHECK-NEXT: ld r3, -160(r1) # 8-byte Folded Reload 764; CHECK-NEXT: rldicl r0, r7, 62, 2 765; CHECK-NEXT: sldi r11, r30, 5 766; CHECK-NEXT: sldi r19, r30, 4 767; CHECK-NEXT: sldi r7, r14, 3 768; CHECK-NEXT: add r14, r30, r14 769; CHECK-NEXT: sldi r10, r16, 3 770; CHECK-NEXT: sldi r12, r15, 3 771; CHECK-NEXT: add r16, r30, r16 772; CHECK-NEXT: add r15, r30, r15 773; CHECK-NEXT: add r27, r11, r7 774; CHECK-NEXT: add r22, r24, r7 775; CHECK-NEXT: add r17, r19, r7 776; CHECK-NEXT: sldi r2, r14, 3 777; CHECK-NEXT: add r26, r24, r10 778; CHECK-NEXT: add r25, r24, r12 779; CHECK-NEXT: add r21, r19, r10 780; CHECK-NEXT: add r20, r19, r12 781; CHECK-NEXT: add r8, r11, r10 782; CHECK-NEXT: sldi r16, r16, 3 783; CHECK-NEXT: add r29, r5, r27 784; CHECK-NEXT: add r28, r4, r27 785; CHECK-NEXT: add r27, r3, r27 786; CHECK-NEXT: add r24, r5, r22 787; CHECK-NEXT: add r23, r4, r22 788; CHECK-NEXT: add r22, r3, r22 789; CHECK-NEXT: add r19, r5, r17 790; CHECK-NEXT: add r18, r4, r17 791; CHECK-NEXT: add r17, r3, r17 792; CHECK-NEXT: add r14, r5, r2 793; CHECK-NEXT: add r31, r4, r2 794; CHECK-NEXT: add r2, r3, r2 795; CHECK-NEXT: add r9, r5, r8 796; CHECK-NEXT: add r8, r11, r12 797; CHECK-NEXT: add r26, r5, r26 798; CHECK-NEXT: add r25, r5, r25 799; CHECK-NEXT: add r21, r5, r21 800; CHECK-NEXT: add r20, r5, r20 801; CHECK-NEXT: add r16, r5, r16 802; CHECK-NEXT: add r8, r5, r8 803; CHECK-NEXT: rldicl r3, r0, 2, 1 804; CHECK-NEXT: addi r3, r3, -4 805; CHECK-NEXT: sub r0, r12, r7 806; CHECK-NEXT: sub r12, r10, r7 807; CHECK-NEXT: li r7, 0 808; CHECK-NEXT: mr r10, r30 809; CHECK-NEXT: sldi r15, r15, 3 810; CHECK-NEXT: add r15, r5, r15 811; CHECK-NEXT: rldicl r3, r3, 62, 2 812; CHECK-NEXT: addi r3, r3, 1 813; CHECK-NEXT: mtctr r3 814; CHECK-NEXT: .p2align 4 815; CHECK-NEXT: .LBB7_3: # %for.body 816; CHECK-NEXT: # 817; CHECK-NEXT: lfd f0, 0(r2) 818; CHECK-NEXT: lfd f1, 0(r31) 819; CHECK-NEXT: add r3, r10, r30 820; CHECK-NEXT: add r3, r3, r30 821; CHECK-NEXT: xsmuldp f0, f0, f1 822; CHECK-NEXT: lfd f1, 0(r14) 823; CHECK-NEXT: add r3, r3, r30 824; CHECK-NEXT: add r10, r3, r30 825; CHECK-NEXT: xsadddp f0, f1, f0 826; CHECK-NEXT: stfd f0, 0(r14) 827; CHECK-NEXT: add r14, r14, r11 828; CHECK-NEXT: lfdx f0, r2, r0 829; CHECK-NEXT: lfdx f1, r31, r0 830; CHECK-NEXT: xsmuldp f0, f0, f1 831; CHECK-NEXT: lfdx f1, r15, r7 832; CHECK-NEXT: xsadddp f0, f1, f0 833; CHECK-NEXT: stfdx f0, r15, r7 834; CHECK-NEXT: lfdx f0, r2, r12 835; CHECK-NEXT: lfdx f1, r31, r12 836; CHECK-NEXT: add r2, r2, r11 837; CHECK-NEXT: add r31, r31, r11 838; CHECK-NEXT: xsmuldp f0, f0, f1 839; CHECK-NEXT: lfdx f1, r16, r7 840; CHECK-NEXT: xsadddp f0, f1, f0 841; CHECK-NEXT: stfdx f0, r16, r7 842; CHECK-NEXT: lfd f0, 0(r17) 843; CHECK-NEXT: lfd f1, 0(r18) 844; CHECK-NEXT: xsmuldp f0, f0, f1 845; CHECK-NEXT: lfdx f1, r19, r7 846; CHECK-NEXT: xsadddp f0, f1, f0 847; CHECK-NEXT: stfdx f0, r19, r7 848; CHECK-NEXT: lfdx f0, r17, r0 849; CHECK-NEXT: lfdx f1, r18, r0 850; CHECK-NEXT: xsmuldp f0, f0, f1 851; CHECK-NEXT: lfdx f1, r20, r7 852; CHECK-NEXT: xsadddp f0, f1, f0 853; CHECK-NEXT: stfdx f0, r20, r7 854; CHECK-NEXT: lfdx f0, r17, r12 855; CHECK-NEXT: lfdx f1, r18, r12 856; CHECK-NEXT: add r17, r17, r11 857; CHECK-NEXT: add r18, r18, r11 858; CHECK-NEXT: xsmuldp f0, f0, f1 859; CHECK-NEXT: lfdx f1, r21, r7 860; CHECK-NEXT: xsadddp f0, f1, f0 861; CHECK-NEXT: stfdx f0, r21, r7 862; CHECK-NEXT: lfd f0, 0(r22) 863; CHECK-NEXT: lfd f1, 0(r23) 864; CHECK-NEXT: xsmuldp f0, f0, f1 865; CHECK-NEXT: lfdx f1, r24, r7 866; CHECK-NEXT: xsadddp f0, f1, f0 867; CHECK-NEXT: stfdx f0, r24, r7 868; CHECK-NEXT: lfdx f0, r22, r0 869; CHECK-NEXT: lfdx f1, r23, r0 870; CHECK-NEXT: xsmuldp f0, f0, f1 871; CHECK-NEXT: lfdx f1, r25, r7 872; CHECK-NEXT: xsadddp f0, f1, f0 873; CHECK-NEXT: stfdx f0, r25, r7 874; CHECK-NEXT: lfdx f0, r22, r12 875; CHECK-NEXT: lfdx f1, r23, r12 876; CHECK-NEXT: add r22, r22, r11 877; CHECK-NEXT: add r23, r23, r11 878; CHECK-NEXT: xsmuldp f0, f0, f1 879; CHECK-NEXT: lfdx f1, r26, r7 880; CHECK-NEXT: xsadddp f0, f1, f0 881; CHECK-NEXT: stfdx f0, r26, r7 882; CHECK-NEXT: lfd f0, 0(r27) 883; CHECK-NEXT: lfd f1, 0(r28) 884; CHECK-NEXT: xsmuldp f0, f0, f1 885; CHECK-NEXT: lfdx f1, r29, r7 886; CHECK-NEXT: xsadddp f0, f1, f0 887; CHECK-NEXT: stfdx f0, r29, r7 888; CHECK-NEXT: lfdx f0, r27, r0 889; CHECK-NEXT: lfdx f1, r28, r0 890; CHECK-NEXT: xsmuldp f0, f0, f1 891; CHECK-NEXT: lfdx f1, r8, r7 892; CHECK-NEXT: xsadddp f0, f1, f0 893; CHECK-NEXT: stfdx f0, r8, r7 894; CHECK-NEXT: lfdx f0, r27, r12 895; CHECK-NEXT: lfdx f1, r28, r12 896; CHECK-NEXT: add r27, r27, r11 897; CHECK-NEXT: add r28, r28, r11 898; CHECK-NEXT: xsmuldp f0, f0, f1 899; CHECK-NEXT: lfdx f1, r9, r7 900; CHECK-NEXT: xsadddp f0, f1, f0 901; CHECK-NEXT: stfdx f0, r9, r7 902; CHECK-NEXT: add r7, r7, r11 903; CHECK-NEXT: bdnz .LBB7_3 904; CHECK-NEXT: .LBB7_4: # %for.cond.cleanup.loopexit.unr-lcssa 905; CHECK-NEXT: cmpldi r6, 0 906; CHECK-NEXT: beq cr0, .LBB7_7 907; CHECK-NEXT: # %bb.5: # %for.body.epil.preheader 908; CHECK-NEXT: ld r3, -184(r1) # 8-byte Folded Reload 909; CHECK-NEXT: ld r0, -160(r1) # 8-byte Folded Reload 910; CHECK-NEXT: sldi r8, r30, 3 911; CHECK-NEXT: add r3, r10, r3 912; CHECK-NEXT: sldi r3, r3, 3 913; CHECK-NEXT: add r7, r5, r3 914; CHECK-NEXT: add r9, r4, r3 915; CHECK-NEXT: add r11, r0, r3 916; CHECK-NEXT: ld r3, -176(r1) # 8-byte Folded Reload 917; CHECK-NEXT: add r3, r10, r3 918; CHECK-NEXT: sldi r3, r3, 3 919; CHECK-NEXT: add r12, r5, r3 920; CHECK-NEXT: add r30, r4, r3 921; CHECK-NEXT: add r29, r0, r3 922; CHECK-NEXT: ld r3, -168(r1) # 8-byte Folded Reload 923; CHECK-NEXT: add r3, r10, r3 924; CHECK-NEXT: li r10, 0 925; CHECK-NEXT: sldi r3, r3, 3 926; CHECK-NEXT: add r5, r5, r3 927; CHECK-NEXT: add r4, r4, r3 928; CHECK-NEXT: add r3, r0, r3 929; CHECK-NEXT: .p2align 4 930; CHECK-NEXT: .LBB7_6: # %for.body.epil 931; CHECK-NEXT: # 932; CHECK-NEXT: lfdx f0, r3, r10 933; CHECK-NEXT: lfdx f1, r4, r10 934; CHECK-NEXT: addi r6, r6, -1 935; CHECK-NEXT: cmpldi r6, 0 936; CHECK-NEXT: xsmuldp f0, f0, f1 937; CHECK-NEXT: lfd f1, 0(r5) 938; CHECK-NEXT: xsadddp f0, f1, f0 939; CHECK-NEXT: stfd f0, 0(r5) 940; CHECK-NEXT: add r5, r5, r8 941; CHECK-NEXT: lfdx f0, r29, r10 942; CHECK-NEXT: lfdx f1, r30, r10 943; CHECK-NEXT: xsmuldp f0, f0, f1 944; CHECK-NEXT: lfdx f1, r12, r10 945; CHECK-NEXT: xsadddp f0, f1, f0 946; CHECK-NEXT: stfdx f0, r12, r10 947; CHECK-NEXT: lfdx f0, r11, r10 948; CHECK-NEXT: lfdx f1, r9, r10 949; CHECK-NEXT: xsmuldp f0, f0, f1 950; CHECK-NEXT: lfdx f1, r7, r10 951; CHECK-NEXT: xsadddp f0, f1, f0 952; CHECK-NEXT: stfdx f0, r7, r10 953; CHECK-NEXT: add r10, r10, r8 954; CHECK-NEXT: bne cr0, .LBB7_6 955; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup 956; CHECK-NEXT: ld r2, -152(r1) # 8-byte Folded Reload 957; CHECK-NEXT: ld r31, -8(r1) # 8-byte Folded Reload 958; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload 959; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload 960; CHECK-NEXT: li r3, 0 961; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload 962; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload 963; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload 964; CHECK-NEXT: ld r25, -56(r1) # 8-byte Folded Reload 965; CHECK-NEXT: ld r24, -64(r1) # 8-byte Folded Reload 966; CHECK-NEXT: ld r23, -72(r1) # 8-byte Folded Reload 967; CHECK-NEXT: ld r22, -80(r1) # 8-byte Folded Reload 968; CHECK-NEXT: ld r21, -88(r1) # 8-byte Folded Reload 969; CHECK-NEXT: ld r20, -96(r1) # 8-byte Folded Reload 970; CHECK-NEXT: ld r19, -104(r1) # 8-byte Folded Reload 971; CHECK-NEXT: ld r18, -112(r1) # 8-byte Folded Reload 972; CHECK-NEXT: ld r17, -120(r1) # 8-byte Folded Reload 973; CHECK-NEXT: ld r16, -128(r1) # 8-byte Folded Reload 974; CHECK-NEXT: ld r15, -136(r1) # 8-byte Folded Reload 975; CHECK-NEXT: ld r14, -144(r1) # 8-byte Folded Reload 976; CHECK-NEXT: blr 977entry: 978 %cmp49 = icmp sgt i64 %m, 0 979 br i1 %cmp49, label %for.body.preheader, label %for.cond.cleanup 980 981for.body.preheader: ; preds = %entry 982 %0 = shl i64 %m, 2 983 %smax52 = call i64 @llvm.smax.i64(i64 %0, i64 1) 984 %1 = add nsw i64 %smax52, -1 985 %xtraiter = and i64 %smax52, 1 986 %2 = icmp ult i64 %1, 3 987 br i1 %2, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new 988 989for.body.preheader.new: ; preds = %for.body.preheader 990 %unroll_iter = and i64 %smax52, 9223372036854775804 991 br label %for.body 992 993for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader 994 %inc.addr.050.unr = phi i64 [ %inc4, %for.body.preheader ], [ %add23.3, %for.body ] 995 %lcmp.mod.not = icmp eq i64 %xtraiter, 0 996 br i1 %lcmp.mod.not, label %for.cond.cleanup, label %for.body.epil 997 998for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil 999 %inc.addr.050.epil = phi i64 [ %add23.epil, %for.body.epil ], [ %inc.addr.050.unr, %for.cond.cleanup.loopexit.unr-lcssa ] 1000 %epil.iter = phi i64 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ] 1001 %add.epil = add nsw i64 %inc.addr.050.epil, %inc1 1002 %arrayidx.epil = getelementptr inbounds double, ptr %input1, i64 %add.epil 1003 %3 = load double, ptr %arrayidx.epil, align 8 1004 %arrayidx2.epil = getelementptr inbounds double, ptr %input2, i64 %add.epil 1005 %4 = load double, ptr %arrayidx2.epil, align 8 1006 %mul3.epil = fmul double %3, %4 1007 %arrayidx5.epil = getelementptr inbounds double, ptr %output, i64 %add.epil 1008 %5 = load double, ptr %arrayidx5.epil, align 8 1009 %add6.epil = fadd double %5, %mul3.epil 1010 store double %add6.epil, ptr %arrayidx5.epil, align 8 1011 %add7.epil = add nsw i64 %inc.addr.050.epil, %inc2 1012 %arrayidx8.epil = getelementptr inbounds double, ptr %input1, i64 %add7.epil 1013 %6 = load double, ptr %arrayidx8.epil, align 8 1014 %arrayidx10.epil = getelementptr inbounds double, ptr %input2, i64 %add7.epil 1015 %7 = load double, ptr %arrayidx10.epil, align 8 1016 %mul11.epil = fmul double %6, %7 1017 %arrayidx13.epil = getelementptr inbounds double, ptr %output, i64 %add7.epil 1018 %8 = load double, ptr %arrayidx13.epil, align 8 1019 %add14.epil = fadd double %8, %mul11.epil 1020 store double %add14.epil, ptr %arrayidx13.epil, align 8 1021 %add15.epil = add nsw i64 %inc.addr.050.epil, %inc3 1022 %arrayidx16.epil = getelementptr inbounds double, ptr %input1, i64 %add15.epil 1023 %9 = load double, ptr %arrayidx16.epil, align 8 1024 %arrayidx18.epil = getelementptr inbounds double, ptr %input2, i64 %add15.epil 1025 %10 = load double, ptr %arrayidx18.epil, align 8 1026 %mul19.epil = fmul double %9, %10 1027 %arrayidx21.epil = getelementptr inbounds double, ptr %output, i64 %add15.epil 1028 %11 = load double, ptr %arrayidx21.epil, align 8 1029 %add22.epil = fadd double %11, %mul19.epil 1030 store double %add22.epil, ptr %arrayidx21.epil, align 8 1031 %add23.epil = add nsw i64 %inc.addr.050.epil, %inc4 1032 %epil.iter.sub = add nsw i64 %epil.iter, -1 1033 %epil.iter.cmp.not = icmp eq i64 %epil.iter.sub, 0 1034 br i1 %epil.iter.cmp.not, label %for.cond.cleanup, label %for.body.epil 1035 1036for.cond.cleanup: ; preds = %for.body.epil, %for.cond.cleanup.loopexit.unr-lcssa, %entry 1037 ret i32 0 1038 1039for.body: ; preds = %for.body, %for.body.preheader.new 1040 %inc.addr.050 = phi i64 [ %inc4, %for.body.preheader.new ], [ %add23.3, %for.body ] 1041 %niter = phi i64 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ] 1042 %add = add nsw i64 %inc.addr.050, %inc1 1043 %arrayidx = getelementptr inbounds double, ptr %input1, i64 %add 1044 %12 = load double, ptr %arrayidx, align 8 1045 %arrayidx2 = getelementptr inbounds double, ptr %input2, i64 %add 1046 %13 = load double, ptr %arrayidx2, align 8 1047 %mul3 = fmul double %12, %13 1048 %arrayidx5 = getelementptr inbounds double, ptr %output, i64 %add 1049 %14 = load double, ptr %arrayidx5, align 8 1050 %add6 = fadd double %14, %mul3 1051 store double %add6, ptr %arrayidx5, align 8 1052 %add7 = add nsw i64 %inc.addr.050, %inc2 1053 %arrayidx8 = getelementptr inbounds double, ptr %input1, i64 %add7 1054 %15 = load double, ptr %arrayidx8, align 8 1055 %arrayidx10 = getelementptr inbounds double, ptr %input2, i64 %add7 1056 %16 = load double, ptr %arrayidx10, align 8 1057 %mul11 = fmul double %15, %16 1058 %arrayidx13 = getelementptr inbounds double, ptr %output, i64 %add7 1059 %17 = load double, ptr %arrayidx13, align 8 1060 %add14 = fadd double %17, %mul11 1061 store double %add14, ptr %arrayidx13, align 8 1062 %add15 = add nsw i64 %inc.addr.050, %inc3 1063 %arrayidx16 = getelementptr inbounds double, ptr %input1, i64 %add15 1064 %18 = load double, ptr %arrayidx16, align 8 1065 %arrayidx18 = getelementptr inbounds double, ptr %input2, i64 %add15 1066 %19 = load double, ptr %arrayidx18, align 8 1067 %mul19 = fmul double %18, %19 1068 %arrayidx21 = getelementptr inbounds double, ptr %output, i64 %add15 1069 %20 = load double, ptr %arrayidx21, align 8 1070 %add22 = fadd double %20, %mul19 1071 store double %add22, ptr %arrayidx21, align 8 1072 %add23 = add nsw i64 %inc.addr.050, %inc4 1073 %add.1 = add nsw i64 %add23, %inc1 1074 %arrayidx.1 = getelementptr inbounds double, ptr %input1, i64 %add.1 1075 %21 = load double, ptr %arrayidx.1, align 8 1076 %arrayidx2.1 = getelementptr inbounds double, ptr %input2, i64 %add.1 1077 %22 = load double, ptr %arrayidx2.1, align 8 1078 %mul3.1 = fmul double %21, %22 1079 %arrayidx5.1 = getelementptr inbounds double, ptr %output, i64 %add.1 1080 %23 = load double, ptr %arrayidx5.1, align 8 1081 %add6.1 = fadd double %23, %mul3.1 1082 store double %add6.1, ptr %arrayidx5.1, align 8 1083 %add7.1 = add nsw i64 %add23, %inc2 1084 %arrayidx8.1 = getelementptr inbounds double, ptr %input1, i64 %add7.1 1085 %24 = load double, ptr %arrayidx8.1, align 8 1086 %arrayidx10.1 = getelementptr inbounds double, ptr %input2, i64 %add7.1 1087 %25 = load double, ptr %arrayidx10.1, align 8 1088 %mul11.1 = fmul double %24, %25 1089 %arrayidx13.1 = getelementptr inbounds double, ptr %output, i64 %add7.1 1090 %26 = load double, ptr %arrayidx13.1, align 8 1091 %add14.1 = fadd double %26, %mul11.1 1092 store double %add14.1, ptr %arrayidx13.1, align 8 1093 %add15.1 = add nsw i64 %add23, %inc3 1094 %arrayidx16.1 = getelementptr inbounds double, ptr %input1, i64 %add15.1 1095 %27 = load double, ptr %arrayidx16.1, align 8 1096 %arrayidx18.1 = getelementptr inbounds double, ptr %input2, i64 %add15.1 1097 %28 = load double, ptr %arrayidx18.1, align 8 1098 %mul19.1 = fmul double %27, %28 1099 %arrayidx21.1 = getelementptr inbounds double, ptr %output, i64 %add15.1 1100 %29 = load double, ptr %arrayidx21.1, align 8 1101 %add22.1 = fadd double %29, %mul19.1 1102 store double %add22.1, ptr %arrayidx21.1, align 8 1103 %add23.1 = add nsw i64 %add23, %inc4 1104 %add.2 = add nsw i64 %add23.1, %inc1 1105 %arrayidx.2 = getelementptr inbounds double, ptr %input1, i64 %add.2 1106 %30 = load double, ptr %arrayidx.2, align 8 1107 %arrayidx2.2 = getelementptr inbounds double, ptr %input2, i64 %add.2 1108 %31 = load double, ptr %arrayidx2.2, align 8 1109 %mul3.2 = fmul double %30, %31 1110 %arrayidx5.2 = getelementptr inbounds double, ptr %output, i64 %add.2 1111 %32 = load double, ptr %arrayidx5.2, align 8 1112 %add6.2 = fadd double %32, %mul3.2 1113 store double %add6.2, ptr %arrayidx5.2, align 8 1114 %add7.2 = add nsw i64 %add23.1, %inc2 1115 %arrayidx8.2 = getelementptr inbounds double, ptr %input1, i64 %add7.2 1116 %33 = load double, ptr %arrayidx8.2, align 8 1117 %arrayidx10.2 = getelementptr inbounds double, ptr %input2, i64 %add7.2 1118 %34 = load double, ptr %arrayidx10.2, align 8 1119 %mul11.2 = fmul double %33, %34 1120 %arrayidx13.2 = getelementptr inbounds double, ptr %output, i64 %add7.2 1121 %35 = load double, ptr %arrayidx13.2, align 8 1122 %add14.2 = fadd double %35, %mul11.2 1123 store double %add14.2, ptr %arrayidx13.2, align 8 1124 %add15.2 = add nsw i64 %add23.1, %inc3 1125 %arrayidx16.2 = getelementptr inbounds double, ptr %input1, i64 %add15.2 1126 %36 = load double, ptr %arrayidx16.2, align 8 1127 %arrayidx18.2 = getelementptr inbounds double, ptr %input2, i64 %add15.2 1128 %37 = load double, ptr %arrayidx18.2, align 8 1129 %mul19.2 = fmul double %36, %37 1130 %arrayidx21.2 = getelementptr inbounds double, ptr %output, i64 %add15.2 1131 %38 = load double, ptr %arrayidx21.2, align 8 1132 %add22.2 = fadd double %38, %mul19.2 1133 store double %add22.2, ptr %arrayidx21.2, align 8 1134 %add23.2 = add nsw i64 %add23.1, %inc4 1135 %add.3 = add nsw i64 %add23.2, %inc1 1136 %arrayidx.3 = getelementptr inbounds double, ptr %input1, i64 %add.3 1137 %39 = load double, ptr %arrayidx.3, align 8 1138 %arrayidx2.3 = getelementptr inbounds double, ptr %input2, i64 %add.3 1139 %40 = load double, ptr %arrayidx2.3, align 8 1140 %mul3.3 = fmul double %39, %40 1141 %arrayidx5.3 = getelementptr inbounds double, ptr %output, i64 %add.3 1142 %41 = load double, ptr %arrayidx5.3, align 8 1143 %add6.3 = fadd double %41, %mul3.3 1144 store double %add6.3, ptr %arrayidx5.3, align 8 1145 %add7.3 = add nsw i64 %add23.2, %inc2 1146 %arrayidx8.3 = getelementptr inbounds double, ptr %input1, i64 %add7.3 1147 %42 = load double, ptr %arrayidx8.3, align 8 1148 %arrayidx10.3 = getelementptr inbounds double, ptr %input2, i64 %add7.3 1149 %43 = load double, ptr %arrayidx10.3, align 8 1150 %mul11.3 = fmul double %42, %43 1151 %arrayidx13.3 = getelementptr inbounds double, ptr %output, i64 %add7.3 1152 %44 = load double, ptr %arrayidx13.3, align 8 1153 %add14.3 = fadd double %44, %mul11.3 1154 store double %add14.3, ptr %arrayidx13.3, align 8 1155 %add15.3 = add nsw i64 %add23.2, %inc3 1156 %arrayidx16.3 = getelementptr inbounds double, ptr %input1, i64 %add15.3 1157 %45 = load double, ptr %arrayidx16.3, align 8 1158 %arrayidx18.3 = getelementptr inbounds double, ptr %input2, i64 %add15.3 1159 %46 = load double, ptr %arrayidx18.3, align 8 1160 %mul19.3 = fmul double %45, %46 1161 %arrayidx21.3 = getelementptr inbounds double, ptr %output, i64 %add15.3 1162 %47 = load double, ptr %arrayidx21.3, align 8 1163 %add22.3 = fadd double %47, %mul19.3 1164 store double %add22.3, ptr %arrayidx21.3, align 8 1165 %add23.3 = add nsw i64 %add23.2, %inc4 1166 %niter.nsub.3 = add i64 %niter, -4 1167 %niter.ncmp.3.not = icmp eq i64 %niter.nsub.3, 0 1168 br i1 %niter.ncmp.3.not, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body 1169} 1170 1171declare i64 @llvm.smax.i64(i64, i64) 1172 1173