1 /* Loop interchange. 2 Copyright (C) 2017-2020 Free Software Foundation, Inc. 3 Contributed by ARM Ltd. 4 5 This file is part of GCC. 6 7 GCC is free software; you can redistribute it and/or modify it 8 under the terms of the GNU General Public License as published by the 9 Free Software Foundation; either version 3, or (at your option) any 10 later version. 11 12 GCC is distributed in the hope that it will be useful, but WITHOUT 13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with GCC; see the file COPYING3. If not see 19 <http://www.gnu.org/licenses/>. */ 20 21 #include "config.h" 22 #include "system.h" 23 #include "coretypes.h" 24 #include "backend.h" 25 #include "is-a.h" 26 #include "tree.h" 27 #include "gimple.h" 28 #include "tree-pass.h" 29 #include "ssa.h" 30 #include "gimple-pretty-print.h" 31 #include "fold-const.h" 32 #include "gimplify.h" 33 #include "gimple-iterator.h" 34 #include "gimplify-me.h" 35 #include "cfgloop.h" 36 #include "tree-ssa.h" 37 #include "tree-scalar-evolution.h" 38 #include "tree-ssa-loop-manip.h" 39 #include "tree-ssa-loop-niter.h" 40 #include "tree-ssa-loop-ivopts.h" 41 #include "tree-ssa-dce.h" 42 #include "tree-data-ref.h" 43 #include "tree-vectorizer.h" 44 45 /* This pass performs loop interchange: for example, the loop nest 46 47 for (int j = 0; j < N; j++) 48 for (int k = 0; k < N; k++) 49 for (int i = 0; i < N; i++) 50 c[i][j] = c[i][j] + a[i][k]*b[k][j]; 51 52 is transformed to 53 54 for (int i = 0; i < N; i++) 55 for (int j = 0; j < N; j++) 56 for (int k = 0; k < N; k++) 57 c[i][j] = c[i][j] + a[i][k]*b[k][j]; 58 59 This pass implements loop interchange in the following steps: 60 61 1) Find perfect loop nest for each innermost loop and compute data 62 dependence relations for it. For above example, loop nest is 63 <loop_j, loop_k, loop_i>. 64 2) From innermost to outermost loop, this pass tries to interchange 65 each loop pair. For above case, it firstly tries to interchange 66 <loop_k, loop_i> and loop nest becomes <loop_j, loop_i, loop_k>. 67 Then it tries to interchange <loop_j, loop_i> and loop nest becomes 68 <loop_i, loop_j, loop_k>. The overall effect is to move innermost 69 loop to the outermost position. For loop pair <loop_i, loop_j> 70 to be interchanged, we: 71 3) Check if data dependence relations are valid for loop interchange. 72 4) Check if both loops can be interchanged in terms of transformation. 73 5) Check if interchanging the two loops is profitable. 74 6) Interchange the two loops by mapping induction variables. 75 76 This pass also handles reductions in loop nest. So far we only support 77 simple reduction of inner loop and double reduction of the loop nest. */ 78 79 /* Maximum number of stmts in each loop that should be interchanged. */ 80 #define MAX_NUM_STMT (param_loop_interchange_max_num_stmts) 81 /* Maximum number of data references in loop nest. */ 82 #define MAX_DATAREFS (param_loop_max_datarefs_for_datadeps) 83 84 /* Comparison ratio of access stride between inner/outer loops to be 85 interchanged. This is the minimum stride ratio for loop interchange 86 to be profitable. */ 87 #define OUTER_STRIDE_RATIO (param_loop_interchange_stride_ratio) 88 /* The same as above, but we require higher ratio for interchanging the 89 innermost two loops. */ 90 #define INNER_STRIDE_RATIO ((OUTER_STRIDE_RATIO) + 1) 91 92 /* Comparison ratio of stmt cost between inner/outer loops. Loops won't 93 be interchanged if outer loop has too many stmts. */ 94 #define STMT_COST_RATIO (3) 95 96 /* Vector of strides that DR accesses in each level loop of a loop nest. */ 97 #define DR_ACCESS_STRIDE(dr) ((vec<tree> *) dr->aux) 98 99 /* Structure recording loop induction variable. */ 100 typedef struct induction 101 { 102 /* IV itself. */ 103 tree var; 104 /* IV's initializing value, which is the init arg of the IV PHI node. */ 105 tree init_val; 106 /* IV's initializing expr, which is (the expanded result of) init_val. */ 107 tree init_expr; 108 /* IV's step. */ 109 tree step; 110 } *induction_p; 111 112 /* Enum type for loop reduction variable. */ 113 enum reduction_type 114 { 115 UNKNOWN_RTYPE = 0, 116 SIMPLE_RTYPE, 117 DOUBLE_RTYPE 118 }; 119 120 /* Structure recording loop reduction variable. */ 121 typedef struct reduction 122 { 123 /* Reduction itself. */ 124 tree var; 125 /* PHI node defining reduction variable. */ 126 gphi *phi; 127 /* Init and next variables of the reduction. */ 128 tree init; 129 tree next; 130 /* Lcssa PHI node if reduction is used outside of its definition loop. */ 131 gphi *lcssa_phi; 132 /* Stmts defining init and next. */ 133 gimple *producer; 134 gimple *consumer; 135 /* If init is loaded from memory, this is the loading memory reference. */ 136 tree init_ref; 137 /* If reduction is finally stored to memory, this is the stored memory 138 reference. */ 139 tree fini_ref; 140 enum reduction_type type; 141 } *reduction_p; 142 143 144 /* Dump reduction RE. */ 145 146 static void 147 dump_reduction (reduction_p re) 148 { 149 if (re->type == SIMPLE_RTYPE) 150 fprintf (dump_file, " Simple reduction: "); 151 else if (re->type == DOUBLE_RTYPE) 152 fprintf (dump_file, " Double reduction: "); 153 else 154 fprintf (dump_file, " Unknown reduction: "); 155 156 print_gimple_stmt (dump_file, re->phi, 0); 157 } 158 159 /* Dump LOOP's induction IV. */ 160 static void 161 dump_induction (class loop *loop, induction_p iv) 162 { 163 fprintf (dump_file, " Induction: "); 164 print_generic_expr (dump_file, iv->var, TDF_SLIM); 165 fprintf (dump_file, " = {"); 166 print_generic_expr (dump_file, iv->init_expr, TDF_SLIM); 167 fprintf (dump_file, ", "); 168 print_generic_expr (dump_file, iv->step, TDF_SLIM); 169 fprintf (dump_file, "}_%d\n", loop->num); 170 } 171 172 /* Loop candidate for interchange. */ 173 174 class loop_cand 175 { 176 public: 177 loop_cand (class loop *, class loop *); 178 ~loop_cand (); 179 180 reduction_p find_reduction_by_stmt (gimple *); 181 void classify_simple_reduction (reduction_p); 182 bool analyze_iloop_reduction_var (tree); 183 bool analyze_oloop_reduction_var (loop_cand *, tree); 184 bool analyze_induction_var (tree, tree); 185 bool analyze_carried_vars (loop_cand *); 186 bool analyze_lcssa_phis (void); 187 bool can_interchange_p (loop_cand *); 188 void undo_simple_reduction (reduction_p, bitmap); 189 190 /* The loop itself. */ 191 class loop *m_loop; 192 /* The outer loop for interchange. It equals to loop if this loop cand 193 itself represents the outer loop. */ 194 class loop *m_outer; 195 /* Vector of induction variables in loop. */ 196 vec<induction_p> m_inductions; 197 /* Vector of reduction variables in loop. */ 198 vec<reduction_p> m_reductions; 199 /* Lcssa PHI nodes of this loop. */ 200 vec<gphi *> m_lcssa_nodes; 201 /* Single exit edge of this loop. */ 202 edge m_exit; 203 /* Basic blocks of this loop. */ 204 basic_block *m_bbs; 205 /* Number of stmts of this loop. Inner loops' stmts are not included. */ 206 int m_num_stmts; 207 /* Number of constant initialized simple reduction. */ 208 int m_const_init_reduc; 209 }; 210 211 /* Constructor. */ 212 213 loop_cand::loop_cand (class loop *loop, class loop *outer) 214 : m_loop (loop), m_outer (outer), m_exit (single_exit (loop)), 215 m_bbs (get_loop_body (loop)), m_num_stmts (0), m_const_init_reduc (0) 216 { 217 m_inductions.create (3); 218 m_reductions.create (3); 219 m_lcssa_nodes.create (3); 220 } 221 222 /* Destructor. */ 223 224 loop_cand::~loop_cand () 225 { 226 induction_p iv; 227 for (unsigned i = 0; m_inductions.iterate (i, &iv); ++i) 228 free (iv); 229 230 reduction_p re; 231 for (unsigned i = 0; m_reductions.iterate (i, &re); ++i) 232 free (re); 233 234 m_inductions.release (); 235 m_reductions.release (); 236 m_lcssa_nodes.release (); 237 free (m_bbs); 238 } 239 240 /* Return single use stmt of VAR in LOOP, otherwise return NULL. */ 241 242 static gimple * 243 single_use_in_loop (tree var, class loop *loop) 244 { 245 gimple *stmt, *res = NULL; 246 use_operand_p use_p; 247 imm_use_iterator iterator; 248 249 FOR_EACH_IMM_USE_FAST (use_p, iterator, var) 250 { 251 stmt = USE_STMT (use_p); 252 if (is_gimple_debug (stmt)) 253 continue; 254 255 if (!flow_bb_inside_loop_p (loop, gimple_bb (stmt))) 256 continue; 257 258 if (res) 259 return NULL; 260 261 res = stmt; 262 } 263 return res; 264 } 265 266 /* Return true if E is unsupported in loop interchange, i.e, E is a complex 267 edge or part of irreducible loop. */ 268 269 static inline bool 270 unsupported_edge (edge e) 271 { 272 return (e->flags & (EDGE_COMPLEX | EDGE_IRREDUCIBLE_LOOP)); 273 } 274 275 /* Return the reduction if STMT is one of its lcssa PHI, producer or consumer 276 stmt. */ 277 278 reduction_p 279 loop_cand::find_reduction_by_stmt (gimple *stmt) 280 { 281 gphi *phi = dyn_cast <gphi *> (stmt); 282 reduction_p re; 283 284 for (unsigned i = 0; m_reductions.iterate (i, &re); ++i) 285 if ((phi != NULL && phi == re->lcssa_phi) 286 || (stmt == re->producer || stmt == re->consumer)) 287 return re; 288 289 return NULL; 290 } 291 292 /* Return true if current loop_cand be interchanged. ILOOP is not NULL if 293 current loop_cand is outer loop in loop nest. */ 294 295 bool 296 loop_cand::can_interchange_p (loop_cand *iloop) 297 { 298 /* For now we only support at most one reduction. */ 299 unsigned allowed_reduction_num = 1; 300 301 /* Only support reduction if the loop nest to be interchanged is the 302 innermostin two loops. */ 303 if ((iloop == NULL && m_loop->inner != NULL) 304 || (iloop != NULL && iloop->m_loop->inner != NULL)) 305 allowed_reduction_num = 0; 306 307 if (m_reductions.length () > allowed_reduction_num 308 || (m_reductions.length () == 1 309 && m_reductions[0]->type == UNKNOWN_RTYPE)) 310 return false; 311 312 /* Only support lcssa PHI node which is for reduction. */ 313 if (m_lcssa_nodes.length () > allowed_reduction_num) 314 return false; 315 316 /* Check if basic block has any unsupported operation. Note basic blocks 317 of inner loops are not checked here. */ 318 for (unsigned i = 0; i < m_loop->num_nodes; i++) 319 { 320 basic_block bb = m_bbs[i]; 321 gphi_iterator psi; 322 gimple_stmt_iterator gsi; 323 324 /* Skip basic blocks of inner loops. */ 325 if (bb->loop_father != m_loop) 326 continue; 327 328 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 329 { 330 gimple *stmt = gsi_stmt (gsi); 331 if (is_gimple_debug (stmt)) 332 continue; 333 334 if (gimple_has_side_effects (stmt)) 335 return false; 336 337 m_num_stmts++; 338 if (gcall *call = dyn_cast <gcall *> (stmt)) 339 { 340 /* In basic block of outer loop, the call should be cheap since 341 it will be moved to inner loop. */ 342 if (iloop != NULL 343 && !gimple_inexpensive_call_p (call)) 344 return false; 345 continue; 346 } 347 348 if (!iloop || !gimple_vuse (stmt)) 349 continue; 350 351 /* Support stmt accessing memory in outer loop only if it is for 352 inner loop's reduction. */ 353 if (iloop->find_reduction_by_stmt (stmt)) 354 continue; 355 356 tree lhs; 357 /* Support loop invariant memory reference if it's only used once by 358 inner loop. */ 359 /* ??? How's this checking for invariantness? */ 360 if (gimple_assign_single_p (stmt) 361 && (lhs = gimple_assign_lhs (stmt)) != NULL_TREE 362 && TREE_CODE (lhs) == SSA_NAME 363 && single_use_in_loop (lhs, iloop->m_loop)) 364 continue; 365 366 return false; 367 } 368 /* Check if loop has too many stmts. */ 369 if (m_num_stmts > MAX_NUM_STMT) 370 return false; 371 372 /* Allow PHI nodes in any basic block of inner loop, PHI nodes in outer 373 loop's header, or PHI nodes in dest bb of inner loop's exit edge. */ 374 if (!iloop || bb == m_loop->header 375 || bb == iloop->m_exit->dest) 376 continue; 377 378 /* Don't allow any other PHI nodes. */ 379 for (psi = gsi_start_phis (bb); !gsi_end_p (psi); gsi_next (&psi)) 380 if (!virtual_operand_p (PHI_RESULT (psi.phi ()))) 381 return false; 382 } 383 384 return true; 385 } 386 387 /* Programmers and optimizers (like loop store motion) may optimize code: 388 389 for (int i = 0; i < N; i++) 390 for (int j = 0; j < N; j++) 391 a[i] += b[j][i] * c[j][i]; 392 393 into reduction: 394 395 for (int i = 0; i < N; i++) 396 { 397 // producer. Note sum can be intitialized to a constant. 398 int sum = a[i]; 399 for (int j = 0; j < N; j++) 400 { 401 sum += b[j][i] * c[j][i]; 402 } 403 // consumer. 404 a[i] = sum; 405 } 406 407 The result code can't be interchanged without undoing the optimization. 408 This function classifies this kind reduction and records information so 409 that we can undo the store motion during interchange. */ 410 411 void 412 loop_cand::classify_simple_reduction (reduction_p re) 413 { 414 gimple *producer, *consumer; 415 416 /* Check init variable of reduction and how it is initialized. */ 417 if (TREE_CODE (re->init) == SSA_NAME) 418 { 419 producer = SSA_NAME_DEF_STMT (re->init); 420 re->producer = producer; 421 basic_block bb = gimple_bb (producer); 422 if (!bb || bb->loop_father != m_outer) 423 return; 424 425 if (!gimple_assign_load_p (producer)) 426 return; 427 428 re->init_ref = gimple_assign_rhs1 (producer); 429 } 430 else if (CONSTANT_CLASS_P (re->init)) 431 m_const_init_reduc++; 432 else 433 return; 434 435 /* Check how reduction variable is used. */ 436 consumer = single_use_in_loop (PHI_RESULT (re->lcssa_phi), m_outer); 437 if (!consumer 438 || !gimple_store_p (consumer)) 439 return; 440 441 re->fini_ref = gimple_get_lhs (consumer); 442 re->consumer = consumer; 443 444 /* Simple reduction with constant initializer. */ 445 if (!re->init_ref) 446 { 447 gcc_assert (CONSTANT_CLASS_P (re->init)); 448 re->init_ref = unshare_expr (re->fini_ref); 449 } 450 451 /* Require memory references in producer and consumer are the same so 452 that we can undo reduction during interchange. */ 453 if (re->init_ref && !operand_equal_p (re->init_ref, re->fini_ref, 0)) 454 return; 455 456 re->type = SIMPLE_RTYPE; 457 } 458 459 /* Analyze reduction variable VAR for inner loop of the loop nest to be 460 interchanged. Return true if analysis succeeds. */ 461 462 bool 463 loop_cand::analyze_iloop_reduction_var (tree var) 464 { 465 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var)); 466 gphi *lcssa_phi = NULL, *use_phi; 467 tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop)); 468 tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop)); 469 reduction_p re; 470 gimple *stmt, *next_def, *single_use = NULL; 471 use_operand_p use_p; 472 imm_use_iterator iterator; 473 474 if (TREE_CODE (next) != SSA_NAME) 475 return false; 476 477 next_def = SSA_NAME_DEF_STMT (next); 478 basic_block bb = gimple_bb (next_def); 479 if (!bb || !flow_bb_inside_loop_p (m_loop, bb)) 480 return false; 481 482 /* In restricted reduction, the var is (and must be) used in defining 483 the updated var. The process can be depicted as below: 484 485 var ;; = PHI<init, next> 486 | 487 | 488 v 489 +---------------------+ 490 | reduction operators | <-- other operands 491 +---------------------+ 492 | 493 | 494 v 495 next 496 497 In terms loop interchange, we don't change how NEXT is computed based 498 on VAR and OTHER OPERANDS. In case of double reduction in loop nest 499 to be interchanged, we don't changed it at all. In the case of simple 500 reduction in inner loop, we only make change how VAR/NEXT is loaded or 501 stored. With these conditions, we can relax restrictions on reduction 502 in a way that reduction operation is seen as black box. In general, 503 we can ignore reassociation of reduction operator; we can handle fake 504 reductions in which VAR is not even used to compute NEXT. */ 505 if (! single_imm_use (var, &use_p, &single_use) 506 || ! flow_bb_inside_loop_p (m_loop, gimple_bb (single_use))) 507 return false; 508 509 /* Check the reduction operation. We require a left-associative operation. 510 For FP math we also need to be allowed to associate operations. */ 511 if (gassign *ass = dyn_cast <gassign *> (single_use)) 512 { 513 enum tree_code code = gimple_assign_rhs_code (ass); 514 if (! (associative_tree_code (code) 515 || (code == MINUS_EXPR 516 && use_p->use == gimple_assign_rhs1_ptr (ass))) 517 || (FLOAT_TYPE_P (TREE_TYPE (var)) 518 && ! flag_associative_math)) 519 return false; 520 } 521 else 522 return false; 523 524 /* Handle and verify a series of stmts feeding the reduction op. */ 525 if (single_use != next_def 526 && !check_reduction_path (dump_user_location_t (), m_loop, phi, next, 527 gimple_assign_rhs_code (single_use))) 528 return false; 529 530 /* Only support cases in which INIT is used in inner loop. */ 531 if (TREE_CODE (init) == SSA_NAME) 532 FOR_EACH_IMM_USE_FAST (use_p, iterator, init) 533 { 534 stmt = USE_STMT (use_p); 535 if (is_gimple_debug (stmt)) 536 continue; 537 538 if (!flow_bb_inside_loop_p (m_loop, gimple_bb (stmt))) 539 return false; 540 } 541 542 FOR_EACH_IMM_USE_FAST (use_p, iterator, next) 543 { 544 stmt = USE_STMT (use_p); 545 if (is_gimple_debug (stmt)) 546 continue; 547 548 /* Or else it's used in PHI itself. */ 549 use_phi = dyn_cast <gphi *> (stmt); 550 if (use_phi == phi) 551 continue; 552 553 if (use_phi != NULL 554 && lcssa_phi == NULL 555 && gimple_bb (stmt) == m_exit->dest 556 && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next) 557 lcssa_phi = use_phi; 558 else 559 return false; 560 } 561 if (!lcssa_phi) 562 return false; 563 564 re = XCNEW (struct reduction); 565 re->var = var; 566 re->init = init; 567 re->next = next; 568 re->phi = phi; 569 re->lcssa_phi = lcssa_phi; 570 571 classify_simple_reduction (re); 572 573 if (dump_file && (dump_flags & TDF_DETAILS)) 574 dump_reduction (re); 575 576 m_reductions.safe_push (re); 577 return true; 578 } 579 580 /* Analyze reduction variable VAR for outer loop of the loop nest to be 581 interchanged. ILOOP is not NULL and points to inner loop. For the 582 moment, we only support double reduction for outer loop, like: 583 584 for (int i = 0; i < n; i++) 585 { 586 int sum = 0; 587 588 for (int j = 0; j < n; j++) // outer loop 589 for (int k = 0; k < n; k++) // inner loop 590 sum += a[i][k]*b[k][j]; 591 592 s[i] = sum; 593 } 594 595 Note the innermost two loops are the loop nest to be interchanged. 596 Return true if analysis succeeds. */ 597 598 bool 599 loop_cand::analyze_oloop_reduction_var (loop_cand *iloop, tree var) 600 { 601 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var)); 602 gphi *lcssa_phi = NULL, *use_phi; 603 tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop)); 604 tree next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (m_loop)); 605 reduction_p re; 606 gimple *stmt, *next_def; 607 use_operand_p use_p; 608 imm_use_iterator iterator; 609 610 if (TREE_CODE (next) != SSA_NAME) 611 return false; 612 613 next_def = SSA_NAME_DEF_STMT (next); 614 basic_block bb = gimple_bb (next_def); 615 if (!bb || !flow_bb_inside_loop_p (m_loop, bb)) 616 return false; 617 618 /* Find inner loop's simple reduction that uses var as initializer. */ 619 reduction_p inner_re = NULL; 620 for (unsigned i = 0; iloop->m_reductions.iterate (i, &inner_re); ++i) 621 if (inner_re->init == var || operand_equal_p (inner_re->init, var, 0)) 622 break; 623 624 if (inner_re == NULL 625 || inner_re->type != UNKNOWN_RTYPE 626 || inner_re->producer != phi) 627 return false; 628 629 /* In case of double reduction, outer loop's reduction should be updated 630 by inner loop's simple reduction. */ 631 if (next_def != inner_re->lcssa_phi) 632 return false; 633 634 /* Outer loop's reduction should only be used to initialize inner loop's 635 simple reduction. */ 636 if (! single_imm_use (var, &use_p, &stmt) 637 || stmt != inner_re->phi) 638 return false; 639 640 /* Check this reduction is correctly used outside of loop via lcssa phi. */ 641 FOR_EACH_IMM_USE_FAST (use_p, iterator, next) 642 { 643 stmt = USE_STMT (use_p); 644 if (is_gimple_debug (stmt)) 645 continue; 646 647 /* Or else it's used in PHI itself. */ 648 use_phi = dyn_cast <gphi *> (stmt); 649 if (use_phi == phi) 650 continue; 651 652 if (lcssa_phi == NULL 653 && use_phi != NULL 654 && gimple_bb (stmt) == m_exit->dest 655 && PHI_ARG_DEF_FROM_EDGE (use_phi, m_exit) == next) 656 lcssa_phi = use_phi; 657 else 658 return false; 659 } 660 if (!lcssa_phi) 661 return false; 662 663 re = XCNEW (struct reduction); 664 re->var = var; 665 re->init = init; 666 re->next = next; 667 re->phi = phi; 668 re->lcssa_phi = lcssa_phi; 669 re->type = DOUBLE_RTYPE; 670 inner_re->type = DOUBLE_RTYPE; 671 672 if (dump_file && (dump_flags & TDF_DETAILS)) 673 dump_reduction (re); 674 675 m_reductions.safe_push (re); 676 return true; 677 } 678 679 /* Return true if VAR is induction variable of current loop whose scev is 680 specified by CHREC. */ 681 682 bool 683 loop_cand::analyze_induction_var (tree var, tree chrec) 684 { 685 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (var)); 686 tree init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (m_loop)); 687 688 /* Var is loop invariant, though it's unlikely to happen. */ 689 if (tree_does_not_contain_chrecs (chrec)) 690 { 691 /* Punt on floating point invariants if honoring signed zeros, 692 representing that as + 0.0 would change the result if init 693 is -0.0. Similarly for SNaNs it can raise exception. */ 694 if (HONOR_SIGNED_ZEROS (chrec) || HONOR_SNANS (chrec)) 695 return false; 696 struct induction *iv = XCNEW (struct induction); 697 iv->var = var; 698 iv->init_val = init; 699 iv->init_expr = chrec; 700 iv->step = build_zero_cst (TREE_TYPE (chrec)); 701 m_inductions.safe_push (iv); 702 return true; 703 } 704 705 if (TREE_CODE (chrec) != POLYNOMIAL_CHREC 706 || CHREC_VARIABLE (chrec) != (unsigned) m_loop->num 707 || tree_contains_chrecs (CHREC_LEFT (chrec), NULL) 708 || tree_contains_chrecs (CHREC_RIGHT (chrec), NULL)) 709 return false; 710 711 struct induction *iv = XCNEW (struct induction); 712 iv->var = var; 713 iv->init_val = init; 714 iv->init_expr = CHREC_LEFT (chrec); 715 iv->step = CHREC_RIGHT (chrec); 716 717 if (dump_file && (dump_flags & TDF_DETAILS)) 718 dump_induction (m_loop, iv); 719 720 m_inductions.safe_push (iv); 721 return true; 722 } 723 724 /* Return true if all loop carried variables defined in loop header can 725 be successfully analyzed. */ 726 727 bool 728 loop_cand::analyze_carried_vars (loop_cand *iloop) 729 { 730 edge e = loop_preheader_edge (m_outer); 731 gphi_iterator gsi; 732 733 if (dump_file && (dump_flags & TDF_DETAILS)) 734 fprintf (dump_file, "\nLoop(%d) carried vars:\n", m_loop->num); 735 736 for (gsi = gsi_start_phis (m_loop->header); !gsi_end_p (gsi); gsi_next (&gsi)) 737 { 738 gphi *phi = gsi.phi (); 739 740 tree var = PHI_RESULT (phi); 741 if (virtual_operand_p (var)) 742 continue; 743 744 tree chrec = analyze_scalar_evolution (m_loop, var); 745 chrec = instantiate_scev (e, m_loop, chrec); 746 747 /* Analyze var as reduction variable. */ 748 if (chrec_contains_undetermined (chrec) 749 || chrec_contains_symbols_defined_in_loop (chrec, m_outer->num)) 750 { 751 if (iloop && !analyze_oloop_reduction_var (iloop, var)) 752 return false; 753 if (!iloop && !analyze_iloop_reduction_var (var)) 754 return false; 755 } 756 /* Analyze var as induction variable. */ 757 else if (!analyze_induction_var (var, chrec)) 758 return false; 759 } 760 761 return true; 762 } 763 764 /* Return TRUE if loop closed PHI nodes can be analyzed successfully. */ 765 766 bool 767 loop_cand::analyze_lcssa_phis (void) 768 { 769 gphi_iterator gsi; 770 for (gsi = gsi_start_phis (m_exit->dest); !gsi_end_p (gsi); gsi_next (&gsi)) 771 { 772 gphi *phi = gsi.phi (); 773 774 if (virtual_operand_p (PHI_RESULT (phi))) 775 continue; 776 777 /* TODO: We only support lcssa phi for reduction for now. */ 778 if (!find_reduction_by_stmt (phi)) 779 return false; 780 } 781 782 return true; 783 } 784 785 /* CONSUMER is a stmt in BB storing reduction result into memory object. 786 When the reduction is intialized from constant value, we need to add 787 a stmt loading from the memory object to target basic block in inner 788 loop during undoing the reduction. Problem is that memory reference 789 may use ssa variables not dominating the target basic block. This 790 function finds all stmts on which CONSUMER depends in basic block BB, 791 records and returns them via STMTS. */ 792 793 static void 794 find_deps_in_bb_for_stmt (gimple_seq *stmts, basic_block bb, gimple *consumer) 795 { 796 auto_vec<gimple *, 4> worklist; 797 use_operand_p use_p; 798 ssa_op_iter iter; 799 gimple *stmt, *def_stmt; 800 gimple_stmt_iterator gsi; 801 802 /* First clear flag for stmts in bb. */ 803 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 804 gimple_set_plf (gsi_stmt (gsi), GF_PLF_1, false); 805 806 /* DFS search all depended stmts in bb and mark flag for these stmts. */ 807 worklist.safe_push (consumer); 808 while (!worklist.is_empty ()) 809 { 810 stmt = worklist.pop (); 811 FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE) 812 { 813 def_stmt = SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p)); 814 815 if (is_a <gphi *> (def_stmt) 816 || gimple_bb (def_stmt) != bb 817 || gimple_plf (def_stmt, GF_PLF_1)) 818 continue; 819 820 worklist.safe_push (def_stmt); 821 } 822 gimple_set_plf (stmt, GF_PLF_1, true); 823 } 824 for (gsi = gsi_start_nondebug_bb (bb); 825 !gsi_end_p (gsi) && (stmt = gsi_stmt (gsi)) != consumer;) 826 { 827 /* Move dep stmts to sequence STMTS. */ 828 if (gimple_plf (stmt, GF_PLF_1)) 829 { 830 gsi_remove (&gsi, false); 831 gimple_seq_add_stmt_without_update (stmts, stmt); 832 } 833 else 834 gsi_next_nondebug (&gsi); 835 } 836 } 837 838 /* User can write, optimizers can generate simple reduction RE for inner 839 loop. In order to make interchange valid, we have to undo reduction by 840 moving producer and consumer stmts into the inner loop. For example, 841 below code: 842 843 init = MEM_REF[idx]; //producer 844 loop: 845 var = phi<init, next> 846 next = var op ... 847 reduc_sum = phi<next> 848 MEM_REF[idx] = reduc_sum //consumer 849 850 is transformed into: 851 852 loop: 853 new_var = MEM_REF[idx]; //producer after moving 854 next = new_var op ... 855 MEM_REF[idx] = next; //consumer after moving 856 857 Note if the reduction variable is initialized to constant, like: 858 859 var = phi<0.0, next> 860 861 we compute new_var as below: 862 863 loop: 864 tmp = MEM_REF[idx]; 865 new_var = !first_iteration ? tmp : 0.0; 866 867 so that the initial const is used in the first iteration of loop. Also 868 record ssa variables for dead code elimination in DCE_SEEDS. */ 869 870 void 871 loop_cand::undo_simple_reduction (reduction_p re, bitmap dce_seeds) 872 { 873 gimple *stmt; 874 gimple_stmt_iterator from, to = gsi_after_labels (m_loop->header); 875 gimple_seq stmts = NULL; 876 tree new_var; 877 878 /* Prepare the initialization stmts and insert it to inner loop. */ 879 if (re->producer != NULL) 880 { 881 gimple_set_vuse (re->producer, NULL_TREE); 882 update_stmt (re->producer); 883 from = gsi_for_stmt (re->producer); 884 gsi_remove (&from, false); 885 gimple_seq_add_stmt_without_update (&stmts, re->producer); 886 new_var = re->init; 887 } 888 else 889 { 890 /* Find all stmts on which expression "MEM_REF[idx]" depends. */ 891 find_deps_in_bb_for_stmt (&stmts, gimple_bb (re->consumer), re->consumer); 892 /* Because we generate new stmt loading from the MEM_REF to TMP. */ 893 tree cond, tmp = copy_ssa_name (re->var); 894 stmt = gimple_build_assign (tmp, re->init_ref); 895 gimple_seq_add_stmt_without_update (&stmts, stmt); 896 897 /* Init new_var to MEM_REF or CONST depending on if it is the first 898 iteration. */ 899 induction_p iv = m_inductions[0]; 900 cond = fold_build2 (NE_EXPR, boolean_type_node, iv->var, iv->init_val); 901 new_var = copy_ssa_name (re->var); 902 stmt = gimple_build_assign (new_var, COND_EXPR, cond, tmp, re->init); 903 gimple_seq_add_stmt_without_update (&stmts, stmt); 904 } 905 gsi_insert_seq_before (&to, stmts, GSI_SAME_STMT); 906 907 /* Replace all uses of reduction var with new variable. */ 908 use_operand_p use_p; 909 imm_use_iterator iterator; 910 FOR_EACH_IMM_USE_STMT (stmt, iterator, re->var) 911 { 912 FOR_EACH_IMM_USE_ON_STMT (use_p, iterator) 913 SET_USE (use_p, new_var); 914 915 update_stmt (stmt); 916 } 917 918 /* Move consumer stmt into inner loop, just after reduction next's def. */ 919 unlink_stmt_vdef (re->consumer); 920 release_ssa_name (gimple_vdef (re->consumer)); 921 gimple_set_vdef (re->consumer, NULL_TREE); 922 gimple_set_vuse (re->consumer, NULL_TREE); 923 gimple_assign_set_rhs1 (re->consumer, re->next); 924 update_stmt (re->consumer); 925 from = gsi_for_stmt (re->consumer); 926 to = gsi_for_stmt (SSA_NAME_DEF_STMT (re->next)); 927 gsi_move_after (&from, &to); 928 929 /* Mark the reduction variables for DCE. */ 930 bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (re->var)); 931 bitmap_set_bit (dce_seeds, SSA_NAME_VERSION (PHI_RESULT (re->lcssa_phi))); 932 } 933 934 /* Free DATAREFS and its auxiliary memory. */ 935 936 static void 937 free_data_refs_with_aux (vec<data_reference_p> datarefs) 938 { 939 data_reference_p dr; 940 for (unsigned i = 0; datarefs.iterate (i, &dr); ++i) 941 if (dr->aux != NULL) 942 { 943 DR_ACCESS_STRIDE (dr)->release (); 944 delete (vec<tree> *) dr->aux; 945 } 946 947 free_data_refs (datarefs); 948 } 949 950 /* Class for loop interchange transformation. */ 951 952 class tree_loop_interchange 953 { 954 public: 955 tree_loop_interchange (vec<class loop *> loop_nest) 956 : m_loop_nest (loop_nest), m_niters_iv_var (NULL_TREE), 957 m_dce_seeds (BITMAP_ALLOC (NULL)) { } 958 ~tree_loop_interchange () { BITMAP_FREE (m_dce_seeds); } 959 bool interchange (vec<data_reference_p>, vec<ddr_p>); 960 961 private: 962 void update_data_info (unsigned, unsigned, vec<data_reference_p>, vec<ddr_p>); 963 bool valid_data_dependences (unsigned, unsigned, vec<ddr_p>); 964 void interchange_loops (loop_cand &, loop_cand &); 965 void map_inductions_to_loop (loop_cand &, loop_cand &); 966 void move_code_to_inner_loop (class loop *, class loop *, basic_block *); 967 968 /* The whole loop nest in which interchange is ongoing. */ 969 vec<class loop *> m_loop_nest; 970 /* We create new IV which is only used in loop's exit condition check. 971 In case of 3-level loop nest interchange, when we interchange the 972 innermost two loops, new IV created in the middle level loop does 973 not need to be preserved in interchanging the outermost two loops 974 later. We record the IV so that it can be skipped. */ 975 tree m_niters_iv_var; 976 /* Bitmap of seed variables for dead code elimination after interchange. */ 977 bitmap m_dce_seeds; 978 }; 979 980 /* Update data refs' access stride and dependence information after loop 981 interchange. I_IDX/O_IDX gives indices of interchanged loops in loop 982 nest. DATAREFS are data references. DDRS are data dependences. */ 983 984 void 985 tree_loop_interchange::update_data_info (unsigned i_idx, unsigned o_idx, 986 vec<data_reference_p> datarefs, 987 vec<ddr_p> ddrs) 988 { 989 struct data_reference *dr; 990 struct data_dependence_relation *ddr; 991 992 /* Update strides of data references. */ 993 for (unsigned i = 0; datarefs.iterate (i, &dr); ++i) 994 { 995 vec<tree> *stride = DR_ACCESS_STRIDE (dr); 996 gcc_assert (stride->length () > i_idx); 997 std::swap ((*stride)[i_idx], (*stride)[o_idx]); 998 } 999 /* Update data dependences. */ 1000 for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i) 1001 if (DDR_ARE_DEPENDENT (ddr) != chrec_known) 1002 { 1003 for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j) 1004 { 1005 lambda_vector dist_vect = DDR_DIST_VECT (ddr, j); 1006 std::swap (dist_vect[i_idx], dist_vect[o_idx]); 1007 } 1008 } 1009 } 1010 1011 /* Check data dependence relations, return TRUE if it's valid to interchange 1012 two loops specified by I_IDX/O_IDX. Theoretically, interchanging the two 1013 loops is valid only if dist vector, after interchanging, doesn't have '>' 1014 as the leftmost non-'=' direction. Practically, this function have been 1015 conservative here by not checking some valid cases. */ 1016 1017 bool 1018 tree_loop_interchange::valid_data_dependences (unsigned i_idx, unsigned o_idx, 1019 vec<ddr_p> ddrs) 1020 { 1021 struct data_dependence_relation *ddr; 1022 1023 for (unsigned i = 0; ddrs.iterate (i, &ddr); ++i) 1024 { 1025 /* Skip no-dependence case. */ 1026 if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 1027 continue; 1028 1029 for (unsigned j = 0; j < DDR_NUM_DIST_VECTS (ddr); ++j) 1030 { 1031 lambda_vector dist_vect = DDR_DIST_VECT (ddr, j); 1032 unsigned level = dependence_level (dist_vect, m_loop_nest.length ()); 1033 1034 /* If there is no carried dependence. */ 1035 if (level == 0) 1036 continue; 1037 1038 level --; 1039 1040 /* If dependence is not carried by any loop in between the two 1041 loops [oloop, iloop] to interchange. */ 1042 if (level < o_idx || level > i_idx) 1043 continue; 1044 1045 /* Be conservative, skip case if either direction at i_idx/o_idx 1046 levels is not '=' or '<'. */ 1047 if ((!DDR_REVERSED_P (ddr) && dist_vect[i_idx] < 0) 1048 || (DDR_REVERSED_P (ddr) && dist_vect[i_idx] > 0) 1049 || (!DDR_REVERSED_P (ddr) && dist_vect[o_idx] < 0) 1050 || (DDR_REVERSED_P (ddr) && dist_vect[o_idx] > 0)) 1051 return false; 1052 } 1053 } 1054 1055 return true; 1056 } 1057 1058 /* Interchange two loops specified by ILOOP and OLOOP. */ 1059 1060 void 1061 tree_loop_interchange::interchange_loops (loop_cand &iloop, loop_cand &oloop) 1062 { 1063 reduction_p re; 1064 gimple_stmt_iterator gsi; 1065 tree i_niters, o_niters, var_after; 1066 1067 /* Undo inner loop's simple reduction. */ 1068 for (unsigned i = 0; iloop.m_reductions.iterate (i, &re); ++i) 1069 if (re->type != DOUBLE_RTYPE) 1070 { 1071 if (re->producer) 1072 reset_debug_uses (re->producer); 1073 1074 iloop.undo_simple_reduction (re, m_dce_seeds); 1075 } 1076 1077 /* Only need to reset debug uses for double reduction. */ 1078 for (unsigned i = 0; oloop.m_reductions.iterate (i, &re); ++i) 1079 { 1080 gcc_assert (re->type == DOUBLE_RTYPE); 1081 reset_debug_uses (SSA_NAME_DEF_STMT (re->var)); 1082 reset_debug_uses (SSA_NAME_DEF_STMT (re->next)); 1083 } 1084 1085 /* Prepare niters for both loops. */ 1086 class loop *loop_nest = m_loop_nest[0]; 1087 edge instantiate_below = loop_preheader_edge (loop_nest); 1088 gsi = gsi_last_bb (loop_preheader_edge (loop_nest)->src); 1089 i_niters = number_of_latch_executions (iloop.m_loop); 1090 i_niters = analyze_scalar_evolution (loop_outer (iloop.m_loop), i_niters); 1091 i_niters = instantiate_scev (instantiate_below, loop_outer (iloop.m_loop), 1092 i_niters); 1093 i_niters = force_gimple_operand_gsi (&gsi, unshare_expr (i_niters), true, 1094 NULL_TREE, false, GSI_CONTINUE_LINKING); 1095 o_niters = number_of_latch_executions (oloop.m_loop); 1096 if (oloop.m_loop != loop_nest) 1097 { 1098 o_niters = analyze_scalar_evolution (loop_outer (oloop.m_loop), o_niters); 1099 o_niters = instantiate_scev (instantiate_below, loop_outer (oloop.m_loop), 1100 o_niters); 1101 } 1102 o_niters = force_gimple_operand_gsi (&gsi, unshare_expr (o_niters), true, 1103 NULL_TREE, false, GSI_CONTINUE_LINKING); 1104 1105 /* Move src's code to tgt loop. This is necessary when src is the outer 1106 loop and tgt is the inner loop. */ 1107 move_code_to_inner_loop (oloop.m_loop, iloop.m_loop, oloop.m_bbs); 1108 1109 /* Map outer loop's IV to inner loop, and vice versa. */ 1110 map_inductions_to_loop (oloop, iloop); 1111 map_inductions_to_loop (iloop, oloop); 1112 1113 /* Create canonical IV for both loops. Note canonical IV for outer/inner 1114 loop is actually from inner/outer loop. Also we record the new IV 1115 created for the outer loop so that it can be skipped in later loop 1116 interchange. */ 1117 create_canonical_iv (oloop.m_loop, oloop.m_exit, 1118 i_niters, &m_niters_iv_var, &var_after); 1119 bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after)); 1120 create_canonical_iv (iloop.m_loop, iloop.m_exit, 1121 o_niters, NULL, &var_after); 1122 bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after)); 1123 1124 /* Scrap niters estimation of interchanged loops. */ 1125 iloop.m_loop->any_upper_bound = false; 1126 iloop.m_loop->any_likely_upper_bound = false; 1127 free_numbers_of_iterations_estimates (iloop.m_loop); 1128 oloop.m_loop->any_upper_bound = false; 1129 oloop.m_loop->any_likely_upper_bound = false; 1130 free_numbers_of_iterations_estimates (oloop.m_loop); 1131 1132 /* Clear all cached scev information. This is expensive but shouldn't be 1133 a problem given we interchange in very limited times. */ 1134 scev_reset_htab (); 1135 1136 /* ??? The association between the loop data structure and the 1137 CFG changed, so what was loop N at the source level is now 1138 loop M. We should think of retaining the association or breaking 1139 it fully by creating a new loop instead of re-using the "wrong" one. */ 1140 } 1141 1142 /* Map induction variables of SRC loop to TGT loop. The function firstly 1143 creates the same IV of SRC loop in TGT loop, then deletes the original 1144 IV and re-initialize it using the newly created IV. For example, loop 1145 nest: 1146 1147 for (i = 0; i < N; i++) 1148 for (j = 0; j < M; j++) 1149 { 1150 //use of i; 1151 //use of j; 1152 } 1153 1154 will be transformed into: 1155 1156 for (jj = 0; jj < M; jj++) 1157 for (ii = 0; ii < N; ii++) 1158 { 1159 //use of ii; 1160 //use of jj; 1161 } 1162 1163 after loop interchange. */ 1164 1165 void 1166 tree_loop_interchange::map_inductions_to_loop (loop_cand &src, loop_cand &tgt) 1167 { 1168 induction_p iv; 1169 edge e = tgt.m_exit; 1170 gimple_stmt_iterator incr_pos = gsi_last_bb (e->src), gsi; 1171 1172 /* Map source loop's IV to target loop. */ 1173 for (unsigned i = 0; src.m_inductions.iterate (i, &iv); ++i) 1174 { 1175 gimple *use_stmt, *stmt = SSA_NAME_DEF_STMT (iv->var); 1176 gcc_assert (is_a <gphi *> (stmt)); 1177 1178 use_operand_p use_p; 1179 /* Only map original IV to target loop. */ 1180 if (m_niters_iv_var != iv->var) 1181 { 1182 /* Map the IV by creating the same one in target loop. */ 1183 tree var_before, var_after; 1184 tree base = unshare_expr (iv->init_expr); 1185 tree step = unshare_expr (iv->step); 1186 create_iv (base, step, SSA_NAME_VAR (iv->var), 1187 tgt.m_loop, &incr_pos, false, &var_before, &var_after); 1188 bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_before)); 1189 bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (var_after)); 1190 1191 /* Replace uses of the original IV var with newly created IV var. */ 1192 imm_use_iterator imm_iter; 1193 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, iv->var) 1194 { 1195 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) 1196 SET_USE (use_p, var_before); 1197 1198 update_stmt (use_stmt); 1199 } 1200 } 1201 1202 /* Mark all uses for DCE. */ 1203 ssa_op_iter op_iter; 1204 FOR_EACH_PHI_OR_STMT_USE (use_p, stmt, op_iter, SSA_OP_USE) 1205 { 1206 tree use = USE_FROM_PTR (use_p); 1207 if (TREE_CODE (use) == SSA_NAME 1208 && ! SSA_NAME_IS_DEFAULT_DEF (use)) 1209 bitmap_set_bit (m_dce_seeds, SSA_NAME_VERSION (use)); 1210 } 1211 1212 /* Delete definition of the original IV in the source loop. */ 1213 gsi = gsi_for_stmt (stmt); 1214 remove_phi_node (&gsi, true); 1215 } 1216 } 1217 1218 /* Move stmts of outer loop to inner loop. */ 1219 1220 void 1221 tree_loop_interchange::move_code_to_inner_loop (class loop *outer, 1222 class loop *inner, 1223 basic_block *outer_bbs) 1224 { 1225 basic_block oloop_exit_bb = single_exit (outer)->src; 1226 gimple_stmt_iterator gsi, to; 1227 1228 for (unsigned i = 0; i < outer->num_nodes; i++) 1229 { 1230 basic_block bb = outer_bbs[i]; 1231 1232 /* Skip basic blocks of inner loop. */ 1233 if (flow_bb_inside_loop_p (inner, bb)) 1234 continue; 1235 1236 /* Move code from header/latch to header/latch. */ 1237 if (bb == outer->header) 1238 to = gsi_after_labels (inner->header); 1239 else if (bb == outer->latch) 1240 to = gsi_after_labels (inner->latch); 1241 else 1242 /* Otherwise, simply move to exit->src. */ 1243 to = gsi_last_bb (single_exit (inner)->src); 1244 1245 for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) 1246 { 1247 gimple *stmt = gsi_stmt (gsi); 1248 1249 if (oloop_exit_bb == bb 1250 && stmt == gsi_stmt (gsi_last_bb (oloop_exit_bb))) 1251 { 1252 gsi_next (&gsi); 1253 continue; 1254 } 1255 1256 if (gimple_vdef (stmt)) 1257 { 1258 unlink_stmt_vdef (stmt); 1259 release_ssa_name (gimple_vdef (stmt)); 1260 gimple_set_vdef (stmt, NULL_TREE); 1261 } 1262 if (gimple_vuse (stmt)) 1263 { 1264 gimple_set_vuse (stmt, NULL_TREE); 1265 update_stmt (stmt); 1266 } 1267 1268 reset_debug_uses (stmt); 1269 gsi_move_before (&gsi, &to); 1270 } 1271 } 1272 } 1273 1274 /* Given data reference DR in LOOP_NEST, the function computes DR's access 1275 stride at each level of loop from innermost LOOP to outer. On success, 1276 it saves access stride at each level loop in a vector which is pointed 1277 by DR->aux. For example: 1278 1279 int arr[100][100][100]; 1280 for (i = 0; i < 100; i++) ;(DR->aux)strides[0] = 40000 1281 for (j = 100; j > 0; j--) ;(DR->aux)strides[1] = 400 1282 for (k = 0; k < 100; k++) ;(DR->aux)strides[2] = 4 1283 arr[i][j - 1][k] = 0; */ 1284 1285 static void 1286 compute_access_stride (class loop *loop_nest, class loop *loop, 1287 data_reference_p dr) 1288 { 1289 vec<tree> *strides = new vec<tree> (); 1290 basic_block bb = gimple_bb (DR_STMT (dr)); 1291 1292 while (!flow_bb_inside_loop_p (loop, bb)) 1293 { 1294 strides->safe_push (build_int_cst (sizetype, 0)); 1295 loop = loop_outer (loop); 1296 } 1297 gcc_assert (loop == bb->loop_father); 1298 1299 tree ref = DR_REF (dr); 1300 if (TREE_CODE (ref) == COMPONENT_REF 1301 && DECL_BIT_FIELD (TREE_OPERAND (ref, 1))) 1302 { 1303 /* We can't take address of bitfields. If the bitfield is at constant 1304 offset from the start of the struct, just use address of the 1305 struct, for analysis of the strides that shouldn't matter. */ 1306 if (!TREE_OPERAND (ref, 2) 1307 || TREE_CODE (TREE_OPERAND (ref, 2)) == INTEGER_CST) 1308 ref = TREE_OPERAND (ref, 0); 1309 /* Otherwise, if we have a bit field representative, use that. */ 1310 else if (DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1)) 1311 != NULL_TREE) 1312 { 1313 tree repr = DECL_BIT_FIELD_REPRESENTATIVE (TREE_OPERAND (ref, 1)); 1314 ref = build3 (COMPONENT_REF, TREE_TYPE (repr), TREE_OPERAND (ref, 0), 1315 repr, TREE_OPERAND (ref, 2)); 1316 } 1317 /* Otherwise punt. */ 1318 else 1319 { 1320 dr->aux = strides; 1321 return; 1322 } 1323 } 1324 tree scev_base = build_fold_addr_expr (ref); 1325 tree scev = analyze_scalar_evolution (loop, scev_base); 1326 scev = instantiate_scev (loop_preheader_edge (loop_nest), loop, scev); 1327 if (! chrec_contains_undetermined (scev)) 1328 { 1329 tree sl = scev; 1330 class loop *expected = loop; 1331 while (TREE_CODE (sl) == POLYNOMIAL_CHREC) 1332 { 1333 class loop *sl_loop = get_chrec_loop (sl); 1334 while (sl_loop != expected) 1335 { 1336 strides->safe_push (size_int (0)); 1337 expected = loop_outer (expected); 1338 } 1339 strides->safe_push (CHREC_RIGHT (sl)); 1340 sl = CHREC_LEFT (sl); 1341 expected = loop_outer (expected); 1342 } 1343 if (! tree_contains_chrecs (sl, NULL)) 1344 while (expected != loop_outer (loop_nest)) 1345 { 1346 strides->safe_push (size_int (0)); 1347 expected = loop_outer (expected); 1348 } 1349 } 1350 1351 dr->aux = strides; 1352 } 1353 1354 /* Given loop nest LOOP_NEST with innermost LOOP, the function computes 1355 access strides with respect to each level loop for all data refs in 1356 DATAREFS from inner loop to outer loop. On success, it returns the 1357 outermost loop that access strides can be computed successfully for 1358 all data references. If access strides cannot be computed at least 1359 for two levels of loop for any data reference, it returns NULL. */ 1360 1361 static class loop * 1362 compute_access_strides (class loop *loop_nest, class loop *loop, 1363 vec<data_reference_p> datarefs) 1364 { 1365 unsigned i, j, num_loops = (unsigned) -1; 1366 data_reference_p dr; 1367 vec<tree> *stride; 1368 1369 for (i = 0; datarefs.iterate (i, &dr); ++i) 1370 { 1371 compute_access_stride (loop_nest, loop, dr); 1372 stride = DR_ACCESS_STRIDE (dr); 1373 if (stride->length () < num_loops) 1374 { 1375 num_loops = stride->length (); 1376 if (num_loops < 2) 1377 return NULL; 1378 } 1379 } 1380 1381 for (i = 0; datarefs.iterate (i, &dr); ++i) 1382 { 1383 stride = DR_ACCESS_STRIDE (dr); 1384 if (stride->length () > num_loops) 1385 stride->truncate (num_loops); 1386 1387 for (j = 0; j < (num_loops >> 1); ++j) 1388 std::swap ((*stride)[j], (*stride)[num_loops - j - 1]); 1389 } 1390 1391 loop = superloop_at_depth (loop, loop_depth (loop) + 1 - num_loops); 1392 gcc_assert (loop_nest == loop || flow_loop_nested_p (loop_nest, loop)); 1393 return loop; 1394 } 1395 1396 /* Prune access strides for data references in DATAREFS by removing strides 1397 of loops that isn't in current LOOP_NEST. */ 1398 1399 static void 1400 prune_access_strides_not_in_loop (class loop *loop_nest, 1401 class loop *innermost, 1402 vec<data_reference_p> datarefs) 1403 { 1404 data_reference_p dr; 1405 unsigned num_loops = loop_depth (innermost) - loop_depth (loop_nest) + 1; 1406 gcc_assert (num_loops > 1); 1407 1408 /* Block remove strides of loops that is not in current loop nest. */ 1409 for (unsigned i = 0; datarefs.iterate (i, &dr); ++i) 1410 { 1411 vec<tree> *stride = DR_ACCESS_STRIDE (dr); 1412 if (stride->length () > num_loops) 1413 stride->block_remove (0, stride->length () - num_loops); 1414 } 1415 } 1416 1417 /* Dump access strides for all DATAREFS. */ 1418 1419 static void 1420 dump_access_strides (vec<data_reference_p> datarefs) 1421 { 1422 data_reference_p dr; 1423 fprintf (dump_file, "Access Strides for DRs:\n"); 1424 for (unsigned i = 0; datarefs.iterate (i, &dr); ++i) 1425 { 1426 fprintf (dump_file, " "); 1427 print_generic_expr (dump_file, DR_REF (dr), TDF_SLIM); 1428 fprintf (dump_file, ":\t\t<"); 1429 1430 vec<tree> *stride = DR_ACCESS_STRIDE (dr); 1431 unsigned num_loops = stride->length (); 1432 for (unsigned j = 0; j < num_loops; ++j) 1433 { 1434 print_generic_expr (dump_file, (*stride)[j], TDF_SLIM); 1435 fprintf (dump_file, "%s", (j < num_loops - 1) ? ",\t" : ">\n"); 1436 } 1437 } 1438 } 1439 1440 /* Return true if it's profitable to interchange two loops whose index 1441 in whole loop nest vector are I_IDX/O_IDX respectively. The function 1442 computes and compares three types information from all DATAREFS: 1443 1) Access stride for loop I_IDX and O_IDX. 1444 2) Number of invariant memory references with respect to I_IDX before 1445 and after loop interchange. 1446 3) Flags indicating if all memory references access sequential memory 1447 in ILOOP, before and after loop interchange. 1448 If INNMOST_LOOP_P is true, the two loops for interchanging are the two 1449 innermost loops in loop nest. This function also dumps information if 1450 DUMP_INFO_P is true. */ 1451 1452 static bool 1453 should_interchange_loops (unsigned i_idx, unsigned o_idx, 1454 vec<data_reference_p> datarefs, 1455 unsigned i_stmt_cost, unsigned o_stmt_cost, 1456 bool innermost_loops_p, bool dump_info_p = true) 1457 { 1458 unsigned HOST_WIDE_INT ratio; 1459 unsigned i, j, num_old_inv_drs = 0, num_new_inv_drs = 0; 1460 struct data_reference *dr; 1461 bool all_seq_dr_before_p = true, all_seq_dr_after_p = true; 1462 widest_int iloop_strides = 0, oloop_strides = 0; 1463 unsigned num_unresolved_drs = 0; 1464 unsigned num_resolved_ok_drs = 0; 1465 unsigned num_resolved_not_ok_drs = 0; 1466 1467 if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS)) 1468 fprintf (dump_file, "\nData ref strides:\n\tmem_ref:\t\tiloop\toloop\n"); 1469 1470 for (i = 0; datarefs.iterate (i, &dr); ++i) 1471 { 1472 vec<tree> *stride = DR_ACCESS_STRIDE (dr); 1473 tree iloop_stride = (*stride)[i_idx], oloop_stride = (*stride)[o_idx]; 1474 1475 bool subloop_stride_p = false; 1476 /* Data ref can't be invariant or sequential access at current loop if 1477 its address changes with respect to any subloops. */ 1478 for (j = i_idx + 1; j < stride->length (); ++j) 1479 if (!integer_zerop ((*stride)[j])) 1480 { 1481 subloop_stride_p = true; 1482 break; 1483 } 1484 1485 if (integer_zerop (iloop_stride)) 1486 { 1487 if (!subloop_stride_p) 1488 num_old_inv_drs++; 1489 } 1490 if (integer_zerop (oloop_stride)) 1491 { 1492 if (!subloop_stride_p) 1493 num_new_inv_drs++; 1494 } 1495 1496 if (TREE_CODE (iloop_stride) == INTEGER_CST 1497 && TREE_CODE (oloop_stride) == INTEGER_CST) 1498 { 1499 iloop_strides = wi::add (iloop_strides, wi::to_widest (iloop_stride)); 1500 oloop_strides = wi::add (oloop_strides, wi::to_widest (oloop_stride)); 1501 } 1502 else if (multiple_of_p (TREE_TYPE (iloop_stride), 1503 iloop_stride, oloop_stride)) 1504 num_resolved_ok_drs++; 1505 else if (multiple_of_p (TREE_TYPE (iloop_stride), 1506 oloop_stride, iloop_stride)) 1507 num_resolved_not_ok_drs++; 1508 else 1509 num_unresolved_drs++; 1510 1511 /* Data ref can't be sequential access if its address changes in sub 1512 loop. */ 1513 if (subloop_stride_p) 1514 { 1515 all_seq_dr_before_p = false; 1516 all_seq_dr_after_p = false; 1517 continue; 1518 } 1519 /* Track if all data references are sequential accesses before/after loop 1520 interchange. Note invariant is considered sequential here. */ 1521 tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); 1522 if (all_seq_dr_before_p 1523 && ! (integer_zerop (iloop_stride) 1524 || operand_equal_p (access_size, iloop_stride, 0))) 1525 all_seq_dr_before_p = false; 1526 if (all_seq_dr_after_p 1527 && ! (integer_zerop (oloop_stride) 1528 || operand_equal_p (access_size, oloop_stride, 0))) 1529 all_seq_dr_after_p = false; 1530 } 1531 1532 if (dump_info_p && dump_file && (dump_flags & TDF_DETAILS)) 1533 { 1534 fprintf (dump_file, "\toverall:\t\t"); 1535 print_decu (iloop_strides, dump_file); 1536 fprintf (dump_file, "\t"); 1537 print_decu (oloop_strides, dump_file); 1538 fprintf (dump_file, "\n"); 1539 1540 fprintf (dump_file, "Invariant data ref: before(%d), after(%d)\n", 1541 num_old_inv_drs, num_new_inv_drs); 1542 fprintf (dump_file, "All consecutive stride: before(%s), after(%s)\n", 1543 all_seq_dr_before_p ? "true" : "false", 1544 all_seq_dr_after_p ? "true" : "false"); 1545 fprintf (dump_file, "OK to interchage with variable strides: %d\n", 1546 num_resolved_ok_drs); 1547 fprintf (dump_file, "Not OK to interchage with variable strides: %d\n", 1548 num_resolved_not_ok_drs); 1549 fprintf (dump_file, "Variable strides we cannot decide: %d\n", 1550 num_unresolved_drs); 1551 fprintf (dump_file, "Stmt cost of inner loop: %d\n", i_stmt_cost); 1552 fprintf (dump_file, "Stmt cost of outer loop: %d\n", o_stmt_cost); 1553 } 1554 1555 if (num_unresolved_drs != 0 || num_resolved_not_ok_drs != 0) 1556 return false; 1557 1558 /* Stmts of outer loop will be moved to inner loop. If there are two many 1559 such stmts, it could make inner loop costly. Here we compare stmt cost 1560 between outer and inner loops. */ 1561 if (i_stmt_cost && o_stmt_cost 1562 && num_old_inv_drs + o_stmt_cost > num_new_inv_drs 1563 && o_stmt_cost * STMT_COST_RATIO > i_stmt_cost) 1564 return false; 1565 1566 /* We use different stride comparison ratio for interchanging innermost 1567 two loops or not. The idea is to be conservative in interchange for 1568 the innermost loops. */ 1569 ratio = innermost_loops_p ? INNER_STRIDE_RATIO : OUTER_STRIDE_RATIO; 1570 /* Do interchange if it gives better data locality behavior. */ 1571 if (wi::gtu_p (iloop_strides, wi::mul (oloop_strides, ratio))) 1572 return true; 1573 if (wi::gtu_p (iloop_strides, oloop_strides)) 1574 { 1575 /* Or it creates more invariant memory references. */ 1576 if ((!all_seq_dr_before_p || all_seq_dr_after_p) 1577 && num_new_inv_drs > num_old_inv_drs) 1578 return true; 1579 /* Or it makes all memory references sequential. */ 1580 if (num_new_inv_drs >= num_old_inv_drs 1581 && !all_seq_dr_before_p && all_seq_dr_after_p) 1582 return true; 1583 } 1584 1585 return false; 1586 } 1587 1588 /* Try to interchange inner loop of a loop nest to outer level. */ 1589 1590 bool 1591 tree_loop_interchange::interchange (vec<data_reference_p> datarefs, 1592 vec<ddr_p> ddrs) 1593 { 1594 dump_user_location_t loc = find_loop_location (m_loop_nest[0]); 1595 bool changed_p = false; 1596 /* In each iteration we try to interchange I-th loop with (I+1)-th loop. 1597 The overall effect is to push inner loop to outermost level in whole 1598 loop nest. */ 1599 for (unsigned i = m_loop_nest.length (); i > 1; --i) 1600 { 1601 unsigned i_idx = i - 1, o_idx = i - 2; 1602 1603 /* Check validity for loop interchange. */ 1604 if (!valid_data_dependences (i_idx, o_idx, ddrs)) 1605 break; 1606 1607 loop_cand iloop (m_loop_nest[i_idx], m_loop_nest[o_idx]); 1608 loop_cand oloop (m_loop_nest[o_idx], m_loop_nest[o_idx]); 1609 1610 /* Check if we can do transformation for loop interchange. */ 1611 if (!iloop.analyze_carried_vars (NULL) 1612 || !iloop.analyze_lcssa_phis () 1613 || !oloop.analyze_carried_vars (&iloop) 1614 || !oloop.analyze_lcssa_phis () 1615 || !iloop.can_interchange_p (NULL) 1616 || !oloop.can_interchange_p (&iloop)) 1617 break; 1618 1619 /* Outer loop's stmts will be moved to inner loop during interchange. 1620 If there are many of them, it may make inner loop very costly. We 1621 need to check number of outer loop's stmts in profit cost model of 1622 interchange. */ 1623 int stmt_cost = oloop.m_num_stmts; 1624 /* Count out the exit checking stmt of outer loop. */ 1625 stmt_cost --; 1626 /* Count out IV's increasing stmt, IVOPTs takes care if it. */ 1627 stmt_cost -= oloop.m_inductions.length (); 1628 /* Count in the additional load and cond_expr stmts caused by inner 1629 loop's constant initialized reduction. */ 1630 stmt_cost += iloop.m_const_init_reduc * 2; 1631 if (stmt_cost < 0) 1632 stmt_cost = 0; 1633 1634 /* Check profitability for loop interchange. */ 1635 if (should_interchange_loops (i_idx, o_idx, datarefs, 1636 (unsigned) iloop.m_num_stmts, 1637 (unsigned) stmt_cost, 1638 iloop.m_loop->inner == NULL)) 1639 { 1640 if (dump_file && (dump_flags & TDF_DETAILS)) 1641 fprintf (dump_file, 1642 "Loop_pair<outer:%d, inner:%d> is interchanged\n\n", 1643 oloop.m_loop->num, iloop.m_loop->num); 1644 1645 changed_p = true; 1646 interchange_loops (iloop, oloop); 1647 /* No need to update if there is no further loop interchange. */ 1648 if (o_idx > 0) 1649 update_data_info (i_idx, o_idx, datarefs, ddrs); 1650 } 1651 else 1652 { 1653 if (dump_file && (dump_flags & TDF_DETAILS)) 1654 fprintf (dump_file, 1655 "Loop_pair<outer:%d, inner:%d> is not interchanged\n\n", 1656 oloop.m_loop->num, iloop.m_loop->num); 1657 } 1658 } 1659 simple_dce_from_worklist (m_dce_seeds); 1660 1661 if (changed_p && dump_enabled_p ()) 1662 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loc, 1663 "loops interchanged in loop nest\n"); 1664 1665 return changed_p; 1666 } 1667 1668 1669 /* Loop interchange pass. */ 1670 1671 namespace { 1672 1673 const pass_data pass_data_linterchange = 1674 { 1675 GIMPLE_PASS, /* type */ 1676 "linterchange", /* name */ 1677 OPTGROUP_LOOP, /* optinfo_flags */ 1678 TV_LINTERCHANGE, /* tv_id */ 1679 PROP_cfg, /* properties_required */ 1680 0, /* properties_provided */ 1681 0, /* properties_destroyed */ 1682 0, /* todo_flags_start */ 1683 0, /* todo_flags_finish */ 1684 }; 1685 1686 class pass_linterchange : public gimple_opt_pass 1687 { 1688 public: 1689 pass_linterchange (gcc::context *ctxt) 1690 : gimple_opt_pass (pass_data_linterchange, ctxt) 1691 {} 1692 1693 /* opt_pass methods: */ 1694 opt_pass * clone () { return new pass_linterchange (m_ctxt); } 1695 virtual bool gate (function *) { return flag_loop_interchange; } 1696 virtual unsigned int execute (function *); 1697 1698 }; // class pass_linterchange 1699 1700 1701 /* Return true if LOOP has proper form for interchange. We check three 1702 conditions in the function: 1703 1) In general, a loop can be interchanged only if it doesn't have 1704 basic blocks other than header, exit and latch besides possible 1705 inner loop nest. This basically restricts loop interchange to 1706 below form loop nests: 1707 1708 header<---+ 1709 | | 1710 v | 1711 INNER_LOOP | 1712 | | 1713 v | 1714 exit--->latch 1715 1716 2) Data reference in basic block that executes in different times 1717 than loop head/exit is not allowed. 1718 3) Record the innermost outer loop that doesn't form rectangle loop 1719 nest with LOOP. */ 1720 1721 static bool 1722 proper_loop_form_for_interchange (class loop *loop, class loop **min_outer) 1723 { 1724 edge e0, e1, exit; 1725 1726 /* Don't interchange if loop has unsupported information for the moment. */ 1727 if (loop->safelen > 0 1728 || loop->constraints != 0 1729 || loop->can_be_parallel 1730 || loop->dont_vectorize 1731 || loop->force_vectorize 1732 || loop->in_oacc_kernels_region 1733 || loop->orig_loop_num != 0 1734 || loop->simduid != NULL_TREE) 1735 return false; 1736 1737 /* Don't interchange if outer loop has basic block other than header, exit 1738 and latch. */ 1739 if (loop->inner != NULL 1740 && loop->num_nodes != loop->inner->num_nodes + 3) 1741 return false; 1742 1743 if ((exit = single_dom_exit (loop)) == NULL) 1744 return false; 1745 1746 /* Check control flow on loop header/exit blocks. */ 1747 if (loop->header == exit->src 1748 && (EDGE_COUNT (loop->header->preds) != 2 1749 || EDGE_COUNT (loop->header->succs) != 2)) 1750 return false; 1751 else if (loop->header != exit->src 1752 && (EDGE_COUNT (loop->header->preds) != 2 1753 || !single_succ_p (loop->header) 1754 || unsupported_edge (single_succ_edge (loop->header)) 1755 || EDGE_COUNT (exit->src->succs) != 2 1756 || !single_pred_p (exit->src) 1757 || unsupported_edge (single_pred_edge (exit->src)))) 1758 return false; 1759 1760 e0 = EDGE_PRED (loop->header, 0); 1761 e1 = EDGE_PRED (loop->header, 1); 1762 if (unsupported_edge (e0) || unsupported_edge (e1) 1763 || (e0->src != loop->latch && e1->src != loop->latch) 1764 || (e0->src->loop_father == loop && e1->src->loop_father == loop)) 1765 return false; 1766 1767 e0 = EDGE_SUCC (exit->src, 0); 1768 e1 = EDGE_SUCC (exit->src, 1); 1769 if (unsupported_edge (e0) || unsupported_edge (e1) 1770 || (e0->dest != loop->latch && e1->dest != loop->latch) 1771 || (e0->dest->loop_father == loop && e1->dest->loop_father == loop)) 1772 return false; 1773 1774 /* Don't interchange if any reference is in basic block that doesn't 1775 dominate exit block. */ 1776 basic_block *bbs = get_loop_body (loop); 1777 for (unsigned i = 0; i < loop->num_nodes; i++) 1778 { 1779 basic_block bb = bbs[i]; 1780 1781 if (bb->loop_father != loop 1782 || bb == loop->header || bb == exit->src 1783 || dominated_by_p (CDI_DOMINATORS, exit->src, bb)) 1784 continue; 1785 1786 for (gimple_stmt_iterator gsi = gsi_start_nondebug_bb (bb); 1787 !gsi_end_p (gsi); gsi_next_nondebug (&gsi)) 1788 if (gimple_vuse (gsi_stmt (gsi))) 1789 { 1790 free (bbs); 1791 return false; 1792 } 1793 } 1794 free (bbs); 1795 1796 tree niters = number_of_latch_executions (loop); 1797 niters = analyze_scalar_evolution (loop_outer (loop), niters); 1798 if (!niters || chrec_contains_undetermined (niters)) 1799 return false; 1800 1801 /* Record the innermost outer loop that doesn't form rectangle loop nest. */ 1802 for (loop_p loop2 = loop_outer (loop); 1803 loop2 && flow_loop_nested_p (*min_outer, loop2); 1804 loop2 = loop_outer (loop2)) 1805 { 1806 niters = instantiate_scev (loop_preheader_edge (loop2), 1807 loop_outer (loop), niters); 1808 if (!evolution_function_is_invariant_p (niters, loop2->num)) 1809 { 1810 *min_outer = loop2; 1811 break; 1812 } 1813 } 1814 return true; 1815 } 1816 1817 /* Return true if any two adjacent loops in loop nest [INNERMOST, LOOP_NEST] 1818 should be interchanged by looking into all DATAREFS. */ 1819 1820 static bool 1821 should_interchange_loop_nest (class loop *loop_nest, class loop *innermost, 1822 vec<data_reference_p> datarefs) 1823 { 1824 unsigned idx = loop_depth (innermost) - loop_depth (loop_nest); 1825 gcc_assert (idx > 0); 1826 1827 /* Check if any two adjacent loops should be interchanged. */ 1828 for (class loop *loop = innermost; 1829 loop != loop_nest; loop = loop_outer (loop), idx--) 1830 if (should_interchange_loops (idx, idx - 1, datarefs, 0, 0, 1831 loop == innermost, false)) 1832 return true; 1833 1834 return false; 1835 } 1836 1837 /* Given loop nest LOOP_NEST and data references DATAREFS, compute data 1838 dependences for loop interchange and store it in DDRS. Note we compute 1839 dependences directly rather than call generic interface so that we can 1840 return on unknown dependence instantly. */ 1841 1842 static bool 1843 tree_loop_interchange_compute_ddrs (vec<loop_p> loop_nest, 1844 vec<data_reference_p> datarefs, 1845 vec<ddr_p> *ddrs) 1846 { 1847 struct data_reference *a, *b; 1848 class loop *innermost = loop_nest.last (); 1849 1850 for (unsigned i = 0; datarefs.iterate (i, &a); ++i) 1851 { 1852 bool a_outer_p = gimple_bb (DR_STMT (a))->loop_father != innermost; 1853 for (unsigned j = i + 1; datarefs.iterate (j, &b); ++j) 1854 if (DR_IS_WRITE (a) || DR_IS_WRITE (b)) 1855 { 1856 bool b_outer_p = gimple_bb (DR_STMT (b))->loop_father != innermost; 1857 /* Don't support multiple write references in outer loop. */ 1858 if (a_outer_p && b_outer_p && DR_IS_WRITE (a) && DR_IS_WRITE (b)) 1859 return false; 1860 1861 ddr_p ddr = initialize_data_dependence_relation (a, b, loop_nest); 1862 ddrs->safe_push (ddr); 1863 compute_affine_dependence (ddr, loop_nest[0]); 1864 1865 /* Give up if ddr is unknown dependence or classic direct vector 1866 is not available. */ 1867 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know 1868 || (DDR_ARE_DEPENDENT (ddr) == NULL_TREE 1869 && DDR_NUM_DIR_VECTS (ddr) == 0)) 1870 return false; 1871 1872 /* If either data references is in outer loop of nest, we require 1873 no dependence here because the data reference need to be moved 1874 into inner loop during interchange. */ 1875 if (a_outer_p && b_outer_p 1876 && operand_equal_p (DR_REF (a), DR_REF (b), 0)) 1877 continue; 1878 if (DDR_ARE_DEPENDENT (ddr) != chrec_known 1879 && (a_outer_p || b_outer_p)) 1880 return false; 1881 } 1882 } 1883 1884 return true; 1885 } 1886 1887 /* Prune DATAREFS by removing any data reference not inside of LOOP. */ 1888 1889 static inline void 1890 prune_datarefs_not_in_loop (class loop *loop, vec<data_reference_p> datarefs) 1891 { 1892 unsigned i, j; 1893 struct data_reference *dr; 1894 1895 for (i = 0, j = 0; datarefs.iterate (i, &dr); ++i) 1896 { 1897 if (flow_bb_inside_loop_p (loop, gimple_bb (DR_STMT (dr)))) 1898 datarefs[j++] = dr; 1899 else 1900 { 1901 if (dr->aux) 1902 { 1903 DR_ACCESS_STRIDE (dr)->release (); 1904 delete (vec<tree> *) dr->aux; 1905 } 1906 free_data_ref (dr); 1907 } 1908 } 1909 datarefs.truncate (j); 1910 } 1911 1912 /* Find and store data references in DATAREFS for LOOP nest. If there's 1913 difficult data reference in a basic block, we shrink the loop nest to 1914 inner loop of that basic block's father loop. On success, return the 1915 outer loop of the result loop nest. */ 1916 1917 static class loop * 1918 prepare_data_references (class loop *loop, vec<data_reference_p> *datarefs) 1919 { 1920 class loop *loop_nest = loop; 1921 vec<data_reference_p> *bb_refs; 1922 basic_block bb, *bbs = get_loop_body_in_dom_order (loop); 1923 1924 for (unsigned i = 0; i < loop->num_nodes; i++) 1925 bbs[i]->aux = NULL; 1926 1927 /* Find data references for all basic blocks. Shrink loop nest on difficult 1928 data reference. */ 1929 for (unsigned i = 0; loop_nest && i < loop->num_nodes; ++i) 1930 { 1931 bb = bbs[i]; 1932 if (!flow_bb_inside_loop_p (loop_nest, bb)) 1933 continue; 1934 1935 bb_refs = new vec<data_reference_p> (); 1936 if (find_data_references_in_bb (loop, bb, bb_refs) == chrec_dont_know) 1937 { 1938 loop_nest = bb->loop_father->inner; 1939 if (loop_nest && !loop_nest->inner) 1940 loop_nest = NULL; 1941 1942 free_data_refs (*bb_refs); 1943 delete bb_refs; 1944 } 1945 else if (bb_refs->is_empty ()) 1946 delete bb_refs; 1947 else 1948 bb->aux = bb_refs; 1949 } 1950 1951 /* Collect all data references in loop nest. */ 1952 for (unsigned i = 0; i < loop->num_nodes; i++) 1953 { 1954 bb = bbs[i]; 1955 if (!bb->aux) 1956 continue; 1957 1958 bb_refs = (vec<data_reference_p> *) bb->aux; 1959 if (loop_nest && flow_bb_inside_loop_p (loop_nest, bb)) 1960 datarefs->safe_splice (*bb_refs); 1961 else 1962 free_data_refs (*bb_refs); 1963 1964 delete bb_refs; 1965 bb->aux = NULL; 1966 } 1967 free (bbs); 1968 1969 return loop_nest; 1970 } 1971 1972 /* Given innermost LOOP, return true if perfect loop nest can be found and 1973 data dependences can be computed. If succeed, record the perfect loop 1974 nest in LOOP_NEST; record all data references in DATAREFS and record all 1975 data dependence relations in DDRS. 1976 1977 We do support a restricted form of imperfect loop nest, i.e, loop nest 1978 with load/store in outer loop initializing/finalizing simple reduction 1979 of the innermost loop. For such outer loop reference, we require that 1980 it has no dependence with others sinve it will be moved to inner loop 1981 in interchange. */ 1982 1983 static bool 1984 prepare_perfect_loop_nest (class loop *loop, vec<loop_p> *loop_nest, 1985 vec<data_reference_p> *datarefs, vec<ddr_p> *ddrs) 1986 { 1987 class loop *start_loop = NULL, *innermost = loop; 1988 class loop *outermost = loops_for_fn (cfun)->tree_root; 1989 1990 /* Find loop nest from the innermost loop. The outermost is the innermost 1991 outer*/ 1992 while (loop->num != 0 && loop->inner == start_loop 1993 && flow_loop_nested_p (outermost, loop)) 1994 { 1995 if (!proper_loop_form_for_interchange (loop, &outermost)) 1996 break; 1997 1998 start_loop = loop; 1999 /* If this loop has sibling loop, the father loop won't be in perfect 2000 loop nest. */ 2001 if (loop->next != NULL) 2002 break; 2003 2004 loop = loop_outer (loop); 2005 } 2006 if (!start_loop || !start_loop->inner) 2007 return false; 2008 2009 /* Prepare the data reference vector for the loop nest, pruning outer 2010 loops we cannot handle. */ 2011 start_loop = prepare_data_references (start_loop, datarefs); 2012 if (!start_loop 2013 /* Check if there is no data reference. */ 2014 || datarefs->is_empty () 2015 /* Check if there are too many of data references. */ 2016 || (int) datarefs->length () > MAX_DATAREFS) 2017 return false; 2018 2019 /* Compute access strides for all data references, pruning outer 2020 loops we cannot analyze refs in. */ 2021 start_loop = compute_access_strides (start_loop, innermost, *datarefs); 2022 if (!start_loop) 2023 return false; 2024 2025 /* Check if any interchange is profitable in the loop nest. */ 2026 if (!should_interchange_loop_nest (start_loop, innermost, *datarefs)) 2027 return false; 2028 2029 /* Check if data dependences can be computed for loop nest starting from 2030 start_loop. */ 2031 loop = start_loop; 2032 do { 2033 loop_nest->truncate (0); 2034 2035 if (loop != start_loop) 2036 prune_datarefs_not_in_loop (start_loop, *datarefs); 2037 2038 if (find_loop_nest (start_loop, loop_nest) 2039 && tree_loop_interchange_compute_ddrs (*loop_nest, *datarefs, ddrs)) 2040 { 2041 if (dump_file && (dump_flags & TDF_DETAILS)) 2042 fprintf (dump_file, 2043 "\nConsider loop interchange for loop_nest<%d - %d>\n", 2044 start_loop->num, innermost->num); 2045 2046 if (loop != start_loop) 2047 prune_access_strides_not_in_loop (start_loop, innermost, *datarefs); 2048 2049 if (dump_file && (dump_flags & TDF_DETAILS)) 2050 dump_access_strides (*datarefs); 2051 2052 return true; 2053 } 2054 2055 free_dependence_relations (*ddrs); 2056 *ddrs = vNULL; 2057 /* Try to compute data dependences with the outermost loop stripped. */ 2058 loop = start_loop; 2059 start_loop = start_loop->inner; 2060 } while (start_loop && start_loop->inner); 2061 2062 return false; 2063 } 2064 2065 /* Main entry for loop interchange pass. */ 2066 2067 unsigned int 2068 pass_linterchange::execute (function *fun) 2069 { 2070 if (number_of_loops (fun) <= 2) 2071 return 0; 2072 2073 bool changed_p = false; 2074 class loop *loop; 2075 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) 2076 { 2077 vec<loop_p> loop_nest = vNULL; 2078 vec<data_reference_p> datarefs = vNULL; 2079 vec<ddr_p> ddrs = vNULL; 2080 if (prepare_perfect_loop_nest (loop, &loop_nest, &datarefs, &ddrs)) 2081 { 2082 tree_loop_interchange loop_interchange (loop_nest); 2083 changed_p |= loop_interchange.interchange (datarefs, ddrs); 2084 } 2085 free_dependence_relations (ddrs); 2086 free_data_refs_with_aux (datarefs); 2087 loop_nest.release (); 2088 } 2089 2090 return changed_p ? (TODO_update_ssa_only_virtuals) : 0; 2091 } 2092 2093 } // anon namespace 2094 2095 gimple_opt_pass * 2096 make_pass_linterchange (gcc::context *ctxt) 2097 { 2098 return new pass_linterchange (ctxt); 2099 } 2100