1*38fd1498Szrj /* Loop unroll-and-jam. 2*38fd1498Szrj Copyright (C) 2017-2018 Free Software Foundation, Inc. 3*38fd1498Szrj 4*38fd1498Szrj This file is part of GCC. 5*38fd1498Szrj 6*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it 7*38fd1498Szrj under the terms of the GNU General Public License as published by the 8*38fd1498Szrj Free Software Foundation; either version 3, or (at your option) any 9*38fd1498Szrj later version. 10*38fd1498Szrj 11*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT 12*38fd1498Szrj ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14*38fd1498Szrj for more details. 15*38fd1498Szrj 16*38fd1498Szrj You should have received a copy of the GNU General Public License 17*38fd1498Szrj along with GCC; see the file COPYING3. If not see 18*38fd1498Szrj <http://www.gnu.org/licenses/>. */ 19*38fd1498Szrj 20*38fd1498Szrj #include "config.h" 21*38fd1498Szrj #include "system.h" 22*38fd1498Szrj #include "coretypes.h" 23*38fd1498Szrj #include "params.h" 24*38fd1498Szrj #include "tree-pass.h" 25*38fd1498Szrj #include "backend.h" 26*38fd1498Szrj #include "tree.h" 27*38fd1498Szrj #include "gimple.h" 28*38fd1498Szrj #include "ssa.h" 29*38fd1498Szrj #include "fold-const.h" 30*38fd1498Szrj #include "tree-cfg.h" 31*38fd1498Szrj #include "tree-ssa.h" 32*38fd1498Szrj #include "tree-ssa-loop-niter.h" 33*38fd1498Szrj #include "tree-ssa-loop.h" 34*38fd1498Szrj #include "tree-ssa-loop-manip.h" 35*38fd1498Szrj #include "cfgloop.h" 36*38fd1498Szrj #include "tree-scalar-evolution.h" 37*38fd1498Szrj #include "gimple-iterator.h" 38*38fd1498Szrj #include "cfghooks.h" 39*38fd1498Szrj #include "tree-data-ref.h" 40*38fd1498Szrj #include "tree-ssa-loop-ivopts.h" 41*38fd1498Szrj #include "tree-vectorizer.h" 42*38fd1498Szrj 43*38fd1498Szrj /* Unroll and Jam transformation 44*38fd1498Szrj 45*38fd1498Szrj This is a combination of two transformations, where the second 46*38fd1498Szrj is not always valid. It's applicable if a loop nest has redundancies 47*38fd1498Szrj over the iterations of an outer loop while not having that with 48*38fd1498Szrj an inner loop. 49*38fd1498Szrj 50*38fd1498Szrj Given this nest: 51*38fd1498Szrj for (i) { 52*38fd1498Szrj for (j) { 53*38fd1498Szrj B(i,j) 54*38fd1498Szrj } 55*38fd1498Szrj } 56*38fd1498Szrj 57*38fd1498Szrj first unroll: 58*38fd1498Szrj for (i by 2) { 59*38fd1498Szrj for (j) { 60*38fd1498Szrj B(i,j) 61*38fd1498Szrj } 62*38fd1498Szrj for (j) { 63*38fd1498Szrj B(i+1,j) 64*38fd1498Szrj } 65*38fd1498Szrj } 66*38fd1498Szrj 67*38fd1498Szrj then fuse the two adjacent inner loops resulting from that: 68*38fd1498Szrj for (i by 2) { 69*38fd1498Szrj for (j) { 70*38fd1498Szrj B(i,j) 71*38fd1498Szrj B(i+1,j) 72*38fd1498Szrj } 73*38fd1498Szrj } 74*38fd1498Szrj 75*38fd1498Szrj As the order of evaluations of the body B changes this is valid 76*38fd1498Szrj only in certain situations: all distance vectors need to be forward. 77*38fd1498Szrj Additionally if there are multiple induction variables than just 78*38fd1498Szrj a counting control IV (j above) we can also deal with some situations. 79*38fd1498Szrj 80*38fd1498Szrj The validity is checked by unroll_jam_possible_p, and the data-dep 81*38fd1498Szrj testing below. 82*38fd1498Szrj 83*38fd1498Szrj A trivial example where the fusion is wrong would be when 84*38fd1498Szrj B(i,j) == x[j-1] = x[j]; 85*38fd1498Szrj for (i by 2) { 86*38fd1498Szrj for (j) { 87*38fd1498Szrj x[j-1] = x[j]; 88*38fd1498Szrj } 89*38fd1498Szrj for (j) { 90*38fd1498Szrj x[j-1] = x[j]; 91*38fd1498Szrj } 92*38fd1498Szrj } effect: move content to front by two elements 93*38fd1498Szrj --> 94*38fd1498Szrj for (i by 2) { 95*38fd1498Szrj for (j) { 96*38fd1498Szrj x[j-1] = x[j]; 97*38fd1498Szrj x[j-1] = x[j]; 98*38fd1498Szrj } 99*38fd1498Szrj } effect: move content to front by one element 100*38fd1498Szrj */ 101*38fd1498Szrj 102*38fd1498Szrj /* Modify the loop tree for the fact that all code once belonging 103*38fd1498Szrj to the OLD loop or the outer loop of OLD now is inside LOOP. */ 104*38fd1498Szrj 105*38fd1498Szrj static void 106*38fd1498Szrj merge_loop_tree (struct loop *loop, struct loop *old) 107*38fd1498Szrj { 108*38fd1498Szrj basic_block *bbs; 109*38fd1498Szrj int i, n; 110*38fd1498Szrj struct loop *subloop; 111*38fd1498Szrj edge e; 112*38fd1498Szrj edge_iterator ei; 113*38fd1498Szrj 114*38fd1498Szrj /* Find its nodes. */ 115*38fd1498Szrj bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); 116*38fd1498Szrj n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun)); 117*38fd1498Szrj 118*38fd1498Szrj for (i = 0; i < n; i++) 119*38fd1498Szrj { 120*38fd1498Szrj /* If the block was direct child of OLD loop it's now part 121*38fd1498Szrj of LOOP. If it was outside OLD, then it moved into LOOP 122*38fd1498Szrj as well. This avoids changing the loop father for BBs 123*38fd1498Szrj in inner loops of OLD. */ 124*38fd1498Szrj if (bbs[i]->loop_father == old 125*38fd1498Szrj || loop_depth (bbs[i]->loop_father) < loop_depth (old)) 126*38fd1498Szrj { 127*38fd1498Szrj remove_bb_from_loops (bbs[i]); 128*38fd1498Szrj add_bb_to_loop (bbs[i], loop); 129*38fd1498Szrj continue; 130*38fd1498Szrj } 131*38fd1498Szrj 132*38fd1498Szrj /* If we find a direct subloop of OLD, move it to LOOP. */ 133*38fd1498Szrj subloop = bbs[i]->loop_father; 134*38fd1498Szrj if (loop_outer (subloop) == old && subloop->header == bbs[i]) 135*38fd1498Szrj { 136*38fd1498Szrj flow_loop_tree_node_remove (subloop); 137*38fd1498Szrj flow_loop_tree_node_add (loop, subloop); 138*38fd1498Szrj } 139*38fd1498Szrj } 140*38fd1498Szrj 141*38fd1498Szrj /* Update the information about loop exit edges. */ 142*38fd1498Szrj for (i = 0; i < n; i++) 143*38fd1498Szrj { 144*38fd1498Szrj FOR_EACH_EDGE (e, ei, bbs[i]->succs) 145*38fd1498Szrj { 146*38fd1498Szrj rescan_loop_exit (e, false, false); 147*38fd1498Szrj } 148*38fd1498Szrj } 149*38fd1498Szrj 150*38fd1498Szrj loop->num_nodes = n; 151*38fd1498Szrj 152*38fd1498Szrj free (bbs); 153*38fd1498Szrj } 154*38fd1498Szrj 155*38fd1498Szrj /* BB is part of the outer loop of an unroll-and-jam situation. 156*38fd1498Szrj Check if any statements therein would prevent the transformation. */ 157*38fd1498Szrj 158*38fd1498Szrj static bool 159*38fd1498Szrj bb_prevents_fusion_p (basic_block bb) 160*38fd1498Szrj { 161*38fd1498Szrj gimple_stmt_iterator gsi; 162*38fd1498Szrj /* BB is duplicated by outer unrolling and then all N-1 first copies 163*38fd1498Szrj move into the body of the fused inner loop. If BB exits the outer loop 164*38fd1498Szrj the last copy still doess so, and the first N-1 copies are cancelled 165*38fd1498Szrj by loop unrolling, so also after fusion it's the exit block. 166*38fd1498Szrj But there might be other reasons that prevent fusion: 167*38fd1498Szrj * stores or unknown side-effects prevent fusion 168*38fd1498Szrj * loads don't 169*38fd1498Szrj * computations into SSA names: these aren't problematic. Their 170*38fd1498Szrj result will be unused on the exit edges of the first N-1 copies 171*38fd1498Szrj (those aren't taken after unrolling). If they are used on the 172*38fd1498Szrj other edge (the one leading to the outer latch block) they are 173*38fd1498Szrj loop-carried (on the outer loop) and the Nth copy of BB will 174*38fd1498Szrj compute them again (i.e. the first N-1 copies will be dead). */ 175*38fd1498Szrj for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 176*38fd1498Szrj { 177*38fd1498Szrj gimple *g = gsi_stmt (gsi); 178*38fd1498Szrj if (gimple_vdef (g) || gimple_has_side_effects (g)) 179*38fd1498Szrj return true; 180*38fd1498Szrj } 181*38fd1498Szrj return false; 182*38fd1498Szrj } 183*38fd1498Szrj 184*38fd1498Szrj /* Given an inner loop LOOP (of some OUTER loop) determine if 185*38fd1498Szrj we can safely fuse copies of it (generated by outer unrolling). 186*38fd1498Szrj If so return true, otherwise return false. */ 187*38fd1498Szrj 188*38fd1498Szrj static bool 189*38fd1498Szrj unroll_jam_possible_p (struct loop *outer, struct loop *loop) 190*38fd1498Szrj { 191*38fd1498Szrj basic_block *bbs; 192*38fd1498Szrj int i, n; 193*38fd1498Szrj struct tree_niter_desc niter; 194*38fd1498Szrj 195*38fd1498Szrj /* When fusing the loops we skip the latch block 196*38fd1498Szrj of the first one, so it mustn't have any effects to 197*38fd1498Szrj preserve. */ 198*38fd1498Szrj if (!empty_block_p (loop->latch)) 199*38fd1498Szrj return false; 200*38fd1498Szrj 201*38fd1498Szrj if (!single_exit (loop)) 202*38fd1498Szrj return false; 203*38fd1498Szrj 204*38fd1498Szrj /* We need a perfect nest. Quick check for adjacent inner loops. */ 205*38fd1498Szrj if (outer->inner != loop || loop->next) 206*38fd1498Szrj return false; 207*38fd1498Szrj 208*38fd1498Szrj /* Prevent head-controlled inner loops, that we usually have. 209*38fd1498Szrj The guard block would need to be accepted 210*38fd1498Szrj (invariant condition either entering or skipping the loop), 211*38fd1498Szrj without also accepting arbitrary control flow. When unswitching 212*38fd1498Szrj ran before us (as with -O3) this won't be a problem because its 213*38fd1498Szrj outer loop unswitching will have moved out the invariant condition. 214*38fd1498Szrj 215*38fd1498Szrj If we do that we need to extend fuse_loops() to cope with this 216*38fd1498Szrj by threading through the (still invariant) copied condition 217*38fd1498Szrj between the two loop copies. */ 218*38fd1498Szrj if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header)) 219*38fd1498Szrj return false; 220*38fd1498Szrj 221*38fd1498Szrj /* The number of iterations of the inner loop must be loop invariant 222*38fd1498Szrj with respect to the outer loop. */ 223*38fd1498Szrj if (!number_of_iterations_exit (loop, single_exit (loop), &niter, 224*38fd1498Szrj false, true) 225*38fd1498Szrj || niter.cmp == ERROR_MARK 226*38fd1498Szrj || !integer_zerop (niter.may_be_zero) 227*38fd1498Szrj || !expr_invariant_in_loop_p (outer, niter.niter)) 228*38fd1498Szrj return false; 229*38fd1498Szrj 230*38fd1498Szrj /* And check blocks belonging to just outer loop. */ 231*38fd1498Szrj bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); 232*38fd1498Szrj n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun)); 233*38fd1498Szrj 234*38fd1498Szrj for (i = 0; i < n; i++) 235*38fd1498Szrj if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i])) 236*38fd1498Szrj break; 237*38fd1498Szrj free (bbs); 238*38fd1498Szrj if (i != n) 239*38fd1498Szrj return false; 240*38fd1498Szrj 241*38fd1498Szrj /* For now we can safely fuse copies of LOOP only if all 242*38fd1498Szrj loop carried variables are inductions (or the virtual op). 243*38fd1498Szrj 244*38fd1498Szrj We could handle reductions as well (the initial value in the second 245*38fd1498Szrj body would be the after-iter value of the first body) if it's over 246*38fd1498Szrj an associative and commutative operation. We wouldn't 247*38fd1498Szrj be able to handle unknown cycles. */ 248*38fd1498Szrj gphi_iterator psi; 249*38fd1498Szrj for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi)) 250*38fd1498Szrj { 251*38fd1498Szrj affine_iv iv; 252*38fd1498Szrj tree op = gimple_phi_result (psi.phi ()); 253*38fd1498Szrj 254*38fd1498Szrj if (virtual_operand_p (op)) 255*38fd1498Szrj continue; 256*38fd1498Szrj if (!simple_iv (loop, loop, op, &iv, true)) 257*38fd1498Szrj return false; 258*38fd1498Szrj /* The inductions must be regular, loop invariant step and initial 259*38fd1498Szrj value. */ 260*38fd1498Szrj if (!expr_invariant_in_loop_p (outer, iv.step) 261*38fd1498Szrj || !expr_invariant_in_loop_p (outer, iv.base)) 262*38fd1498Szrj return false; 263*38fd1498Szrj /* XXX With more effort we could also be able to deal with inductions 264*38fd1498Szrj where the initial value is loop variant but a simple IV in the 265*38fd1498Szrj outer loop. The initial value for the second body would be 266*38fd1498Szrj the original initial value plus iv.base.step. The next value 267*38fd1498Szrj for the fused loop would be the original next value of the first 268*38fd1498Szrj copy, _not_ the next value of the second body. */ 269*38fd1498Szrj } 270*38fd1498Szrj 271*38fd1498Szrj return true; 272*38fd1498Szrj } 273*38fd1498Szrj 274*38fd1498Szrj /* Fuse LOOP with all further neighbors. The loops are expected to 275*38fd1498Szrj be in appropriate form. */ 276*38fd1498Szrj 277*38fd1498Szrj static void 278*38fd1498Szrj fuse_loops (struct loop *loop) 279*38fd1498Szrj { 280*38fd1498Szrj struct loop *next = loop->next; 281*38fd1498Szrj 282*38fd1498Szrj while (next) 283*38fd1498Szrj { 284*38fd1498Szrj edge e; 285*38fd1498Szrj 286*38fd1498Szrj remove_branch (single_pred_edge (loop->latch)); 287*38fd1498Szrj /* Make delete_basic_block not fiddle with the loop structure. */ 288*38fd1498Szrj basic_block oldlatch = loop->latch; 289*38fd1498Szrj loop->latch = NULL; 290*38fd1498Szrj delete_basic_block (oldlatch); 291*38fd1498Szrj e = redirect_edge_and_branch (loop_latch_edge (next), 292*38fd1498Szrj loop->header); 293*38fd1498Szrj loop->latch = e->src; 294*38fd1498Szrj flush_pending_stmts (e); 295*38fd1498Szrj 296*38fd1498Szrj gcc_assert (EDGE_COUNT (next->header->preds) == 1); 297*38fd1498Szrj 298*38fd1498Szrj /* The PHI nodes of the second body (single-argument now) 299*38fd1498Szrj need adjustments to use the right values: either directly 300*38fd1498Szrj the value of the corresponding PHI in the first copy or 301*38fd1498Szrj the one leaving the first body which unrolling did for us. 302*38fd1498Szrj 303*38fd1498Szrj See also unroll_jam_possible_p() for further possibilities. */ 304*38fd1498Szrj gphi_iterator psi_first, psi_second; 305*38fd1498Szrj e = single_pred_edge (next->header); 306*38fd1498Szrj for (psi_first = gsi_start_phis (loop->header), 307*38fd1498Szrj psi_second = gsi_start_phis (next->header); 308*38fd1498Szrj !gsi_end_p (psi_first); 309*38fd1498Szrj gsi_next (&psi_first), gsi_next (&psi_second)) 310*38fd1498Szrj { 311*38fd1498Szrj gphi *phi_first = psi_first.phi (); 312*38fd1498Szrj gphi *phi_second = psi_second.phi (); 313*38fd1498Szrj tree firstop = gimple_phi_result (phi_first); 314*38fd1498Szrj /* The virtual operand is correct already as it's 315*38fd1498Szrj always live at exit, hence has a LCSSA node and outer 316*38fd1498Szrj loop unrolling updated SSA form. */ 317*38fd1498Szrj if (virtual_operand_p (firstop)) 318*38fd1498Szrj continue; 319*38fd1498Szrj 320*38fd1498Szrj /* Due to unroll_jam_possible_p() we know that this is 321*38fd1498Szrj an induction. The second body goes over the same 322*38fd1498Szrj iteration space. */ 323*38fd1498Szrj add_phi_arg (phi_second, firstop, e, 324*38fd1498Szrj gimple_location (phi_first)); 325*38fd1498Szrj } 326*38fd1498Szrj gcc_assert (gsi_end_p (psi_second)); 327*38fd1498Szrj 328*38fd1498Szrj merge_loop_tree (loop, next); 329*38fd1498Szrj gcc_assert (!next->num_nodes); 330*38fd1498Szrj struct loop *ln = next->next; 331*38fd1498Szrj delete_loop (next); 332*38fd1498Szrj next = ln; 333*38fd1498Szrj } 334*38fd1498Szrj rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); 335*38fd1498Szrj } 336*38fd1498Szrj 337*38fd1498Szrj /* Returns true if the distance in DDR can be determined and adjusts 338*38fd1498Szrj the unroll factor in *UNROLL to make unrolling valid for that distance. 339*38fd1498Szrj Otherwise return false. 340*38fd1498Szrj 341*38fd1498Szrj If this data dep can lead to a removed memory reference, increment 342*38fd1498Szrj *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor 343*38fd1498Szrj for this to happen. */ 344*38fd1498Szrj 345*38fd1498Szrj static bool 346*38fd1498Szrj adjust_unroll_factor (struct data_dependence_relation *ddr, 347*38fd1498Szrj unsigned *unroll, unsigned *profit_unroll, 348*38fd1498Szrj unsigned *removed) 349*38fd1498Szrj { 350*38fd1498Szrj bool ret = false; 351*38fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) != chrec_known) 352*38fd1498Szrj { 353*38fd1498Szrj if (DDR_NUM_DIST_VECTS (ddr) == 0) 354*38fd1498Szrj return false; 355*38fd1498Szrj unsigned i; 356*38fd1498Szrj lambda_vector dist_v; 357*38fd1498Szrj FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 358*38fd1498Szrj { 359*38fd1498Szrj /* A distance (a,b) is at worst transformed into (a/N,b) by the 360*38fd1498Szrj unrolling (factor N), so the transformation is valid if 361*38fd1498Szrj a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll 362*38fd1498Szrj factor needs to be limited so that the first condition holds. 363*38fd1498Szrj That may limit the factor down to zero in the worst case. */ 364*38fd1498Szrj int dist = dist_v[0]; 365*38fd1498Szrj if (dist < 0) 366*38fd1498Szrj gcc_unreachable (); 367*38fd1498Szrj else if ((unsigned)dist >= *unroll) 368*38fd1498Szrj ; 369*38fd1498Szrj else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) 370*38fd1498Szrj || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) 371*38fd1498Szrj && dist > 0)) 372*38fd1498Szrj ; 373*38fd1498Szrj else 374*38fd1498Szrj *unroll = dist; 375*38fd1498Szrj 376*38fd1498Szrj /* With a distance (a,0) it's always profitable to unroll-and-jam 377*38fd1498Szrj (by a+1), because one memory reference will go away. With 378*38fd1498Szrj (a,b) and b != 0 that's less clear. We will increase the 379*38fd1498Szrj number of streams without lowering the number of mem refs. 380*38fd1498Szrj So for now only handle the first situation. */ 381*38fd1498Szrj if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) 382*38fd1498Szrj { 383*38fd1498Szrj *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1); 384*38fd1498Szrj (*removed)++; 385*38fd1498Szrj } 386*38fd1498Szrj 387*38fd1498Szrj ret = true; 388*38fd1498Szrj } 389*38fd1498Szrj } 390*38fd1498Szrj return ret; 391*38fd1498Szrj } 392*38fd1498Szrj 393*38fd1498Szrj /* Main entry point for the unroll-and-jam transformation 394*38fd1498Szrj described above. */ 395*38fd1498Szrj 396*38fd1498Szrj static unsigned int 397*38fd1498Szrj tree_loop_unroll_and_jam (void) 398*38fd1498Szrj { 399*38fd1498Szrj struct loop *loop; 400*38fd1498Szrj bool changed = false; 401*38fd1498Szrj 402*38fd1498Szrj gcc_assert (scev_initialized_p ()); 403*38fd1498Szrj 404*38fd1498Szrj /* Go through all innermost loops. */ 405*38fd1498Szrj FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) 406*38fd1498Szrj { 407*38fd1498Szrj struct loop *outer = loop_outer (loop); 408*38fd1498Szrj 409*38fd1498Szrj if (loop_depth (loop) < 2 410*38fd1498Szrj || optimize_loop_nest_for_size_p (outer)) 411*38fd1498Szrj continue; 412*38fd1498Szrj 413*38fd1498Szrj if (!unroll_jam_possible_p (outer, loop)) 414*38fd1498Szrj continue; 415*38fd1498Szrj 416*38fd1498Szrj vec<data_reference_p> datarefs; 417*38fd1498Szrj vec<ddr_p> dependences; 418*38fd1498Szrj unsigned unroll_factor, profit_unroll, removed; 419*38fd1498Szrj struct tree_niter_desc desc; 420*38fd1498Szrj bool unroll = false; 421*38fd1498Szrj 422*38fd1498Szrj auto_vec<loop_p, 3> loop_nest; 423*38fd1498Szrj dependences.create (10); 424*38fd1498Szrj datarefs.create (10); 425*38fd1498Szrj if (!compute_data_dependences_for_loop (outer, true, &loop_nest, 426*38fd1498Szrj &datarefs, &dependences)) 427*38fd1498Szrj { 428*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 429*38fd1498Szrj fprintf (dump_file, "Cannot analyze data dependencies\n"); 430*38fd1498Szrj free_data_refs (datarefs); 431*38fd1498Szrj free_dependence_relations (dependences); 432*38fd1498Szrj return false; 433*38fd1498Szrj } 434*38fd1498Szrj if (!datarefs.length ()) 435*38fd1498Szrj continue; 436*38fd1498Szrj 437*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 438*38fd1498Szrj dump_data_dependence_relations (dump_file, dependences); 439*38fd1498Szrj 440*38fd1498Szrj unroll_factor = (unsigned)-1; 441*38fd1498Szrj profit_unroll = 1; 442*38fd1498Szrj removed = 0; 443*38fd1498Szrj 444*38fd1498Szrj /* Check all dependencies. */ 445*38fd1498Szrj unsigned i; 446*38fd1498Szrj struct data_dependence_relation *ddr; 447*38fd1498Szrj FOR_EACH_VEC_ELT (dependences, i, ddr) 448*38fd1498Szrj { 449*38fd1498Szrj struct data_reference *dra, *drb; 450*38fd1498Szrj 451*38fd1498Szrj /* If the refs are independend there's nothing to do. */ 452*38fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 453*38fd1498Szrj continue; 454*38fd1498Szrj dra = DDR_A (ddr); 455*38fd1498Szrj drb = DDR_B (ddr); 456*38fd1498Szrj /* Nothing interesting for the self dependencies. */ 457*38fd1498Szrj if (dra == drb) 458*38fd1498Szrj continue; 459*38fd1498Szrj 460*38fd1498Szrj /* Now check the distance vector, for determining a sensible 461*38fd1498Szrj outer unroll factor, and for validity of merging the inner 462*38fd1498Szrj loop copies. */ 463*38fd1498Szrj if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll, 464*38fd1498Szrj &removed)) 465*38fd1498Szrj { 466*38fd1498Szrj /* Couldn't get the distance vector. For two reads that's 467*38fd1498Szrj harmless (we assume we should unroll). For at least 468*38fd1498Szrj one write this means we can't check the dependence direction 469*38fd1498Szrj and hence can't determine safety. */ 470*38fd1498Szrj 471*38fd1498Szrj if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb)) 472*38fd1498Szrj { 473*38fd1498Szrj unroll_factor = 0; 474*38fd1498Szrj break; 475*38fd1498Szrj } 476*38fd1498Szrj } 477*38fd1498Szrj } 478*38fd1498Szrj 479*38fd1498Szrj /* We regard a user-specified minimum percentage of zero as a request 480*38fd1498Szrj to ignore all profitability concerns and apply the transformation 481*38fd1498Szrj always. */ 482*38fd1498Szrj if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) 483*38fd1498Szrj profit_unroll = 2; 484*38fd1498Szrj else if (removed * 100 / datarefs.length () 485*38fd1498Szrj < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) 486*38fd1498Szrj profit_unroll = 1; 487*38fd1498Szrj if (unroll_factor > profit_unroll) 488*38fd1498Szrj unroll_factor = profit_unroll; 489*38fd1498Szrj if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL)) 490*38fd1498Szrj unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL); 491*38fd1498Szrj unroll = (unroll_factor > 1 492*38fd1498Szrj && can_unroll_loop_p (outer, unroll_factor, &desc)); 493*38fd1498Szrj 494*38fd1498Szrj if (unroll) 495*38fd1498Szrj { 496*38fd1498Szrj if (dump_enabled_p ()) 497*38fd1498Szrj dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS, 498*38fd1498Szrj find_loop_location (outer), 499*38fd1498Szrj "applying unroll and jam with factor %d\n", 500*38fd1498Szrj unroll_factor); 501*38fd1498Szrj initialize_original_copy_tables (); 502*38fd1498Szrj tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer), 503*38fd1498Szrj &desc); 504*38fd1498Szrj free_original_copy_tables (); 505*38fd1498Szrj fuse_loops (outer->inner); 506*38fd1498Szrj changed = true; 507*38fd1498Szrj } 508*38fd1498Szrj 509*38fd1498Szrj loop_nest.release (); 510*38fd1498Szrj free_dependence_relations (dependences); 511*38fd1498Szrj free_data_refs (datarefs); 512*38fd1498Szrj } 513*38fd1498Szrj 514*38fd1498Szrj if (changed) 515*38fd1498Szrj { 516*38fd1498Szrj scev_reset (); 517*38fd1498Szrj free_dominance_info (CDI_DOMINATORS); 518*38fd1498Szrj return TODO_cleanup_cfg; 519*38fd1498Szrj } 520*38fd1498Szrj return 0; 521*38fd1498Szrj } 522*38fd1498Szrj 523*38fd1498Szrj /* Pass boilerplate */ 524*38fd1498Szrj 525*38fd1498Szrj namespace { 526*38fd1498Szrj 527*38fd1498Szrj const pass_data pass_data_loop_jam = 528*38fd1498Szrj { 529*38fd1498Szrj GIMPLE_PASS, /* type */ 530*38fd1498Szrj "unrolljam", /* name */ 531*38fd1498Szrj OPTGROUP_LOOP, /* optinfo_flags */ 532*38fd1498Szrj TV_LOOP_JAM, /* tv_id */ 533*38fd1498Szrj PROP_cfg, /* properties_required */ 534*38fd1498Szrj 0, /* properties_provided */ 535*38fd1498Szrj 0, /* properties_destroyed */ 536*38fd1498Szrj 0, /* todo_flags_start */ 537*38fd1498Szrj 0, /* todo_flags_finish */ 538*38fd1498Szrj }; 539*38fd1498Szrj 540*38fd1498Szrj class pass_loop_jam : public gimple_opt_pass 541*38fd1498Szrj { 542*38fd1498Szrj public: 543*38fd1498Szrj pass_loop_jam (gcc::context *ctxt) 544*38fd1498Szrj : gimple_opt_pass (pass_data_loop_jam, ctxt) 545*38fd1498Szrj {} 546*38fd1498Szrj 547*38fd1498Szrj /* opt_pass methods: */ 548*38fd1498Szrj virtual bool gate (function *) { return flag_unroll_jam != 0; } 549*38fd1498Szrj virtual unsigned int execute (function *); 550*38fd1498Szrj 551*38fd1498Szrj }; 552*38fd1498Szrj 553*38fd1498Szrj unsigned int 554*38fd1498Szrj pass_loop_jam::execute (function *fun) 555*38fd1498Szrj { 556*38fd1498Szrj if (number_of_loops (fun) <= 1) 557*38fd1498Szrj return 0; 558*38fd1498Szrj 559*38fd1498Szrj return tree_loop_unroll_and_jam (); 560*38fd1498Szrj } 561*38fd1498Szrj 562*38fd1498Szrj } 563*38fd1498Szrj 564*38fd1498Szrj gimple_opt_pass * 565*38fd1498Szrj make_pass_loop_jam (gcc::context *ctxt) 566*38fd1498Szrj { 567*38fd1498Szrj return new pass_loop_jam (ctxt); 568*38fd1498Szrj } 569