138fd1498Szrj /* Loop unroll-and-jam. 238fd1498Szrj Copyright (C) 2017-2018 Free Software Foundation, Inc. 338fd1498Szrj 438fd1498Szrj This file is part of GCC. 538fd1498Szrj 638fd1498Szrj GCC is free software; you can redistribute it and/or modify it 738fd1498Szrj under the terms of the GNU General Public License as published by the 838fd1498Szrj Free Software Foundation; either version 3, or (at your option) any 938fd1498Szrj later version. 1038fd1498Szrj 1138fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT 1238fd1498Szrj ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 1338fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 1438fd1498Szrj for more details. 1538fd1498Szrj 1638fd1498Szrj You should have received a copy of the GNU General Public License 1738fd1498Szrj along with GCC; see the file COPYING3. If not see 1838fd1498Szrj <http://www.gnu.org/licenses/>. */ 1938fd1498Szrj 2038fd1498Szrj #include "config.h" 2138fd1498Szrj #include "system.h" 2238fd1498Szrj #include "coretypes.h" 2338fd1498Szrj #include "params.h" 2438fd1498Szrj #include "tree-pass.h" 2538fd1498Szrj #include "backend.h" 2638fd1498Szrj #include "tree.h" 2738fd1498Szrj #include "gimple.h" 2838fd1498Szrj #include "ssa.h" 2938fd1498Szrj #include "fold-const.h" 3038fd1498Szrj #include "tree-cfg.h" 3138fd1498Szrj #include "tree-ssa.h" 3238fd1498Szrj #include "tree-ssa-loop-niter.h" 3338fd1498Szrj #include "tree-ssa-loop.h" 3438fd1498Szrj #include "tree-ssa-loop-manip.h" 3538fd1498Szrj #include "cfgloop.h" 3638fd1498Szrj #include "tree-scalar-evolution.h" 3738fd1498Szrj #include "gimple-iterator.h" 3838fd1498Szrj #include "cfghooks.h" 3938fd1498Szrj #include "tree-data-ref.h" 4038fd1498Szrj #include "tree-ssa-loop-ivopts.h" 4138fd1498Szrj #include "tree-vectorizer.h" 4238fd1498Szrj 4338fd1498Szrj /* Unroll and Jam transformation 4438fd1498Szrj 4538fd1498Szrj This is a combination of two transformations, where the second 4638fd1498Szrj is not always valid. It's applicable if a loop nest has redundancies 4738fd1498Szrj over the iterations of an outer loop while not having that with 4838fd1498Szrj an inner loop. 4938fd1498Szrj 5038fd1498Szrj Given this nest: 5138fd1498Szrj for (i) { 5238fd1498Szrj for (j) { 5338fd1498Szrj B(i,j) 5438fd1498Szrj } 5538fd1498Szrj } 5638fd1498Szrj 5738fd1498Szrj first unroll: 5838fd1498Szrj for (i by 2) { 5938fd1498Szrj for (j) { 6038fd1498Szrj B(i,j) 6138fd1498Szrj } 6238fd1498Szrj for (j) { 6338fd1498Szrj B(i+1,j) 6438fd1498Szrj } 6538fd1498Szrj } 6638fd1498Szrj 6738fd1498Szrj then fuse the two adjacent inner loops resulting from that: 6838fd1498Szrj for (i by 2) { 6938fd1498Szrj for (j) { 7038fd1498Szrj B(i,j) 7138fd1498Szrj B(i+1,j) 7238fd1498Szrj } 7338fd1498Szrj } 7438fd1498Szrj 7538fd1498Szrj As the order of evaluations of the body B changes this is valid 7638fd1498Szrj only in certain situations: all distance vectors need to be forward. 7738fd1498Szrj Additionally if there are multiple induction variables than just 7838fd1498Szrj a counting control IV (j above) we can also deal with some situations. 7938fd1498Szrj 8038fd1498Szrj The validity is checked by unroll_jam_possible_p, and the data-dep 8138fd1498Szrj testing below. 8238fd1498Szrj 8338fd1498Szrj A trivial example where the fusion is wrong would be when 8438fd1498Szrj B(i,j) == x[j-1] = x[j]; 8538fd1498Szrj for (i by 2) { 8638fd1498Szrj for (j) { 8738fd1498Szrj x[j-1] = x[j]; 8838fd1498Szrj } 8938fd1498Szrj for (j) { 9038fd1498Szrj x[j-1] = x[j]; 9138fd1498Szrj } 9238fd1498Szrj } effect: move content to front by two elements 9338fd1498Szrj --> 9438fd1498Szrj for (i by 2) { 9538fd1498Szrj for (j) { 9638fd1498Szrj x[j-1] = x[j]; 9738fd1498Szrj x[j-1] = x[j]; 9838fd1498Szrj } 9938fd1498Szrj } effect: move content to front by one element 10038fd1498Szrj */ 10138fd1498Szrj 10238fd1498Szrj /* Modify the loop tree for the fact that all code once belonging 10338fd1498Szrj to the OLD loop or the outer loop of OLD now is inside LOOP. */ 10438fd1498Szrj 10538fd1498Szrj static void 10638fd1498Szrj merge_loop_tree (struct loop *loop, struct loop *old) 10738fd1498Szrj { 10838fd1498Szrj basic_block *bbs; 10938fd1498Szrj int i, n; 11038fd1498Szrj struct loop *subloop; 11138fd1498Szrj edge e; 11238fd1498Szrj edge_iterator ei; 11338fd1498Szrj 11438fd1498Szrj /* Find its nodes. */ 11538fd1498Szrj bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); 11638fd1498Szrj n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun)); 11738fd1498Szrj 11838fd1498Szrj for (i = 0; i < n; i++) 11938fd1498Szrj { 12038fd1498Szrj /* If the block was direct child of OLD loop it's now part 12138fd1498Szrj of LOOP. If it was outside OLD, then it moved into LOOP 12238fd1498Szrj as well. This avoids changing the loop father for BBs 12338fd1498Szrj in inner loops of OLD. */ 12438fd1498Szrj if (bbs[i]->loop_father == old 12538fd1498Szrj || loop_depth (bbs[i]->loop_father) < loop_depth (old)) 12638fd1498Szrj { 12738fd1498Szrj remove_bb_from_loops (bbs[i]); 12838fd1498Szrj add_bb_to_loop (bbs[i], loop); 12938fd1498Szrj continue; 13038fd1498Szrj } 13138fd1498Szrj 13238fd1498Szrj /* If we find a direct subloop of OLD, move it to LOOP. */ 13338fd1498Szrj subloop = bbs[i]->loop_father; 13438fd1498Szrj if (loop_outer (subloop) == old && subloop->header == bbs[i]) 13538fd1498Szrj { 13638fd1498Szrj flow_loop_tree_node_remove (subloop); 13738fd1498Szrj flow_loop_tree_node_add (loop, subloop); 13838fd1498Szrj } 13938fd1498Szrj } 14038fd1498Szrj 14138fd1498Szrj /* Update the information about loop exit edges. */ 14238fd1498Szrj for (i = 0; i < n; i++) 14338fd1498Szrj { 14438fd1498Szrj FOR_EACH_EDGE (e, ei, bbs[i]->succs) 14538fd1498Szrj { 14638fd1498Szrj rescan_loop_exit (e, false, false); 14738fd1498Szrj } 14838fd1498Szrj } 14938fd1498Szrj 15038fd1498Szrj loop->num_nodes = n; 15138fd1498Szrj 15238fd1498Szrj free (bbs); 15338fd1498Szrj } 15438fd1498Szrj 15538fd1498Szrj /* BB is part of the outer loop of an unroll-and-jam situation. 15638fd1498Szrj Check if any statements therein would prevent the transformation. */ 15738fd1498Szrj 15838fd1498Szrj static bool 15938fd1498Szrj bb_prevents_fusion_p (basic_block bb) 16038fd1498Szrj { 16138fd1498Szrj gimple_stmt_iterator gsi; 16238fd1498Szrj /* BB is duplicated by outer unrolling and then all N-1 first copies 16338fd1498Szrj move into the body of the fused inner loop. If BB exits the outer loop 164*58e805e6Szrj the last copy still does so, and the first N-1 copies are cancelled 16538fd1498Szrj by loop unrolling, so also after fusion it's the exit block. 16638fd1498Szrj But there might be other reasons that prevent fusion: 16738fd1498Szrj * stores or unknown side-effects prevent fusion 16838fd1498Szrj * loads don't 16938fd1498Szrj * computations into SSA names: these aren't problematic. Their 17038fd1498Szrj result will be unused on the exit edges of the first N-1 copies 17138fd1498Szrj (those aren't taken after unrolling). If they are used on the 17238fd1498Szrj other edge (the one leading to the outer latch block) they are 17338fd1498Szrj loop-carried (on the outer loop) and the Nth copy of BB will 17438fd1498Szrj compute them again (i.e. the first N-1 copies will be dead). */ 17538fd1498Szrj for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 17638fd1498Szrj { 17738fd1498Szrj gimple *g = gsi_stmt (gsi); 17838fd1498Szrj if (gimple_vdef (g) || gimple_has_side_effects (g)) 17938fd1498Szrj return true; 18038fd1498Szrj } 18138fd1498Szrj return false; 18238fd1498Szrj } 18338fd1498Szrj 18438fd1498Szrj /* Given an inner loop LOOP (of some OUTER loop) determine if 18538fd1498Szrj we can safely fuse copies of it (generated by outer unrolling). 18638fd1498Szrj If so return true, otherwise return false. */ 18738fd1498Szrj 18838fd1498Szrj static bool 18938fd1498Szrj unroll_jam_possible_p (struct loop *outer, struct loop *loop) 19038fd1498Szrj { 19138fd1498Szrj basic_block *bbs; 19238fd1498Szrj int i, n; 19338fd1498Szrj struct tree_niter_desc niter; 19438fd1498Szrj 19538fd1498Szrj /* When fusing the loops we skip the latch block 19638fd1498Szrj of the first one, so it mustn't have any effects to 19738fd1498Szrj preserve. */ 19838fd1498Szrj if (!empty_block_p (loop->latch)) 19938fd1498Szrj return false; 20038fd1498Szrj 20138fd1498Szrj if (!single_exit (loop)) 20238fd1498Szrj return false; 20338fd1498Szrj 20438fd1498Szrj /* We need a perfect nest. Quick check for adjacent inner loops. */ 20538fd1498Szrj if (outer->inner != loop || loop->next) 20638fd1498Szrj return false; 20738fd1498Szrj 20838fd1498Szrj /* Prevent head-controlled inner loops, that we usually have. 20938fd1498Szrj The guard block would need to be accepted 21038fd1498Szrj (invariant condition either entering or skipping the loop), 21138fd1498Szrj without also accepting arbitrary control flow. When unswitching 21238fd1498Szrj ran before us (as with -O3) this won't be a problem because its 21338fd1498Szrj outer loop unswitching will have moved out the invariant condition. 21438fd1498Szrj 21538fd1498Szrj If we do that we need to extend fuse_loops() to cope with this 21638fd1498Szrj by threading through the (still invariant) copied condition 21738fd1498Szrj between the two loop copies. */ 21838fd1498Szrj if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header)) 21938fd1498Szrj return false; 22038fd1498Szrj 22138fd1498Szrj /* The number of iterations of the inner loop must be loop invariant 22238fd1498Szrj with respect to the outer loop. */ 22338fd1498Szrj if (!number_of_iterations_exit (loop, single_exit (loop), &niter, 22438fd1498Szrj false, true) 22538fd1498Szrj || niter.cmp == ERROR_MARK 22638fd1498Szrj || !integer_zerop (niter.may_be_zero) 22738fd1498Szrj || !expr_invariant_in_loop_p (outer, niter.niter)) 22838fd1498Szrj return false; 22938fd1498Szrj 230*58e805e6Szrj /* If the inner loop produces any values that are used inside the 231*58e805e6Szrj outer loop (except the virtual op) then it can flow 232*58e805e6Szrj back (perhaps indirectly) into the inner loop. This prevents 233*58e805e6Szrj fusion: without fusion the value at the last iteration is used, 234*58e805e6Szrj with fusion the value after the initial iteration is used. 235*58e805e6Szrj 236*58e805e6Szrj If all uses are outside the outer loop this doesn't prevent fusion; 237*58e805e6Szrj the value of the last iteration is still used (and the values from 238*58e805e6Szrj all intermediate iterations are dead). */ 239*58e805e6Szrj gphi_iterator psi; 240*58e805e6Szrj for (psi = gsi_start_phis (single_exit (loop)->dest); 241*58e805e6Szrj !gsi_end_p (psi); gsi_next (&psi)) 242*58e805e6Szrj { 243*58e805e6Szrj imm_use_iterator imm_iter; 244*58e805e6Szrj use_operand_p use_p; 245*58e805e6Szrj tree op = gimple_phi_result (psi.phi ()); 246*58e805e6Szrj if (virtual_operand_p (op)) 247*58e805e6Szrj continue; 248*58e805e6Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op) 249*58e805e6Szrj { 250*58e805e6Szrj gimple *use_stmt = USE_STMT (use_p); 251*58e805e6Szrj if (!is_gimple_debug (use_stmt) 252*58e805e6Szrj && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt))) 253*58e805e6Szrj return false; 254*58e805e6Szrj } 255*58e805e6Szrj } 256*58e805e6Szrj 25738fd1498Szrj /* And check blocks belonging to just outer loop. */ 25838fd1498Szrj bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); 25938fd1498Szrj n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun)); 26038fd1498Szrj 26138fd1498Szrj for (i = 0; i < n; i++) 26238fd1498Szrj if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i])) 26338fd1498Szrj break; 26438fd1498Szrj free (bbs); 26538fd1498Szrj if (i != n) 26638fd1498Szrj return false; 26738fd1498Szrj 26838fd1498Szrj /* For now we can safely fuse copies of LOOP only if all 26938fd1498Szrj loop carried variables are inductions (or the virtual op). 27038fd1498Szrj 27138fd1498Szrj We could handle reductions as well (the initial value in the second 27238fd1498Szrj body would be the after-iter value of the first body) if it's over 27338fd1498Szrj an associative and commutative operation. We wouldn't 27438fd1498Szrj be able to handle unknown cycles. */ 27538fd1498Szrj for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi)) 27638fd1498Szrj { 27738fd1498Szrj affine_iv iv; 27838fd1498Szrj tree op = gimple_phi_result (psi.phi ()); 27938fd1498Szrj 28038fd1498Szrj if (virtual_operand_p (op)) 28138fd1498Szrj continue; 28238fd1498Szrj if (!simple_iv (loop, loop, op, &iv, true)) 28338fd1498Szrj return false; 28438fd1498Szrj /* The inductions must be regular, loop invariant step and initial 28538fd1498Szrj value. */ 28638fd1498Szrj if (!expr_invariant_in_loop_p (outer, iv.step) 28738fd1498Szrj || !expr_invariant_in_loop_p (outer, iv.base)) 28838fd1498Szrj return false; 28938fd1498Szrj /* XXX With more effort we could also be able to deal with inductions 29038fd1498Szrj where the initial value is loop variant but a simple IV in the 29138fd1498Szrj outer loop. The initial value for the second body would be 29238fd1498Szrj the original initial value plus iv.base.step. The next value 29338fd1498Szrj for the fused loop would be the original next value of the first 29438fd1498Szrj copy, _not_ the next value of the second body. */ 29538fd1498Szrj } 29638fd1498Szrj 29738fd1498Szrj return true; 29838fd1498Szrj } 29938fd1498Szrj 30038fd1498Szrj /* Fuse LOOP with all further neighbors. The loops are expected to 30138fd1498Szrj be in appropriate form. */ 30238fd1498Szrj 30338fd1498Szrj static void 30438fd1498Szrj fuse_loops (struct loop *loop) 30538fd1498Szrj { 30638fd1498Szrj struct loop *next = loop->next; 30738fd1498Szrj 30838fd1498Szrj while (next) 30938fd1498Szrj { 31038fd1498Szrj edge e; 31138fd1498Szrj 31238fd1498Szrj remove_branch (single_pred_edge (loop->latch)); 31338fd1498Szrj /* Make delete_basic_block not fiddle with the loop structure. */ 31438fd1498Szrj basic_block oldlatch = loop->latch; 31538fd1498Szrj loop->latch = NULL; 31638fd1498Szrj delete_basic_block (oldlatch); 31738fd1498Szrj e = redirect_edge_and_branch (loop_latch_edge (next), 31838fd1498Szrj loop->header); 31938fd1498Szrj loop->latch = e->src; 32038fd1498Szrj flush_pending_stmts (e); 32138fd1498Szrj 32238fd1498Szrj gcc_assert (EDGE_COUNT (next->header->preds) == 1); 32338fd1498Szrj 32438fd1498Szrj /* The PHI nodes of the second body (single-argument now) 32538fd1498Szrj need adjustments to use the right values: either directly 32638fd1498Szrj the value of the corresponding PHI in the first copy or 32738fd1498Szrj the one leaving the first body which unrolling did for us. 32838fd1498Szrj 32938fd1498Szrj See also unroll_jam_possible_p() for further possibilities. */ 33038fd1498Szrj gphi_iterator psi_first, psi_second; 33138fd1498Szrj e = single_pred_edge (next->header); 33238fd1498Szrj for (psi_first = gsi_start_phis (loop->header), 33338fd1498Szrj psi_second = gsi_start_phis (next->header); 33438fd1498Szrj !gsi_end_p (psi_first); 33538fd1498Szrj gsi_next (&psi_first), gsi_next (&psi_second)) 33638fd1498Szrj { 33738fd1498Szrj gphi *phi_first = psi_first.phi (); 33838fd1498Szrj gphi *phi_second = psi_second.phi (); 33938fd1498Szrj tree firstop = gimple_phi_result (phi_first); 34038fd1498Szrj /* The virtual operand is correct already as it's 34138fd1498Szrj always live at exit, hence has a LCSSA node and outer 34238fd1498Szrj loop unrolling updated SSA form. */ 34338fd1498Szrj if (virtual_operand_p (firstop)) 34438fd1498Szrj continue; 34538fd1498Szrj 34638fd1498Szrj /* Due to unroll_jam_possible_p() we know that this is 34738fd1498Szrj an induction. The second body goes over the same 34838fd1498Szrj iteration space. */ 34938fd1498Szrj add_phi_arg (phi_second, firstop, e, 35038fd1498Szrj gimple_location (phi_first)); 35138fd1498Szrj } 35238fd1498Szrj gcc_assert (gsi_end_p (psi_second)); 35338fd1498Szrj 35438fd1498Szrj merge_loop_tree (loop, next); 35538fd1498Szrj gcc_assert (!next->num_nodes); 35638fd1498Szrj struct loop *ln = next->next; 35738fd1498Szrj delete_loop (next); 35838fd1498Szrj next = ln; 35938fd1498Szrj } 36038fd1498Szrj rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); 36138fd1498Szrj } 36238fd1498Szrj 36338fd1498Szrj /* Returns true if the distance in DDR can be determined and adjusts 36438fd1498Szrj the unroll factor in *UNROLL to make unrolling valid for that distance. 36538fd1498Szrj Otherwise return false. 36638fd1498Szrj 36738fd1498Szrj If this data dep can lead to a removed memory reference, increment 36838fd1498Szrj *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor 36938fd1498Szrj for this to happen. */ 37038fd1498Szrj 37138fd1498Szrj static bool 37238fd1498Szrj adjust_unroll_factor (struct data_dependence_relation *ddr, 37338fd1498Szrj unsigned *unroll, unsigned *profit_unroll, 37438fd1498Szrj unsigned *removed) 37538fd1498Szrj { 37638fd1498Szrj bool ret = false; 37738fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) != chrec_known) 37838fd1498Szrj { 37938fd1498Szrj if (DDR_NUM_DIST_VECTS (ddr) == 0) 38038fd1498Szrj return false; 38138fd1498Szrj unsigned i; 38238fd1498Szrj lambda_vector dist_v; 38338fd1498Szrj FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 38438fd1498Szrj { 38538fd1498Szrj /* A distance (a,b) is at worst transformed into (a/N,b) by the 38638fd1498Szrj unrolling (factor N), so the transformation is valid if 38738fd1498Szrj a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll 38838fd1498Szrj factor needs to be limited so that the first condition holds. 38938fd1498Szrj That may limit the factor down to zero in the worst case. */ 39038fd1498Szrj int dist = dist_v[0]; 39138fd1498Szrj if (dist < 0) 39238fd1498Szrj gcc_unreachable (); 39338fd1498Szrj else if ((unsigned)dist >= *unroll) 39438fd1498Szrj ; 39538fd1498Szrj else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) 39638fd1498Szrj || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) 39738fd1498Szrj && dist > 0)) 39838fd1498Szrj ; 39938fd1498Szrj else 40038fd1498Szrj *unroll = dist; 40138fd1498Szrj 40238fd1498Szrj /* With a distance (a,0) it's always profitable to unroll-and-jam 40338fd1498Szrj (by a+1), because one memory reference will go away. With 40438fd1498Szrj (a,b) and b != 0 that's less clear. We will increase the 40538fd1498Szrj number of streams without lowering the number of mem refs. 40638fd1498Szrj So for now only handle the first situation. */ 40738fd1498Szrj if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) 40838fd1498Szrj { 40938fd1498Szrj *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1); 41038fd1498Szrj (*removed)++; 41138fd1498Szrj } 41238fd1498Szrj 41338fd1498Szrj ret = true; 41438fd1498Szrj } 41538fd1498Szrj } 41638fd1498Szrj return ret; 41738fd1498Szrj } 41838fd1498Szrj 41938fd1498Szrj /* Main entry point for the unroll-and-jam transformation 42038fd1498Szrj described above. */ 42138fd1498Szrj 42238fd1498Szrj static unsigned int 42338fd1498Szrj tree_loop_unroll_and_jam (void) 42438fd1498Szrj { 42538fd1498Szrj struct loop *loop; 42638fd1498Szrj bool changed = false; 42738fd1498Szrj 42838fd1498Szrj gcc_assert (scev_initialized_p ()); 42938fd1498Szrj 43038fd1498Szrj /* Go through all innermost loops. */ 43138fd1498Szrj FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) 43238fd1498Szrj { 43338fd1498Szrj struct loop *outer = loop_outer (loop); 43438fd1498Szrj 43538fd1498Szrj if (loop_depth (loop) < 2 43638fd1498Szrj || optimize_loop_nest_for_size_p (outer)) 43738fd1498Szrj continue; 43838fd1498Szrj 43938fd1498Szrj if (!unroll_jam_possible_p (outer, loop)) 44038fd1498Szrj continue; 44138fd1498Szrj 44238fd1498Szrj vec<data_reference_p> datarefs; 44338fd1498Szrj vec<ddr_p> dependences; 44438fd1498Szrj unsigned unroll_factor, profit_unroll, removed; 44538fd1498Szrj struct tree_niter_desc desc; 44638fd1498Szrj bool unroll = false; 44738fd1498Szrj 44838fd1498Szrj auto_vec<loop_p, 3> loop_nest; 44938fd1498Szrj dependences.create (10); 45038fd1498Szrj datarefs.create (10); 45138fd1498Szrj if (!compute_data_dependences_for_loop (outer, true, &loop_nest, 45238fd1498Szrj &datarefs, &dependences)) 45338fd1498Szrj { 45438fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 45538fd1498Szrj fprintf (dump_file, "Cannot analyze data dependencies\n"); 45638fd1498Szrj free_data_refs (datarefs); 45738fd1498Szrj free_dependence_relations (dependences); 458*58e805e6Szrj continue; 45938fd1498Szrj } 46038fd1498Szrj if (!datarefs.length ()) 46138fd1498Szrj continue; 46238fd1498Szrj 46338fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 46438fd1498Szrj dump_data_dependence_relations (dump_file, dependences); 46538fd1498Szrj 46638fd1498Szrj unroll_factor = (unsigned)-1; 46738fd1498Szrj profit_unroll = 1; 46838fd1498Szrj removed = 0; 46938fd1498Szrj 47038fd1498Szrj /* Check all dependencies. */ 47138fd1498Szrj unsigned i; 47238fd1498Szrj struct data_dependence_relation *ddr; 47338fd1498Szrj FOR_EACH_VEC_ELT (dependences, i, ddr) 47438fd1498Szrj { 47538fd1498Szrj struct data_reference *dra, *drb; 47638fd1498Szrj 47738fd1498Szrj /* If the refs are independend there's nothing to do. */ 47838fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 47938fd1498Szrj continue; 48038fd1498Szrj dra = DDR_A (ddr); 48138fd1498Szrj drb = DDR_B (ddr); 48238fd1498Szrj /* Nothing interesting for the self dependencies. */ 48338fd1498Szrj if (dra == drb) 48438fd1498Szrj continue; 48538fd1498Szrj 48638fd1498Szrj /* Now check the distance vector, for determining a sensible 48738fd1498Szrj outer unroll factor, and for validity of merging the inner 48838fd1498Szrj loop copies. */ 48938fd1498Szrj if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll, 49038fd1498Szrj &removed)) 49138fd1498Szrj { 49238fd1498Szrj /* Couldn't get the distance vector. For two reads that's 49338fd1498Szrj harmless (we assume we should unroll). For at least 49438fd1498Szrj one write this means we can't check the dependence direction 49538fd1498Szrj and hence can't determine safety. */ 49638fd1498Szrj 49738fd1498Szrj if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb)) 49838fd1498Szrj { 49938fd1498Szrj unroll_factor = 0; 50038fd1498Szrj break; 50138fd1498Szrj } 50238fd1498Szrj } 50338fd1498Szrj } 50438fd1498Szrj 50538fd1498Szrj /* We regard a user-specified minimum percentage of zero as a request 50638fd1498Szrj to ignore all profitability concerns and apply the transformation 50738fd1498Szrj always. */ 50838fd1498Szrj if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) 50938fd1498Szrj profit_unroll = 2; 51038fd1498Szrj else if (removed * 100 / datarefs.length () 51138fd1498Szrj < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) 51238fd1498Szrj profit_unroll = 1; 51338fd1498Szrj if (unroll_factor > profit_unroll) 51438fd1498Szrj unroll_factor = profit_unroll; 51538fd1498Szrj if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL)) 51638fd1498Szrj unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL); 51738fd1498Szrj unroll = (unroll_factor > 1 51838fd1498Szrj && can_unroll_loop_p (outer, unroll_factor, &desc)); 51938fd1498Szrj 52038fd1498Szrj if (unroll) 52138fd1498Szrj { 52238fd1498Szrj if (dump_enabled_p ()) 52338fd1498Szrj dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS, 52438fd1498Szrj find_loop_location (outer), 52538fd1498Szrj "applying unroll and jam with factor %d\n", 52638fd1498Szrj unroll_factor); 52738fd1498Szrj initialize_original_copy_tables (); 52838fd1498Szrj tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer), 52938fd1498Szrj &desc); 53038fd1498Szrj free_original_copy_tables (); 53138fd1498Szrj fuse_loops (outer->inner); 53238fd1498Szrj changed = true; 53338fd1498Szrj } 53438fd1498Szrj 53538fd1498Szrj loop_nest.release (); 53638fd1498Szrj free_dependence_relations (dependences); 53738fd1498Szrj free_data_refs (datarefs); 53838fd1498Szrj } 53938fd1498Szrj 54038fd1498Szrj if (changed) 54138fd1498Szrj { 54238fd1498Szrj scev_reset (); 54338fd1498Szrj free_dominance_info (CDI_DOMINATORS); 54438fd1498Szrj return TODO_cleanup_cfg; 54538fd1498Szrj } 54638fd1498Szrj return 0; 54738fd1498Szrj } 54838fd1498Szrj 54938fd1498Szrj /* Pass boilerplate */ 55038fd1498Szrj 55138fd1498Szrj namespace { 55238fd1498Szrj 55338fd1498Szrj const pass_data pass_data_loop_jam = 55438fd1498Szrj { 55538fd1498Szrj GIMPLE_PASS, /* type */ 55638fd1498Szrj "unrolljam", /* name */ 55738fd1498Szrj OPTGROUP_LOOP, /* optinfo_flags */ 55838fd1498Szrj TV_LOOP_JAM, /* tv_id */ 55938fd1498Szrj PROP_cfg, /* properties_required */ 56038fd1498Szrj 0, /* properties_provided */ 56138fd1498Szrj 0, /* properties_destroyed */ 56238fd1498Szrj 0, /* todo_flags_start */ 56338fd1498Szrj 0, /* todo_flags_finish */ 56438fd1498Szrj }; 56538fd1498Szrj 56638fd1498Szrj class pass_loop_jam : public gimple_opt_pass 56738fd1498Szrj { 56838fd1498Szrj public: 56938fd1498Szrj pass_loop_jam (gcc::context *ctxt) 57038fd1498Szrj : gimple_opt_pass (pass_data_loop_jam, ctxt) 57138fd1498Szrj {} 57238fd1498Szrj 57338fd1498Szrj /* opt_pass methods: */ 57438fd1498Szrj virtual bool gate (function *) { return flag_unroll_jam != 0; } 57538fd1498Szrj virtual unsigned int execute (function *); 57638fd1498Szrj 57738fd1498Szrj }; 57838fd1498Szrj 57938fd1498Szrj unsigned int 58038fd1498Szrj pass_loop_jam::execute (function *fun) 58138fd1498Szrj { 58238fd1498Szrj if (number_of_loops (fun) <= 1) 58338fd1498Szrj return 0; 58438fd1498Szrj 58538fd1498Szrj return tree_loop_unroll_and_jam (); 58638fd1498Szrj } 58738fd1498Szrj 58838fd1498Szrj } 58938fd1498Szrj 59038fd1498Szrj gimple_opt_pass * 59138fd1498Szrj make_pass_loop_jam (gcc::context *ctxt) 59238fd1498Szrj { 59338fd1498Szrj return new pass_loop_jam (ctxt); 59438fd1498Szrj } 595