xref: /dflybsd-src/contrib/gcc-8.0/gcc/gimple-loop-jam.c (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj /* Loop unroll-and-jam.
2*38fd1498Szrj    Copyright (C) 2017-2018 Free Software Foundation, Inc.
3*38fd1498Szrj 
4*38fd1498Szrj This file is part of GCC.
5*38fd1498Szrj 
6*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it
7*38fd1498Szrj under the terms of the GNU General Public License as published by the
8*38fd1498Szrj Free Software Foundation; either version 3, or (at your option) any
9*38fd1498Szrj later version.
10*38fd1498Szrj 
11*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT
12*38fd1498Szrj ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14*38fd1498Szrj for more details.
15*38fd1498Szrj 
16*38fd1498Szrj You should have received a copy of the GNU General Public License
17*38fd1498Szrj along with GCC; see the file COPYING3.  If not see
18*38fd1498Szrj <http://www.gnu.org/licenses/>.  */
19*38fd1498Szrj 
20*38fd1498Szrj #include "config.h"
21*38fd1498Szrj #include "system.h"
22*38fd1498Szrj #include "coretypes.h"
23*38fd1498Szrj #include "params.h"
24*38fd1498Szrj #include "tree-pass.h"
25*38fd1498Szrj #include "backend.h"
26*38fd1498Szrj #include "tree.h"
27*38fd1498Szrj #include "gimple.h"
28*38fd1498Szrj #include "ssa.h"
29*38fd1498Szrj #include "fold-const.h"
30*38fd1498Szrj #include "tree-cfg.h"
31*38fd1498Szrj #include "tree-ssa.h"
32*38fd1498Szrj #include "tree-ssa-loop-niter.h"
33*38fd1498Szrj #include "tree-ssa-loop.h"
34*38fd1498Szrj #include "tree-ssa-loop-manip.h"
35*38fd1498Szrj #include "cfgloop.h"
36*38fd1498Szrj #include "tree-scalar-evolution.h"
37*38fd1498Szrj #include "gimple-iterator.h"
38*38fd1498Szrj #include "cfghooks.h"
39*38fd1498Szrj #include "tree-data-ref.h"
40*38fd1498Szrj #include "tree-ssa-loop-ivopts.h"
41*38fd1498Szrj #include "tree-vectorizer.h"
42*38fd1498Szrj 
43*38fd1498Szrj /* Unroll and Jam transformation
44*38fd1498Szrj 
45*38fd1498Szrj    This is a combination of two transformations, where the second
46*38fd1498Szrj    is not always valid.  It's applicable if a loop nest has redundancies
47*38fd1498Szrj    over the iterations of an outer loop while not having that with
48*38fd1498Szrj    an inner loop.
49*38fd1498Szrj 
50*38fd1498Szrj    Given this nest:
51*38fd1498Szrj        for (i) {
52*38fd1498Szrj 	 for (j) {
53*38fd1498Szrj 	   B(i,j)
54*38fd1498Szrj 	 }
55*38fd1498Szrj        }
56*38fd1498Szrj 
57*38fd1498Szrj    first unroll:
58*38fd1498Szrj        for (i by 2) {
59*38fd1498Szrj 	 for (j) {
60*38fd1498Szrj 	   B(i,j)
61*38fd1498Szrj 	 }
62*38fd1498Szrj 	 for (j) {
63*38fd1498Szrj 	   B(i+1,j)
64*38fd1498Szrj 	 }
65*38fd1498Szrj        }
66*38fd1498Szrj 
67*38fd1498Szrj    then fuse the two adjacent inner loops resulting from that:
68*38fd1498Szrj        for (i by 2) {
69*38fd1498Szrj 	 for (j) {
70*38fd1498Szrj 	   B(i,j)
71*38fd1498Szrj 	   B(i+1,j)
72*38fd1498Szrj 	 }
73*38fd1498Szrj        }
74*38fd1498Szrj 
75*38fd1498Szrj    As the order of evaluations of the body B changes this is valid
76*38fd1498Szrj    only in certain situations: all distance vectors need to be forward.
77*38fd1498Szrj    Additionally if there are multiple induction variables than just
78*38fd1498Szrj    a counting control IV (j above) we can also deal with some situations.
79*38fd1498Szrj 
80*38fd1498Szrj    The validity is checked by unroll_jam_possible_p, and the data-dep
81*38fd1498Szrj    testing below.
82*38fd1498Szrj 
83*38fd1498Szrj    A trivial example where the fusion is wrong would be when
84*38fd1498Szrj    B(i,j) == x[j-1] = x[j];
85*38fd1498Szrj        for (i by 2) {
86*38fd1498Szrj 	 for (j) {
87*38fd1498Szrj 	   x[j-1] = x[j];
88*38fd1498Szrj 	 }
89*38fd1498Szrj 	 for (j) {
90*38fd1498Szrj 	   x[j-1] = x[j];
91*38fd1498Szrj 	 }
92*38fd1498Szrj        }  effect: move content to front by two elements
93*38fd1498Szrj        -->
94*38fd1498Szrj        for (i by 2) {
95*38fd1498Szrj 	 for (j) {
96*38fd1498Szrj 	   x[j-1] = x[j];
97*38fd1498Szrj 	   x[j-1] = x[j];
98*38fd1498Szrj 	 }
99*38fd1498Szrj        }  effect: move content to front by one element
100*38fd1498Szrj */
101*38fd1498Szrj 
102*38fd1498Szrj /* Modify the loop tree for the fact that all code once belonging
103*38fd1498Szrj    to the OLD loop or the outer loop of OLD now is inside LOOP.  */
104*38fd1498Szrj 
105*38fd1498Szrj static void
106*38fd1498Szrj merge_loop_tree (struct loop *loop, struct loop *old)
107*38fd1498Szrj {
108*38fd1498Szrj   basic_block *bbs;
109*38fd1498Szrj   int i, n;
110*38fd1498Szrj   struct loop *subloop;
111*38fd1498Szrj   edge e;
112*38fd1498Szrj   edge_iterator ei;
113*38fd1498Szrj 
114*38fd1498Szrj   /* Find its nodes.  */
115*38fd1498Szrj   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
116*38fd1498Szrj   n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
117*38fd1498Szrj 
118*38fd1498Szrj   for (i = 0; i < n; i++)
119*38fd1498Szrj     {
120*38fd1498Szrj       /* If the block was direct child of OLD loop it's now part
121*38fd1498Szrj          of LOOP.  If it was outside OLD, then it moved into LOOP
122*38fd1498Szrj 	 as well.  This avoids changing the loop father for BBs
123*38fd1498Szrj 	 in inner loops of OLD.  */
124*38fd1498Szrj       if (bbs[i]->loop_father == old
125*38fd1498Szrj 	  || loop_depth (bbs[i]->loop_father) < loop_depth (old))
126*38fd1498Szrj 	{
127*38fd1498Szrj 	  remove_bb_from_loops (bbs[i]);
128*38fd1498Szrj 	  add_bb_to_loop (bbs[i], loop);
129*38fd1498Szrj 	  continue;
130*38fd1498Szrj 	}
131*38fd1498Szrj 
132*38fd1498Szrj       /* If we find a direct subloop of OLD, move it to LOOP.  */
133*38fd1498Szrj       subloop = bbs[i]->loop_father;
134*38fd1498Szrj       if (loop_outer (subloop) == old && subloop->header == bbs[i])
135*38fd1498Szrj 	{
136*38fd1498Szrj 	  flow_loop_tree_node_remove (subloop);
137*38fd1498Szrj 	  flow_loop_tree_node_add (loop, subloop);
138*38fd1498Szrj 	}
139*38fd1498Szrj     }
140*38fd1498Szrj 
141*38fd1498Szrj   /* Update the information about loop exit edges.  */
142*38fd1498Szrj   for (i = 0; i < n; i++)
143*38fd1498Szrj     {
144*38fd1498Szrj       FOR_EACH_EDGE (e, ei, bbs[i]->succs)
145*38fd1498Szrj 	{
146*38fd1498Szrj 	  rescan_loop_exit (e, false, false);
147*38fd1498Szrj 	}
148*38fd1498Szrj     }
149*38fd1498Szrj 
150*38fd1498Szrj   loop->num_nodes = n;
151*38fd1498Szrj 
152*38fd1498Szrj   free (bbs);
153*38fd1498Szrj }
154*38fd1498Szrj 
155*38fd1498Szrj /* BB is part of the outer loop of an unroll-and-jam situation.
156*38fd1498Szrj    Check if any statements therein would prevent the transformation.  */
157*38fd1498Szrj 
158*38fd1498Szrj static bool
159*38fd1498Szrj bb_prevents_fusion_p (basic_block bb)
160*38fd1498Szrj {
161*38fd1498Szrj   gimple_stmt_iterator gsi;
162*38fd1498Szrj   /* BB is duplicated by outer unrolling and then all N-1 first copies
163*38fd1498Szrj      move into the body of the fused inner loop.  If BB exits the outer loop
164*38fd1498Szrj      the last copy still doess so, and the first N-1 copies are cancelled
165*38fd1498Szrj      by loop unrolling, so also after fusion it's the exit block.
166*38fd1498Szrj      But there might be other reasons that prevent fusion:
167*38fd1498Szrj        * stores or unknown side-effects prevent fusion
168*38fd1498Szrj        * loads don't
169*38fd1498Szrj        * computations into SSA names: these aren't problematic.  Their
170*38fd1498Szrj          result will be unused on the exit edges of the first N-1 copies
171*38fd1498Szrj 	 (those aren't taken after unrolling).  If they are used on the
172*38fd1498Szrj 	 other edge (the one leading to the outer latch block) they are
173*38fd1498Szrj 	 loop-carried (on the outer loop) and the Nth copy of BB will
174*38fd1498Szrj 	 compute them again (i.e. the first N-1 copies will be dead).  */
175*38fd1498Szrj   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
176*38fd1498Szrj     {
177*38fd1498Szrj       gimple *g = gsi_stmt (gsi);
178*38fd1498Szrj       if (gimple_vdef (g) || gimple_has_side_effects (g))
179*38fd1498Szrj 	return true;
180*38fd1498Szrj     }
181*38fd1498Szrj   return false;
182*38fd1498Szrj }
183*38fd1498Szrj 
184*38fd1498Szrj /* Given an inner loop LOOP (of some OUTER loop) determine if
185*38fd1498Szrj    we can safely fuse copies of it (generated by outer unrolling).
186*38fd1498Szrj    If so return true, otherwise return false.  */
187*38fd1498Szrj 
188*38fd1498Szrj static bool
189*38fd1498Szrj unroll_jam_possible_p (struct loop *outer, struct loop *loop)
190*38fd1498Szrj {
191*38fd1498Szrj   basic_block *bbs;
192*38fd1498Szrj   int i, n;
193*38fd1498Szrj   struct tree_niter_desc niter;
194*38fd1498Szrj 
195*38fd1498Szrj   /* When fusing the loops we skip the latch block
196*38fd1498Szrj      of the first one, so it mustn't have any effects to
197*38fd1498Szrj      preserve.  */
198*38fd1498Szrj   if (!empty_block_p (loop->latch))
199*38fd1498Szrj     return false;
200*38fd1498Szrj 
201*38fd1498Szrj   if (!single_exit (loop))
202*38fd1498Szrj     return false;
203*38fd1498Szrj 
204*38fd1498Szrj   /* We need a perfect nest.  Quick check for adjacent inner loops.  */
205*38fd1498Szrj   if (outer->inner != loop || loop->next)
206*38fd1498Szrj     return false;
207*38fd1498Szrj 
208*38fd1498Szrj   /* Prevent head-controlled inner loops, that we usually have.
209*38fd1498Szrj      The guard block would need to be accepted
210*38fd1498Szrj      (invariant condition either entering or skipping the loop),
211*38fd1498Szrj      without also accepting arbitrary control flow.  When unswitching
212*38fd1498Szrj      ran before us (as with -O3) this won't be a problem because its
213*38fd1498Szrj      outer loop unswitching will have moved out the invariant condition.
214*38fd1498Szrj 
215*38fd1498Szrj      If we do that we need to extend fuse_loops() to cope with this
216*38fd1498Szrj      by threading through the (still invariant) copied condition
217*38fd1498Szrj      between the two loop copies.  */
218*38fd1498Szrj   if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
219*38fd1498Szrj     return false;
220*38fd1498Szrj 
221*38fd1498Szrj   /* The number of iterations of the inner loop must be loop invariant
222*38fd1498Szrj      with respect to the outer loop.  */
223*38fd1498Szrj   if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
224*38fd1498Szrj 				 false, true)
225*38fd1498Szrj       || niter.cmp == ERROR_MARK
226*38fd1498Szrj       || !integer_zerop (niter.may_be_zero)
227*38fd1498Szrj       || !expr_invariant_in_loop_p (outer, niter.niter))
228*38fd1498Szrj     return false;
229*38fd1498Szrj 
230*38fd1498Szrj   /* And check blocks belonging to just outer loop.  */
231*38fd1498Szrj   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
232*38fd1498Szrj   n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
233*38fd1498Szrj 
234*38fd1498Szrj   for (i = 0; i < n; i++)
235*38fd1498Szrj     if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
236*38fd1498Szrj       break;
237*38fd1498Szrj   free (bbs);
238*38fd1498Szrj   if (i != n)
239*38fd1498Szrj     return false;
240*38fd1498Szrj 
241*38fd1498Szrj   /* For now we can safely fuse copies of LOOP only if all
242*38fd1498Szrj      loop carried variables are inductions (or the virtual op).
243*38fd1498Szrj 
244*38fd1498Szrj      We could handle reductions as well (the initial value in the second
245*38fd1498Szrj      body would be the after-iter value of the first body) if it's over
246*38fd1498Szrj      an associative and commutative operation.  We wouldn't
247*38fd1498Szrj      be able to handle unknown cycles.  */
248*38fd1498Szrj   gphi_iterator psi;
249*38fd1498Szrj   for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
250*38fd1498Szrj     {
251*38fd1498Szrj       affine_iv iv;
252*38fd1498Szrj       tree op = gimple_phi_result (psi.phi ());
253*38fd1498Szrj 
254*38fd1498Szrj       if (virtual_operand_p (op))
255*38fd1498Szrj 	continue;
256*38fd1498Szrj       if (!simple_iv (loop, loop, op, &iv, true))
257*38fd1498Szrj 	return false;
258*38fd1498Szrj       /* The inductions must be regular, loop invariant step and initial
259*38fd1498Szrj          value.  */
260*38fd1498Szrj       if (!expr_invariant_in_loop_p (outer, iv.step)
261*38fd1498Szrj 	  || !expr_invariant_in_loop_p (outer, iv.base))
262*38fd1498Szrj 	return false;
263*38fd1498Szrj       /* XXX With more effort we could also be able to deal with inductions
264*38fd1498Szrj          where the initial value is loop variant but a simple IV in the
265*38fd1498Szrj 	 outer loop.  The initial value for the second body would be
266*38fd1498Szrj 	 the original initial value plus iv.base.step.  The next value
267*38fd1498Szrj 	 for the fused loop would be the original next value of the first
268*38fd1498Szrj 	 copy, _not_ the next value of the second body.  */
269*38fd1498Szrj     }
270*38fd1498Szrj 
271*38fd1498Szrj   return true;
272*38fd1498Szrj }
273*38fd1498Szrj 
274*38fd1498Szrj /* Fuse LOOP with all further neighbors.  The loops are expected to
275*38fd1498Szrj    be in appropriate form.  */
276*38fd1498Szrj 
277*38fd1498Szrj static void
278*38fd1498Szrj fuse_loops (struct loop *loop)
279*38fd1498Szrj {
280*38fd1498Szrj   struct loop *next = loop->next;
281*38fd1498Szrj 
282*38fd1498Szrj   while (next)
283*38fd1498Szrj     {
284*38fd1498Szrj       edge e;
285*38fd1498Szrj 
286*38fd1498Szrj       remove_branch (single_pred_edge (loop->latch));
287*38fd1498Szrj       /* Make delete_basic_block not fiddle with the loop structure.  */
288*38fd1498Szrj       basic_block oldlatch = loop->latch;
289*38fd1498Szrj       loop->latch = NULL;
290*38fd1498Szrj       delete_basic_block (oldlatch);
291*38fd1498Szrj       e = redirect_edge_and_branch (loop_latch_edge (next),
292*38fd1498Szrj 				    loop->header);
293*38fd1498Szrj       loop->latch = e->src;
294*38fd1498Szrj       flush_pending_stmts (e);
295*38fd1498Szrj 
296*38fd1498Szrj       gcc_assert (EDGE_COUNT (next->header->preds) == 1);
297*38fd1498Szrj 
298*38fd1498Szrj       /* The PHI nodes of the second body (single-argument now)
299*38fd1498Szrj          need adjustments to use the right values: either directly
300*38fd1498Szrj 	 the value of the corresponding PHI in the first copy or
301*38fd1498Szrj 	 the one leaving the first body which unrolling did for us.
302*38fd1498Szrj 
303*38fd1498Szrj 	 See also unroll_jam_possible_p() for further possibilities.  */
304*38fd1498Szrj       gphi_iterator psi_first, psi_second;
305*38fd1498Szrj       e = single_pred_edge (next->header);
306*38fd1498Szrj       for (psi_first = gsi_start_phis (loop->header),
307*38fd1498Szrj 	   psi_second = gsi_start_phis (next->header);
308*38fd1498Szrj 	   !gsi_end_p (psi_first);
309*38fd1498Szrj 	   gsi_next (&psi_first), gsi_next (&psi_second))
310*38fd1498Szrj 	{
311*38fd1498Szrj 	  gphi *phi_first = psi_first.phi ();
312*38fd1498Szrj 	  gphi *phi_second = psi_second.phi ();
313*38fd1498Szrj 	  tree firstop = gimple_phi_result (phi_first);
314*38fd1498Szrj 	  /* The virtual operand is correct already as it's
315*38fd1498Szrj 	     always live at exit, hence has a LCSSA node and outer
316*38fd1498Szrj 	     loop unrolling updated SSA form.  */
317*38fd1498Szrj 	  if (virtual_operand_p (firstop))
318*38fd1498Szrj 	    continue;
319*38fd1498Szrj 
320*38fd1498Szrj 	  /* Due to unroll_jam_possible_p() we know that this is
321*38fd1498Szrj 	     an induction.  The second body goes over the same
322*38fd1498Szrj 	     iteration space.  */
323*38fd1498Szrj 	  add_phi_arg (phi_second, firstop, e,
324*38fd1498Szrj 		       gimple_location (phi_first));
325*38fd1498Szrj 	}
326*38fd1498Szrj       gcc_assert (gsi_end_p (psi_second));
327*38fd1498Szrj 
328*38fd1498Szrj       merge_loop_tree (loop, next);
329*38fd1498Szrj       gcc_assert (!next->num_nodes);
330*38fd1498Szrj       struct loop *ln = next->next;
331*38fd1498Szrj       delete_loop (next);
332*38fd1498Szrj       next = ln;
333*38fd1498Szrj     }
334*38fd1498Szrj   rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
335*38fd1498Szrj }
336*38fd1498Szrj 
337*38fd1498Szrj /* Returns true if the distance in DDR can be determined and adjusts
338*38fd1498Szrj    the unroll factor in *UNROLL to make unrolling valid for that distance.
339*38fd1498Szrj    Otherwise return false.
340*38fd1498Szrj 
341*38fd1498Szrj    If this data dep can lead to a removed memory reference, increment
342*38fd1498Szrj    *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
343*38fd1498Szrj    for this to happen.  */
344*38fd1498Szrj 
345*38fd1498Szrj static bool
346*38fd1498Szrj adjust_unroll_factor (struct data_dependence_relation *ddr,
347*38fd1498Szrj 		      unsigned *unroll, unsigned *profit_unroll,
348*38fd1498Szrj 		      unsigned *removed)
349*38fd1498Szrj {
350*38fd1498Szrj   bool ret = false;
351*38fd1498Szrj   if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
352*38fd1498Szrj     {
353*38fd1498Szrj       if (DDR_NUM_DIST_VECTS (ddr) == 0)
354*38fd1498Szrj 	return false;
355*38fd1498Szrj       unsigned i;
356*38fd1498Szrj       lambda_vector dist_v;
357*38fd1498Szrj       FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
358*38fd1498Szrj 	{
359*38fd1498Szrj 	  /* A distance (a,b) is at worst transformed into (a/N,b) by the
360*38fd1498Szrj 	     unrolling (factor N), so the transformation is valid if
361*38fd1498Szrj 	     a >= N, or b > 0, or b is zero and a > 0.  Otherwise the unroll
362*38fd1498Szrj 	     factor needs to be limited so that the first condition holds.
363*38fd1498Szrj 	     That may limit the factor down to zero in the worst case.  */
364*38fd1498Szrj 	  int dist = dist_v[0];
365*38fd1498Szrj 	  if (dist < 0)
366*38fd1498Szrj 	    gcc_unreachable ();
367*38fd1498Szrj 	  else if ((unsigned)dist >= *unroll)
368*38fd1498Szrj 	    ;
369*38fd1498Szrj 	  else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
370*38fd1498Szrj 		   || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
371*38fd1498Szrj 		       && dist > 0))
372*38fd1498Szrj 	    ;
373*38fd1498Szrj 	  else
374*38fd1498Szrj 	    *unroll = dist;
375*38fd1498Szrj 
376*38fd1498Szrj 	  /* With a distance (a,0) it's always profitable to unroll-and-jam
377*38fd1498Szrj 	     (by a+1), because one memory reference will go away.  With
378*38fd1498Szrj 	     (a,b) and b != 0 that's less clear.  We will increase the
379*38fd1498Szrj 	     number of streams without lowering the number of mem refs.
380*38fd1498Szrj 	     So for now only handle the first situation.  */
381*38fd1498Szrj 	  if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
382*38fd1498Szrj 	    {
383*38fd1498Szrj 	      *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
384*38fd1498Szrj 	      (*removed)++;
385*38fd1498Szrj 	    }
386*38fd1498Szrj 
387*38fd1498Szrj 	  ret = true;
388*38fd1498Szrj 	}
389*38fd1498Szrj     }
390*38fd1498Szrj   return ret;
391*38fd1498Szrj }
392*38fd1498Szrj 
393*38fd1498Szrj /* Main entry point for the unroll-and-jam transformation
394*38fd1498Szrj    described above.  */
395*38fd1498Szrj 
396*38fd1498Szrj static unsigned int
397*38fd1498Szrj tree_loop_unroll_and_jam (void)
398*38fd1498Szrj {
399*38fd1498Szrj   struct loop *loop;
400*38fd1498Szrj   bool changed = false;
401*38fd1498Szrj 
402*38fd1498Szrj   gcc_assert (scev_initialized_p ());
403*38fd1498Szrj 
404*38fd1498Szrj   /* Go through all innermost loops.  */
405*38fd1498Szrj   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
406*38fd1498Szrj     {
407*38fd1498Szrj       struct loop *outer = loop_outer (loop);
408*38fd1498Szrj 
409*38fd1498Szrj       if (loop_depth (loop) < 2
410*38fd1498Szrj 	  || optimize_loop_nest_for_size_p (outer))
411*38fd1498Szrj 	continue;
412*38fd1498Szrj 
413*38fd1498Szrj       if (!unroll_jam_possible_p (outer, loop))
414*38fd1498Szrj 	continue;
415*38fd1498Szrj 
416*38fd1498Szrj       vec<data_reference_p> datarefs;
417*38fd1498Szrj       vec<ddr_p> dependences;
418*38fd1498Szrj       unsigned unroll_factor, profit_unroll, removed;
419*38fd1498Szrj       struct tree_niter_desc desc;
420*38fd1498Szrj       bool unroll = false;
421*38fd1498Szrj 
422*38fd1498Szrj       auto_vec<loop_p, 3> loop_nest;
423*38fd1498Szrj       dependences.create (10);
424*38fd1498Szrj       datarefs.create (10);
425*38fd1498Szrj       if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
426*38fd1498Szrj 					       &datarefs, &dependences))
427*38fd1498Szrj 	{
428*38fd1498Szrj 	  if (dump_file && (dump_flags & TDF_DETAILS))
429*38fd1498Szrj 	    fprintf (dump_file, "Cannot analyze data dependencies\n");
430*38fd1498Szrj 	  free_data_refs (datarefs);
431*38fd1498Szrj 	  free_dependence_relations (dependences);
432*38fd1498Szrj 	  return false;
433*38fd1498Szrj 	}
434*38fd1498Szrj       if (!datarefs.length ())
435*38fd1498Szrj 	continue;
436*38fd1498Szrj 
437*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
438*38fd1498Szrj 	dump_data_dependence_relations (dump_file, dependences);
439*38fd1498Szrj 
440*38fd1498Szrj       unroll_factor = (unsigned)-1;
441*38fd1498Szrj       profit_unroll = 1;
442*38fd1498Szrj       removed = 0;
443*38fd1498Szrj 
444*38fd1498Szrj       /* Check all dependencies.  */
445*38fd1498Szrj       unsigned i;
446*38fd1498Szrj       struct data_dependence_relation *ddr;
447*38fd1498Szrj       FOR_EACH_VEC_ELT (dependences, i, ddr)
448*38fd1498Szrj 	{
449*38fd1498Szrj 	  struct data_reference *dra, *drb;
450*38fd1498Szrj 
451*38fd1498Szrj 	  /* If the refs are independend there's nothing to do.  */
452*38fd1498Szrj 	  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
453*38fd1498Szrj 	    continue;
454*38fd1498Szrj 	  dra = DDR_A (ddr);
455*38fd1498Szrj 	  drb = DDR_B (ddr);
456*38fd1498Szrj 	  /* Nothing interesting for the self dependencies.  */
457*38fd1498Szrj 	  if (dra == drb)
458*38fd1498Szrj 	    continue;
459*38fd1498Szrj 
460*38fd1498Szrj 	  /* Now check the distance vector, for determining a sensible
461*38fd1498Szrj 	     outer unroll factor, and for validity of merging the inner
462*38fd1498Szrj 	     loop copies.  */
463*38fd1498Szrj 	  if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
464*38fd1498Szrj 				     &removed))
465*38fd1498Szrj 	    {
466*38fd1498Szrj 	      /* Couldn't get the distance vector.  For two reads that's
467*38fd1498Szrj 	         harmless (we assume we should unroll).  For at least
468*38fd1498Szrj 		 one write this means we can't check the dependence direction
469*38fd1498Szrj 		 and hence can't determine safety.  */
470*38fd1498Szrj 
471*38fd1498Szrj 	      if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
472*38fd1498Szrj 		{
473*38fd1498Szrj 		  unroll_factor = 0;
474*38fd1498Szrj 		  break;
475*38fd1498Szrj 		}
476*38fd1498Szrj 	    }
477*38fd1498Szrj 	}
478*38fd1498Szrj 
479*38fd1498Szrj       /* We regard a user-specified minimum percentage of zero as a request
480*38fd1498Szrj          to ignore all profitability concerns and apply the transformation
481*38fd1498Szrj 	 always.  */
482*38fd1498Szrj       if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
483*38fd1498Szrj 	profit_unroll = 2;
484*38fd1498Szrj       else if (removed * 100 / datarefs.length ()
485*38fd1498Szrj 	  < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
486*38fd1498Szrj 	profit_unroll = 1;
487*38fd1498Szrj       if (unroll_factor > profit_unroll)
488*38fd1498Szrj 	unroll_factor = profit_unroll;
489*38fd1498Szrj       if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
490*38fd1498Szrj 	unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
491*38fd1498Szrj       unroll = (unroll_factor > 1
492*38fd1498Szrj 		&& can_unroll_loop_p (outer, unroll_factor, &desc));
493*38fd1498Szrj 
494*38fd1498Szrj       if (unroll)
495*38fd1498Szrj 	{
496*38fd1498Szrj 	  if (dump_enabled_p ())
497*38fd1498Szrj 	    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
498*38fd1498Szrj 			     find_loop_location (outer),
499*38fd1498Szrj 			     "applying unroll and jam with factor %d\n",
500*38fd1498Szrj 			     unroll_factor);
501*38fd1498Szrj 	  initialize_original_copy_tables ();
502*38fd1498Szrj 	  tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
503*38fd1498Szrj 			    &desc);
504*38fd1498Szrj 	  free_original_copy_tables ();
505*38fd1498Szrj 	  fuse_loops (outer->inner);
506*38fd1498Szrj 	  changed = true;
507*38fd1498Szrj 	}
508*38fd1498Szrj 
509*38fd1498Szrj       loop_nest.release ();
510*38fd1498Szrj       free_dependence_relations (dependences);
511*38fd1498Szrj       free_data_refs (datarefs);
512*38fd1498Szrj     }
513*38fd1498Szrj 
514*38fd1498Szrj   if (changed)
515*38fd1498Szrj     {
516*38fd1498Szrj       scev_reset ();
517*38fd1498Szrj       free_dominance_info (CDI_DOMINATORS);
518*38fd1498Szrj       return TODO_cleanup_cfg;
519*38fd1498Szrj     }
520*38fd1498Szrj   return 0;
521*38fd1498Szrj }
522*38fd1498Szrj 
523*38fd1498Szrj /* Pass boilerplate */
524*38fd1498Szrj 
525*38fd1498Szrj namespace {
526*38fd1498Szrj 
527*38fd1498Szrj const pass_data pass_data_loop_jam =
528*38fd1498Szrj {
529*38fd1498Szrj   GIMPLE_PASS, /* type */
530*38fd1498Szrj   "unrolljam", /* name */
531*38fd1498Szrj   OPTGROUP_LOOP, /* optinfo_flags */
532*38fd1498Szrj   TV_LOOP_JAM, /* tv_id */
533*38fd1498Szrj   PROP_cfg, /* properties_required */
534*38fd1498Szrj   0, /* properties_provided */
535*38fd1498Szrj   0, /* properties_destroyed */
536*38fd1498Szrj   0, /* todo_flags_start */
537*38fd1498Szrj   0, /* todo_flags_finish */
538*38fd1498Szrj };
539*38fd1498Szrj 
540*38fd1498Szrj class pass_loop_jam : public gimple_opt_pass
541*38fd1498Szrj {
542*38fd1498Szrj public:
543*38fd1498Szrj   pass_loop_jam (gcc::context *ctxt)
544*38fd1498Szrj     : gimple_opt_pass (pass_data_loop_jam, ctxt)
545*38fd1498Szrj   {}
546*38fd1498Szrj 
547*38fd1498Szrj   /* opt_pass methods: */
548*38fd1498Szrj   virtual bool gate (function *) { return flag_unroll_jam != 0; }
549*38fd1498Szrj   virtual unsigned int execute (function *);
550*38fd1498Szrj 
551*38fd1498Szrj };
552*38fd1498Szrj 
553*38fd1498Szrj unsigned int
554*38fd1498Szrj pass_loop_jam::execute (function *fun)
555*38fd1498Szrj {
556*38fd1498Szrj   if (number_of_loops (fun) <= 1)
557*38fd1498Szrj     return 0;
558*38fd1498Szrj 
559*38fd1498Szrj   return tree_loop_unroll_and_jam ();
560*38fd1498Szrj }
561*38fd1498Szrj 
562*38fd1498Szrj }
563*38fd1498Szrj 
564*38fd1498Szrj gimple_opt_pass *
565*38fd1498Szrj make_pass_loop_jam (gcc::context *ctxt)
566*38fd1498Szrj {
567*38fd1498Szrj   return new pass_loop_jam (ctxt);
568*38fd1498Szrj }
569