xref: /dflybsd-src/contrib/gcc-8.0/gcc/gimple-loop-jam.c (revision eb67213abec698ffb555ee926f2761bcb7e95f55)
1 /* Loop unroll-and-jam.
2    Copyright (C) 2017-2018 Free Software Foundation, Inc.
3 
4 This file is part of GCC.
5 
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10 
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
14 for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3.  If not see
18 <http://www.gnu.org/licenses/>.  */
19 
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "params.h"
24 #include "tree-pass.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "ssa.h"
29 #include "fold-const.h"
30 #include "tree-cfg.h"
31 #include "tree-ssa.h"
32 #include "tree-ssa-loop-niter.h"
33 #include "tree-ssa-loop.h"
34 #include "tree-ssa-loop-manip.h"
35 #include "cfgloop.h"
36 #include "tree-scalar-evolution.h"
37 #include "gimple-iterator.h"
38 #include "cfghooks.h"
39 #include "tree-data-ref.h"
40 #include "tree-ssa-loop-ivopts.h"
41 #include "tree-vectorizer.h"
42 
43 /* Unroll and Jam transformation
44 
45    This is a combination of two transformations, where the second
46    is not always valid.  It's applicable if a loop nest has redundancies
47    over the iterations of an outer loop while not having that with
48    an inner loop.
49 
50    Given this nest:
51        for (i) {
52 	 for (j) {
53 	   B(i,j)
54 	 }
55        }
56 
57    first unroll:
58        for (i by 2) {
59 	 for (j) {
60 	   B(i,j)
61 	 }
62 	 for (j) {
63 	   B(i+1,j)
64 	 }
65        }
66 
67    then fuse the two adjacent inner loops resulting from that:
68        for (i by 2) {
69 	 for (j) {
70 	   B(i,j)
71 	   B(i+1,j)
72 	 }
73        }
74 
75    As the order of evaluations of the body B changes this is valid
76    only in certain situations: all distance vectors need to be forward.
77    Additionally if there are multiple induction variables than just
78    a counting control IV (j above) we can also deal with some situations.
79 
80    The validity is checked by unroll_jam_possible_p, and the data-dep
81    testing below.
82 
83    A trivial example where the fusion is wrong would be when
84    B(i,j) == x[j-1] = x[j];
85        for (i by 2) {
86 	 for (j) {
87 	   x[j-1] = x[j];
88 	 }
89 	 for (j) {
90 	   x[j-1] = x[j];
91 	 }
92        }  effect: move content to front by two elements
93        -->
94        for (i by 2) {
95 	 for (j) {
96 	   x[j-1] = x[j];
97 	   x[j-1] = x[j];
98 	 }
99        }  effect: move content to front by one element
100 */
101 
102 /* Modify the loop tree for the fact that all code once belonging
103    to the OLD loop or the outer loop of OLD now is inside LOOP.  */
104 
105 static void
106 merge_loop_tree (struct loop *loop, struct loop *old)
107 {
108   basic_block *bbs;
109   int i, n;
110   struct loop *subloop;
111   edge e;
112   edge_iterator ei;
113 
114   /* Find its nodes.  */
115   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
116   n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
117 
118   for (i = 0; i < n; i++)
119     {
120       /* If the block was direct child of OLD loop it's now part
121          of LOOP.  If it was outside OLD, then it moved into LOOP
122 	 as well.  This avoids changing the loop father for BBs
123 	 in inner loops of OLD.  */
124       if (bbs[i]->loop_father == old
125 	  || loop_depth (bbs[i]->loop_father) < loop_depth (old))
126 	{
127 	  remove_bb_from_loops (bbs[i]);
128 	  add_bb_to_loop (bbs[i], loop);
129 	  continue;
130 	}
131 
132       /* If we find a direct subloop of OLD, move it to LOOP.  */
133       subloop = bbs[i]->loop_father;
134       if (loop_outer (subloop) == old && subloop->header == bbs[i])
135 	{
136 	  flow_loop_tree_node_remove (subloop);
137 	  flow_loop_tree_node_add (loop, subloop);
138 	}
139     }
140 
141   /* Update the information about loop exit edges.  */
142   for (i = 0; i < n; i++)
143     {
144       FOR_EACH_EDGE (e, ei, bbs[i]->succs)
145 	{
146 	  rescan_loop_exit (e, false, false);
147 	}
148     }
149 
150   loop->num_nodes = n;
151 
152   free (bbs);
153 }
154 
155 /* BB is part of the outer loop of an unroll-and-jam situation.
156    Check if any statements therein would prevent the transformation.  */
157 
158 static bool
159 bb_prevents_fusion_p (basic_block bb)
160 {
161   gimple_stmt_iterator gsi;
162   /* BB is duplicated by outer unrolling and then all N-1 first copies
163      move into the body of the fused inner loop.  If BB exits the outer loop
164      the last copy still doess so, and the first N-1 copies are cancelled
165      by loop unrolling, so also after fusion it's the exit block.
166      But there might be other reasons that prevent fusion:
167        * stores or unknown side-effects prevent fusion
168        * loads don't
169        * computations into SSA names: these aren't problematic.  Their
170          result will be unused on the exit edges of the first N-1 copies
171 	 (those aren't taken after unrolling).  If they are used on the
172 	 other edge (the one leading to the outer latch block) they are
173 	 loop-carried (on the outer loop) and the Nth copy of BB will
174 	 compute them again (i.e. the first N-1 copies will be dead).  */
175   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
176     {
177       gimple *g = gsi_stmt (gsi);
178       if (gimple_vdef (g) || gimple_has_side_effects (g))
179 	return true;
180     }
181   return false;
182 }
183 
184 /* Given an inner loop LOOP (of some OUTER loop) determine if
185    we can safely fuse copies of it (generated by outer unrolling).
186    If so return true, otherwise return false.  */
187 
188 static bool
189 unroll_jam_possible_p (struct loop *outer, struct loop *loop)
190 {
191   basic_block *bbs;
192   int i, n;
193   struct tree_niter_desc niter;
194 
195   /* When fusing the loops we skip the latch block
196      of the first one, so it mustn't have any effects to
197      preserve.  */
198   if (!empty_block_p (loop->latch))
199     return false;
200 
201   if (!single_exit (loop))
202     return false;
203 
204   /* We need a perfect nest.  Quick check for adjacent inner loops.  */
205   if (outer->inner != loop || loop->next)
206     return false;
207 
208   /* Prevent head-controlled inner loops, that we usually have.
209      The guard block would need to be accepted
210      (invariant condition either entering or skipping the loop),
211      without also accepting arbitrary control flow.  When unswitching
212      ran before us (as with -O3) this won't be a problem because its
213      outer loop unswitching will have moved out the invariant condition.
214 
215      If we do that we need to extend fuse_loops() to cope with this
216      by threading through the (still invariant) copied condition
217      between the two loop copies.  */
218   if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
219     return false;
220 
221   /* The number of iterations of the inner loop must be loop invariant
222      with respect to the outer loop.  */
223   if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
224 				 false, true)
225       || niter.cmp == ERROR_MARK
226       || !integer_zerop (niter.may_be_zero)
227       || !expr_invariant_in_loop_p (outer, niter.niter))
228     return false;
229 
230   /* And check blocks belonging to just outer loop.  */
231   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
232   n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
233 
234   for (i = 0; i < n; i++)
235     if (bbs[i]->loop_father == outer && bb_prevents_fusion_p (bbs[i]))
236       break;
237   free (bbs);
238   if (i != n)
239     return false;
240 
241   /* For now we can safely fuse copies of LOOP only if all
242      loop carried variables are inductions (or the virtual op).
243 
244      We could handle reductions as well (the initial value in the second
245      body would be the after-iter value of the first body) if it's over
246      an associative and commutative operation.  We wouldn't
247      be able to handle unknown cycles.  */
248   gphi_iterator psi;
249   for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
250     {
251       affine_iv iv;
252       tree op = gimple_phi_result (psi.phi ());
253 
254       if (virtual_operand_p (op))
255 	continue;
256       if (!simple_iv (loop, loop, op, &iv, true))
257 	return false;
258       /* The inductions must be regular, loop invariant step and initial
259          value.  */
260       if (!expr_invariant_in_loop_p (outer, iv.step)
261 	  || !expr_invariant_in_loop_p (outer, iv.base))
262 	return false;
263       /* XXX With more effort we could also be able to deal with inductions
264          where the initial value is loop variant but a simple IV in the
265 	 outer loop.  The initial value for the second body would be
266 	 the original initial value plus iv.base.step.  The next value
267 	 for the fused loop would be the original next value of the first
268 	 copy, _not_ the next value of the second body.  */
269     }
270 
271   return true;
272 }
273 
274 /* Fuse LOOP with all further neighbors.  The loops are expected to
275    be in appropriate form.  */
276 
277 static void
278 fuse_loops (struct loop *loop)
279 {
280   struct loop *next = loop->next;
281 
282   while (next)
283     {
284       edge e;
285 
286       remove_branch (single_pred_edge (loop->latch));
287       /* Make delete_basic_block not fiddle with the loop structure.  */
288       basic_block oldlatch = loop->latch;
289       loop->latch = NULL;
290       delete_basic_block (oldlatch);
291       e = redirect_edge_and_branch (loop_latch_edge (next),
292 				    loop->header);
293       loop->latch = e->src;
294       flush_pending_stmts (e);
295 
296       gcc_assert (EDGE_COUNT (next->header->preds) == 1);
297 
298       /* The PHI nodes of the second body (single-argument now)
299          need adjustments to use the right values: either directly
300 	 the value of the corresponding PHI in the first copy or
301 	 the one leaving the first body which unrolling did for us.
302 
303 	 See also unroll_jam_possible_p() for further possibilities.  */
304       gphi_iterator psi_first, psi_second;
305       e = single_pred_edge (next->header);
306       for (psi_first = gsi_start_phis (loop->header),
307 	   psi_second = gsi_start_phis (next->header);
308 	   !gsi_end_p (psi_first);
309 	   gsi_next (&psi_first), gsi_next (&psi_second))
310 	{
311 	  gphi *phi_first = psi_first.phi ();
312 	  gphi *phi_second = psi_second.phi ();
313 	  tree firstop = gimple_phi_result (phi_first);
314 	  /* The virtual operand is correct already as it's
315 	     always live at exit, hence has a LCSSA node and outer
316 	     loop unrolling updated SSA form.  */
317 	  if (virtual_operand_p (firstop))
318 	    continue;
319 
320 	  /* Due to unroll_jam_possible_p() we know that this is
321 	     an induction.  The second body goes over the same
322 	     iteration space.  */
323 	  add_phi_arg (phi_second, firstop, e,
324 		       gimple_location (phi_first));
325 	}
326       gcc_assert (gsi_end_p (psi_second));
327 
328       merge_loop_tree (loop, next);
329       gcc_assert (!next->num_nodes);
330       struct loop *ln = next->next;
331       delete_loop (next);
332       next = ln;
333     }
334   rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
335 }
336 
337 /* Returns true if the distance in DDR can be determined and adjusts
338    the unroll factor in *UNROLL to make unrolling valid for that distance.
339    Otherwise return false.
340 
341    If this data dep can lead to a removed memory reference, increment
342    *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
343    for this to happen.  */
344 
345 static bool
346 adjust_unroll_factor (struct data_dependence_relation *ddr,
347 		      unsigned *unroll, unsigned *profit_unroll,
348 		      unsigned *removed)
349 {
350   bool ret = false;
351   if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
352     {
353       if (DDR_NUM_DIST_VECTS (ddr) == 0)
354 	return false;
355       unsigned i;
356       lambda_vector dist_v;
357       FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
358 	{
359 	  /* A distance (a,b) is at worst transformed into (a/N,b) by the
360 	     unrolling (factor N), so the transformation is valid if
361 	     a >= N, or b > 0, or b is zero and a > 0.  Otherwise the unroll
362 	     factor needs to be limited so that the first condition holds.
363 	     That may limit the factor down to zero in the worst case.  */
364 	  int dist = dist_v[0];
365 	  if (dist < 0)
366 	    gcc_unreachable ();
367 	  else if ((unsigned)dist >= *unroll)
368 	    ;
369 	  else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
370 		   || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
371 		       && dist > 0))
372 	    ;
373 	  else
374 	    *unroll = dist;
375 
376 	  /* With a distance (a,0) it's always profitable to unroll-and-jam
377 	     (by a+1), because one memory reference will go away.  With
378 	     (a,b) and b != 0 that's less clear.  We will increase the
379 	     number of streams without lowering the number of mem refs.
380 	     So for now only handle the first situation.  */
381 	  if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
382 	    {
383 	      *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
384 	      (*removed)++;
385 	    }
386 
387 	  ret = true;
388 	}
389     }
390   return ret;
391 }
392 
393 /* Main entry point for the unroll-and-jam transformation
394    described above.  */
395 
396 static unsigned int
397 tree_loop_unroll_and_jam (void)
398 {
399   struct loop *loop;
400   bool changed = false;
401 
402   gcc_assert (scev_initialized_p ());
403 
404   /* Go through all innermost loops.  */
405   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
406     {
407       struct loop *outer = loop_outer (loop);
408 
409       if (loop_depth (loop) < 2
410 	  || optimize_loop_nest_for_size_p (outer))
411 	continue;
412 
413       if (!unroll_jam_possible_p (outer, loop))
414 	continue;
415 
416       vec<data_reference_p> datarefs;
417       vec<ddr_p> dependences;
418       unsigned unroll_factor, profit_unroll, removed;
419       struct tree_niter_desc desc;
420       bool unroll = false;
421 
422       auto_vec<loop_p, 3> loop_nest;
423       dependences.create (10);
424       datarefs.create (10);
425       if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
426 					       &datarefs, &dependences))
427 	{
428 	  if (dump_file && (dump_flags & TDF_DETAILS))
429 	    fprintf (dump_file, "Cannot analyze data dependencies\n");
430 	  free_data_refs (datarefs);
431 	  free_dependence_relations (dependences);
432 	  return false;
433 	}
434       if (!datarefs.length ())
435 	continue;
436 
437       if (dump_file && (dump_flags & TDF_DETAILS))
438 	dump_data_dependence_relations (dump_file, dependences);
439 
440       unroll_factor = (unsigned)-1;
441       profit_unroll = 1;
442       removed = 0;
443 
444       /* Check all dependencies.  */
445       unsigned i;
446       struct data_dependence_relation *ddr;
447       FOR_EACH_VEC_ELT (dependences, i, ddr)
448 	{
449 	  struct data_reference *dra, *drb;
450 
451 	  /* If the refs are independend there's nothing to do.  */
452 	  if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
453 	    continue;
454 	  dra = DDR_A (ddr);
455 	  drb = DDR_B (ddr);
456 	  /* Nothing interesting for the self dependencies.  */
457 	  if (dra == drb)
458 	    continue;
459 
460 	  /* Now check the distance vector, for determining a sensible
461 	     outer unroll factor, and for validity of merging the inner
462 	     loop copies.  */
463 	  if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
464 				     &removed))
465 	    {
466 	      /* Couldn't get the distance vector.  For two reads that's
467 	         harmless (we assume we should unroll).  For at least
468 		 one write this means we can't check the dependence direction
469 		 and hence can't determine safety.  */
470 
471 	      if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
472 		{
473 		  unroll_factor = 0;
474 		  break;
475 		}
476 	    }
477 	}
478 
479       /* We regard a user-specified minimum percentage of zero as a request
480          to ignore all profitability concerns and apply the transformation
481 	 always.  */
482       if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
483 	profit_unroll = 2;
484       else if (removed * 100 / datarefs.length ()
485 	  < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
486 	profit_unroll = 1;
487       if (unroll_factor > profit_unroll)
488 	unroll_factor = profit_unroll;
489       if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
490 	unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
491       unroll = (unroll_factor > 1
492 		&& can_unroll_loop_p (outer, unroll_factor, &desc));
493 
494       if (unroll)
495 	{
496 	  if (dump_enabled_p ())
497 	    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
498 			     find_loop_location (outer),
499 			     "applying unroll and jam with factor %d\n",
500 			     unroll_factor);
501 	  initialize_original_copy_tables ();
502 	  tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
503 			    &desc);
504 	  free_original_copy_tables ();
505 	  fuse_loops (outer->inner);
506 	  changed = true;
507 	}
508 
509       loop_nest.release ();
510       free_dependence_relations (dependences);
511       free_data_refs (datarefs);
512     }
513 
514   if (changed)
515     {
516       scev_reset ();
517       free_dominance_info (CDI_DOMINATORS);
518       return TODO_cleanup_cfg;
519     }
520   return 0;
521 }
522 
523 /* Pass boilerplate */
524 
525 namespace {
526 
527 const pass_data pass_data_loop_jam =
528 {
529   GIMPLE_PASS, /* type */
530   "unrolljam", /* name */
531   OPTGROUP_LOOP, /* optinfo_flags */
532   TV_LOOP_JAM, /* tv_id */
533   PROP_cfg, /* properties_required */
534   0, /* properties_provided */
535   0, /* properties_destroyed */
536   0, /* todo_flags_start */
537   0, /* todo_flags_finish */
538 };
539 
540 class pass_loop_jam : public gimple_opt_pass
541 {
542 public:
543   pass_loop_jam (gcc::context *ctxt)
544     : gimple_opt_pass (pass_data_loop_jam, ctxt)
545   {}
546 
547   /* opt_pass methods: */
548   virtual bool gate (function *) { return flag_unroll_jam != 0; }
549   virtual unsigned int execute (function *);
550 
551 };
552 
553 unsigned int
554 pass_loop_jam::execute (function *fun)
555 {
556   if (number_of_loops (fun) <= 1)
557     return 0;
558 
559   return tree_loop_unroll_and_jam ();
560 }
561 
562 }
563 
564 gimple_opt_pass *
565 make_pass_loop_jam (gcc::context *ctxt)
566 {
567   return new pass_loop_jam (ctxt);
568 }
569