xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-loop-distribution.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* Loop distribution.
2    Copyright (C) 2006-2013 Free Software Foundation, Inc.
3    Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4    and Sebastian Pop <sebastian.pop@amd.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 /* This pass performs loop distribution: for example, the loop
23 
24    |DO I = 2, N
25    |    A(I) = B(I) + C
26    |    D(I) = A(I-1)*E
27    |ENDDO
28 
29    is transformed to
30 
31    |DOALL I = 2, N
32    |   A(I) = B(I) + C
33    |ENDDO
34    |
35    |DOALL I = 2, N
36    |   D(I) = A(I-1)*E
37    |ENDDO
38 
39    This pass uses an RDG, Reduced Dependence Graph built on top of the
40    data dependence relations.  The RDG is then topologically sorted to
41    obtain a map of information producers/consumers based on which it
42    generates the new loops.  */
43 
44 #include "config.h"
45 #include "system.h"
46 #include "coretypes.h"
47 #include "tree-flow.h"
48 #include "cfgloop.h"
49 #include "tree-chrec.h"
50 #include "tree-data-ref.h"
51 #include "tree-scalar-evolution.h"
52 #include "tree-pass.h"
53 
54 enum partition_kind {
55     PKIND_NORMAL, PKIND_REDUCTION, PKIND_MEMSET, PKIND_MEMCPY
56 };
57 
58 typedef struct partition_s
59 {
60   bitmap stmts;
61   bool has_writes;
62   enum partition_kind kind;
63   /* data-references a kind != PKIND_NORMAL partition is about.  */
64   data_reference_p main_dr;
65   data_reference_p secondary_dr;
66 } *partition_t;
67 
68 
69 /* Allocate and initialize a partition from BITMAP.  */
70 
71 static partition_t
72 partition_alloc (bitmap stmts)
73 {
74   partition_t partition = XCNEW (struct partition_s);
75   partition->stmts = stmts ? stmts : BITMAP_ALLOC (NULL);
76   partition->has_writes = false;
77   partition->kind = PKIND_NORMAL;
78   return partition;
79 }
80 
81 /* Free PARTITION.  */
82 
83 static void
84 partition_free (partition_t partition)
85 {
86   BITMAP_FREE (partition->stmts);
87   free (partition);
88 }
89 
90 /* Returns true if the partition can be generated as a builtin.  */
91 
92 static bool
93 partition_builtin_p (partition_t partition)
94 {
95   return partition->kind > PKIND_REDUCTION;
96 }
97 
98 /* Returns true if the partition has an writes.  */
99 
100 static bool
101 partition_has_writes (partition_t partition)
102 {
103   return partition->has_writes;
104 }
105 
106 /* If bit I is not set, it means that this node represents an
107    operation that has already been performed, and that should not be
108    performed again.  This is the subgraph of remaining important
109    computations that is passed to the DFS algorithm for avoiding to
110    include several times the same stores in different loops.  */
111 static bitmap remaining_stmts;
112 
113 /* A node of the RDG is marked in this bitmap when it has as a
114    predecessor a node that writes to memory.  */
115 static bitmap upstream_mem_writes;
116 
117 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
118    the LOOP.  */
119 
120 static bool
121 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
122 {
123   imm_use_iterator imm_iter;
124   use_operand_p use_p;
125 
126   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
127     {
128       gimple use_stmt = USE_STMT (use_p);
129       if (!is_gimple_debug (use_stmt)
130 	  && loop != loop_containing_stmt (use_stmt))
131 	return true;
132     }
133 
134   return false;
135 }
136 
137 /* Returns true when STMT defines a scalar variable used after the
138    loop LOOP.  */
139 
140 static bool
141 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple stmt)
142 {
143   def_operand_p def_p;
144   ssa_op_iter op_iter;
145 
146   if (gimple_code (stmt) == GIMPLE_PHI)
147     return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
148 
149   FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
150     if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
151       return true;
152 
153   return false;
154 }
155 
156 /* Return a copy of LOOP placed before LOOP.  */
157 
158 static struct loop *
159 copy_loop_before (struct loop *loop)
160 {
161   struct loop *res;
162   edge preheader = loop_preheader_edge (loop);
163 
164   initialize_original_copy_tables ();
165   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader);
166   gcc_assert (res != NULL);
167   free_original_copy_tables ();
168   delete_update_ssa ();
169 
170   return res;
171 }
172 
173 /* Creates an empty basic block after LOOP.  */
174 
175 static void
176 create_bb_after_loop (struct loop *loop)
177 {
178   edge exit = single_exit (loop);
179 
180   if (!exit)
181     return;
182 
183   split_edge (exit);
184 }
185 
186 /* Generate code for PARTITION from the code in LOOP.  The loop is
187    copied when COPY_P is true.  All the statements not flagged in the
188    PARTITION bitmap are removed from the loop or from its copy.  The
189    statements are indexed in sequence inside a basic block, and the
190    basic blocks of a loop are taken in dom order.  */
191 
192 static void
193 generate_loops_for_partition (struct loop *loop, partition_t partition,
194 			      bool copy_p)
195 {
196   unsigned i, x;
197   gimple_stmt_iterator bsi;
198   basic_block *bbs;
199 
200   if (copy_p)
201     {
202       loop = copy_loop_before (loop);
203       gcc_assert (loop != NULL);
204       create_preheader (loop, CP_SIMPLE_PREHEADERS);
205       create_bb_after_loop (loop);
206     }
207 
208   /* Remove stmts not in the PARTITION bitmap.  The order in which we
209      visit the phi nodes and the statements is exactly as in
210      stmts_from_loop.  */
211   bbs = get_loop_body_in_dom_order (loop);
212 
213   if (MAY_HAVE_DEBUG_STMTS)
214     for (x = 0, i = 0; i < loop->num_nodes; i++)
215       {
216 	basic_block bb = bbs[i];
217 
218 	for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi); gsi_next (&bsi))
219 	  if (!bitmap_bit_p (partition->stmts, x++))
220 	    reset_debug_uses (gsi_stmt (bsi));
221 
222 	for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
223 	  {
224 	    gimple stmt = gsi_stmt (bsi);
225 	    if (gimple_code (stmt) != GIMPLE_LABEL
226 		&& !is_gimple_debug (stmt)
227 		&& !bitmap_bit_p (partition->stmts, x++))
228 	      reset_debug_uses (stmt);
229 	  }
230       }
231 
232   for (x = 0, i = 0; i < loop->num_nodes; i++)
233     {
234       basic_block bb = bbs[i];
235 
236       for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
237 	if (!bitmap_bit_p (partition->stmts, x++))
238 	  {
239 	    gimple phi = gsi_stmt (bsi);
240 	    if (virtual_operand_p (gimple_phi_result (phi)))
241 	      mark_virtual_phi_result_for_renaming (phi);
242 	    remove_phi_node (&bsi, true);
243 	  }
244 	else
245 	  gsi_next (&bsi);
246 
247       for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
248 	{
249 	  gimple stmt = gsi_stmt (bsi);
250 	  if (gimple_code (stmt) != GIMPLE_LABEL
251 	      && !is_gimple_debug (stmt)
252 	      && !bitmap_bit_p (partition->stmts, x++))
253 	    {
254 	      unlink_stmt_vdef (stmt);
255 	      gsi_remove (&bsi, true);
256 	      release_defs (stmt);
257 	    }
258 	  else
259 	    gsi_next (&bsi);
260 	}
261     }
262 
263   free (bbs);
264 }
265 
266 /* Build the size argument for a memory operation call.  */
267 
268 static tree
269 build_size_arg_loc (location_t loc, data_reference_p dr, tree nb_iter)
270 {
271   tree size;
272   size = fold_build2_loc (loc, MULT_EXPR, sizetype,
273 			  fold_convert_loc (loc, sizetype, nb_iter),
274 			  TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
275   return fold_convert_loc (loc, size_type_node, size);
276 }
277 
278 /* Build an address argument for a memory operation call.  */
279 
280 static tree
281 build_addr_arg_loc (location_t loc, data_reference_p dr, tree nb_bytes)
282 {
283   tree addr_base;
284 
285   addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
286   addr_base = fold_convert_loc (loc, sizetype, addr_base);
287 
288   /* Test for a negative stride, iterating over every element.  */
289   if (tree_int_cst_sgn (DR_STEP (dr)) == -1)
290     {
291       addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base,
292 				  fold_convert_loc (loc, sizetype, nb_bytes));
293       addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base,
294 				  TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))));
295     }
296 
297   return fold_build_pointer_plus_loc (loc, DR_BASE_ADDRESS (dr), addr_base);
298 }
299 
300 /* Generate a call to memset for PARTITION in LOOP.  */
301 
302 static void
303 generate_memset_builtin (struct loop *loop, partition_t partition)
304 {
305   gimple_stmt_iterator gsi;
306   gimple stmt, fn_call;
307   tree nb_iter, mem, fn, nb_bytes;
308   location_t loc;
309   tree val;
310 
311   stmt = DR_STMT (partition->main_dr);
312   loc = gimple_location (stmt);
313   if (gimple_bb (stmt) == loop->latch)
314     nb_iter = number_of_latch_executions (loop);
315   else
316     nb_iter = number_of_exit_cond_executions (loop);
317 
318   /* The new statements will be placed before LOOP.  */
319   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
320 
321   nb_bytes = build_size_arg_loc (loc, partition->main_dr, nb_iter);
322   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
323 				       false, GSI_CONTINUE_LINKING);
324   mem = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
325   mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
326 				  false, GSI_CONTINUE_LINKING);
327 
328   /* This exactly matches the pattern recognition in classify_partition.  */
329   val = gimple_assign_rhs1 (stmt);
330   if (integer_zerop (val)
331       || real_zerop (val)
332       || TREE_CODE (val) == CONSTRUCTOR)
333     val = integer_zero_node;
334   else if (integer_all_onesp (val))
335     val = build_int_cst (integer_type_node, -1);
336   else
337     {
338       if (TREE_CODE (val) == INTEGER_CST)
339 	val = fold_convert (integer_type_node, val);
340       else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
341 	{
342 	  gimple cstmt;
343 	  tree tem = make_ssa_name (integer_type_node, NULL);
344 	  cstmt = gimple_build_assign_with_ops (NOP_EXPR, tem, val, NULL_TREE);
345 	  gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
346 	  val = tem;
347 	}
348     }
349 
350   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
351   fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
352   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
353 
354   if (dump_file && (dump_flags & TDF_DETAILS))
355     {
356       fprintf (dump_file, "generated memset");
357       if (integer_zerop (val))
358 	fprintf (dump_file, " zero\n");
359       else if (integer_all_onesp (val))
360 	fprintf (dump_file, " minus one\n");
361       else
362 	fprintf (dump_file, "\n");
363     }
364 }
365 
366 /* Generate a call to memcpy for PARTITION in LOOP.  */
367 
368 static void
369 generate_memcpy_builtin (struct loop *loop, partition_t partition)
370 {
371   gimple_stmt_iterator gsi;
372   gimple stmt, fn_call;
373   tree nb_iter, dest, src, fn, nb_bytes;
374   location_t loc;
375   enum built_in_function kind;
376 
377   stmt = DR_STMT (partition->main_dr);
378   loc = gimple_location (stmt);
379   if (gimple_bb (stmt) == loop->latch)
380     nb_iter = number_of_latch_executions (loop);
381   else
382     nb_iter = number_of_exit_cond_executions (loop);
383 
384   /* The new statements will be placed before LOOP.  */
385   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
386 
387   nb_bytes = build_size_arg_loc (loc, partition->main_dr, nb_iter);
388   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
389 				       false, GSI_CONTINUE_LINKING);
390   dest = build_addr_arg_loc (loc, partition->main_dr, nb_bytes);
391   src = build_addr_arg_loc (loc, partition->secondary_dr, nb_bytes);
392   if (ptr_derefs_may_alias_p (dest, src))
393     kind = BUILT_IN_MEMMOVE;
394   else
395     kind = BUILT_IN_MEMCPY;
396 
397   dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
398 				   false, GSI_CONTINUE_LINKING);
399   src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
400 				  false, GSI_CONTINUE_LINKING);
401   fn = build_fold_addr_expr (builtin_decl_implicit (kind));
402   fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
403   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
404 
405   if (dump_file && (dump_flags & TDF_DETAILS))
406     {
407       if (kind == BUILT_IN_MEMCPY)
408 	fprintf (dump_file, "generated memcpy\n");
409       else
410 	fprintf (dump_file, "generated memmove\n");
411     }
412 }
413 
414 /* Remove and destroy the loop LOOP.  */
415 
416 static void
417 destroy_loop (struct loop *loop)
418 {
419   unsigned nbbs = loop->num_nodes;
420   edge exit = single_exit (loop);
421   basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
422   basic_block *bbs;
423   unsigned i;
424 
425   bbs = get_loop_body_in_dom_order (loop);
426 
427   redirect_edge_pred (exit, src);
428   exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
429   exit->flags |= EDGE_FALLTHRU;
430   cancel_loop_tree (loop);
431   rescan_loop_exit (exit, false, true);
432 
433   for (i = 0; i < nbbs; i++)
434     {
435       /* We have made sure to not leave any dangling uses of SSA
436          names defined in the loop.  With the exception of virtuals.
437 	 Make sure we replace all uses of virtual defs that will remain
438 	 outside of the loop with the bare symbol as delete_basic_block
439 	 will release them.  */
440       gimple_stmt_iterator gsi;
441       for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
442 	{
443 	  gimple phi = gsi_stmt (gsi);
444 	  if (virtual_operand_p (gimple_phi_result (phi)))
445 	    mark_virtual_phi_result_for_renaming (phi);
446 	}
447       for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
448 	{
449 	  gimple stmt = gsi_stmt (gsi);
450 	  tree vdef = gimple_vdef (stmt);
451 	  if (vdef && TREE_CODE (vdef) == SSA_NAME)
452 	    mark_virtual_operand_for_renaming (vdef);
453 	}
454       delete_basic_block (bbs[i]);
455     }
456   free (bbs);
457 
458   set_immediate_dominator (CDI_DOMINATORS, dest,
459 			   recompute_dominator (CDI_DOMINATORS, dest));
460 }
461 
462 /* Generates code for PARTITION.  */
463 
464 static void
465 generate_code_for_partition (struct loop *loop,
466 			     partition_t partition, bool copy_p)
467 {
468   switch (partition->kind)
469     {
470     case PKIND_MEMSET:
471       generate_memset_builtin (loop, partition);
472       /* If this is the last partition for which we generate code, we have
473 	 to destroy the loop.  */
474       if (!copy_p)
475 	destroy_loop (loop);
476       break;
477 
478     case PKIND_MEMCPY:
479       generate_memcpy_builtin (loop, partition);
480       /* If this is the last partition for which we generate code, we have
481 	 to destroy the loop.  */
482       if (!copy_p)
483 	destroy_loop (loop);
484       break;
485 
486     case PKIND_REDUCTION:
487       /* Reductions all have to be in the last partition.  */
488       gcc_assert (!copy_p);
489     case PKIND_NORMAL:
490       generate_loops_for_partition (loop, partition, copy_p);
491       break;
492 
493     default:
494       gcc_unreachable ();
495     }
496 }
497 
498 
499 /* Returns true if the node V of RDG cannot be recomputed.  */
500 
501 static bool
502 rdg_cannot_recompute_vertex_p (struct graph *rdg, int v)
503 {
504   if (RDG_MEM_WRITE_STMT (rdg, v))
505     return true;
506 
507   return false;
508 }
509 
510 /* Returns true when the vertex V has already been generated in the
511    current partition (V is in PROCESSED), or when V belongs to another
512    partition and cannot be recomputed (V is not in REMAINING_STMTS).  */
513 
514 static inline bool
515 already_processed_vertex_p (bitmap processed, int v)
516 {
517   return (bitmap_bit_p (processed, v)
518 	  || !bitmap_bit_p (remaining_stmts, v));
519 }
520 
521 /* Returns NULL when there is no anti-dependence or output-dependence
522    among the successors of vertex V, otherwise returns the edge with the
523    dependency.  */
524 
525 static struct graph_edge *
526 has_anti_or_output_dependence (struct vertex *v)
527 {
528   struct graph_edge *e;
529 
530   if (v->succ)
531     for (e = v->succ; e; e = e->succ_next)
532       if (RDGE_TYPE (e) == anti_dd
533 	  || RDGE_TYPE (e) == output_dd)
534 	return e;
535 
536   return NULL;
537 }
538 
539 /* Returns true when V has an anti-dependence edge among its successors.  */
540 
541 static bool
542 predecessor_has_mem_write (struct graph *rdg, struct vertex *v)
543 {
544   struct graph_edge *e;
545 
546   if (v->pred)
547     for (e = v->pred; e; e = e->pred_next)
548       if (bitmap_bit_p (upstream_mem_writes, e->src)
549 	  /* Don't consider flow channels: a write to memory followed
550 	     by a read from memory.  These channels allow the split of
551 	     the RDG in different partitions.  */
552 	  && !RDG_MEM_WRITE_STMT (rdg, e->src))
553 	return true;
554 
555   return false;
556 }
557 
558 /* Initializes the upstream_mem_writes bitmap following the
559    information from RDG.  */
560 
561 static void
562 mark_nodes_having_upstream_mem_writes (struct graph *rdg)
563 {
564   int v, x;
565   bitmap seen = BITMAP_ALLOC (NULL);
566 
567   for (v = rdg->n_vertices - 1; v >= 0; v--)
568     if (!bitmap_bit_p (seen, v))
569       {
570 	unsigned i;
571 	vec<int> nodes;
572 	nodes.create (3);
573 
574 	graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
575 
576 	FOR_EACH_VEC_ELT (nodes, i, x)
577 	  {
578 	    if (!bitmap_set_bit (seen, x))
579 	      continue;
580 
581 	    if (RDG_MEM_WRITE_STMT (rdg, x)
582 		|| predecessor_has_mem_write (rdg, &(rdg->vertices[x]))
583 		/* In anti dependences the read should occur before
584 		   the write, this is why both the read and the write
585 		   should be placed in the same partition.  In output
586 		   dependences the writes order need to be preserved.  */
587 		|| has_anti_or_output_dependence (&(rdg->vertices[x])))
588 	      bitmap_set_bit (upstream_mem_writes, x);
589 	  }
590 
591 	nodes.release ();
592       }
593 }
594 
595 /* Returns true when vertex u has a memory write node as a predecessor
596    in RDG.  */
597 
598 static bool
599 has_upstream_mem_writes (int u)
600 {
601   return bitmap_bit_p (upstream_mem_writes, u);
602 }
603 
604 static void rdg_flag_vertex_and_dependent (struct graph *, int, partition_t,
605 					   bitmap, bitmap);
606 
607 /* Flag the uses of U stopping following the information from
608    upstream_mem_writes.  */
609 
610 static void
611 rdg_flag_uses (struct graph *rdg, int u, partition_t partition, bitmap loops,
612 	       bitmap processed)
613 {
614   use_operand_p use_p;
615   struct vertex *x = &(rdg->vertices[u]);
616   gimple stmt = RDGV_STMT (x);
617   struct graph_edge *anti_dep = has_anti_or_output_dependence (x);
618 
619   /* Keep in the same partition the destination of an antidependence,
620      because this is a store to the exact same location.  Putting this
621      in another partition is bad for cache locality.  */
622   if (anti_dep)
623     {
624       int v = anti_dep->dest;
625 
626       if (!already_processed_vertex_p (processed, v))
627 	rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
628 				       processed);
629     }
630 
631   if (gimple_code (stmt) != GIMPLE_PHI)
632     {
633       if ((use_p = gimple_vuse_op (stmt)) != NULL_USE_OPERAND_P)
634 	{
635 	  tree use = USE_FROM_PTR (use_p);
636 
637 	  if (TREE_CODE (use) == SSA_NAME
638 	      && !SSA_NAME_IS_DEFAULT_DEF (use))
639 	    {
640 	      gimple def_stmt = SSA_NAME_DEF_STMT (use);
641 	      int v = rdg_vertex_for_stmt (rdg, def_stmt);
642 
643 	      if (v >= 0
644 		  && !already_processed_vertex_p (processed, v))
645 		rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
646 					       processed);
647 	    }
648 	}
649     }
650 
651   if (is_gimple_assign (stmt) && has_upstream_mem_writes (u))
652     {
653       tree op0 = gimple_assign_lhs (stmt);
654 
655       /* Scalar channels don't have enough space for transmitting data
656 	 between tasks, unless we add more storage by privatizing.  */
657       if (is_gimple_reg (op0))
658 	{
659 	  use_operand_p use_p;
660 	  imm_use_iterator iter;
661 
662 	  FOR_EACH_IMM_USE_FAST (use_p, iter, op0)
663 	    {
664 	      int v = rdg_vertex_for_stmt (rdg, USE_STMT (use_p));
665 
666 	      if (!already_processed_vertex_p (processed, v))
667 		rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
668 					       processed);
669 	    }
670 	}
671     }
672 }
673 
674 /* Flag V from RDG as part of PARTITION, and also flag its loop number
675    in LOOPS.  */
676 
677 static void
678 rdg_flag_vertex (struct graph *rdg, int v, partition_t partition, bitmap loops)
679 {
680   struct loop *loop;
681 
682   if (!bitmap_set_bit (partition->stmts, v))
683     return;
684 
685   loop = loop_containing_stmt (RDG_STMT (rdg, v));
686   bitmap_set_bit (loops, loop->num);
687 
688   if (rdg_cannot_recompute_vertex_p (rdg, v))
689     {
690       partition->has_writes = true;
691       bitmap_clear_bit (remaining_stmts, v);
692     }
693 }
694 
695 /* Flag in the bitmap PARTITION the vertex V and all its predecessors.
696    Also flag their loop number in LOOPS.  */
697 
698 static void
699 rdg_flag_vertex_and_dependent (struct graph *rdg, int v, partition_t partition,
700 			       bitmap loops, bitmap processed)
701 {
702   unsigned i;
703   vec<int> nodes;
704   nodes.create (3);
705   int x;
706 
707   bitmap_set_bit (processed, v);
708   rdg_flag_uses (rdg, v, partition, loops, processed);
709   graphds_dfs (rdg, &v, 1, &nodes, false, remaining_stmts);
710   rdg_flag_vertex (rdg, v, partition, loops);
711 
712   FOR_EACH_VEC_ELT (nodes, i, x)
713     if (!already_processed_vertex_p (processed, x))
714       rdg_flag_vertex_and_dependent (rdg, x, partition, loops, processed);
715 
716   nodes.release ();
717 }
718 
719 /* Initialize CONDS with all the condition statements from the basic
720    blocks of LOOP.  */
721 
722 static void
723 collect_condition_stmts (struct loop *loop, vec<gimple> *conds)
724 {
725   unsigned i;
726   edge e;
727   vec<edge> exits = get_loop_exit_edges (loop);
728 
729   FOR_EACH_VEC_ELT (exits, i, e)
730     {
731       gimple cond = last_stmt (e->src);
732 
733       if (cond)
734 	conds->safe_push (cond);
735     }
736 
737   exits.release ();
738 }
739 
740 /* Add to PARTITION all the exit condition statements for LOOPS
741    together with all their dependent statements determined from
742    RDG.  */
743 
744 static void
745 rdg_flag_loop_exits (struct graph *rdg, bitmap loops, partition_t partition,
746 		     bitmap processed)
747 {
748   unsigned i;
749   bitmap_iterator bi;
750   vec<gimple> conds;
751   conds.create (3);
752 
753   EXECUTE_IF_SET_IN_BITMAP (loops, 0, i, bi)
754     collect_condition_stmts (get_loop (i), &conds);
755 
756   while (!conds.is_empty ())
757     {
758       gimple cond = conds.pop ();
759       int v = rdg_vertex_for_stmt (rdg, cond);
760       bitmap new_loops = BITMAP_ALLOC (NULL);
761 
762       if (!already_processed_vertex_p (processed, v))
763 	rdg_flag_vertex_and_dependent (rdg, v, partition, new_loops, processed);
764 
765       EXECUTE_IF_SET_IN_BITMAP (new_loops, 0, i, bi)
766 	if (bitmap_set_bit (loops, i))
767 	  collect_condition_stmts (get_loop (i), &conds);
768 
769       BITMAP_FREE (new_loops);
770     }
771 
772   conds.release ();
773 }
774 
775 /* Returns a bitmap in which all the statements needed for computing
776    the strongly connected component C of the RDG are flagged, also
777    including the loop exit conditions.  */
778 
779 static partition_t
780 build_rdg_partition_for_component (struct graph *rdg, rdgc c)
781 {
782   int i, v;
783   partition_t partition = partition_alloc (NULL);
784   bitmap loops = BITMAP_ALLOC (NULL);
785   bitmap processed = BITMAP_ALLOC (NULL);
786 
787   FOR_EACH_VEC_ELT (c->vertices, i, v)
788     if (!already_processed_vertex_p (processed, v))
789       rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed);
790 
791   rdg_flag_loop_exits (rdg, loops, partition, processed);
792 
793   BITMAP_FREE (processed);
794   BITMAP_FREE (loops);
795   return partition;
796 }
797 
798 /* Free memory for COMPONENTS.  */
799 
800 static void
801 free_rdg_components (vec<rdgc> components)
802 {
803   int i;
804   rdgc x;
805 
806   FOR_EACH_VEC_ELT (components, i, x)
807     {
808       x->vertices.release ();
809       free (x);
810     }
811 
812   components.release ();
813 }
814 
815 /* Build the COMPONENTS vector with the strongly connected components
816    of RDG in which the STARTING_VERTICES occur.  */
817 
818 static void
819 rdg_build_components (struct graph *rdg, vec<int> starting_vertices,
820 		      vec<rdgc> *components)
821 {
822   int i, v;
823   bitmap saved_components = BITMAP_ALLOC (NULL);
824   int n_components = graphds_scc (rdg, NULL);
825   /* ??? Macros cannot process template types with more than one
826      argument, so we need this typedef.  */
827   typedef vec<int> vec_int_heap;
828   vec<int> *all_components = XNEWVEC (vec_int_heap, n_components);
829 
830   for (i = 0; i < n_components; i++)
831     all_components[i].create (3);
832 
833   for (i = 0; i < rdg->n_vertices; i++)
834     all_components[rdg->vertices[i].component].safe_push (i);
835 
836   FOR_EACH_VEC_ELT (starting_vertices, i, v)
837     {
838       int c = rdg->vertices[v].component;
839 
840       if (bitmap_set_bit (saved_components, c))
841 	{
842 	  rdgc x = XCNEW (struct rdg_component);
843 	  x->num = c;
844 	  x->vertices = all_components[c];
845 
846 	  components->safe_push (x);
847 	}
848     }
849 
850   for (i = 0; i < n_components; i++)
851     if (!bitmap_bit_p (saved_components, i))
852       all_components[i].release ();
853 
854   free (all_components);
855   BITMAP_FREE (saved_components);
856 }
857 
858 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
859    For the moment we detect only the memset zero pattern.  */
860 
861 static void
862 classify_partition (loop_p loop, struct graph *rdg, partition_t partition)
863 {
864   bitmap_iterator bi;
865   unsigned i;
866   tree nb_iter;
867   data_reference_p single_load, single_store;
868   bool volatiles_p = false;
869 
870   partition->kind = PKIND_NORMAL;
871   partition->main_dr = NULL;
872   partition->secondary_dr = NULL;
873 
874   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
875     {
876       gimple stmt = RDG_STMT (rdg, i);
877 
878       if (gimple_has_volatile_ops (stmt))
879 	volatiles_p = true;
880 
881       /* If the stmt has uses outside of the loop fail.
882 	 ???  If the stmt is generated in another partition that
883 	 is not created as builtin we can ignore this.  */
884       if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
885 	{
886 	  if (dump_file && (dump_flags & TDF_DETAILS))
887 	    fprintf (dump_file, "not generating builtin, partition has "
888 		     "scalar uses outside of the loop\n");
889 	  partition->kind = PKIND_REDUCTION;
890 	  return;
891 	}
892     }
893 
894   /* Perform general partition disqualification for builtins.  */
895   if (volatiles_p
896       || !flag_tree_loop_distribute_patterns)
897     return;
898 
899   nb_iter = number_of_exit_cond_executions (loop);
900   if (!nb_iter || nb_iter == chrec_dont_know)
901     return;
902 
903   /* Detect memset and memcpy.  */
904   single_load = NULL;
905   single_store = NULL;
906   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
907     {
908       gimple stmt = RDG_STMT (rdg, i);
909       data_reference_p dr;
910       unsigned j;
911 
912       if (gimple_code (stmt) == GIMPLE_PHI)
913 	continue;
914 
915       /* Any scalar stmts are ok.  */
916       if (!gimple_vuse (stmt))
917 	continue;
918 
919       /* Otherwise just regular loads/stores.  */
920       if (!gimple_assign_single_p (stmt))
921 	return;
922 
923       /* But exactly one store and/or load.  */
924       for (j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
925 	{
926 	  if (DR_IS_READ (dr))
927 	    {
928 	      if (single_load != NULL)
929 		return;
930 	      single_load = dr;
931 	    }
932 	  else
933 	    {
934 	      if (single_store != NULL)
935 		return;
936 	      single_store = dr;
937 	    }
938 	}
939     }
940 
941   if (single_store && !single_load)
942     {
943       gimple stmt = DR_STMT (single_store);
944       tree rhs = gimple_assign_rhs1 (stmt);
945       if (!(integer_zerop (rhs)
946 	    || real_zerop (rhs)
947 	    || (TREE_CODE (rhs) == CONSTRUCTOR
948 		&& !TREE_CLOBBER_P (rhs))
949 	    || ((integer_all_onesp (rhs)
950 		 || (INTEGRAL_TYPE_P (TREE_TYPE (rhs))
951 		     && (TYPE_MODE (TREE_TYPE (rhs))
952 			 == TYPE_MODE (unsigned_char_type_node))))
953 		/* For stores of a non-zero value require that the precision
954 		   of the value matches its actual size.  */
955 		&& (TYPE_PRECISION (TREE_TYPE (rhs))
956 		    == GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (rhs)))))))
957 	return;
958       if (TREE_CODE (rhs) == SSA_NAME
959 	  && !SSA_NAME_IS_DEFAULT_DEF (rhs)
960 	  && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
961 	return;
962       if (!adjacent_dr_p (single_store))
963 	return;
964       partition->kind = PKIND_MEMSET;
965       partition->main_dr = single_store;
966     }
967   else if (single_store && single_load)
968     {
969       gimple store = DR_STMT (single_store);
970       gimple load = DR_STMT (single_load);
971       /* Direct aggregate copy or via an SSA name temporary.  */
972       if (load != store
973 	  && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
974 	return;
975       if (!adjacent_dr_p (single_store)
976 	  || !adjacent_dr_p (single_load)
977 	  || !operand_equal_p (DR_STEP (single_store),
978 			       DR_STEP (single_load), 0))
979 	return;
980       /* Now check that if there is a dependence this dependence is
981          of a suitable form for memmove.  */
982       vec<loop_p> loops = vNULL;
983       ddr_p ddr;
984       loops.safe_push (loop);
985       ddr = initialize_data_dependence_relation (single_load, single_store,
986 						 loops);
987       compute_affine_dependence (ddr, loop);
988       if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
989 	{
990 	  free_dependence_relation (ddr);
991 	  loops.release ();
992 	  return;
993 	}
994       if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
995 	{
996 	  if (DDR_NUM_DIST_VECTS (ddr) == 0)
997 	    {
998 	      free_dependence_relation (ddr);
999 	      loops.release ();
1000 	      return;
1001 	    }
1002 	  lambda_vector dist_v;
1003 	  FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1004 	    {
1005 	      int dist = dist_v[index_in_loop_nest (loop->num,
1006 						    DDR_LOOP_NEST (ddr))];
1007 	      if (dist > 0 && !DDR_REVERSED_P (ddr))
1008 		{
1009 		  free_dependence_relation (ddr);
1010 		  loops.release ();
1011 		  return;
1012 		}
1013 	    }
1014 	}
1015       free_dependence_relation (ddr);
1016       loops.release ();
1017       partition->kind = PKIND_MEMCPY;
1018       partition->main_dr = single_store;
1019       partition->secondary_dr = single_load;
1020     }
1021 }
1022 
1023 /* For a data reference REF, return the declaration of its base
1024    address or NULL_TREE if the base is not determined.  */
1025 
1026 static tree
1027 ref_base_address (data_reference_p dr)
1028 {
1029   tree base_address = DR_BASE_ADDRESS (dr);
1030   if (base_address
1031       && TREE_CODE (base_address) == ADDR_EXPR)
1032     return TREE_OPERAND (base_address, 0);
1033 
1034   return base_address;
1035 }
1036 
1037 /* Returns true when PARTITION1 and PARTITION2 have similar memory
1038    accesses in RDG.  */
1039 
1040 static bool
1041 similar_memory_accesses (struct graph *rdg, partition_t partition1,
1042 			 partition_t partition2)
1043 {
1044   unsigned i, j, k, l;
1045   bitmap_iterator bi, bj;
1046   data_reference_p ref1, ref2;
1047 
1048   /* First check whether in the intersection of the two partitions are
1049      any loads or stores.  Common loads are the situation that happens
1050      most often.  */
1051   EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1052     if (RDG_MEM_WRITE_STMT (rdg, i)
1053 	|| RDG_MEM_READS_STMT (rdg, i))
1054       return true;
1055 
1056   /* Then check all data-references against each other.  */
1057   EXECUTE_IF_SET_IN_BITMAP (partition1->stmts, 0, i, bi)
1058     if (RDG_MEM_WRITE_STMT (rdg, i)
1059 	|| RDG_MEM_READS_STMT (rdg, i))
1060       EXECUTE_IF_SET_IN_BITMAP (partition2->stmts, 0, j, bj)
1061 	if (RDG_MEM_WRITE_STMT (rdg, j)
1062 	    || RDG_MEM_READS_STMT (rdg, j))
1063 	  {
1064 	    FOR_EACH_VEC_ELT (RDG_DATAREFS (rdg, i), k, ref1)
1065 	      {
1066 		tree base1 = ref_base_address (ref1);
1067 		if (base1)
1068 		  FOR_EACH_VEC_ELT (RDG_DATAREFS (rdg, j), l, ref2)
1069 		    if (base1 == ref_base_address (ref2))
1070 		      return true;
1071 	      }
1072 	  }
1073 
1074   return false;
1075 }
1076 
1077 /* Aggregate several components into a useful partition that is
1078    registered in the PARTITIONS vector.  Partitions will be
1079    distributed in different loops.  */
1080 
1081 static void
1082 rdg_build_partitions (struct graph *rdg, vec<rdgc> components,
1083 		      vec<int> *other_stores,
1084 		      vec<partition_t> *partitions, bitmap processed)
1085 {
1086   int i;
1087   rdgc x;
1088   partition_t partition = partition_alloc (NULL);
1089 
1090   FOR_EACH_VEC_ELT (components, i, x)
1091     {
1092       partition_t np;
1093       int v = x->vertices[0];
1094 
1095       if (bitmap_bit_p (processed, v))
1096 	continue;
1097 
1098       np = build_rdg_partition_for_component (rdg, x);
1099       bitmap_ior_into (partition->stmts, np->stmts);
1100       partition->has_writes = partition_has_writes (np);
1101       bitmap_ior_into (processed, np->stmts);
1102       partition_free (np);
1103 
1104       if (partition_has_writes (partition))
1105 	{
1106 	  if (dump_file && (dump_flags & TDF_DETAILS))
1107 	    {
1108 	      fprintf (dump_file, "ldist useful partition:\n");
1109 	      dump_bitmap (dump_file, partition->stmts);
1110 	    }
1111 
1112 	  partitions->safe_push (partition);
1113 	  partition = partition_alloc (NULL);
1114 	}
1115     }
1116 
1117   /* Add the nodes from the RDG that were not marked as processed, and
1118      that are used outside the current loop.  These are scalar
1119      computations that are not yet part of previous partitions.  */
1120   for (i = 0; i < rdg->n_vertices; i++)
1121     if (!bitmap_bit_p (processed, i)
1122 	&& rdg_defs_used_in_other_loops_p (rdg, i))
1123       other_stores->safe_push (i);
1124 
1125   /* If there are still statements left in the OTHER_STORES array,
1126      create other components and partitions with these stores and
1127      their dependences.  */
1128   if (other_stores->length () > 0)
1129     {
1130       vec<rdgc> comps;
1131       comps.create (3);
1132       vec<int> foo;
1133       foo.create (3);
1134 
1135       rdg_build_components (rdg, *other_stores, &comps);
1136       rdg_build_partitions (rdg, comps, &foo, partitions, processed);
1137 
1138       foo.release ();
1139       free_rdg_components (comps);
1140     }
1141 
1142   /* If there is something left in the last partition, save it.  */
1143   if (bitmap_count_bits (partition->stmts) > 0)
1144     partitions->safe_push (partition);
1145   else
1146     partition_free (partition);
1147 }
1148 
1149 /* Dump to FILE the PARTITIONS.  */
1150 
1151 static void
1152 dump_rdg_partitions (FILE *file, vec<partition_t> partitions)
1153 {
1154   int i;
1155   partition_t partition;
1156 
1157   FOR_EACH_VEC_ELT (partitions, i, partition)
1158     debug_bitmap_file (file, partition->stmts);
1159 }
1160 
1161 /* Debug PARTITIONS.  */
1162 extern void debug_rdg_partitions (vec<partition_t> );
1163 
1164 DEBUG_FUNCTION void
1165 debug_rdg_partitions (vec<partition_t> partitions)
1166 {
1167   dump_rdg_partitions (stderr, partitions);
1168 }
1169 
1170 /* Returns the number of read and write operations in the RDG.  */
1171 
1172 static int
1173 number_of_rw_in_rdg (struct graph *rdg)
1174 {
1175   int i, res = 0;
1176 
1177   for (i = 0; i < rdg->n_vertices; i++)
1178     {
1179       if (RDG_MEM_WRITE_STMT (rdg, i))
1180 	++res;
1181 
1182       if (RDG_MEM_READS_STMT (rdg, i))
1183 	++res;
1184     }
1185 
1186   return res;
1187 }
1188 
1189 /* Returns the number of read and write operations in a PARTITION of
1190    the RDG.  */
1191 
1192 static int
1193 number_of_rw_in_partition (struct graph *rdg, partition_t partition)
1194 {
1195   int res = 0;
1196   unsigned i;
1197   bitmap_iterator ii;
1198 
1199   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1200     {
1201       if (RDG_MEM_WRITE_STMT (rdg, i))
1202 	++res;
1203 
1204       if (RDG_MEM_READS_STMT (rdg, i))
1205 	++res;
1206     }
1207 
1208   return res;
1209 }
1210 
1211 /* Returns true when one of the PARTITIONS contains all the read or
1212    write operations of RDG.  */
1213 
1214 static bool
1215 partition_contains_all_rw (struct graph *rdg,
1216 			   vec<partition_t> partitions)
1217 {
1218   int i;
1219   partition_t partition;
1220   int nrw = number_of_rw_in_rdg (rdg);
1221 
1222   FOR_EACH_VEC_ELT (partitions, i, partition)
1223     if (nrw == number_of_rw_in_partition (rdg, partition))
1224       return true;
1225 
1226   return false;
1227 }
1228 
1229 /* Generate code from STARTING_VERTICES in RDG.  Returns the number of
1230    distributed loops.  */
1231 
1232 static int
1233 ldist_gen (struct loop *loop, struct graph *rdg,
1234 	   vec<int> starting_vertices)
1235 {
1236   int i, nbp;
1237   vec<rdgc> components;
1238   components.create (3);
1239   vec<partition_t> partitions;
1240   partitions.create (3);
1241   vec<int> other_stores;
1242   other_stores.create (3);
1243   partition_t partition;
1244   bitmap processed = BITMAP_ALLOC (NULL);
1245   bool any_builtin;
1246 
1247   remaining_stmts = BITMAP_ALLOC (NULL);
1248   upstream_mem_writes = BITMAP_ALLOC (NULL);
1249 
1250   for (i = 0; i < rdg->n_vertices; i++)
1251     {
1252       bitmap_set_bit (remaining_stmts, i);
1253 
1254       /* Save in OTHER_STORES all the memory writes that are not in
1255 	 STARTING_VERTICES.  */
1256       if (RDG_MEM_WRITE_STMT (rdg, i))
1257 	{
1258 	  int v;
1259 	  unsigned j;
1260 	  bool found = false;
1261 
1262 	  FOR_EACH_VEC_ELT (starting_vertices, j, v)
1263 	    if (i == v)
1264 	      {
1265 		found = true;
1266 		break;
1267 	      }
1268 
1269 	  if (!found)
1270 	    other_stores.safe_push (i);
1271 	}
1272     }
1273 
1274   mark_nodes_having_upstream_mem_writes (rdg);
1275   rdg_build_components (rdg, starting_vertices, &components);
1276   rdg_build_partitions (rdg, components, &other_stores, &partitions,
1277 			processed);
1278   BITMAP_FREE (processed);
1279 
1280   any_builtin = false;
1281   FOR_EACH_VEC_ELT (partitions, i, partition)
1282     {
1283       classify_partition (loop, rdg, partition);
1284       any_builtin |= partition_builtin_p (partition);
1285     }
1286 
1287   /* If we are only distributing patterns fuse all partitions that
1288      were not properly classified as builtins.  Else fuse partitions
1289      with similar memory accesses.  */
1290   if (!flag_tree_loop_distribution)
1291     {
1292       partition_t into;
1293       /* If we did not detect any builtin simply bail out.  */
1294       if (!any_builtin)
1295 	{
1296 	  nbp = 0;
1297 	  goto ldist_done;
1298 	}
1299       /* Only fuse adjacent non-builtin partitions, see PR53616.
1300          ???  Use dependence information to improve partition ordering.  */
1301       i = 0;
1302       do
1303 	{
1304 	  for (; partitions.iterate (i, &into); ++i)
1305 	    if (!partition_builtin_p (into))
1306 	      break;
1307 	  for (++i; partitions.iterate (i, &partition); ++i)
1308 	    if (!partition_builtin_p (partition))
1309 	      {
1310 		bitmap_ior_into (into->stmts, partition->stmts);
1311 		if (partition->kind == PKIND_REDUCTION)
1312 		  into->kind = PKIND_REDUCTION;
1313 		partitions.ordered_remove (i);
1314 		partition_free (partition);
1315 		i--;
1316 	      }
1317 	    else
1318 	      break;
1319 	}
1320       while ((unsigned) i < partitions.length ());
1321     }
1322   else
1323     {
1324       partition_t into;
1325       int j;
1326       for (i = 0; partitions.iterate (i, &into); ++i)
1327 	{
1328 	  if (partition_builtin_p (into))
1329 	    continue;
1330 	  for (j = i + 1;
1331 	       partitions.iterate (j, &partition); ++j)
1332 	    {
1333 	      if (!partition_builtin_p (partition)
1334 		  /* ???  The following is horribly inefficient,
1335 		     we are re-computing and analyzing data-references
1336 		     of the stmts in the partitions all the time.  */
1337 		  && similar_memory_accesses (rdg, into, partition))
1338 		{
1339 		  if (dump_file && (dump_flags & TDF_DETAILS))
1340 		    {
1341 		      fprintf (dump_file, "fusing partitions\n");
1342 		      dump_bitmap (dump_file, into->stmts);
1343 		      dump_bitmap (dump_file, partition->stmts);
1344 		      fprintf (dump_file, "because they have similar "
1345 			       "memory accesses\n");
1346 		    }
1347 		  bitmap_ior_into (into->stmts, partition->stmts);
1348 		  if (partition->kind == PKIND_REDUCTION)
1349 		    into->kind = PKIND_REDUCTION;
1350 		  partitions.ordered_remove (j);
1351 		  partition_free (partition);
1352 		  j--;
1353 		}
1354 	    }
1355 	}
1356     }
1357 
1358   /* Fuse all reduction partitions into the last.  */
1359   if (partitions.length () > 1)
1360     {
1361       partition_t into = partitions.last ();
1362       for (i = partitions.length () - 2; i >= 0; --i)
1363 	{
1364 	  partition_t what = partitions[i];
1365 	  if (what->kind == PKIND_REDUCTION)
1366 	    {
1367 	      if (dump_file && (dump_flags & TDF_DETAILS))
1368 		{
1369 		  fprintf (dump_file, "fusing partitions\n");
1370 		  dump_bitmap (dump_file, into->stmts);
1371 		  dump_bitmap (dump_file, what->stmts);
1372 		  fprintf (dump_file, "because the latter has reductions\n");
1373 		}
1374 	      bitmap_ior_into (into->stmts, what->stmts);
1375 	      into->kind = PKIND_REDUCTION;
1376 	      partitions.ordered_remove (i);
1377 	      partition_free (what);
1378 	    }
1379 	}
1380     }
1381 
1382   nbp = partitions.length ();
1383   if (nbp == 0
1384       || (nbp == 1 && !partition_builtin_p (partitions[0]))
1385       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
1386     {
1387       nbp = 0;
1388       goto ldist_done;
1389     }
1390 
1391   if (dump_file && (dump_flags & TDF_DETAILS))
1392     dump_rdg_partitions (dump_file, partitions);
1393 
1394   FOR_EACH_VEC_ELT (partitions, i, partition)
1395     generate_code_for_partition (loop, partition, i < nbp - 1);
1396 
1397  ldist_done:
1398 
1399   BITMAP_FREE (remaining_stmts);
1400   BITMAP_FREE (upstream_mem_writes);
1401 
1402   FOR_EACH_VEC_ELT (partitions, i, partition)
1403     partition_free (partition);
1404 
1405   other_stores.release ();
1406   partitions.release ();
1407   free_rdg_components (components);
1408   return nbp;
1409 }
1410 
1411 /* Distributes the code from LOOP in such a way that producer
1412    statements are placed before consumer statements.  When STMTS is
1413    NULL, performs the maximal distribution, if STMTS is not NULL,
1414    tries to separate only these statements from the LOOP's body.
1415    Returns the number of distributed loops.  */
1416 
1417 static int
1418 distribute_loop (struct loop *loop, vec<gimple> stmts)
1419 {
1420   int res = 0;
1421   struct graph *rdg;
1422   gimple s;
1423   unsigned i;
1424   vec<int> vertices;
1425   vec<ddr_p> dependence_relations;
1426   vec<data_reference_p> datarefs;
1427   vec<loop_p> loop_nest;
1428 
1429   datarefs.create (10);
1430   dependence_relations.create (100);
1431   loop_nest.create (3);
1432   rdg = build_rdg (loop, &loop_nest, &dependence_relations, &datarefs);
1433 
1434   if (!rdg)
1435     {
1436       if (dump_file && (dump_flags & TDF_DETAILS))
1437 	fprintf (dump_file,
1438 		 "FIXME: Loop %d not distributed: failed to build the RDG.\n",
1439 		 loop->num);
1440 
1441       free_dependence_relations (dependence_relations);
1442       free_data_refs (datarefs);
1443       loop_nest.release ();
1444       return res;
1445     }
1446 
1447   vertices.create (3);
1448 
1449   if (dump_file && (dump_flags & TDF_DETAILS))
1450     dump_rdg (dump_file, rdg);
1451 
1452   FOR_EACH_VEC_ELT (stmts, i, s)
1453     {
1454       int v = rdg_vertex_for_stmt (rdg, s);
1455 
1456       if (v >= 0)
1457 	{
1458 	  vertices.safe_push (v);
1459 
1460 	  if (dump_file && (dump_flags & TDF_DETAILS))
1461 	    fprintf (dump_file,
1462 		     "ldist asked to generate code for vertex %d\n", v);
1463 	}
1464     }
1465 
1466   res = ldist_gen (loop, rdg, vertices);
1467   vertices.release ();
1468   free_rdg (rdg);
1469   free_dependence_relations (dependence_relations);
1470   free_data_refs (datarefs);
1471   loop_nest.release ();
1472   return res;
1473 }
1474 
1475 /* Distribute all loops in the current function.  */
1476 
1477 static unsigned int
1478 tree_loop_distribution (void)
1479 {
1480   struct loop *loop;
1481   loop_iterator li;
1482   bool changed = false;
1483   basic_block bb;
1484 
1485   FOR_ALL_BB (bb)
1486     {
1487       gimple_stmt_iterator gsi;
1488       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1489 	gimple_set_uid (gsi_stmt (gsi), -1);
1490       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1491 	gimple_set_uid (gsi_stmt (gsi), -1);
1492     }
1493 
1494   /* We can at the moment only distribute non-nested loops, thus restrict
1495      walking to innermost loops.  */
1496   FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST)
1497     {
1498       vec<gimple> work_list = vNULL;
1499       basic_block *bbs;
1500       int num = loop->num;
1501       int nb_generated_loops = 0;
1502       unsigned int i;
1503 
1504       /* If the loop doesn't have a single exit we will fail anyway,
1505 	 so do that early.  */
1506       if (!single_exit (loop))
1507 	continue;
1508 
1509       /* Only optimize hot loops.  */
1510       if (!optimize_loop_for_speed_p (loop))
1511 	continue;
1512 
1513       /* Only distribute loops with a header and latch for now.  */
1514       if (loop->num_nodes > 2)
1515 	continue;
1516 
1517       /* Initialize the worklist with stmts we seed the partitions with.  */
1518       bbs = get_loop_body_in_dom_order (loop);
1519       for (i = 0; i < loop->num_nodes; ++i)
1520 	{
1521 	  gimple_stmt_iterator gsi;
1522 	  for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
1523 	    {
1524 	      gimple stmt = gsi_stmt (gsi);
1525 	      /* Distribute stmts which have defs that are used outside of
1526 	         the loop.  */
1527 	      if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1528 		;
1529 	      /* Otherwise only distribute stores for now.  */
1530 	      else if (!gimple_assign_single_p (stmt)
1531 		       || is_gimple_reg (gimple_assign_lhs (stmt)))
1532 		continue;
1533 
1534 	      work_list.safe_push (stmt);
1535 	    }
1536 	}
1537       free (bbs);
1538 
1539       if (work_list.length () > 0)
1540 	nb_generated_loops = distribute_loop (loop, work_list);
1541 
1542       if (nb_generated_loops > 0)
1543 	changed = true;
1544 
1545       if (dump_file && (dump_flags & TDF_DETAILS))
1546 	{
1547 	  if (nb_generated_loops > 1)
1548 	    fprintf (dump_file, "Loop %d distributed: split to %d loops.\n",
1549 		     num, nb_generated_loops);
1550 	  else
1551 	    fprintf (dump_file, "Loop %d is the same.\n", num);
1552 	}
1553 
1554       work_list.release ();
1555     }
1556 
1557   if (changed)
1558     {
1559       mark_virtual_operands_for_renaming (cfun);
1560       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
1561     }
1562 
1563 #ifdef ENABLE_CHECKING
1564   verify_loop_structure ();
1565 #endif
1566 
1567   return 0;
1568 }
1569 
1570 static bool
1571 gate_tree_loop_distribution (void)
1572 {
1573   return flag_tree_loop_distribution
1574     || flag_tree_loop_distribute_patterns;
1575 }
1576 
1577 struct gimple_opt_pass pass_loop_distribution =
1578 {
1579  {
1580   GIMPLE_PASS,
1581   "ldist",			/* name */
1582   OPTGROUP_LOOP,                /* optinfo_flags */
1583   gate_tree_loop_distribution,  /* gate */
1584   tree_loop_distribution,       /* execute */
1585   NULL,				/* sub */
1586   NULL,				/* next */
1587   0,				/* static_pass_number */
1588   TV_TREE_LOOP_DISTRIBUTION,    /* tv_id */
1589   PROP_cfg | PROP_ssa,		/* properties_required */
1590   0,				/* properties_provided */
1591   0,				/* properties_destroyed */
1592   0,				/* todo_flags_start */
1593   TODO_ggc_collect
1594   | TODO_verify_ssa             /* todo_flags_finish */
1595  }
1596 };
1597