xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-loop-distribution.c (revision b7b7574d3bf8eeb51a1fa3977b59142ec6434a55)
1 /* Loop distribution.
2    Copyright (C) 2006, 2007, 2008, 2009, 2010
3    Free Software Foundation, Inc.
4    Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
5    and Sebastian Pop <sebastian.pop@amd.com>.
6 
7 This file is part of GCC.
8 
9 GCC is free software; you can redistribute it and/or modify it
10 under the terms of the GNU General Public License as published by the
11 Free Software Foundation; either version 3, or (at your option) any
12 later version.
13 
14 GCC is distributed in the hope that it will be useful, but WITHOUT
15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
17 for more details.
18 
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3.  If not see
21 <http://www.gnu.org/licenses/>.  */
22 
23 /* This pass performs loop distribution: for example, the loop
24 
25    |DO I = 2, N
26    |    A(I) = B(I) + C
27    |    D(I) = A(I-1)*E
28    |ENDDO
29 
30    is transformed to
31 
32    |DOALL I = 2, N
33    |   A(I) = B(I) + C
34    |ENDDO
35    |
36    |DOALL I = 2, N
37    |   D(I) = A(I-1)*E
38    |ENDDO
39 
40    This pass uses an RDG, Reduced Dependence Graph built on top of the
41    data dependence relations.  The RDG is then topologically sorted to
42    obtain a map of information producers/consumers based on which it
43    generates the new loops.  */
44 
45 #include "config.h"
46 #include "system.h"
47 #include "coretypes.h"
48 #include "tm.h"
49 #include "ggc.h"
50 #include "tree.h"
51 #include "target.h"
52 
53 #include "rtl.h"
54 #include "basic-block.h"
55 #include "diagnostic.h"
56 #include "tree-flow.h"
57 #include "tree-dump.h"
58 #include "timevar.h"
59 #include "cfgloop.h"
60 #include "expr.h"
61 #include "optabs.h"
62 #include "tree-chrec.h"
63 #include "tree-data-ref.h"
64 #include "tree-scalar-evolution.h"
65 #include "tree-pass.h"
66 #include "lambda.h"
67 #include "langhooks.h"
68 #include "tree-vectorizer.h"
69 
70 /* If bit I is not set, it means that this node represents an
71    operation that has already been performed, and that should not be
72    performed again.  This is the subgraph of remaining important
73    computations that is passed to the DFS algorithm for avoiding to
74    include several times the same stores in different loops.  */
75 static bitmap remaining_stmts;
76 
77 /* A node of the RDG is marked in this bitmap when it has as a
78    predecessor a node that writes to memory.  */
79 static bitmap upstream_mem_writes;
80 
81 /* Update the PHI nodes of NEW_LOOP.  NEW_LOOP is a duplicate of
82    ORIG_LOOP.  */
83 
84 static void
85 update_phis_for_loop_copy (struct loop *orig_loop, struct loop *new_loop)
86 {
87   tree new_ssa_name;
88   gimple_stmt_iterator si_new, si_orig;
89   edge orig_loop_latch = loop_latch_edge (orig_loop);
90   edge orig_entry_e = loop_preheader_edge (orig_loop);
91   edge new_loop_entry_e = loop_preheader_edge (new_loop);
92 
93   /* Scan the phis in the headers of the old and new loops
94      (they are organized in exactly the same order).  */
95   for (si_new = gsi_start_phis (new_loop->header),
96        si_orig = gsi_start_phis (orig_loop->header);
97        !gsi_end_p (si_new) && !gsi_end_p (si_orig);
98        gsi_next (&si_new), gsi_next (&si_orig))
99     {
100       tree def;
101       source_location locus;
102       gimple phi_new = gsi_stmt (si_new);
103       gimple phi_orig = gsi_stmt (si_orig);
104 
105       /* Add the first phi argument for the phi in NEW_LOOP (the one
106 	 associated with the entry of NEW_LOOP)  */
107       def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_entry_e);
108       locus = gimple_phi_arg_location_from_edge (phi_orig, orig_entry_e);
109       add_phi_arg (phi_new, def, new_loop_entry_e, locus);
110 
111       /* Add the second phi argument for the phi in NEW_LOOP (the one
112 	 associated with the latch of NEW_LOOP)  */
113       def = PHI_ARG_DEF_FROM_EDGE (phi_orig, orig_loop_latch);
114       locus = gimple_phi_arg_location_from_edge (phi_orig, orig_loop_latch);
115 
116       if (TREE_CODE (def) == SSA_NAME)
117 	{
118 	  new_ssa_name = get_current_def (def);
119 
120 	  if (!new_ssa_name)
121 	    /* This only happens if there are no definitions inside the
122 	       loop.  Use the the invariant in the new loop as is.  */
123 	    new_ssa_name = def;
124 	}
125       else
126 	/* Could be an integer.  */
127 	new_ssa_name = def;
128 
129       add_phi_arg (phi_new, new_ssa_name, loop_latch_edge (new_loop), locus);
130     }
131 }
132 
133 /* Return a copy of LOOP placed before LOOP.  */
134 
135 static struct loop *
136 copy_loop_before (struct loop *loop)
137 {
138   struct loop *res;
139   edge preheader = loop_preheader_edge (loop);
140 
141   if (!single_exit (loop))
142     return NULL;
143 
144   initialize_original_copy_tables ();
145   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, preheader);
146   free_original_copy_tables ();
147 
148   if (!res)
149     return NULL;
150 
151   update_phis_for_loop_copy (loop, res);
152   rename_variables_in_loop (res);
153 
154   return res;
155 }
156 
157 /* Creates an empty basic block after LOOP.  */
158 
159 static void
160 create_bb_after_loop (struct loop *loop)
161 {
162   edge exit = single_exit (loop);
163 
164   if (!exit)
165     return;
166 
167   split_edge (exit);
168 }
169 
170 /* Generate code for PARTITION from the code in LOOP.  The loop is
171    copied when COPY_P is true.  All the statements not flagged in the
172    PARTITION bitmap are removed from the loop or from its copy.  The
173    statements are indexed in sequence inside a basic block, and the
174    basic blocks of a loop are taken in dom order.  Returns true when
175    the code gen succeeded. */
176 
177 static bool
178 generate_loops_for_partition (struct loop *loop, bitmap partition, bool copy_p)
179 {
180   unsigned i, x;
181   gimple_stmt_iterator bsi;
182   basic_block *bbs;
183 
184   if (copy_p)
185     {
186       loop = copy_loop_before (loop);
187       create_preheader (loop, CP_SIMPLE_PREHEADERS);
188       create_bb_after_loop (loop);
189     }
190 
191   if (loop == NULL)
192     return false;
193 
194   /* Remove stmts not in the PARTITION bitmap.  The order in which we
195      visit the phi nodes and the statements is exactly as in
196      stmts_from_loop.  */
197   bbs = get_loop_body_in_dom_order (loop);
198 
199   for (x = 0, i = 0; i < loop->num_nodes; i++)
200     {
201       basic_block bb = bbs[i];
202 
203       for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
204 	if (!bitmap_bit_p (partition, x++))
205 	  {
206 	    gimple phi = gsi_stmt (bsi);
207 	    if (!is_gimple_reg (gimple_phi_result (phi)))
208 	      mark_virtual_phi_result_for_renaming (phi);
209 	    remove_phi_node (&bsi, true);
210 	  }
211 	else
212 	  gsi_next (&bsi);
213 
214       for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
215 	{
216 	  gimple stmt = gsi_stmt (bsi);
217 	  if (gimple_code (gsi_stmt (bsi)) != GIMPLE_LABEL
218 	      && !bitmap_bit_p (partition, x++))
219 	    {
220 	      unlink_stmt_vdef (stmt);
221 	      gsi_remove (&bsi, true);
222 	      release_defs (stmt);
223 	    }
224 	  else
225 	    gsi_next (&bsi);
226 	}
227     }
228 
229   free (bbs);
230   return true;
231 }
232 
233 /* Build the size argument for a memset call.  */
234 
235 static inline tree
236 build_size_arg_loc (location_t loc, tree nb_iter, tree op,
237 		    gimple_seq *stmt_list)
238 {
239   gimple_seq stmts;
240   tree x;
241 
242   x = fold_build2_loc (loc, MULT_EXPR, size_type_node,
243 		       fold_convert_loc (loc, size_type_node, nb_iter),
244 		       fold_convert_loc (loc, size_type_node,
245 					 TYPE_SIZE_UNIT (TREE_TYPE (op))));
246   x = force_gimple_operand (x, &stmts, true, NULL);
247   gimple_seq_add_seq (stmt_list, stmts);
248 
249   return x;
250 }
251 
252 /* Generate a call to memset.  Return true when the operation succeeded.  */
253 
254 static void
255 generate_memset_zero (gimple stmt, tree op0, tree nb_iter,
256 		      gimple_stmt_iterator bsi)
257 {
258   tree addr_base, nb_bytes;
259   bool res = false;
260   gimple_seq stmt_list = NULL, stmts;
261   gimple fn_call;
262   tree mem, fn;
263   struct data_reference *dr = XCNEW (struct data_reference);
264   location_t loc = gimple_location (stmt);
265 
266   DR_STMT (dr) = stmt;
267   DR_REF (dr) = op0;
268   res = dr_analyze_innermost (dr);
269   gcc_assert (res && stride_of_unit_type_p (DR_STEP (dr), TREE_TYPE (op0)));
270 
271   nb_bytes = build_size_arg_loc (loc, nb_iter, op0, &stmt_list);
272   addr_base = size_binop_loc (loc, PLUS_EXPR, DR_OFFSET (dr), DR_INIT (dr));
273   addr_base = fold_convert_loc (loc, sizetype, addr_base);
274 
275   /* Test for a negative stride, iterating over every element.  */
276   if (integer_zerop (size_binop (PLUS_EXPR,
277 				 TYPE_SIZE_UNIT (TREE_TYPE (op0)),
278 				 fold_convert (sizetype, DR_STEP (dr)))))
279     {
280       addr_base = size_binop_loc (loc, MINUS_EXPR, addr_base,
281 				  fold_convert_loc (loc, sizetype, nb_bytes));
282       addr_base = size_binop_loc (loc, PLUS_EXPR, addr_base,
283 				  TYPE_SIZE_UNIT (TREE_TYPE (op0)));
284     }
285 
286   addr_base = fold_build2_loc (loc, POINTER_PLUS_EXPR,
287 			       TREE_TYPE (DR_BASE_ADDRESS (dr)),
288 			       DR_BASE_ADDRESS (dr), addr_base);
289   mem = force_gimple_operand (addr_base, &stmts, true, NULL);
290   gimple_seq_add_seq (&stmt_list, stmts);
291 
292   fn = build_fold_addr_expr (implicit_built_in_decls [BUILT_IN_MEMSET]);
293   fn_call = gimple_build_call (fn, 3, mem, integer_zero_node, nb_bytes);
294   gimple_seq_add_stmt (&stmt_list, fn_call);
295   gsi_insert_seq_after (&bsi, stmt_list, GSI_CONTINUE_LINKING);
296 
297   if (dump_file && (dump_flags & TDF_DETAILS))
298     fprintf (dump_file, "generated memset zero\n");
299 
300   free_data_ref (dr);
301 }
302 
303 /* Tries to generate a builtin function for the instructions of LOOP
304    pointed to by the bits set in PARTITION.  Returns true when the
305    operation succeeded.  */
306 
307 static bool
308 generate_builtin (struct loop *loop, bitmap partition, bool copy_p)
309 {
310   bool res = false;
311   unsigned i, x = 0;
312   basic_block *bbs;
313   gimple write = NULL;
314   gimple_stmt_iterator bsi;
315   tree nb_iter = number_of_exit_cond_executions (loop);
316 
317   if (!nb_iter || nb_iter == chrec_dont_know)
318     return false;
319 
320   bbs = get_loop_body_in_dom_order (loop);
321 
322   for (i = 0; i < loop->num_nodes; i++)
323     {
324       basic_block bb = bbs[i];
325 
326       for (bsi = gsi_start_phis (bb); !gsi_end_p (bsi); gsi_next (&bsi))
327 	x++;
328 
329       for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
330 	{
331 	  gimple stmt = gsi_stmt (bsi);
332 
333 	  if (bitmap_bit_p (partition, x++)
334 	      && is_gimple_assign (stmt)
335 	      && !is_gimple_reg (gimple_assign_lhs (stmt)))
336 	    {
337 	      /* Don't generate the builtins when there are more than
338 		 one memory write.  */
339 	      if (write != NULL)
340 		goto end;
341 
342 	      write = stmt;
343 	      if (bb == loop->latch)
344 		nb_iter = number_of_latch_executions (loop);
345 	    }
346 	}
347     }
348 
349   if (!stmt_with_adjacent_zero_store_dr_p (write))
350     goto end;
351 
352   /* The new statements will be placed before LOOP.  */
353   bsi = gsi_last_bb (loop_preheader_edge (loop)->src);
354   generate_memset_zero (write, gimple_assign_lhs (write), nb_iter, bsi);
355   res = true;
356 
357   /* If this is the last partition for which we generate code, we have
358      to destroy the loop.  */
359   if (!copy_p)
360     {
361       unsigned nbbs = loop->num_nodes;
362       edge exit = single_exit (loop);
363       basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
364       redirect_edge_pred (exit, src);
365       exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
366       exit->flags |= EDGE_FALLTHRU;
367       cancel_loop_tree (loop);
368       rescan_loop_exit (exit, false, true);
369 
370       for (i = 0; i < nbbs; i++)
371 	delete_basic_block (bbs[i]);
372 
373       set_immediate_dominator (CDI_DOMINATORS, dest,
374 			       recompute_dominator (CDI_DOMINATORS, dest));
375     }
376 
377  end:
378   free (bbs);
379   return res;
380 }
381 
382 /* Generates code for PARTITION.  For simple loops, this function can
383    generate a built-in.  */
384 
385 static bool
386 generate_code_for_partition (struct loop *loop, bitmap partition, bool copy_p)
387 {
388   if (generate_builtin (loop, partition, copy_p))
389     return true;
390 
391   return generate_loops_for_partition (loop, partition, copy_p);
392 }
393 
394 
395 /* Returns true if the node V of RDG cannot be recomputed.  */
396 
397 static bool
398 rdg_cannot_recompute_vertex_p (struct graph *rdg, int v)
399 {
400   if (RDG_MEM_WRITE_STMT (rdg, v))
401     return true;
402 
403   return false;
404 }
405 
406 /* Returns true when the vertex V has already been generated in the
407    current partition (V is in PROCESSED), or when V belongs to another
408    partition and cannot be recomputed (V is not in REMAINING_STMTS).  */
409 
410 static inline bool
411 already_processed_vertex_p (bitmap processed, int v)
412 {
413   return (bitmap_bit_p (processed, v)
414 	  || !bitmap_bit_p (remaining_stmts, v));
415 }
416 
417 /* Returns NULL when there is no anti-dependence among the successors
418    of vertex V, otherwise returns the edge with the anti-dep.  */
419 
420 static struct graph_edge *
421 has_anti_dependence (struct vertex *v)
422 {
423   struct graph_edge *e;
424 
425   if (v->succ)
426     for (e = v->succ; e; e = e->succ_next)
427       if (RDGE_TYPE (e) == anti_dd)
428 	return e;
429 
430   return NULL;
431 }
432 
433 /* Returns true when V has an anti-dependence edge among its successors.  */
434 
435 static bool
436 predecessor_has_mem_write (struct graph *rdg, struct vertex *v)
437 {
438   struct graph_edge *e;
439 
440   if (v->pred)
441     for (e = v->pred; e; e = e->pred_next)
442       if (bitmap_bit_p (upstream_mem_writes, e->src)
443 	  /* Don't consider flow channels: a write to memory followed
444 	     by a read from memory.  These channels allow the split of
445 	     the RDG in different partitions.  */
446 	  && !RDG_MEM_WRITE_STMT (rdg, e->src))
447 	return true;
448 
449   return false;
450 }
451 
452 /* Initializes the upstream_mem_writes bitmap following the
453    information from RDG.  */
454 
455 static void
456 mark_nodes_having_upstream_mem_writes (struct graph *rdg)
457 {
458   int v, x;
459   bitmap seen = BITMAP_ALLOC (NULL);
460 
461   for (v = rdg->n_vertices - 1; v >= 0; v--)
462     if (!bitmap_bit_p (seen, v))
463       {
464 	unsigned i;
465 	VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
466 
467 	graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
468 
469 	for (i = 0; VEC_iterate (int, nodes, i, x); i++)
470 	  {
471 	    if (bitmap_bit_p (seen, x))
472 	      continue;
473 
474 	    bitmap_set_bit (seen, x);
475 
476 	    if (RDG_MEM_WRITE_STMT (rdg, x)
477 		|| predecessor_has_mem_write (rdg, &(rdg->vertices[x]))
478 		/* In anti dependences the read should occur before
479 		   the write, this is why both the read and the write
480 		   should be placed in the same partition.  */
481 		|| has_anti_dependence (&(rdg->vertices[x])))
482 	      {
483 		bitmap_set_bit (upstream_mem_writes, x);
484 	      }
485 	  }
486 
487 	VEC_free (int, heap, nodes);
488       }
489 }
490 
491 /* Returns true when vertex u has a memory write node as a predecessor
492    in RDG.  */
493 
494 static bool
495 has_upstream_mem_writes (int u)
496 {
497   return bitmap_bit_p (upstream_mem_writes, u);
498 }
499 
500 static void rdg_flag_vertex_and_dependent (struct graph *, int, bitmap, bitmap,
501 					   bitmap, bool *);
502 
503 /* Flag the uses of U stopping following the information from
504    upstream_mem_writes.  */
505 
506 static void
507 rdg_flag_uses (struct graph *rdg, int u, bitmap partition, bitmap loops,
508 	       bitmap processed, bool *part_has_writes)
509 {
510   use_operand_p use_p;
511   struct vertex *x = &(rdg->vertices[u]);
512   gimple stmt = RDGV_STMT (x);
513   struct graph_edge *anti_dep = has_anti_dependence (x);
514 
515   /* Keep in the same partition the destination of an antidependence,
516      because this is a store to the exact same location.  Putting this
517      in another partition is bad for cache locality.  */
518   if (anti_dep)
519     {
520       int v = anti_dep->dest;
521 
522       if (!already_processed_vertex_p (processed, v))
523 	rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
524 				       processed, part_has_writes);
525     }
526 
527   if (gimple_code (stmt) != GIMPLE_PHI)
528     {
529       if ((use_p = gimple_vuse_op (stmt)) != NULL_USE_OPERAND_P)
530 	{
531 	  tree use = USE_FROM_PTR (use_p);
532 
533 	  if (TREE_CODE (use) == SSA_NAME)
534 	    {
535 	      gimple def_stmt = SSA_NAME_DEF_STMT (use);
536 	      int v = rdg_vertex_for_stmt (rdg, def_stmt);
537 
538 	      if (v >= 0
539 		  && !already_processed_vertex_p (processed, v))
540 		rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
541 					       processed, part_has_writes);
542 	    }
543 	}
544     }
545 
546   if (is_gimple_assign (stmt) && has_upstream_mem_writes (u))
547     {
548       tree op0 = gimple_assign_lhs (stmt);
549 
550       /* Scalar channels don't have enough space for transmitting data
551 	 between tasks, unless we add more storage by privatizing.  */
552       if (is_gimple_reg (op0))
553 	{
554 	  use_operand_p use_p;
555 	  imm_use_iterator iter;
556 
557 	  FOR_EACH_IMM_USE_FAST (use_p, iter, op0)
558 	    {
559 	      int v = rdg_vertex_for_stmt (rdg, USE_STMT (use_p));
560 
561 	      if (!already_processed_vertex_p (processed, v))
562 		rdg_flag_vertex_and_dependent (rdg, v, partition, loops,
563 					       processed, part_has_writes);
564 	    }
565 	}
566     }
567 }
568 
569 /* Flag V from RDG as part of PARTITION, and also flag its loop number
570    in LOOPS.  */
571 
572 static void
573 rdg_flag_vertex (struct graph *rdg, int v, bitmap partition, bitmap loops,
574 		 bool *part_has_writes)
575 {
576   struct loop *loop;
577 
578   if (bitmap_bit_p (partition, v))
579     return;
580 
581   loop = loop_containing_stmt (RDG_STMT (rdg, v));
582   bitmap_set_bit (loops, loop->num);
583   bitmap_set_bit (partition, v);
584 
585   if (rdg_cannot_recompute_vertex_p (rdg, v))
586     {
587       *part_has_writes = true;
588       bitmap_clear_bit (remaining_stmts, v);
589     }
590 }
591 
592 /* Flag in the bitmap PARTITION the vertex V and all its predecessors.
593    Also flag their loop number in LOOPS.  */
594 
595 static void
596 rdg_flag_vertex_and_dependent (struct graph *rdg, int v, bitmap partition,
597 			       bitmap loops, bitmap processed,
598 			       bool *part_has_writes)
599 {
600   unsigned i;
601   VEC (int, heap) *nodes = VEC_alloc (int, heap, 3);
602   int x;
603 
604   bitmap_set_bit (processed, v);
605   rdg_flag_uses (rdg, v, partition, loops, processed, part_has_writes);
606   graphds_dfs (rdg, &v, 1, &nodes, false, remaining_stmts);
607   rdg_flag_vertex (rdg, v, partition, loops, part_has_writes);
608 
609   for (i = 0; VEC_iterate (int, nodes, i, x); i++)
610     if (!already_processed_vertex_p (processed, x))
611       rdg_flag_vertex_and_dependent (rdg, x, partition, loops, processed,
612 				     part_has_writes);
613 
614   VEC_free (int, heap, nodes);
615 }
616 
617 /* Initialize CONDS with all the condition statements from the basic
618    blocks of LOOP.  */
619 
620 static void
621 collect_condition_stmts (struct loop *loop, VEC (gimple, heap) **conds)
622 {
623   unsigned i;
624   edge e;
625   VEC (edge, heap) *exits = get_loop_exit_edges (loop);
626 
627   for (i = 0; VEC_iterate (edge, exits, i, e); i++)
628     {
629       gimple cond = last_stmt (e->src);
630 
631       if (cond)
632 	VEC_safe_push (gimple, heap, *conds, cond);
633     }
634 
635   VEC_free (edge, heap, exits);
636 }
637 
638 /* Add to PARTITION all the exit condition statements for LOOPS
639    together with all their dependent statements determined from
640    RDG.  */
641 
642 static void
643 rdg_flag_loop_exits (struct graph *rdg, bitmap loops, bitmap partition,
644 		     bitmap processed, bool *part_has_writes)
645 {
646   unsigned i;
647   bitmap_iterator bi;
648   VEC (gimple, heap) *conds = VEC_alloc (gimple, heap, 3);
649 
650   EXECUTE_IF_SET_IN_BITMAP (loops, 0, i, bi)
651     collect_condition_stmts (get_loop (i), &conds);
652 
653   while (!VEC_empty (gimple, conds))
654     {
655       gimple cond = VEC_pop (gimple, conds);
656       int v = rdg_vertex_for_stmt (rdg, cond);
657       bitmap new_loops = BITMAP_ALLOC (NULL);
658 
659       if (!already_processed_vertex_p (processed, v))
660 	rdg_flag_vertex_and_dependent (rdg, v, partition, new_loops, processed,
661 				       part_has_writes);
662 
663       EXECUTE_IF_SET_IN_BITMAP (new_loops, 0, i, bi)
664 	if (!bitmap_bit_p (loops, i))
665 	  {
666 	    bitmap_set_bit (loops, i);
667 	    collect_condition_stmts (get_loop (i), &conds);
668 	  }
669 
670       BITMAP_FREE (new_loops);
671     }
672 }
673 
674 /* Returns a bitmap in which all the statements needed for computing
675    the strongly connected component C of the RDG are flagged, also
676    including the loop exit conditions.  */
677 
678 static bitmap
679 build_rdg_partition_for_component (struct graph *rdg, rdgc c,
680 				   bool *part_has_writes)
681 {
682   int i, v;
683   bitmap partition = BITMAP_ALLOC (NULL);
684   bitmap loops = BITMAP_ALLOC (NULL);
685   bitmap processed = BITMAP_ALLOC (NULL);
686 
687   for (i = 0; VEC_iterate (int, c->vertices, i, v); i++)
688     if (!already_processed_vertex_p (processed, v))
689       rdg_flag_vertex_and_dependent (rdg, v, partition, loops, processed,
690 				     part_has_writes);
691 
692   rdg_flag_loop_exits (rdg, loops, partition, processed, part_has_writes);
693 
694   BITMAP_FREE (processed);
695   BITMAP_FREE (loops);
696   return partition;
697 }
698 
699 /* Free memory for COMPONENTS.  */
700 
701 static void
702 free_rdg_components (VEC (rdgc, heap) *components)
703 {
704   int i;
705   rdgc x;
706 
707   for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
708     {
709       VEC_free (int, heap, x->vertices);
710       free (x);
711     }
712 }
713 
714 /* Build the COMPONENTS vector with the strongly connected components
715    of RDG in which the STARTING_VERTICES occur.  */
716 
717 static void
718 rdg_build_components (struct graph *rdg, VEC (int, heap) *starting_vertices,
719 		      VEC (rdgc, heap) **components)
720 {
721   int i, v;
722   bitmap saved_components = BITMAP_ALLOC (NULL);
723   int n_components = graphds_scc (rdg, NULL);
724   VEC (int, heap) **all_components = XNEWVEC (VEC (int, heap) *, n_components);
725 
726   for (i = 0; i < n_components; i++)
727     all_components[i] = VEC_alloc (int, heap, 3);
728 
729   for (i = 0; i < rdg->n_vertices; i++)
730     VEC_safe_push (int, heap, all_components[rdg->vertices[i].component], i);
731 
732   for (i = 0; VEC_iterate (int, starting_vertices, i, v); i++)
733     {
734       int c = rdg->vertices[v].component;
735 
736       if (!bitmap_bit_p (saved_components, c))
737 	{
738 	  rdgc x = XCNEW (struct rdg_component);
739 	  x->num = c;
740 	  x->vertices = all_components[c];
741 
742 	  VEC_safe_push (rdgc, heap, *components, x);
743 	  bitmap_set_bit (saved_components, c);
744 	}
745     }
746 
747   for (i = 0; i < n_components; i++)
748     if (!bitmap_bit_p (saved_components, i))
749       VEC_free (int, heap, all_components[i]);
750 
751   free (all_components);
752   BITMAP_FREE (saved_components);
753 }
754 
755 /* Returns true when it is possible to generate a builtin pattern for
756    the PARTITION of RDG.  For the moment we detect only the memset
757    zero pattern.  */
758 
759 static bool
760 can_generate_builtin (struct graph *rdg, bitmap partition)
761 {
762   unsigned i;
763   bitmap_iterator bi;
764   int nb_reads = 0;
765   int nb_writes = 0;
766   int stores_zero = 0;
767 
768   EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, bi)
769     if (RDG_MEM_READS_STMT (rdg, i))
770       nb_reads++;
771     else if (RDG_MEM_WRITE_STMT (rdg, i))
772       {
773 	nb_writes++;
774 	if (stmt_with_adjacent_zero_store_dr_p (RDG_STMT (rdg, i)))
775 	  stores_zero++;
776       }
777 
778   return stores_zero == 1 && nb_writes == 1 && nb_reads == 0;
779 }
780 
781 /* Returns true when PARTITION1 and PARTITION2 have similar memory
782    accesses in RDG.  */
783 
784 static bool
785 similar_memory_accesses (struct graph *rdg, bitmap partition1,
786 			 bitmap partition2)
787 {
788   unsigned i, j;
789   bitmap_iterator bi, bj;
790 
791   EXECUTE_IF_SET_IN_BITMAP (partition1, 0, i, bi)
792     if (RDG_MEM_WRITE_STMT (rdg, i)
793 	|| RDG_MEM_READS_STMT (rdg, i))
794       EXECUTE_IF_SET_IN_BITMAP (partition2, 0, j, bj)
795 	if (RDG_MEM_WRITE_STMT (rdg, j)
796 	    || RDG_MEM_READS_STMT (rdg, j))
797 	  if (rdg_has_similar_memory_accesses (rdg, i, j))
798 	    return true;
799 
800   return false;
801 }
802 
803 /* Fuse all the partitions from PARTITIONS that contain similar memory
804    references, i.e., we're taking care of cache locality.  This
805    function does not fuse those partitions that contain patterns that
806    can be code generated with builtins.  */
807 
808 static void
809 fuse_partitions_with_similar_memory_accesses (struct graph *rdg,
810 					      VEC (bitmap, heap) **partitions)
811 {
812   int p1, p2;
813   bitmap partition1, partition2;
814 
815   for (p1 = 0; VEC_iterate (bitmap, *partitions, p1, partition1); p1++)
816     if (!can_generate_builtin (rdg, partition1))
817       for (p2 = 0; VEC_iterate (bitmap, *partitions, p2, partition2); p2++)
818 	if (p1 != p2
819 	    && !can_generate_builtin (rdg, partition2)
820 	    && similar_memory_accesses (rdg, partition1, partition2))
821 	  {
822 	    bitmap_ior_into (partition1, partition2);
823 	    VEC_ordered_remove (bitmap, *partitions, p2);
824 	    p2--;
825 	  }
826 }
827 
828 /* Aggregate several components into a useful partition that is
829    registered in the PARTITIONS vector.  Partitions will be
830    distributed in different loops.  */
831 
832 static void
833 rdg_build_partitions (struct graph *rdg, VEC (rdgc, heap) *components,
834 		      VEC (int, heap) **other_stores,
835 		      VEC (bitmap, heap) **partitions, bitmap processed)
836 {
837   int i;
838   rdgc x;
839   bitmap partition = BITMAP_ALLOC (NULL);
840 
841   for (i = 0; VEC_iterate (rdgc, components, i, x); i++)
842     {
843       bitmap np;
844       bool part_has_writes = false;
845       int v = VEC_index (int, x->vertices, 0);
846 
847       if (bitmap_bit_p (processed, v))
848 	continue;
849 
850       np = build_rdg_partition_for_component (rdg, x, &part_has_writes);
851       bitmap_ior_into (partition, np);
852       bitmap_ior_into (processed, np);
853       BITMAP_FREE (np);
854 
855       if (part_has_writes)
856 	{
857 	  if (dump_file && (dump_flags & TDF_DETAILS))
858 	    {
859 	      fprintf (dump_file, "ldist useful partition:\n");
860 	      dump_bitmap (dump_file, partition);
861 	    }
862 
863 	  VEC_safe_push (bitmap, heap, *partitions, partition);
864 	  partition = BITMAP_ALLOC (NULL);
865 	}
866     }
867 
868   /* Add the nodes from the RDG that were not marked as processed, and
869      that are used outside the current loop.  These are scalar
870      computations that are not yet part of previous partitions.  */
871   for (i = 0; i < rdg->n_vertices; i++)
872     if (!bitmap_bit_p (processed, i)
873 	&& rdg_defs_used_in_other_loops_p (rdg, i))
874       VEC_safe_push (int, heap, *other_stores, i);
875 
876   /* If there are still statements left in the OTHER_STORES array,
877      create other components and partitions with these stores and
878      their dependences.  */
879   if (VEC_length (int, *other_stores) > 0)
880     {
881       VEC (rdgc, heap) *comps = VEC_alloc (rdgc, heap, 3);
882       VEC (int, heap) *foo = VEC_alloc (int, heap, 3);
883 
884       rdg_build_components (rdg, *other_stores, &comps);
885       rdg_build_partitions (rdg, comps, &foo, partitions, processed);
886 
887       VEC_free (int, heap, foo);
888       free_rdg_components (comps);
889     }
890 
891   /* If there is something left in the last partition, save it.  */
892   if (bitmap_count_bits (partition) > 0)
893     VEC_safe_push (bitmap, heap, *partitions, partition);
894   else
895     BITMAP_FREE (partition);
896 
897   fuse_partitions_with_similar_memory_accesses (rdg, partitions);
898 }
899 
900 /* Dump to FILE the PARTITIONS.  */
901 
902 static void
903 dump_rdg_partitions (FILE *file, VEC (bitmap, heap) *partitions)
904 {
905   int i;
906   bitmap partition;
907 
908   for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
909     debug_bitmap_file (file, partition);
910 }
911 
912 /* Debug PARTITIONS.  */
913 extern void debug_rdg_partitions (VEC (bitmap, heap) *);
914 
915 void
916 debug_rdg_partitions (VEC (bitmap, heap) *partitions)
917 {
918   dump_rdg_partitions (stderr, partitions);
919 }
920 
921 /* Returns the number of read and write operations in the RDG.  */
922 
923 static int
924 number_of_rw_in_rdg (struct graph *rdg)
925 {
926   int i, res = 0;
927 
928   for (i = 0; i < rdg->n_vertices; i++)
929     {
930       if (RDG_MEM_WRITE_STMT (rdg, i))
931 	++res;
932 
933       if (RDG_MEM_READS_STMT (rdg, i))
934 	++res;
935     }
936 
937   return res;
938 }
939 
940 /* Returns the number of read and write operations in a PARTITION of
941    the RDG.  */
942 
943 static int
944 number_of_rw_in_partition (struct graph *rdg, bitmap partition)
945 {
946   int res = 0;
947   unsigned i;
948   bitmap_iterator ii;
949 
950   EXECUTE_IF_SET_IN_BITMAP (partition, 0, i, ii)
951     {
952       if (RDG_MEM_WRITE_STMT (rdg, i))
953 	++res;
954 
955       if (RDG_MEM_READS_STMT (rdg, i))
956 	++res;
957     }
958 
959   return res;
960 }
961 
962 /* Returns true when one of the PARTITIONS contains all the read or
963    write operations of RDG.  */
964 
965 static bool
966 partition_contains_all_rw (struct graph *rdg, VEC (bitmap, heap) *partitions)
967 {
968   int i;
969   bitmap partition;
970   int nrw = number_of_rw_in_rdg (rdg);
971 
972   for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
973     if (nrw == number_of_rw_in_partition (rdg, partition))
974       return true;
975 
976   return false;
977 }
978 
979 /* Generate code from STARTING_VERTICES in RDG.  Returns the number of
980    distributed loops.  */
981 
982 static int
983 ldist_gen (struct loop *loop, struct graph *rdg,
984 	   VEC (int, heap) *starting_vertices)
985 {
986   int i, nbp;
987   VEC (rdgc, heap) *components = VEC_alloc (rdgc, heap, 3);
988   VEC (bitmap, heap) *partitions = VEC_alloc (bitmap, heap, 3);
989   VEC (int, heap) *other_stores = VEC_alloc (int, heap, 3);
990   bitmap partition, processed = BITMAP_ALLOC (NULL);
991 
992   remaining_stmts = BITMAP_ALLOC (NULL);
993   upstream_mem_writes = BITMAP_ALLOC (NULL);
994 
995   for (i = 0; i < rdg->n_vertices; i++)
996     {
997       bitmap_set_bit (remaining_stmts, i);
998 
999       /* Save in OTHER_STORES all the memory writes that are not in
1000 	 STARTING_VERTICES.  */
1001       if (RDG_MEM_WRITE_STMT (rdg, i))
1002 	{
1003 	  int v;
1004 	  unsigned j;
1005 	  bool found = false;
1006 
1007 	  for (j = 0; VEC_iterate (int, starting_vertices, j, v); j++)
1008 	    if (i == v)
1009 	      {
1010 		found = true;
1011 		break;
1012 	      }
1013 
1014 	  if (!found)
1015 	    VEC_safe_push (int, heap, other_stores, i);
1016 	}
1017     }
1018 
1019   mark_nodes_having_upstream_mem_writes (rdg);
1020   rdg_build_components (rdg, starting_vertices, &components);
1021   rdg_build_partitions (rdg, components, &other_stores, &partitions,
1022 			processed);
1023   BITMAP_FREE (processed);
1024   nbp = VEC_length (bitmap, partitions);
1025 
1026   if (nbp <= 1
1027       || partition_contains_all_rw (rdg, partitions))
1028     goto ldist_done;
1029 
1030   if (dump_file && (dump_flags & TDF_DETAILS))
1031     dump_rdg_partitions (dump_file, partitions);
1032 
1033   for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
1034     if (!generate_code_for_partition (loop, partition, i < nbp - 1))
1035       goto ldist_done;
1036 
1037   rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
1038   update_ssa (TODO_update_ssa_only_virtuals | TODO_update_ssa);
1039 
1040  ldist_done:
1041 
1042   BITMAP_FREE (remaining_stmts);
1043   BITMAP_FREE (upstream_mem_writes);
1044 
1045   for (i = 0; VEC_iterate (bitmap, partitions, i, partition); i++)
1046     BITMAP_FREE (partition);
1047 
1048   VEC_free (int, heap, other_stores);
1049   VEC_free (bitmap, heap, partitions);
1050   free_rdg_components (components);
1051   return nbp;
1052 }
1053 
1054 /* Distributes the code from LOOP in such a way that producer
1055    statements are placed before consumer statements.  When STMTS is
1056    NULL, performs the maximal distribution, if STMTS is not NULL,
1057    tries to separate only these statements from the LOOP's body.
1058    Returns the number of distributed loops.  */
1059 
1060 static int
1061 distribute_loop (struct loop *loop, VEC (gimple, heap) *stmts)
1062 {
1063   int res = 0;
1064   struct graph *rdg;
1065   gimple s;
1066   unsigned i;
1067   VEC (int, heap) *vertices;
1068 
1069   if (loop->num_nodes > 2)
1070     {
1071       if (dump_file && (dump_flags & TDF_DETAILS))
1072 	fprintf (dump_file,
1073 		 "FIXME: Loop %d not distributed: it has more than two basic blocks.\n",
1074 		 loop->num);
1075 
1076       return res;
1077     }
1078 
1079   rdg = build_rdg (loop);
1080 
1081   if (!rdg)
1082     {
1083       if (dump_file && (dump_flags & TDF_DETAILS))
1084 	fprintf (dump_file,
1085 		 "FIXME: Loop %d not distributed: failed to build the RDG.\n",
1086 		 loop->num);
1087 
1088       return res;
1089     }
1090 
1091   vertices = VEC_alloc (int, heap, 3);
1092 
1093   if (dump_file && (dump_flags & TDF_DETAILS))
1094     dump_rdg (dump_file, rdg);
1095 
1096   for (i = 0; VEC_iterate (gimple, stmts, i, s); i++)
1097     {
1098       int v = rdg_vertex_for_stmt (rdg, s);
1099 
1100       if (v >= 0)
1101 	{
1102 	  VEC_safe_push (int, heap, vertices, v);
1103 
1104 	  if (dump_file && (dump_flags & TDF_DETAILS))
1105 	    fprintf (dump_file,
1106 		     "ldist asked to generate code for vertex %d\n", v);
1107 	}
1108     }
1109 
1110   res = ldist_gen (loop, rdg, vertices);
1111   VEC_free (int, heap, vertices);
1112   free_rdg (rdg);
1113 
1114   return res;
1115 }
1116 
1117 /* Distribute all loops in the current function.  */
1118 
1119 static unsigned int
1120 tree_loop_distribution (void)
1121 {
1122   struct loop *loop;
1123   loop_iterator li;
1124   int nb_generated_loops = 0;
1125 
1126   FOR_EACH_LOOP (li, loop, 0)
1127     {
1128       VEC (gimple, heap) *work_list = NULL;
1129 
1130       /* If the loop doesn't have a single exit we will fail anyway,
1131 	 so do that early.  */
1132       if (!single_exit (loop))
1133 	continue;
1134 
1135       /* With the following working list, we're asking distribute_loop
1136 	 to separate the stores of the loop: when dependences allow,
1137 	 it will end on having one store per loop.  */
1138       stores_from_loop (loop, &work_list);
1139 
1140       /* A simple heuristic for cache locality is to not split stores
1141 	 to the same array.  Without this call, an unrolled loop would
1142 	 be split into as many loops as unroll factor, each loop
1143 	 storing in the same array.  */
1144       remove_similar_memory_refs (&work_list);
1145 
1146       nb_generated_loops = distribute_loop (loop, work_list);
1147 
1148       if (dump_file && (dump_flags & TDF_DETAILS))
1149 	{
1150 	  if (nb_generated_loops > 1)
1151 	    fprintf (dump_file, "Loop %d distributed: split to %d loops.\n",
1152 		     loop->num, nb_generated_loops);
1153 	  else
1154 	    fprintf (dump_file, "Loop %d is the same.\n", loop->num);
1155 	}
1156 
1157       verify_loop_structure ();
1158 
1159       VEC_free (gimple, heap, work_list);
1160     }
1161 
1162   return 0;
1163 }
1164 
1165 static bool
1166 gate_tree_loop_distribution (void)
1167 {
1168   return flag_tree_loop_distribution != 0;
1169 }
1170 
1171 struct gimple_opt_pass pass_loop_distribution =
1172 {
1173  {
1174   GIMPLE_PASS,
1175   "ldist",			/* name */
1176   gate_tree_loop_distribution,  /* gate */
1177   tree_loop_distribution,       /* execute */
1178   NULL,				/* sub */
1179   NULL,				/* next */
1180   0,				/* static_pass_number */
1181   TV_TREE_LOOP_DISTRIBUTION,    /* tv_id */
1182   PROP_cfg | PROP_ssa,		/* properties_required */
1183   0,				/* properties_provided */
1184   0,				/* properties_destroyed */
1185   0,				/* todo_flags_start */
1186   TODO_dump_func | TODO_verify_loops            /* todo_flags_finish */
1187  }
1188 };
1189