xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-loop-distribution.c (revision 53b02e147d4ed531c0d2a5ca9b3e8026ba3e99b5)
1 /* Loop distribution.
2    Copyright (C) 2006-2019 Free Software Foundation, Inc.
3    Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4    and Sebastian Pop <sebastian.pop@amd.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 /* This pass performs loop distribution: for example, the loop
23 
24    |DO I = 2, N
25    |    A(I) = B(I) + C
26    |    D(I) = A(I-1)*E
27    |ENDDO
28 
29    is transformed to
30 
31    |DOALL I = 2, N
32    |   A(I) = B(I) + C
33    |ENDDO
34    |
35    |DOALL I = 2, N
36    |   D(I) = A(I-1)*E
37    |ENDDO
38 
39    Loop distribution is the dual of loop fusion.  It separates statements
40    of a loop (or loop nest) into multiple loops (or loop nests) with the
41    same loop header.  The major goal is to separate statements which may
42    be vectorized from those that can't.  This pass implements distribution
43    in the following steps:
44 
45      1) Seed partitions with specific type statements.  For now we support
46 	two types seed statements: statement defining variable used outside
47 	of loop; statement storing to memory.
48      2) Build reduced dependence graph (RDG) for loop to be distributed.
49 	The vertices (RDG:V) model all statements in the loop and the edges
50 	(RDG:E) model flow and control dependencies between statements.
51      3) Apart from RDG, compute data dependencies between memory references.
52      4) Starting from seed statement, build up partition by adding depended
53 	statements according to RDG's dependence information.  Partition is
54 	classified as parallel type if it can be executed paralleled; or as
55 	sequential type if it can't.  Parallel type partition is further
56 	classified as different builtin kinds if it can be implemented as
57 	builtin function calls.
58      5) Build partition dependence graph (PG) based on data dependencies.
59 	The vertices (PG:V) model all partitions and the edges (PG:E) model
60 	all data dependencies between every partitions pair.  In general,
61 	data dependence is either compilation time known or unknown.  In C
62 	family languages, there exists quite amount compilation time unknown
63 	dependencies because of possible alias relation of data references.
64 	We categorize PG's edge to two types: "true" edge that represents
65 	compilation time known data dependencies; "alias" edge for all other
66 	data dependencies.
67      6) Traverse subgraph of PG as if all "alias" edges don't exist.  Merge
68 	partitions in each strong connected component (SCC) correspondingly.
69 	Build new PG for merged partitions.
70      7) Traverse PG again and this time with both "true" and "alias" edges
71 	included.  We try to break SCCs by removing some edges.  Because
72 	SCCs by "true" edges are all fused in step 6), we can break SCCs
73 	by removing some "alias" edges.  It's NP-hard to choose optimal
74 	edge set, fortunately simple approximation is good enough for us
75 	given the small problem scale.
76      8) Collect all data dependencies of the removed "alias" edges.  Create
77 	runtime alias checks for collected data dependencies.
78      9) Version loop under the condition of runtime alias checks.  Given
79 	loop distribution generally introduces additional overhead, it is
80 	only useful if vectorization is achieved in distributed loop.  We
81 	version loop with internal function call IFN_LOOP_DIST_ALIAS.  If
82 	no distributed loop can be vectorized, we simply remove distributed
83 	loops and recover to the original one.
84 
85    TODO:
86      1) We only distribute innermost two-level loop nest now.  We should
87 	extend it for arbitrary loop nests in the future.
88      2) We only fuse partitions in SCC now.  A better fusion algorithm is
89 	desired to minimize loop overhead, maximize parallelism and maximize
90 	data reuse.  */
91 
92 #include "config.h"
93 #include "system.h"
94 #include "coretypes.h"
95 #include "backend.h"
96 #include "tree.h"
97 #include "gimple.h"
98 #include "cfghooks.h"
99 #include "tree-pass.h"
100 #include "ssa.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
103 #include "cfganal.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop-ivopts.h"
110 #include "tree-ssa-loop.h"
111 #include "tree-into-ssa.h"
112 #include "tree-ssa.h"
113 #include "cfgloop.h"
114 #include "tree-scalar-evolution.h"
115 #include "params.h"
116 #include "tree-vectorizer.h"
117 #include "tree-eh.h"
118 
119 
120 #define MAX_DATAREFS_NUM \
121 	((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
122 
123 /* Threshold controlling number of distributed partitions.  Given it may
124    be unnecessary if a memory stream cost model is invented in the future,
125    we define it as a temporary macro, rather than a parameter.  */
126 #define NUM_PARTITION_THRESHOLD (4)
127 
128 /* Hashtable helpers.  */
129 
130 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
131 {
132   static inline hashval_t hash (const data_dependence_relation *);
133   static inline bool equal (const data_dependence_relation *,
134 			    const data_dependence_relation *);
135 };
136 
137 /* Hash function for data dependence.  */
138 
139 inline hashval_t
140 ddr_hasher::hash (const data_dependence_relation *ddr)
141 {
142   inchash::hash h;
143   h.add_ptr (DDR_A (ddr));
144   h.add_ptr (DDR_B (ddr));
145   return h.end ();
146 }
147 
148 /* Hash table equality function for data dependence.  */
149 
150 inline bool
151 ddr_hasher::equal (const data_dependence_relation *ddr1,
152 		   const data_dependence_relation *ddr2)
153 {
154   return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
155 }
156 
157 /* The loop (nest) to be distributed.  */
158 static vec<loop_p> loop_nest;
159 
160 /* Vector of data references in the loop to be distributed.  */
161 static vec<data_reference_p> datarefs_vec;
162 
163 /* If there is nonaddressable data reference in above vector.  */
164 static bool has_nonaddressable_dataref_p;
165 
166 /* Store index of data reference in aux field.  */
167 #define DR_INDEX(dr)      ((uintptr_t) (dr)->aux)
168 
169 /* Hash table for data dependence relation in the loop to be distributed.  */
170 static hash_table<ddr_hasher> *ddrs_table;
171 
172 /* A Reduced Dependence Graph (RDG) vertex representing a statement.  */
173 struct rdg_vertex
174 {
175   /* The statement represented by this vertex.  */
176   gimple *stmt;
177 
178   /* Vector of data-references in this statement.  */
179   vec<data_reference_p> datarefs;
180 
181   /* True when the statement contains a write to memory.  */
182   bool has_mem_write;
183 
184   /* True when the statement contains a read from memory.  */
185   bool has_mem_reads;
186 };
187 
188 #define RDGV_STMT(V)     ((struct rdg_vertex *) ((V)->data))->stmt
189 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
190 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
191 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
192 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
193 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
194 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
195 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
196 
197 /* Data dependence type.  */
198 
199 enum rdg_dep_type
200 {
201   /* Read After Write (RAW).  */
202   flow_dd = 'f',
203 
204   /* Control dependence (execute conditional on).  */
205   control_dd = 'c'
206 };
207 
208 /* Dependence information attached to an edge of the RDG.  */
209 
210 struct rdg_edge
211 {
212   /* Type of the dependence.  */
213   enum rdg_dep_type type;
214 };
215 
216 #define RDGE_TYPE(E)        ((struct rdg_edge *) ((E)->data))->type
217 
218 /* Dump vertex I in RDG to FILE.  */
219 
220 static void
221 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
222 {
223   struct vertex *v = &(rdg->vertices[i]);
224   struct graph_edge *e;
225 
226   fprintf (file, "(vertex %d: (%s%s) (in:", i,
227 	   RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
228 	   RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
229 
230   if (v->pred)
231     for (e = v->pred; e; e = e->pred_next)
232       fprintf (file, " %d", e->src);
233 
234   fprintf (file, ") (out:");
235 
236   if (v->succ)
237     for (e = v->succ; e; e = e->succ_next)
238       fprintf (file, " %d", e->dest);
239 
240   fprintf (file, ")\n");
241   print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
242   fprintf (file, ")\n");
243 }
244 
245 /* Call dump_rdg_vertex on stderr.  */
246 
247 DEBUG_FUNCTION void
248 debug_rdg_vertex (struct graph *rdg, int i)
249 {
250   dump_rdg_vertex (stderr, rdg, i);
251 }
252 
253 /* Dump the reduced dependence graph RDG to FILE.  */
254 
255 static void
256 dump_rdg (FILE *file, struct graph *rdg)
257 {
258   fprintf (file, "(rdg\n");
259   for (int i = 0; i < rdg->n_vertices; i++)
260     dump_rdg_vertex (file, rdg, i);
261   fprintf (file, ")\n");
262 }
263 
264 /* Call dump_rdg on stderr.  */
265 
266 DEBUG_FUNCTION void
267 debug_rdg (struct graph *rdg)
268 {
269   dump_rdg (stderr, rdg);
270 }
271 
272 static void
273 dot_rdg_1 (FILE *file, struct graph *rdg)
274 {
275   int i;
276   pretty_printer buffer;
277   pp_needs_newline (&buffer) = false;
278   buffer.buffer->stream = file;
279 
280   fprintf (file, "digraph RDG {\n");
281 
282   for (i = 0; i < rdg->n_vertices; i++)
283     {
284       struct vertex *v = &(rdg->vertices[i]);
285       struct graph_edge *e;
286 
287       fprintf (file, "%d [label=\"[%d] ", i, i);
288       pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
289       pp_flush (&buffer);
290       fprintf (file, "\"]\n");
291 
292       /* Highlight reads from memory.  */
293       if (RDG_MEM_READS_STMT (rdg, i))
294        fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
295 
296       /* Highlight stores to memory.  */
297       if (RDG_MEM_WRITE_STMT (rdg, i))
298        fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
299 
300       if (v->succ)
301        for (e = v->succ; e; e = e->succ_next)
302          switch (RDGE_TYPE (e))
303            {
304            case flow_dd:
305              /* These are the most common dependences: don't print these. */
306              fprintf (file, "%d -> %d \n", i, e->dest);
307              break;
308 
309 	   case control_dd:
310              fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
311              break;
312 
313            default:
314              gcc_unreachable ();
315            }
316     }
317 
318   fprintf (file, "}\n\n");
319 }
320 
321 /* Display the Reduced Dependence Graph using dotty.  */
322 
323 DEBUG_FUNCTION void
324 dot_rdg (struct graph *rdg)
325 {
326   /* When debugging, you may want to enable the following code.  */
327 #ifdef HAVE_POPEN
328   FILE *file = popen ("dot -Tx11", "w");
329   if (!file)
330     return;
331   dot_rdg_1 (file, rdg);
332   fflush (file);
333   close (fileno (file));
334   pclose (file);
335 #else
336   dot_rdg_1 (stderr, rdg);
337 #endif
338 }
339 
340 /* Returns the index of STMT in RDG.  */
341 
342 static int
343 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
344 {
345   int index = gimple_uid (stmt);
346   gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
347   return index;
348 }
349 
350 /* Creates dependence edges in RDG for all the uses of DEF.  IDEF is
351    the index of DEF in RDG.  */
352 
353 static void
354 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
355 {
356   use_operand_p imm_use_p;
357   imm_use_iterator iterator;
358 
359   FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
360     {
361       struct graph_edge *e;
362       int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
363 
364       if (use < 0)
365 	continue;
366 
367       e = add_edge (rdg, idef, use);
368       e->data = XNEW (struct rdg_edge);
369       RDGE_TYPE (e) = flow_dd;
370     }
371 }
372 
373 /* Creates an edge for the control dependences of BB to the vertex V.  */
374 
375 static void
376 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
377 				    int v, control_dependences *cd)
378 {
379   bitmap_iterator bi;
380   unsigned edge_n;
381   EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
382 			    0, edge_n, bi)
383     {
384       basic_block cond_bb = cd->get_edge_src (edge_n);
385       gimple *stmt = last_stmt (cond_bb);
386       if (stmt && is_ctrl_stmt (stmt))
387 	{
388 	  struct graph_edge *e;
389 	  int c = rdg_vertex_for_stmt (rdg, stmt);
390 	  if (c < 0)
391 	    continue;
392 
393 	  e = add_edge (rdg, c, v);
394 	  e->data = XNEW (struct rdg_edge);
395 	  RDGE_TYPE (e) = control_dd;
396 	}
397     }
398 }
399 
400 /* Creates the edges of the reduced dependence graph RDG.  */
401 
402 static void
403 create_rdg_flow_edges (struct graph *rdg)
404 {
405   int i;
406   def_operand_p def_p;
407   ssa_op_iter iter;
408 
409   for (i = 0; i < rdg->n_vertices; i++)
410     FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
411 			      iter, SSA_OP_DEF)
412       create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
413 }
414 
415 /* Creates the edges of the reduced dependence graph RDG.  */
416 
417 static void
418 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
419 {
420   int i;
421 
422   for (i = 0; i < rdg->n_vertices; i++)
423     {
424       gimple *stmt = RDG_STMT (rdg, i);
425       if (gimple_code (stmt) == GIMPLE_PHI)
426 	{
427 	  edge_iterator ei;
428 	  edge e;
429 	  FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
430 	    if (flow_bb_inside_loop_p (loop, e->src))
431 	      create_edge_for_control_dependence (rdg, e->src, i, cd);
432 	}
433       else
434 	create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
435     }
436 }
437 
438 /* Build the vertices of the reduced dependence graph RDG.  Return false
439    if that failed.  */
440 
441 static bool
442 create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
443 {
444   int i;
445   gimple *stmt;
446 
447   FOR_EACH_VEC_ELT (stmts, i, stmt)
448     {
449       struct vertex *v = &(rdg->vertices[i]);
450 
451       /* Record statement to vertex mapping.  */
452       gimple_set_uid (stmt, i);
453 
454       v->data = XNEW (struct rdg_vertex);
455       RDGV_STMT (v) = stmt;
456       RDGV_DATAREFS (v).create (0);
457       RDGV_HAS_MEM_WRITE (v) = false;
458       RDGV_HAS_MEM_READS (v) = false;
459       if (gimple_code (stmt) == GIMPLE_PHI)
460 	continue;
461 
462       unsigned drp = datarefs_vec.length ();
463       if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
464 	return false;
465       for (unsigned j = drp; j < datarefs_vec.length (); ++j)
466 	{
467 	  data_reference_p dr = datarefs_vec[j];
468 	  if (DR_IS_READ (dr))
469 	    RDGV_HAS_MEM_READS (v) = true;
470 	  else
471 	    RDGV_HAS_MEM_WRITE (v) = true;
472 	  RDGV_DATAREFS (v).safe_push (dr);
473 	  has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref);
474 	}
475     }
476   return true;
477 }
478 
479 /* Array mapping basic block's index to its topological order.  */
480 static int *bb_top_order_index;
481 /* And size of the array.  */
482 static int bb_top_order_index_size;
483 
484 /* If X has a smaller topological sort number than Y, returns -1;
485    if greater, returns 1.  */
486 
487 static int
488 bb_top_order_cmp (const void *x, const void *y)
489 {
490   basic_block bb1 = *(const basic_block *) x;
491   basic_block bb2 = *(const basic_block *) y;
492 
493   gcc_assert (bb1->index < bb_top_order_index_size
494 	      && bb2->index < bb_top_order_index_size);
495   gcc_assert (bb1 == bb2
496 	      || bb_top_order_index[bb1->index]
497 		 != bb_top_order_index[bb2->index]);
498 
499   return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
500 }
501 
502 /* Initialize STMTS with all the statements of LOOP.  We use topological
503    order to discover all statements.  The order is important because
504    generate_loops_for_partition is using the same traversal for identifying
505    statements in loop copies.  */
506 
507 static void
508 stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
509 {
510   unsigned int i;
511   basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
512 
513   for (i = 0; i < loop->num_nodes; i++)
514     {
515       basic_block bb = bbs[i];
516 
517       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
518 	   gsi_next (&bsi))
519 	if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
520 	  stmts->safe_push (bsi.phi ());
521 
522       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
523 	   gsi_next (&bsi))
524 	{
525 	  gimple *stmt = gsi_stmt (bsi);
526 	  if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
527 	    stmts->safe_push (stmt);
528 	}
529     }
530 
531   free (bbs);
532 }
533 
534 /* Free the reduced dependence graph RDG.  */
535 
536 static void
537 free_rdg (struct graph *rdg)
538 {
539   int i;
540 
541   for (i = 0; i < rdg->n_vertices; i++)
542     {
543       struct vertex *v = &(rdg->vertices[i]);
544       struct graph_edge *e;
545 
546       for (e = v->succ; e; e = e->succ_next)
547 	free (e->data);
548 
549       if (v->data)
550 	{
551 	  gimple_set_uid (RDGV_STMT (v), -1);
552 	  (RDGV_DATAREFS (v)).release ();
553 	  free (v->data);
554 	}
555     }
556 
557   free_graph (rdg);
558 }
559 
560 /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
561    LOOP, and one edge per flow dependence or control dependence from control
562    dependence CD.  During visiting each statement, data references are also
563    collected and recorded in global data DATAREFS_VEC.  */
564 
565 static struct graph *
566 build_rdg (struct loop *loop, control_dependences *cd)
567 {
568   struct graph *rdg;
569 
570   /* Create the RDG vertices from the stmts of the loop nest.  */
571   auto_vec<gimple *, 10> stmts;
572   stmts_from_loop (loop, &stmts);
573   rdg = new_graph (stmts.length ());
574   if (!create_rdg_vertices (rdg, stmts, loop))
575     {
576       free_rdg (rdg);
577       return NULL;
578     }
579   stmts.release ();
580 
581   create_rdg_flow_edges (rdg);
582   if (cd)
583     create_rdg_cd_edges (rdg, cd, loop);
584 
585   return rdg;
586 }
587 
588 
589 /* Kind of distributed loop.  */
590 enum partition_kind {
591     PKIND_NORMAL,
592     /* Partial memset stands for a paritition can be distributed into a loop
593        of memset calls, rather than a single memset call.  It's handled just
594        like a normal parition, i.e, distributed as separate loop, no memset
595        call is generated.
596 
597        Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
598        loop nest as deep as possible.  As a result, parloop achieves better
599        parallelization by parallelizing deeper loop nest.  This hack should
600        be unnecessary and removed once distributed memset can be understood
601        and analyzed in data reference analysis.  See PR82604 for more.  */
602     PKIND_PARTIAL_MEMSET,
603     PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
604 };
605 
606 /* Type of distributed loop.  */
607 enum partition_type {
608     /* The distributed loop can be executed parallelly.  */
609     PTYPE_PARALLEL = 0,
610     /* The distributed loop has to be executed sequentially.  */
611     PTYPE_SEQUENTIAL
612 };
613 
614 /* Builtin info for loop distribution.  */
615 struct builtin_info
616 {
617   /* data-references a kind != PKIND_NORMAL partition is about.  */
618   data_reference_p dst_dr;
619   data_reference_p src_dr;
620   /* Base address and size of memory objects operated by the builtin.  Note
621      both dest and source memory objects must have the same size.  */
622   tree dst_base;
623   tree src_base;
624   tree size;
625   /* Base and offset part of dst_base after stripping constant offset.  This
626      is only used in memset builtin distribution for now.  */
627   tree dst_base_base;
628   unsigned HOST_WIDE_INT dst_base_offset;
629 };
630 
631 /* Partition for loop distribution.  */
632 struct partition
633 {
634   /* Statements of the partition.  */
635   bitmap stmts;
636   /* True if the partition defines variable which is used outside of loop.  */
637   bool reduction_p;
638   enum partition_kind kind;
639   enum partition_type type;
640   /* Data references in the partition.  */
641   bitmap datarefs;
642   /* Information of builtin parition.  */
643   struct builtin_info *builtin;
644 };
645 
646 
647 /* Allocate and initialize a partition from BITMAP.  */
648 
649 static partition *
650 partition_alloc (void)
651 {
652   partition *partition = XCNEW (struct partition);
653   partition->stmts = BITMAP_ALLOC (NULL);
654   partition->reduction_p = false;
655   partition->kind = PKIND_NORMAL;
656   partition->datarefs = BITMAP_ALLOC (NULL);
657   return partition;
658 }
659 
660 /* Free PARTITION.  */
661 
662 static void
663 partition_free (partition *partition)
664 {
665   BITMAP_FREE (partition->stmts);
666   BITMAP_FREE (partition->datarefs);
667   if (partition->builtin)
668     free (partition->builtin);
669 
670   free (partition);
671 }
672 
673 /* Returns true if the partition can be generated as a builtin.  */
674 
675 static bool
676 partition_builtin_p (partition *partition)
677 {
678   return partition->kind > PKIND_PARTIAL_MEMSET;
679 }
680 
681 /* Returns true if the partition contains a reduction.  */
682 
683 static bool
684 partition_reduction_p (partition *partition)
685 {
686   return partition->reduction_p;
687 }
688 
689 /* Partitions are fused because of different reasons.  */
690 enum fuse_type
691 {
692   FUSE_NON_BUILTIN = 0,
693   FUSE_REDUCTION = 1,
694   FUSE_SHARE_REF = 2,
695   FUSE_SAME_SCC = 3,
696   FUSE_FINALIZE = 4
697 };
698 
699 /* Description on different fusing reason.  */
700 static const char *fuse_message[] = {
701   "they are non-builtins",
702   "they have reductions",
703   "they have shared memory refs",
704   "they are in the same dependence scc",
705   "there is no point to distribute loop"};
706 
707 static void
708 update_type_for_merge (struct graph *, partition *, partition *);
709 
710 /* Merge PARTITION into the partition DEST.  RDG is the reduced dependence
711    graph and we update type for result partition if it is non-NULL.  */
712 
713 static void
714 partition_merge_into (struct graph *rdg, partition *dest,
715 		      partition *partition, enum fuse_type ft)
716 {
717   if (dump_file && (dump_flags & TDF_DETAILS))
718     {
719       fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
720       fprintf (dump_file, "  Part 1: ");
721       dump_bitmap (dump_file, dest->stmts);
722       fprintf (dump_file, "  Part 2: ");
723       dump_bitmap (dump_file, partition->stmts);
724     }
725 
726   dest->kind = PKIND_NORMAL;
727   if (dest->type == PTYPE_PARALLEL)
728     dest->type = partition->type;
729 
730   bitmap_ior_into (dest->stmts, partition->stmts);
731   if (partition_reduction_p (partition))
732     dest->reduction_p = true;
733 
734   /* Further check if any data dependence prevents us from executing the
735      new partition parallelly.  */
736   if (dest->type == PTYPE_PARALLEL && rdg != NULL)
737     update_type_for_merge (rdg, dest, partition);
738 
739   bitmap_ior_into (dest->datarefs, partition->datarefs);
740 }
741 
742 
743 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
744    the LOOP.  */
745 
746 static bool
747 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
748 {
749   imm_use_iterator imm_iter;
750   use_operand_p use_p;
751 
752   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
753     {
754       if (is_gimple_debug (USE_STMT (use_p)))
755 	continue;
756 
757       basic_block use_bb = gimple_bb (USE_STMT (use_p));
758       if (!flow_bb_inside_loop_p (loop, use_bb))
759 	return true;
760     }
761 
762   return false;
763 }
764 
765 /* Returns true when STMT defines a scalar variable used after the
766    loop LOOP.  */
767 
768 static bool
769 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
770 {
771   def_operand_p def_p;
772   ssa_op_iter op_iter;
773 
774   if (gimple_code (stmt) == GIMPLE_PHI)
775     return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
776 
777   FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
778     if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
779       return true;
780 
781   return false;
782 }
783 
784 /* Return a copy of LOOP placed before LOOP.  */
785 
786 static struct loop *
787 copy_loop_before (struct loop *loop)
788 {
789   struct loop *res;
790   edge preheader = loop_preheader_edge (loop);
791 
792   initialize_original_copy_tables ();
793   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
794   gcc_assert (res != NULL);
795   free_original_copy_tables ();
796   delete_update_ssa ();
797 
798   return res;
799 }
800 
801 /* Creates an empty basic block after LOOP.  */
802 
803 static void
804 create_bb_after_loop (struct loop *loop)
805 {
806   edge exit = single_exit (loop);
807 
808   if (!exit)
809     return;
810 
811   split_edge (exit);
812 }
813 
814 /* Generate code for PARTITION from the code in LOOP.  The loop is
815    copied when COPY_P is true.  All the statements not flagged in the
816    PARTITION bitmap are removed from the loop or from its copy.  The
817    statements are indexed in sequence inside a basic block, and the
818    basic blocks of a loop are taken in dom order.  */
819 
820 static void
821 generate_loops_for_partition (struct loop *loop, partition *partition,
822 			      bool copy_p)
823 {
824   unsigned i;
825   basic_block *bbs;
826 
827   if (copy_p)
828     {
829       int orig_loop_num = loop->orig_loop_num;
830       loop = copy_loop_before (loop);
831       gcc_assert (loop != NULL);
832       loop->orig_loop_num = orig_loop_num;
833       create_preheader (loop, CP_SIMPLE_PREHEADERS);
834       create_bb_after_loop (loop);
835     }
836   else
837     {
838       /* Origin number is set to the new versioned loop's num.  */
839       gcc_assert (loop->orig_loop_num != loop->num);
840     }
841 
842   /* Remove stmts not in the PARTITION bitmap.  */
843   bbs = get_loop_body_in_dom_order (loop);
844 
845   if (MAY_HAVE_DEBUG_BIND_STMTS)
846     for (i = 0; i < loop->num_nodes; i++)
847       {
848 	basic_block bb = bbs[i];
849 
850 	for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
851 	     gsi_next (&bsi))
852 	  {
853 	    gphi *phi = bsi.phi ();
854 	    if (!virtual_operand_p (gimple_phi_result (phi))
855 		&& !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
856 	      reset_debug_uses (phi);
857 	  }
858 
859 	for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
860 	  {
861 	    gimple *stmt = gsi_stmt (bsi);
862 	    if (gimple_code (stmt) != GIMPLE_LABEL
863 		&& !is_gimple_debug (stmt)
864 		&& !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
865 	      reset_debug_uses (stmt);
866 	  }
867       }
868 
869   for (i = 0; i < loop->num_nodes; i++)
870     {
871       basic_block bb = bbs[i];
872       edge inner_exit = NULL;
873 
874       if (loop != bb->loop_father)
875 	inner_exit = single_exit (bb->loop_father);
876 
877       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
878 	{
879 	  gphi *phi = bsi.phi ();
880 	  if (!virtual_operand_p (gimple_phi_result (phi))
881 	      && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
882 	    remove_phi_node (&bsi, true);
883 	  else
884 	    gsi_next (&bsi);
885 	}
886 
887       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
888 	{
889 	  gimple *stmt = gsi_stmt (bsi);
890 	  if (gimple_code (stmt) != GIMPLE_LABEL
891 	      && !is_gimple_debug (stmt)
892 	      && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
893 	    {
894 	      /* In distribution of loop nest, if bb is inner loop's exit_bb,
895 		 we choose its exit edge/path in order to avoid generating
896 		 infinite loop.  For all other cases, we choose an arbitrary
897 		 path through the empty CFG part that this unnecessary
898 		 control stmt controls.  */
899 	      if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
900 		{
901 		  if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
902 		    gimple_cond_make_true (cond_stmt);
903 		  else
904 		    gimple_cond_make_false (cond_stmt);
905 		  update_stmt (stmt);
906 		}
907 	      else if (gimple_code (stmt) == GIMPLE_SWITCH)
908 		{
909 		  gswitch *switch_stmt = as_a <gswitch *> (stmt);
910 		  gimple_switch_set_index
911 		      (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
912 		  update_stmt (stmt);
913 		}
914 	      else
915 		{
916 		  unlink_stmt_vdef (stmt);
917 		  gsi_remove (&bsi, true);
918 		  release_defs (stmt);
919 		  continue;
920 		}
921 	    }
922 	  gsi_next (&bsi);
923 	}
924     }
925 
926   free (bbs);
927 }
928 
929 /* If VAL memory representation contains the same value in all bytes,
930    return that value, otherwise return -1.
931    E.g. for 0x24242424 return 0x24, for IEEE double
932    747708026454360457216.0 return 0x44, etc.  */
933 
934 static int
935 const_with_all_bytes_same (tree val)
936 {
937   unsigned char buf[64];
938   int i, len;
939 
940   if (integer_zerop (val)
941       || (TREE_CODE (val) == CONSTRUCTOR
942           && !TREE_CLOBBER_P (val)
943           && CONSTRUCTOR_NELTS (val) == 0))
944     return 0;
945 
946   if (real_zerop (val))
947     {
948       /* Only return 0 for +0.0, not for -0.0, which doesn't have
949 	 an all bytes same memory representation.  Don't transform
950 	 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS.  */
951       switch (TREE_CODE (val))
952 	{
953 	case REAL_CST:
954 	  if (!real_isneg (TREE_REAL_CST_PTR (val)))
955 	    return 0;
956 	  break;
957 	case COMPLEX_CST:
958 	  if (!const_with_all_bytes_same (TREE_REALPART (val))
959 	      && !const_with_all_bytes_same (TREE_IMAGPART (val)))
960 	    return 0;
961 	  break;
962 	case VECTOR_CST:
963 	  {
964 	    unsigned int count = vector_cst_encoded_nelts (val);
965 	    unsigned int j;
966 	    for (j = 0; j < count; ++j)
967 	      if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
968 		break;
969 	    if (j == count)
970 	      return 0;
971 	    break;
972 	  }
973 	default:
974 	  break;
975 	}
976     }
977 
978   if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
979     return -1;
980 
981   len = native_encode_expr (val, buf, sizeof (buf));
982   if (len == 0)
983     return -1;
984   for (i = 1; i < len; i++)
985     if (buf[i] != buf[0])
986       return -1;
987   return buf[0];
988 }
989 
990 /* Generate a call to memset for PARTITION in LOOP.  */
991 
992 static void
993 generate_memset_builtin (struct loop *loop, partition *partition)
994 {
995   gimple_stmt_iterator gsi;
996   tree mem, fn, nb_bytes;
997   tree val;
998   struct builtin_info *builtin = partition->builtin;
999   gimple *fn_call;
1000 
1001   /* The new statements will be placed before LOOP.  */
1002   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1003 
1004   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1005   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1006 				       false, GSI_CONTINUE_LINKING);
1007   mem = builtin->dst_base;
1008   mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1009 				  false, GSI_CONTINUE_LINKING);
1010 
1011   /* This exactly matches the pattern recognition in classify_partition.  */
1012   val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1013   /* Handle constants like 0x15151515 and similarly
1014      floating point constants etc. where all bytes are the same.  */
1015   int bytev = const_with_all_bytes_same (val);
1016   if (bytev != -1)
1017     val = build_int_cst (integer_type_node, bytev);
1018   else if (TREE_CODE (val) == INTEGER_CST)
1019     val = fold_convert (integer_type_node, val);
1020   else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1021     {
1022       tree tem = make_ssa_name (integer_type_node);
1023       gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1024       gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1025       val = tem;
1026     }
1027 
1028   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1029   fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1030   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1031 
1032   if (dump_file && (dump_flags & TDF_DETAILS))
1033     {
1034       fprintf (dump_file, "generated memset");
1035       if (bytev == 0)
1036 	fprintf (dump_file, " zero\n");
1037       else
1038 	fprintf (dump_file, "\n");
1039     }
1040 }
1041 
1042 /* Generate a call to memcpy for PARTITION in LOOP.  */
1043 
1044 static void
1045 generate_memcpy_builtin (struct loop *loop, partition *partition)
1046 {
1047   gimple_stmt_iterator gsi;
1048   gimple *fn_call;
1049   tree dest, src, fn, nb_bytes;
1050   enum built_in_function kind;
1051   struct builtin_info *builtin = partition->builtin;
1052 
1053   /* The new statements will be placed before LOOP.  */
1054   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1055 
1056   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1057   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1058 				       false, GSI_CONTINUE_LINKING);
1059   dest = builtin->dst_base;
1060   src = builtin->src_base;
1061   if (partition->kind == PKIND_MEMCPY
1062       || ! ptr_derefs_may_alias_p (dest, src))
1063     kind = BUILT_IN_MEMCPY;
1064   else
1065     kind = BUILT_IN_MEMMOVE;
1066 
1067   dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1068 				   false, GSI_CONTINUE_LINKING);
1069   src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1070 				  false, GSI_CONTINUE_LINKING);
1071   fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1072   fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1073   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1074 
1075   if (dump_file && (dump_flags & TDF_DETAILS))
1076     {
1077       if (kind == BUILT_IN_MEMCPY)
1078 	fprintf (dump_file, "generated memcpy\n");
1079       else
1080 	fprintf (dump_file, "generated memmove\n");
1081     }
1082 }
1083 
1084 /* Remove and destroy the loop LOOP.  */
1085 
1086 static void
1087 destroy_loop (struct loop *loop)
1088 {
1089   unsigned nbbs = loop->num_nodes;
1090   edge exit = single_exit (loop);
1091   basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1092   basic_block *bbs;
1093   unsigned i;
1094 
1095   bbs = get_loop_body_in_dom_order (loop);
1096 
1097   gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest);
1098   bool safe_p = single_pred_p (exit->dest);
1099   i = nbbs;
1100   do
1101     {
1102       /* We have made sure to not leave any dangling uses of SSA
1103          names defined in the loop.  With the exception of virtuals.
1104 	 Make sure we replace all uses of virtual defs that will remain
1105 	 outside of the loop with the bare symbol as delete_basic_block
1106 	 will release them.  */
1107       --i;
1108       for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1109 	   gsi_next (&gsi))
1110 	{
1111 	  gphi *phi = gsi.phi ();
1112 	  if (virtual_operand_p (gimple_phi_result (phi)))
1113 	    mark_virtual_phi_result_for_renaming (phi);
1114 	}
1115       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);)
1116 	{
1117 	  gimple *stmt = gsi_stmt (gsi);
1118 	  tree vdef = gimple_vdef (stmt);
1119 	  if (vdef && TREE_CODE (vdef) == SSA_NAME)
1120 	    mark_virtual_operand_for_renaming (vdef);
1121 	  /* Also move and eventually reset debug stmts.  We can leave
1122 	     constant values in place in case the stmt dominates the exit.
1123 	     ???  Non-constant values from the last iteration can be
1124 	     replaced with final values if we can compute them.  */
1125 	  if (gimple_debug_bind_p (stmt))
1126 	    {
1127 	      tree val = gimple_debug_bind_get_value (stmt);
1128 	      gsi_move_before (&gsi, &dst_gsi);
1129 	      if (val
1130 		  && (!safe_p
1131 		      || !is_gimple_min_invariant (val)
1132 		      || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i])))
1133 		{
1134 		  gimple_debug_bind_reset_value (stmt);
1135 		  update_stmt (stmt);
1136 		}
1137 	    }
1138 	  else
1139 	    gsi_next (&gsi);
1140 	}
1141     }
1142   while (i != 0);
1143 
1144   redirect_edge_pred (exit, src);
1145   exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1146   exit->flags |= EDGE_FALLTHRU;
1147   cancel_loop_tree (loop);
1148   rescan_loop_exit (exit, false, true);
1149 
1150   i = nbbs;
1151   do
1152     {
1153       --i;
1154       delete_basic_block (bbs[i]);
1155     }
1156   while (i != 0);
1157 
1158   free (bbs);
1159 
1160   set_immediate_dominator (CDI_DOMINATORS, dest,
1161 			   recompute_dominator (CDI_DOMINATORS, dest));
1162 }
1163 
1164 /* Generates code for PARTITION.  Return whether LOOP needs to be destroyed.  */
1165 
1166 static bool
1167 generate_code_for_partition (struct loop *loop,
1168 			     partition *partition, bool copy_p)
1169 {
1170   switch (partition->kind)
1171     {
1172     case PKIND_NORMAL:
1173     case PKIND_PARTIAL_MEMSET:
1174       /* Reductions all have to be in the last partition.  */
1175       gcc_assert (!partition_reduction_p (partition)
1176 		  || !copy_p);
1177       generate_loops_for_partition (loop, partition, copy_p);
1178       return false;
1179 
1180     case PKIND_MEMSET:
1181       generate_memset_builtin (loop, partition);
1182       break;
1183 
1184     case PKIND_MEMCPY:
1185     case PKIND_MEMMOVE:
1186       generate_memcpy_builtin (loop, partition);
1187       break;
1188 
1189     default:
1190       gcc_unreachable ();
1191     }
1192 
1193   /* Common tail for partitions we turn into a call.  If this was the last
1194      partition for which we generate code, we have to destroy the loop.  */
1195   if (!copy_p)
1196     return true;
1197   return false;
1198 }
1199 
1200 /* Return data dependence relation for data references A and B.  The two
1201    data references must be in lexicographic order wrto reduced dependence
1202    graph RDG.  We firstly try to find ddr from global ddr hash table.  If
1203    it doesn't exist, compute the ddr and cache it.  */
1204 
1205 static data_dependence_relation *
1206 get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1207 {
1208   struct data_dependence_relation ent, **slot;
1209   struct data_dependence_relation *ddr;
1210 
1211   gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1212   gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1213 	      <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1214   ent.a = a;
1215   ent.b = b;
1216   slot = ddrs_table->find_slot (&ent, INSERT);
1217   if (*slot == NULL)
1218     {
1219       ddr = initialize_data_dependence_relation (a, b, loop_nest);
1220       compute_affine_dependence (ddr, loop_nest[0]);
1221       *slot = ddr;
1222     }
1223 
1224   return *slot;
1225 }
1226 
1227 /* In reduced dependence graph RDG for loop distribution, return true if
1228    dependence between references DR1 and DR2 leads to a dependence cycle
1229    and such dependence cycle can't be resolved by runtime alias check.  */
1230 
1231 static bool
1232 data_dep_in_cycle_p (struct graph *rdg,
1233 		     data_reference_p dr1, data_reference_p dr2)
1234 {
1235   struct data_dependence_relation *ddr;
1236 
1237   /* Re-shuffle data-refs to be in topological order.  */
1238   if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1239       > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1240     std::swap (dr1, dr2);
1241 
1242   ddr = get_data_dependence (rdg, dr1, dr2);
1243 
1244   /* In case of no data dependence.  */
1245   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1246     return false;
1247   /* For unknown data dependence or known data dependence which can't be
1248      expressed in classic distance vector, we check if it can be resolved
1249      by runtime alias check.  If yes, we still consider data dependence
1250      as won't introduce data dependence cycle.  */
1251   else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1252 	   || DDR_NUM_DIST_VECTS (ddr) == 0)
1253     return !runtime_alias_check_p (ddr, NULL, true);
1254   else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1255     return true;
1256   else if (DDR_REVERSED_P (ddr)
1257 	   || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1258     return false;
1259 
1260   return true;
1261 }
1262 
1263 /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1264    PARTITION1's type after merging PARTITION2 into PARTITION1.  */
1265 
1266 static void
1267 update_type_for_merge (struct graph *rdg,
1268 		       partition *partition1, partition *partition2)
1269 {
1270   unsigned i, j;
1271   bitmap_iterator bi, bj;
1272   data_reference_p dr1, dr2;
1273 
1274   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1275     {
1276       unsigned start = (partition1 == partition2) ? i + 1 : 0;
1277 
1278       dr1 = datarefs_vec[i];
1279       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1280 	{
1281 	  dr2 = datarefs_vec[j];
1282 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1283 	    continue;
1284 
1285 	  /* Partition can only be executed sequentially if there is any
1286 	     data dependence cycle.  */
1287 	  if (data_dep_in_cycle_p (rdg, dr1, dr2))
1288 	    {
1289 	      partition1->type = PTYPE_SEQUENTIAL;
1290 	      return;
1291 	    }
1292 	}
1293     }
1294 }
1295 
1296 /* Returns a partition with all the statements needed for computing
1297    the vertex V of the RDG, also including the loop exit conditions.  */
1298 
1299 static partition *
1300 build_rdg_partition_for_vertex (struct graph *rdg, int v)
1301 {
1302   partition *partition = partition_alloc ();
1303   auto_vec<int, 3> nodes;
1304   unsigned i, j;
1305   int x;
1306   data_reference_p dr;
1307 
1308   graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1309 
1310   FOR_EACH_VEC_ELT (nodes, i, x)
1311     {
1312       bitmap_set_bit (partition->stmts, x);
1313 
1314       for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1315 	{
1316 	  unsigned idx = (unsigned) DR_INDEX (dr);
1317 	  gcc_assert (idx < datarefs_vec.length ());
1318 
1319 	  /* Partition can only be executed sequentially if there is any
1320 	     unknown data reference.  */
1321 	  if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1322 	      || !DR_INIT (dr) || !DR_STEP (dr))
1323 	    partition->type = PTYPE_SEQUENTIAL;
1324 
1325 	  bitmap_set_bit (partition->datarefs, idx);
1326 	}
1327     }
1328 
1329   if (partition->type == PTYPE_SEQUENTIAL)
1330     return partition;
1331 
1332   /* Further check if any data dependence prevents us from executing the
1333      partition parallelly.  */
1334   update_type_for_merge (rdg, partition, partition);
1335 
1336   return partition;
1337 }
1338 
1339 /* Given PARTITION of LOOP and RDG, record single load/store data references
1340    for builtin partition in SRC_DR/DST_DR, return false if there is no such
1341    data references.  */
1342 
1343 static bool
1344 find_single_drs (struct loop *loop, struct graph *rdg, partition *partition,
1345 		 data_reference_p *dst_dr, data_reference_p *src_dr)
1346 {
1347   unsigned i;
1348   data_reference_p single_ld = NULL, single_st = NULL;
1349   bitmap_iterator bi;
1350 
1351   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1352     {
1353       gimple *stmt = RDG_STMT (rdg, i);
1354       data_reference_p dr;
1355 
1356       if (gimple_code (stmt) == GIMPLE_PHI)
1357 	continue;
1358 
1359       /* Any scalar stmts are ok.  */
1360       if (!gimple_vuse (stmt))
1361 	continue;
1362 
1363       /* Otherwise just regular loads/stores.  */
1364       if (!gimple_assign_single_p (stmt))
1365 	return false;
1366 
1367       /* But exactly one store and/or load.  */
1368       for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1369 	{
1370 	  tree type = TREE_TYPE (DR_REF (dr));
1371 
1372 	  /* The memset, memcpy and memmove library calls are only
1373 	     able to deal with generic address space.  */
1374 	  if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1375 	    return false;
1376 
1377 	  if (DR_IS_READ (dr))
1378 	    {
1379 	      if (single_ld != NULL)
1380 		return false;
1381 	      single_ld = dr;
1382 	    }
1383 	  else
1384 	    {
1385 	      if (single_st != NULL)
1386 		return false;
1387 	      single_st = dr;
1388 	    }
1389 	}
1390     }
1391 
1392   if (!single_st)
1393     return false;
1394 
1395   /* Bail out if this is a bitfield memory reference.  */
1396   if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1397       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1398     return false;
1399 
1400   /* Data reference must be executed exactly once per iteration of each
1401      loop in the loop nest.  We only need to check dominance information
1402      against the outermost one in a perfect loop nest because a bb can't
1403      dominate outermost loop's latch without dominating inner loop's.  */
1404   basic_block bb_st = gimple_bb (DR_STMT (single_st));
1405   if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1406     return false;
1407 
1408   if (single_ld)
1409     {
1410       gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1411       /* Direct aggregate copy or via an SSA name temporary.  */
1412       if (load != store
1413 	  && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1414 	return false;
1415 
1416       /* Bail out if this is a bitfield memory reference.  */
1417       if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1418 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1419 	return false;
1420 
1421       /* Load and store must be in the same loop nest.  */
1422       basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1423       if (bb_st->loop_father != bb_ld->loop_father)
1424 	return false;
1425 
1426       /* Data reference must be executed exactly once per iteration.
1427 	 Same as single_st, we only need to check against the outermost
1428 	 loop.  */
1429       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1430 	return false;
1431 
1432       edge e = single_exit (bb_st->loop_father);
1433       bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1434       bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1435       if (dom_ld != dom_st)
1436 	return false;
1437     }
1438 
1439   *src_dr = single_ld;
1440   *dst_dr = single_st;
1441   return true;
1442 }
1443 
1444 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1445    loops from inner to outer to see if loop's step equals to access size at
1446    each level of loop.  Return 2 if we can prove this at all level loops;
1447    record access base and size in BASE and SIZE; save loop's step at each
1448    level of loop in STEPS if it is not null.  For example:
1449 
1450      int arr[100][100][100];
1451      for (i = 0; i < 100; i++)       ;steps[2] = 40000
1452        for (j = 100; j > 0; j--)     ;steps[1] = -400
1453 	 for (k = 0; k < 100; k++)   ;steps[0] = 4
1454 	   arr[i][j - 1][k] = 0;     ;base = &arr, size = 4000000
1455 
1456    Return 1 if we can prove the equality at the innermost loop, but not all
1457    level loops.  In this case, no information is recorded.
1458 
1459    Return 0 if no equality can be proven at any level loops.  */
1460 
1461 static int
1462 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1463 		      tree *size, vec<tree> *steps = NULL)
1464 {
1465   location_t loc = gimple_location (DR_STMT (dr));
1466   basic_block bb = gimple_bb (DR_STMT (dr));
1467   struct loop *loop = bb->loop_father;
1468   tree ref = DR_REF (dr);
1469   tree access_base = build_fold_addr_expr (ref);
1470   tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1471   int res = 0;
1472 
1473   do {
1474       tree scev_fn = analyze_scalar_evolution (loop, access_base);
1475       if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1476 	return res;
1477 
1478       access_base = CHREC_LEFT (scev_fn);
1479       if (tree_contains_chrecs (access_base, NULL))
1480 	return res;
1481 
1482       tree scev_step = CHREC_RIGHT (scev_fn);
1483       /* Only support constant steps.  */
1484       if (TREE_CODE (scev_step) != INTEGER_CST)
1485 	return res;
1486 
1487       enum ev_direction access_dir = scev_direction (scev_fn);
1488       if (access_dir == EV_DIR_UNKNOWN)
1489 	return res;
1490 
1491       if (steps != NULL)
1492 	steps->safe_push (scev_step);
1493 
1494       scev_step = fold_convert_loc (loc, sizetype, scev_step);
1495       /* Compute absolute value of scev step.  */
1496       if (access_dir == EV_DIR_DECREASES)
1497 	scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1498 
1499       /* At each level of loop, scev step must equal to access size.  In other
1500 	 words, DR must access consecutive memory between loop iterations.  */
1501       if (!operand_equal_p (scev_step, access_size, 0))
1502 	return res;
1503 
1504       /* Access stride can be computed for data reference at least for the
1505 	 innermost loop.  */
1506       res = 1;
1507 
1508       /* Compute DR's execution times in loop.  */
1509       tree niters = number_of_latch_executions (loop);
1510       niters = fold_convert_loc (loc, sizetype, niters);
1511       if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1512 	niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1513 
1514       /* Compute DR's overall access size in loop.  */
1515       access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1516 				     niters, scev_step);
1517       /* Adjust base address in case of negative step.  */
1518       if (access_dir == EV_DIR_DECREASES)
1519 	{
1520 	  tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1521 				      scev_step, access_size);
1522 	  access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1523 	}
1524   } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1525 
1526   *base = access_base;
1527   *size = access_size;
1528   /* Access stride can be computed for data reference at each level loop.  */
1529   return 2;
1530 }
1531 
1532 /* Allocate and return builtin struct.  Record information like DST_DR,
1533    SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct.  */
1534 
1535 static struct builtin_info *
1536 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1537 	       tree dst_base, tree src_base, tree size)
1538 {
1539   struct builtin_info *builtin = XNEW (struct builtin_info);
1540   builtin->dst_dr = dst_dr;
1541   builtin->src_dr = src_dr;
1542   builtin->dst_base = dst_base;
1543   builtin->src_base = src_base;
1544   builtin->size = size;
1545   return builtin;
1546 }
1547 
1548 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1549    memset call.  */
1550 
1551 static void
1552 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1553 {
1554   gimple *stmt = DR_STMT (dr);
1555   tree base, size, rhs = gimple_assign_rhs1 (stmt);
1556 
1557   if (const_with_all_bytes_same (rhs) == -1
1558       && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1559 	  || (TYPE_MODE (TREE_TYPE (rhs))
1560 	      != TYPE_MODE (unsigned_char_type_node))))
1561     return;
1562 
1563   if (TREE_CODE (rhs) == SSA_NAME
1564       && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1565       && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1566     return;
1567 
1568   int res = compute_access_range (loop, dr, &base, &size);
1569   if (res == 0)
1570     return;
1571   if (res == 1)
1572     {
1573       partition->kind = PKIND_PARTIAL_MEMSET;
1574       return;
1575     }
1576 
1577   poly_uint64 base_offset;
1578   unsigned HOST_WIDE_INT const_base_offset;
1579   tree base_base = strip_offset (base, &base_offset);
1580   if (!base_offset.is_constant (&const_base_offset))
1581     return;
1582 
1583   struct builtin_info *builtin;
1584   builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1585   builtin->dst_base_base = base_base;
1586   builtin->dst_base_offset = const_base_offset;
1587   partition->builtin = builtin;
1588   partition->kind = PKIND_MEMSET;
1589 }
1590 
1591 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1592    if it forms builtin memcpy or memmove call.  */
1593 
1594 static void
1595 classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
1596 		       data_reference_p dst_dr, data_reference_p src_dr)
1597 {
1598   tree base, size, src_base, src_size;
1599   auto_vec<tree> dst_steps, src_steps;
1600 
1601   /* Compute access range of both load and store.  */
1602   int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1603   if (res != 2)
1604     return;
1605   res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1606   if (res != 2)
1607     return;
1608 
1609   /* They much have the same access size.  */
1610   if (!operand_equal_p (size, src_size, 0))
1611     return;
1612 
1613   /* Load and store in loop nest must access memory in the same way, i.e,
1614      their must have the same steps in each loop of the nest.  */
1615   if (dst_steps.length () != src_steps.length ())
1616     return;
1617   for (unsigned i = 0; i < dst_steps.length (); ++i)
1618     if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1619       return;
1620 
1621   /* Now check that if there is a dependence.  */
1622   ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1623 
1624   /* Classify as memcpy if no dependence between load and store.  */
1625   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1626     {
1627       partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1628       partition->kind = PKIND_MEMCPY;
1629       return;
1630     }
1631 
1632   /* Can't do memmove in case of unknown dependence or dependence without
1633      classical distance vector.  */
1634   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1635       || DDR_NUM_DIST_VECTS (ddr) == 0)
1636     return;
1637 
1638   unsigned i;
1639   lambda_vector dist_v;
1640   int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1641   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1642     {
1643       unsigned dep_lev = dependence_level (dist_v, num_lev);
1644       /* Can't do memmove if load depends on store.  */
1645       if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1646 	return;
1647     }
1648 
1649   partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1650   partition->kind = PKIND_MEMMOVE;
1651   return;
1652 }
1653 
1654 /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1655    For the moment we detect memset, memcpy and memmove patterns.  Bitmap
1656    STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.  */
1657 
1658 static void
1659 classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1660 		    bitmap stmt_in_all_partitions)
1661 {
1662   bitmap_iterator bi;
1663   unsigned i;
1664   data_reference_p single_ld = NULL, single_st = NULL;
1665   bool volatiles_p = false, has_reduction = false;
1666 
1667   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1668     {
1669       gimple *stmt = RDG_STMT (rdg, i);
1670 
1671       if (gimple_has_volatile_ops (stmt))
1672 	volatiles_p = true;
1673 
1674       /* If the stmt is not included by all partitions and there is uses
1675 	 outside of the loop, then mark the partition as reduction.  */
1676       if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1677 	{
1678 	  /* Due to limitation in the transform phase we have to fuse all
1679 	     reduction partitions.  As a result, this could cancel valid
1680 	     loop distribution especially for loop that induction variable
1681 	     is used outside of loop.  To workaround this issue, we skip
1682 	     marking partition as reudction if the reduction stmt belongs
1683 	     to all partitions.  In such case, reduction will be computed
1684 	     correctly no matter how partitions are fused/distributed.  */
1685 	  if (!bitmap_bit_p (stmt_in_all_partitions, i))
1686 	    {
1687 	      partition->reduction_p = true;
1688 	      return;
1689 	    }
1690 	  has_reduction = true;
1691 	}
1692     }
1693 
1694   /* Perform general partition disqualification for builtins.  */
1695   if (volatiles_p
1696       /* Simple workaround to prevent classifying the partition as builtin
1697 	 if it contains any use outside of loop.  */
1698       || has_reduction
1699       || !flag_tree_loop_distribute_patterns)
1700     return;
1701 
1702   /* Find single load/store data references for builtin partition.  */
1703   if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
1704     return;
1705 
1706   /* Classify the builtin kind.  */
1707   if (single_ld == NULL)
1708     classify_builtin_st (loop, partition, single_st);
1709   else
1710     classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1711 }
1712 
1713 /* Returns true when PARTITION1 and PARTITION2 access the same memory
1714    object in RDG.  */
1715 
1716 static bool
1717 share_memory_accesses (struct graph *rdg,
1718 		       partition *partition1, partition *partition2)
1719 {
1720   unsigned i, j;
1721   bitmap_iterator bi, bj;
1722   data_reference_p dr1, dr2;
1723 
1724   /* First check whether in the intersection of the two partitions are
1725      any loads or stores.  Common loads are the situation that happens
1726      most often.  */
1727   EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1728     if (RDG_MEM_WRITE_STMT (rdg, i)
1729 	|| RDG_MEM_READS_STMT (rdg, i))
1730       return true;
1731 
1732   /* Then check whether the two partitions access the same memory object.  */
1733   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1734     {
1735       dr1 = datarefs_vec[i];
1736 
1737       if (!DR_BASE_ADDRESS (dr1)
1738 	  || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1739 	continue;
1740 
1741       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1742 	{
1743 	  dr2 = datarefs_vec[j];
1744 
1745 	  if (!DR_BASE_ADDRESS (dr2)
1746 	      || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1747 	    continue;
1748 
1749 	  if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1750 	      && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1751 	      && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1752 	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1753 	    return true;
1754 	}
1755     }
1756 
1757   return false;
1758 }
1759 
1760 /* For each seed statement in STARTING_STMTS, this function builds
1761    partition for it by adding depended statements according to RDG.
1762    All partitions are recorded in PARTITIONS.  */
1763 
1764 static void
1765 rdg_build_partitions (struct graph *rdg,
1766 		      vec<gimple *> starting_stmts,
1767 		      vec<partition *> *partitions)
1768 {
1769   auto_bitmap processed;
1770   int i;
1771   gimple *stmt;
1772 
1773   FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1774     {
1775       int v = rdg_vertex_for_stmt (rdg, stmt);
1776 
1777       if (dump_file && (dump_flags & TDF_DETAILS))
1778 	fprintf (dump_file,
1779 		 "ldist asked to generate code for vertex %d\n", v);
1780 
1781       /* If the vertex is already contained in another partition so
1782          is the partition rooted at it.  */
1783       if (bitmap_bit_p (processed, v))
1784 	continue;
1785 
1786       partition *partition = build_rdg_partition_for_vertex (rdg, v);
1787       bitmap_ior_into (processed, partition->stmts);
1788 
1789       if (dump_file && (dump_flags & TDF_DETAILS))
1790 	{
1791 	  fprintf (dump_file, "ldist creates useful %s partition:\n",
1792 		   partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1793 	  bitmap_print (dump_file, partition->stmts, "  ", "\n");
1794 	}
1795 
1796       partitions->safe_push (partition);
1797     }
1798 
1799   /* All vertices should have been assigned to at least one partition now,
1800      other than vertices belonging to dead code.  */
1801 }
1802 
1803 /* Dump to FILE the PARTITIONS.  */
1804 
1805 static void
1806 dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1807 {
1808   int i;
1809   partition *partition;
1810 
1811   FOR_EACH_VEC_ELT (partitions, i, partition)
1812     debug_bitmap_file (file, partition->stmts);
1813 }
1814 
1815 /* Debug PARTITIONS.  */
1816 extern void debug_rdg_partitions (vec<partition *> );
1817 
1818 DEBUG_FUNCTION void
1819 debug_rdg_partitions (vec<partition *> partitions)
1820 {
1821   dump_rdg_partitions (stderr, partitions);
1822 }
1823 
1824 /* Returns the number of read and write operations in the RDG.  */
1825 
1826 static int
1827 number_of_rw_in_rdg (struct graph *rdg)
1828 {
1829   int i, res = 0;
1830 
1831   for (i = 0; i < rdg->n_vertices; i++)
1832     {
1833       if (RDG_MEM_WRITE_STMT (rdg, i))
1834 	++res;
1835 
1836       if (RDG_MEM_READS_STMT (rdg, i))
1837 	++res;
1838     }
1839 
1840   return res;
1841 }
1842 
1843 /* Returns the number of read and write operations in a PARTITION of
1844    the RDG.  */
1845 
1846 static int
1847 number_of_rw_in_partition (struct graph *rdg, partition *partition)
1848 {
1849   int res = 0;
1850   unsigned i;
1851   bitmap_iterator ii;
1852 
1853   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1854     {
1855       if (RDG_MEM_WRITE_STMT (rdg, i))
1856 	++res;
1857 
1858       if (RDG_MEM_READS_STMT (rdg, i))
1859 	++res;
1860     }
1861 
1862   return res;
1863 }
1864 
1865 /* Returns true when one of the PARTITIONS contains all the read or
1866    write operations of RDG.  */
1867 
1868 static bool
1869 partition_contains_all_rw (struct graph *rdg,
1870 			   vec<partition *> partitions)
1871 {
1872   int i;
1873   partition *partition;
1874   int nrw = number_of_rw_in_rdg (rdg);
1875 
1876   FOR_EACH_VEC_ELT (partitions, i, partition)
1877     if (nrw == number_of_rw_in_partition (rdg, partition))
1878       return true;
1879 
1880   return false;
1881 }
1882 
1883 /* Compute partition dependence created by the data references in DRS1
1884    and DRS2, modify and return DIR according to that.  IF ALIAS_DDR is
1885    not NULL, we record dependence introduced by possible alias between
1886    two data references in ALIAS_DDRS; otherwise, we simply ignore such
1887    dependence as if it doesn't exist at all.  */
1888 
1889 static int
1890 pg_add_dependence_edges (struct graph *rdg, int dir,
1891 			 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1892 {
1893   unsigned i, j;
1894   bitmap_iterator bi, bj;
1895   data_reference_p dr1, dr2, saved_dr1;
1896 
1897   /* dependence direction - 0 is no dependence, -1 is back,
1898      1 is forth, 2 is both (we can stop then, merging will occur).  */
1899   EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1900     {
1901       dr1 = datarefs_vec[i];
1902 
1903       EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1904 	{
1905 	  int res, this_dir = 1;
1906 	  ddr_p ddr;
1907 
1908 	  dr2 = datarefs_vec[j];
1909 
1910 	  /* Skip all <read, read> data dependence.  */
1911 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1912 	    continue;
1913 
1914 	  saved_dr1 = dr1;
1915 	  /* Re-shuffle data-refs to be in topological order.  */
1916 	  if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1917 	      > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1918 	    {
1919 	      std::swap (dr1, dr2);
1920 	      this_dir = -this_dir;
1921 	    }
1922 	  ddr = get_data_dependence (rdg, dr1, dr2);
1923 	  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1924 	    {
1925 	      this_dir = 0;
1926 	      res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1927 					   DR_BASE_ADDRESS (dr2));
1928 	      /* Be conservative.  If data references are not well analyzed,
1929 		 or the two data references have the same base address and
1930 		 offset, add dependence and consider it alias to each other.
1931 		 In other words, the dependence cannot be resolved by
1932 		 runtime alias check.  */
1933 	      if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1934 		  || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1935 		  || !DR_INIT (dr1) || !DR_INIT (dr2)
1936 		  || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1937 		  || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1938 		  || res == 0)
1939 		this_dir = 2;
1940 	      /* Data dependence could be resolved by runtime alias check,
1941 		 record it in ALIAS_DDRS.  */
1942 	      else if (alias_ddrs != NULL)
1943 		alias_ddrs->safe_push (ddr);
1944 	      /* Or simply ignore it.  */
1945 	    }
1946 	  else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1947 	    {
1948 	      if (DDR_REVERSED_P (ddr))
1949 		this_dir = -this_dir;
1950 
1951 	      /* Known dependences can still be unordered througout the
1952 		 iteration space, see gcc.dg/tree-ssa/ldist-16.c.  */
1953 	      if (DDR_NUM_DIST_VECTS (ddr) != 1)
1954 		this_dir = 2;
1955 	      /* If the overlap is exact preserve stmt order.  */
1956 	      else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
1957 					    DDR_NB_LOOPS (ddr)))
1958 		;
1959 	      /* Else as the distance vector is lexicographic positive swap
1960 		 the dependence direction.  */
1961 	      else
1962 		this_dir = -this_dir;
1963 	    }
1964 	  else
1965 	    this_dir = 0;
1966 	  if (this_dir == 2)
1967 	    return 2;
1968 	  else if (dir == 0)
1969 	    dir = this_dir;
1970 	  else if (this_dir != 0 && dir != this_dir)
1971 	    return 2;
1972 	  /* Shuffle "back" dr1.  */
1973 	  dr1 = saved_dr1;
1974 	}
1975     }
1976   return dir;
1977 }
1978 
1979 /* Compare postorder number of the partition graph vertices V1 and V2.  */
1980 
1981 static int
1982 pgcmp (const void *v1_, const void *v2_)
1983 {
1984   const vertex *v1 = (const vertex *)v1_;
1985   const vertex *v2 = (const vertex *)v2_;
1986   return v2->post - v1->post;
1987 }
1988 
1989 /* Data attached to vertices of partition dependence graph.  */
1990 struct pg_vdata
1991 {
1992   /* ID of the corresponding partition.  */
1993   int id;
1994   /* The partition.  */
1995   struct partition *partition;
1996 };
1997 
1998 /* Data attached to edges of partition dependence graph.  */
1999 struct pg_edata
2000 {
2001   /* If the dependence edge can be resolved by runtime alias check,
2002      this vector contains data dependence relations for runtime alias
2003      check.  On the other hand, if the dependence edge is introduced
2004      because of compilation time known data dependence, this vector
2005      contains nothing.  */
2006   vec<ddr_p> alias_ddrs;
2007 };
2008 
2009 /* Callback data for traversing edges in graph.  */
2010 struct pg_edge_callback_data
2011 {
2012   /* Bitmap contains strong connected components should be merged.  */
2013   bitmap sccs_to_merge;
2014   /* Array constains component information for all vertices.  */
2015   int *vertices_component;
2016   /* Vector to record all data dependence relations which are needed
2017      to break strong connected components by runtime alias checks.  */
2018   vec<ddr_p> *alias_ddrs;
2019 };
2020 
2021 /* Initialize vertice's data for partition dependence graph PG with
2022    PARTITIONS.  */
2023 
2024 static void
2025 init_partition_graph_vertices (struct graph *pg,
2026 			       vec<struct partition *> *partitions)
2027 {
2028   int i;
2029   partition *partition;
2030   struct pg_vdata *data;
2031 
2032   for (i = 0; partitions->iterate (i, &partition); ++i)
2033     {
2034       data = new pg_vdata;
2035       pg->vertices[i].data = data;
2036       data->id = i;
2037       data->partition = partition;
2038     }
2039 }
2040 
2041 /* Add edge <I, J> to partition dependence graph PG.  Attach vector of data
2042    dependence relations to the EDGE if DDRS isn't NULL.  */
2043 
2044 static void
2045 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2046 {
2047   struct graph_edge *e = add_edge (pg, i, j);
2048 
2049   /* If the edge is attached with data dependence relations, it means this
2050      dependence edge can be resolved by runtime alias checks.  */
2051   if (ddrs != NULL)
2052     {
2053       struct pg_edata *data = new pg_edata;
2054 
2055       gcc_assert (ddrs->length () > 0);
2056       e->data = data;
2057       data->alias_ddrs = vNULL;
2058       data->alias_ddrs.safe_splice (*ddrs);
2059     }
2060 }
2061 
2062 /* Callback function for graph travesal algorithm.  It returns true
2063    if edge E should skipped when traversing the graph.  */
2064 
2065 static bool
2066 pg_skip_alias_edge (struct graph_edge *e)
2067 {
2068   struct pg_edata *data = (struct pg_edata *)e->data;
2069   return (data != NULL && data->alias_ddrs.length () > 0);
2070 }
2071 
2072 /* Callback function freeing data attached to edge E of graph.  */
2073 
2074 static void
2075 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2076 {
2077   if (e->data != NULL)
2078     {
2079       struct pg_edata *data = (struct pg_edata *)e->data;
2080       data->alias_ddrs.release ();
2081       delete data;
2082     }
2083 }
2084 
2085 /* Free data attached to vertice of partition dependence graph PG.  */
2086 
2087 static void
2088 free_partition_graph_vdata (struct graph *pg)
2089 {
2090   int i;
2091   struct pg_vdata *data;
2092 
2093   for (i = 0; i < pg->n_vertices; ++i)
2094     {
2095       data = (struct pg_vdata *)pg->vertices[i].data;
2096       delete data;
2097     }
2098 }
2099 
2100 /* Build and return partition dependence graph for PARTITIONS.  RDG is
2101    reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
2102    is true, data dependence caused by possible alias between references
2103    is ignored, as if it doesn't exist at all; otherwise all depdendences
2104    are considered.  */
2105 
2106 static struct graph *
2107 build_partition_graph (struct graph *rdg,
2108 		       vec<struct partition *> *partitions,
2109 		       bool ignore_alias_p)
2110 {
2111   int i, j;
2112   struct partition *partition1, *partition2;
2113   graph *pg = new_graph (partitions->length ());
2114   auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2115 
2116   alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2117 
2118   init_partition_graph_vertices (pg, partitions);
2119 
2120   for (i = 0; partitions->iterate (i, &partition1); ++i)
2121     {
2122       for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2123 	{
2124 	  /* dependence direction - 0 is no dependence, -1 is back,
2125 	     1 is forth, 2 is both (we can stop then, merging will occur).  */
2126 	  int dir = 0;
2127 
2128 	  /* If the first partition has reduction, add back edge; if the
2129 	     second partition has reduction, add forth edge.  This makes
2130 	     sure that reduction partition will be sorted as the last one.  */
2131 	  if (partition_reduction_p (partition1))
2132 	    dir = -1;
2133 	  else if (partition_reduction_p (partition2))
2134 	    dir = 1;
2135 
2136 	  /* Cleanup the temporary vector.  */
2137 	  alias_ddrs.truncate (0);
2138 
2139 	  dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2140 					 partition2->datarefs, alias_ddrs_p);
2141 
2142 	  /* Add edge to partition graph if there exists dependence.  There
2143 	     are two types of edges.  One type edge is caused by compilation
2144 	     time known dependence, this type cannot be resolved by runtime
2145 	     alias check.  The other type can be resolved by runtime alias
2146 	     check.  */
2147 	  if (dir == 1 || dir == 2
2148 	      || alias_ddrs.length () > 0)
2149 	    {
2150 	      /* Attach data dependence relations to edge that can be resolved
2151 		 by runtime alias check.  */
2152 	      bool alias_edge_p = (dir != 1 && dir != 2);
2153 	      add_partition_graph_edge (pg, i, j,
2154 					(alias_edge_p) ? &alias_ddrs : NULL);
2155 	    }
2156 	  if (dir == -1 || dir == 2
2157 	      || alias_ddrs.length () > 0)
2158 	    {
2159 	      /* Attach data dependence relations to edge that can be resolved
2160 		 by runtime alias check.  */
2161 	      bool alias_edge_p = (dir != -1 && dir != 2);
2162 	      add_partition_graph_edge (pg, j, i,
2163 					(alias_edge_p) ? &alias_ddrs : NULL);
2164 	    }
2165 	}
2166     }
2167   return pg;
2168 }
2169 
2170 /* Sort partitions in PG in descending post order and store them in
2171    PARTITIONS.  */
2172 
2173 static void
2174 sort_partitions_by_post_order (struct graph *pg,
2175 			       vec<struct partition *> *partitions)
2176 {
2177   int i;
2178   struct pg_vdata *data;
2179 
2180   /* Now order the remaining nodes in descending postorder.  */
2181   qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2182   partitions->truncate (0);
2183   for (i = 0; i < pg->n_vertices; ++i)
2184     {
2185       data = (struct pg_vdata *)pg->vertices[i].data;
2186       if (data->partition)
2187 	partitions->safe_push (data->partition);
2188     }
2189 }
2190 
2191 /* Given reduced dependence graph RDG merge strong connected components
2192    of PARTITIONS.  If IGNORE_ALIAS_P is true, data dependence caused by
2193    possible alias between references is ignored, as if it doesn't exist
2194    at all; otherwise all depdendences are considered.  */
2195 
2196 static void
2197 merge_dep_scc_partitions (struct graph *rdg,
2198 			  vec<struct partition *> *partitions,
2199 			  bool ignore_alias_p)
2200 {
2201   struct partition *partition1, *partition2;
2202   struct pg_vdata *data;
2203   graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2204   int i, j, num_sccs = graphds_scc (pg, NULL);
2205 
2206   /* Strong connected compoenent means dependence cycle, we cannot distribute
2207      them.  So fuse them together.  */
2208   if ((unsigned) num_sccs < partitions->length ())
2209     {
2210       for (i = 0; i < num_sccs; ++i)
2211 	{
2212 	  for (j = 0; partitions->iterate (j, &partition1); ++j)
2213 	    if (pg->vertices[j].component == i)
2214 	      break;
2215 	  for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2216 	    if (pg->vertices[j].component == i)
2217 	      {
2218 		partition_merge_into (NULL, partition1,
2219 				      partition2, FUSE_SAME_SCC);
2220 		partition1->type = PTYPE_SEQUENTIAL;
2221 		(*partitions)[j] = NULL;
2222 		partition_free (partition2);
2223 		data = (struct pg_vdata *)pg->vertices[j].data;
2224 		data->partition = NULL;
2225 	      }
2226 	}
2227     }
2228 
2229   sort_partitions_by_post_order (pg, partitions);
2230   gcc_assert (partitions->length () == (unsigned)num_sccs);
2231   free_partition_graph_vdata (pg);
2232   free_graph (pg);
2233 }
2234 
2235 /* Callback function for traversing edge E in graph G.  DATA is private
2236    callback data.  */
2237 
2238 static void
2239 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2240 {
2241   int i, j, component;
2242   struct pg_edge_callback_data *cbdata;
2243   struct pg_edata *edata = (struct pg_edata *) e->data;
2244 
2245   /* If the edge doesn't have attached data dependence, it represents
2246      compilation time known dependences.  This type dependence cannot
2247      be resolved by runtime alias check.  */
2248   if (edata == NULL || edata->alias_ddrs.length () == 0)
2249     return;
2250 
2251   cbdata = (struct pg_edge_callback_data *) data;
2252   i = e->src;
2253   j = e->dest;
2254   component = cbdata->vertices_component[i];
2255   /* Vertices are topologically sorted according to compilation time
2256      known dependences, so we can break strong connected components
2257      by removing edges of the opposite direction, i.e, edges pointing
2258      from vertice with smaller post number to vertice with bigger post
2259      number.  */
2260   if (g->vertices[i].post < g->vertices[j].post
2261       /* We only need to remove edges connecting vertices in the same
2262 	 strong connected component to break it.  */
2263       && component == cbdata->vertices_component[j]
2264       /* Check if we want to break the strong connected component or not.  */
2265       && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2266     cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2267 }
2268 
2269 /* This is the main function breaking strong conected components in
2270    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
2271    relations for runtime alias check in ALIAS_DDRS.  */
2272 
2273 static void
2274 break_alias_scc_partitions (struct graph *rdg,
2275 			    vec<struct partition *> *partitions,
2276 			    vec<ddr_p> *alias_ddrs)
2277 {
2278   int i, j, k, num_sccs, num_sccs_no_alias;
2279   /* Build partition dependence graph.  */
2280   graph *pg = build_partition_graph (rdg, partitions, false);
2281 
2282   alias_ddrs->truncate (0);
2283   /* Find strong connected components in the graph, with all dependence edges
2284      considered.  */
2285   num_sccs = graphds_scc (pg, NULL);
2286   /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2287      compilation time known dependences are merged before this function.  */
2288   if ((unsigned) num_sccs < partitions->length ())
2289     {
2290       struct pg_edge_callback_data cbdata;
2291       auto_bitmap sccs_to_merge;
2292       auto_vec<enum partition_type> scc_types;
2293       struct partition *partition, *first;
2294 
2295       /* If all partitions in a SCC have the same type, we can simply merge the
2296 	 SCC.  This loop finds out such SCCS and record them in bitmap.  */
2297       bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2298       for (i = 0; i < num_sccs; ++i)
2299 	{
2300 	  for (j = 0; partitions->iterate (j, &first); ++j)
2301 	    if (pg->vertices[j].component == i)
2302 	      break;
2303 
2304 	  bool same_type = true, all_builtins = partition_builtin_p (first);
2305 	  for (++j; partitions->iterate (j, &partition); ++j)
2306 	    {
2307 	      if (pg->vertices[j].component != i)
2308 		continue;
2309 
2310 	      if (first->type != partition->type)
2311 		{
2312 		  same_type = false;
2313 		  break;
2314 		}
2315 	      all_builtins &= partition_builtin_p (partition);
2316 	    }
2317 	  /* Merge SCC if all partitions in SCC have the same type, though the
2318 	     result partition is sequential, because vectorizer can do better
2319 	     runtime alias check.  One expecption is all partitions in SCC are
2320 	     builtins.  */
2321 	  if (!same_type || all_builtins)
2322 	    bitmap_clear_bit (sccs_to_merge, i);
2323 	}
2324 
2325       /* Initialize callback data for traversing.  */
2326       cbdata.sccs_to_merge = sccs_to_merge;
2327       cbdata.alias_ddrs = alias_ddrs;
2328       cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2329       /* Record the component information which will be corrupted by next
2330 	 graph scc finding call.  */
2331       for (i = 0; i < pg->n_vertices; ++i)
2332 	cbdata.vertices_component[i] = pg->vertices[i].component;
2333 
2334       /* Collect data dependences for runtime alias checks to break SCCs.  */
2335       if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2336 	{
2337 	  /* Run SCC finding algorithm again, with alias dependence edges
2338 	     skipped.  This is to topologically sort partitions according to
2339 	     compilation time known dependence.  Note the topological order
2340 	     is stored in the form of pg's post order number.  */
2341 	  num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2342 	  gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2343 	  /* With topological order, we can construct two subgraphs L and R.
2344 	     L contains edge <x, y> where x < y in terms of post order, while
2345 	     R contains edge <x, y> where x > y.  Edges for compilation time
2346 	     known dependence all fall in R, so we break SCCs by removing all
2347 	     (alias) edges of in subgraph L.  */
2348 	  for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2349 	}
2350 
2351       /* For SCC that doesn't need to be broken, merge it.  */
2352       for (i = 0; i < num_sccs; ++i)
2353 	{
2354 	  if (!bitmap_bit_p (sccs_to_merge, i))
2355 	    continue;
2356 
2357 	  for (j = 0; partitions->iterate (j, &first); ++j)
2358 	    if (cbdata.vertices_component[j] == i)
2359 	      break;
2360 	  for (k = j + 1; partitions->iterate (k, &partition); ++k)
2361 	    {
2362 	      struct pg_vdata *data;
2363 
2364 	      if (cbdata.vertices_component[k] != i)
2365 		continue;
2366 
2367 	      /* Update postorder number so that merged reduction partition is
2368 		 sorted after other partitions.  */
2369 	      if (!partition_reduction_p (first)
2370 		  && partition_reduction_p (partition))
2371 		{
2372 		  gcc_assert (pg->vertices[k].post < pg->vertices[j].post);
2373 		  pg->vertices[j].post = pg->vertices[k].post;
2374 		}
2375 	      partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2376 	      (*partitions)[k] = NULL;
2377 	      partition_free (partition);
2378 	      data = (struct pg_vdata *)pg->vertices[k].data;
2379 	      gcc_assert (data->id == k);
2380 	      data->partition = NULL;
2381 	      /* The result partition of merged SCC must be sequential.  */
2382 	      first->type = PTYPE_SEQUENTIAL;
2383 	    }
2384 	}
2385     }
2386 
2387   sort_partitions_by_post_order (pg, partitions);
2388   free_partition_graph_vdata (pg);
2389   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2390   free_graph (pg);
2391 
2392   if (dump_file && (dump_flags & TDF_DETAILS))
2393     {
2394       fprintf (dump_file, "Possible alias data dependence to break:\n");
2395       dump_data_dependence_relations (dump_file, *alias_ddrs);
2396     }
2397 }
2398 
2399 /* Compute and return an expression whose value is the segment length which
2400    will be accessed by DR in NITERS iterations.  */
2401 
2402 static tree
2403 data_ref_segment_size (struct data_reference *dr, tree niters)
2404 {
2405   niters = size_binop (MINUS_EXPR,
2406 		       fold_convert (sizetype, niters),
2407 		       size_one_node);
2408   return size_binop (MULT_EXPR,
2409 		     fold_convert (sizetype, DR_STEP (dr)),
2410 		     fold_convert (sizetype, niters));
2411 }
2412 
2413 /* Return true if LOOP's latch is dominated by statement for data reference
2414    DR.  */
2415 
2416 static inline bool
2417 latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2418 {
2419   return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2420 			 gimple_bb (DR_STMT (dr)));
2421 }
2422 
2423 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2424    data dependence relations ALIAS_DDRS.  */
2425 
2426 static void
2427 compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2428 			   vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2429 {
2430   unsigned int i;
2431   unsigned HOST_WIDE_INT factor = 1;
2432   tree niters_plus_one, niters = number_of_latch_executions (loop);
2433 
2434   gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2435   niters = fold_convert (sizetype, niters);
2436   niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2437 
2438   if (dump_file && (dump_flags & TDF_DETAILS))
2439     fprintf (dump_file, "Creating alias check pairs:\n");
2440 
2441   /* Iterate all data dependence relations and compute alias check pairs.  */
2442   for (i = 0; i < alias_ddrs->length (); i++)
2443     {
2444       ddr_p ddr = (*alias_ddrs)[i];
2445       struct data_reference *dr_a = DDR_A (ddr);
2446       struct data_reference *dr_b = DDR_B (ddr);
2447       tree seg_length_a, seg_length_b;
2448       int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2449 					    DR_BASE_ADDRESS (dr_b));
2450 
2451       if (comp_res == 0)
2452 	comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2453       gcc_assert (comp_res != 0);
2454 
2455       if (latch_dominated_by_data_ref (loop, dr_a))
2456 	seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2457       else
2458 	seg_length_a = data_ref_segment_size (dr_a, niters);
2459 
2460       if (latch_dominated_by_data_ref (loop, dr_b))
2461 	seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2462       else
2463 	seg_length_b = data_ref_segment_size (dr_b, niters);
2464 
2465       unsigned HOST_WIDE_INT access_size_a
2466 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2467       unsigned HOST_WIDE_INT access_size_b
2468 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2469       unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2470       unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2471 
2472       dr_with_seg_len_pair_t dr_with_seg_len_pair
2473 	(dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2474 	 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b));
2475 
2476       /* Canonicalize pairs by sorting the two DR members.  */
2477       if (comp_res > 0)
2478 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2479 
2480       comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2481     }
2482 
2483   if (tree_fits_uhwi_p (niters))
2484     factor = tree_to_uhwi (niters);
2485 
2486   /* Prune alias check pairs.  */
2487   prune_runtime_alias_test_list (comp_alias_pairs, factor);
2488   if (dump_file && (dump_flags & TDF_DETAILS))
2489     fprintf (dump_file,
2490 	     "Improved number of alias checks from %d to %d\n",
2491 	     alias_ddrs->length (), comp_alias_pairs->length ());
2492 }
2493 
2494 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2495    checks and version LOOP under condition of these runtime alias checks.  */
2496 
2497 static void
2498 version_loop_by_alias_check (vec<struct partition *> *partitions,
2499 			     struct loop *loop, vec<ddr_p> *alias_ddrs)
2500 {
2501   profile_probability prob;
2502   basic_block cond_bb;
2503   struct loop *nloop;
2504   tree lhs, arg0, cond_expr = NULL_TREE;
2505   gimple_seq cond_stmts = NULL;
2506   gimple *call_stmt = NULL;
2507   auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2508 
2509   /* Generate code for runtime alias checks if necessary.  */
2510   gcc_assert (alias_ddrs->length () > 0);
2511 
2512   if (dump_file && (dump_flags & TDF_DETAILS))
2513     fprintf (dump_file,
2514 	     "Version loop <%d> with runtime alias check\n", loop->num);
2515 
2516   compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2517   create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2518   cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2519 				      is_gimple_val, NULL_TREE);
2520 
2521   /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS.  */
2522   bool cancelable_p = flag_tree_loop_vectorize;
2523   if (cancelable_p)
2524     {
2525       unsigned i = 0;
2526       struct partition *partition;
2527       for (; partitions->iterate (i, &partition); ++i)
2528 	if (!partition_builtin_p (partition))
2529 	  break;
2530 
2531      /* If all partitions are builtins, distributing it would be profitable and
2532 	we don't want to cancel the runtime alias checks.  */
2533       if (i == partitions->length ())
2534 	cancelable_p = false;
2535     }
2536 
2537   /* Generate internal function call for loop distribution alias check if the
2538      runtime alias check should be cancelable.  */
2539   if (cancelable_p)
2540     {
2541       call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2542 					      2, NULL_TREE, cond_expr);
2543       lhs = make_ssa_name (boolean_type_node);
2544       gimple_call_set_lhs (call_stmt, lhs);
2545     }
2546   else
2547     lhs = cond_expr;
2548 
2549   prob = profile_probability::guessed_always ().apply_scale (9, 10);
2550   initialize_original_copy_tables ();
2551   nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2552 			prob, prob.invert (), true);
2553   free_original_copy_tables ();
2554   /* Record the original loop number in newly generated loops.  In case of
2555      distribution, the original loop will be distributed and the new loop
2556      is kept.  */
2557   loop->orig_loop_num = nloop->num;
2558   nloop->orig_loop_num = nloop->num;
2559   nloop->dont_vectorize = true;
2560   nloop->force_vectorize = false;
2561 
2562   if (call_stmt)
2563     {
2564       /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2565 	 loop could be destroyed.  */
2566       arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2567       gimple_call_set_arg (call_stmt, 0, arg0);
2568       gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2569     }
2570 
2571   if (cond_stmts)
2572     {
2573       gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2574       gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2575     }
2576   update_ssa (TODO_update_ssa);
2577 }
2578 
2579 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2580    ALIAS_DDRS are data dependence relations for runtime alias check.  */
2581 
2582 static inline bool
2583 version_for_distribution_p (vec<struct partition *> *partitions,
2584 			    vec<ddr_p> *alias_ddrs)
2585 {
2586   /* No need to version loop if we have only one partition.  */
2587   if (partitions->length () == 1)
2588     return false;
2589 
2590   /* Need to version loop if runtime alias check is necessary.  */
2591   return (alias_ddrs->length () > 0);
2592 }
2593 
2594 /* Compare base offset of builtin mem* partitions P1 and P2.  */
2595 
2596 static int
2597 offset_cmp (const void *vp1, const void *vp2)
2598 {
2599   struct partition *p1 = *(struct partition *const *) vp1;
2600   struct partition *p2 = *(struct partition *const *) vp2;
2601   unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
2602   unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
2603   return (o2 < o1) - (o1 < o2);
2604 }
2605 
2606 /* Fuse adjacent memset builtin PARTITIONS if possible.  This is a special
2607    case optimization transforming below code:
2608 
2609      __builtin_memset (&obj, 0, 100);
2610      _1 = &obj + 100;
2611      __builtin_memset (_1, 0, 200);
2612      _2 = &obj + 300;
2613      __builtin_memset (_2, 0, 100);
2614 
2615    into:
2616 
2617      __builtin_memset (&obj, 0, 400);
2618 
2619    Note we don't have dependence information between different partitions
2620    at this point, as a result, we can't handle nonadjacent memset builtin
2621    partitions since dependence might be broken.  */
2622 
2623 static void
2624 fuse_memset_builtins (vec<struct partition *> *partitions)
2625 {
2626   unsigned i, j;
2627   struct partition *part1, *part2;
2628   tree rhs1, rhs2;
2629 
2630   for (i = 0; partitions->iterate (i, &part1);)
2631     {
2632       if (part1->kind != PKIND_MEMSET)
2633 	{
2634 	  i++;
2635 	  continue;
2636 	}
2637 
2638       /* Find sub-array of memset builtins of the same base.  Index range
2639 	 of the sub-array is [i, j) with "j > i".  */
2640       for (j = i + 1; partitions->iterate (j, &part2); ++j)
2641 	{
2642 	  if (part2->kind != PKIND_MEMSET
2643 	      || !operand_equal_p (part1->builtin->dst_base_base,
2644 				   part2->builtin->dst_base_base, 0))
2645 	    break;
2646 
2647 	  /* Memset calls setting different values can't be merged.  */
2648 	  rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2649 	  rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2650 	  if (!operand_equal_p (rhs1, rhs2, 0))
2651 	    break;
2652 	}
2653 
2654       /* Stable sort is required in order to avoid breaking dependence.  */
2655       gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
2656 		      offset_cmp);
2657       /* Continue with next partition.  */
2658       i = j;
2659     }
2660 
2661   /* Merge all consecutive memset builtin partitions.  */
2662   for (i = 0; i < partitions->length () - 1;)
2663     {
2664       part1 = (*partitions)[i];
2665       if (part1->kind != PKIND_MEMSET)
2666 	{
2667 	  i++;
2668 	  continue;
2669 	}
2670 
2671       part2 = (*partitions)[i + 1];
2672       /* Only merge memset partitions of the same base and with constant
2673 	 access sizes.  */
2674       if (part2->kind != PKIND_MEMSET
2675 	  || TREE_CODE (part1->builtin->size) != INTEGER_CST
2676 	  || TREE_CODE (part2->builtin->size) != INTEGER_CST
2677 	  || !operand_equal_p (part1->builtin->dst_base_base,
2678 			       part2->builtin->dst_base_base, 0))
2679 	{
2680 	  i++;
2681 	  continue;
2682 	}
2683       rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2684       rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2685       int bytev1 = const_with_all_bytes_same (rhs1);
2686       int bytev2 = const_with_all_bytes_same (rhs2);
2687       /* Only merge memset partitions of the same value.  */
2688       if (bytev1 != bytev2 || bytev1 == -1)
2689 	{
2690 	  i++;
2691 	  continue;
2692 	}
2693       wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2694 			       wi::to_wide (part1->builtin->size));
2695       /* Only merge adjacent memset partitions.  */
2696       if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2697 	{
2698 	  i++;
2699 	  continue;
2700 	}
2701       /* Merge partitions[i] and partitions[i+1].  */
2702       part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2703 					  part1->builtin->size,
2704 					  part2->builtin->size);
2705       partition_free (part2);
2706       partitions->ordered_remove (i + 1);
2707     }
2708 }
2709 
2710 /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
2711    ALIAS_DDRS contains ddrs which need runtime alias check.  */
2712 
2713 static void
2714 finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
2715 		     vec<ddr_p> *alias_ddrs)
2716 {
2717   unsigned i;
2718   struct partition *partition, *a;
2719 
2720   if (partitions->length () == 1
2721       || alias_ddrs->length () > 0)
2722     return;
2723 
2724   unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2725   bool same_type_p = true;
2726   enum partition_type type = ((*partitions)[0])->type;
2727   for (i = 0; partitions->iterate (i, &partition); ++i)
2728     {
2729       same_type_p &= (type == partition->type);
2730       if (partition_builtin_p (partition))
2731 	{
2732 	  num_builtin++;
2733 	  continue;
2734 	}
2735       num_normal++;
2736       if (partition->kind == PKIND_PARTIAL_MEMSET)
2737 	num_partial_memset++;
2738     }
2739 
2740   /* Don't distribute current loop into too many loops given we don't have
2741      memory stream cost model.  Be even more conservative in case of loop
2742      nest distribution.  */
2743   if ((same_type_p && num_builtin == 0
2744        && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2745       || (loop->inner != NULL
2746 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2747       || (loop->inner == NULL
2748 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2749     {
2750       a = (*partitions)[0];
2751       for (i = 1; partitions->iterate (i, &partition); ++i)
2752 	{
2753 	  partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2754 	  partition_free (partition);
2755 	}
2756       partitions->truncate (1);
2757     }
2758 
2759   /* Fuse memset builtins if possible.  */
2760   if (partitions->length () > 1)
2761     fuse_memset_builtins (partitions);
2762 }
2763 
2764 /* Distributes the code from LOOP in such a way that producer statements
2765    are placed before consumer statements.  Tries to separate only the
2766    statements from STMTS into separate loops.  Returns the number of
2767    distributed loops.  Set NB_CALLS to number of generated builtin calls.
2768    Set *DESTROY_P to whether LOOP needs to be destroyed.  */
2769 
2770 static int
2771 distribute_loop (struct loop *loop, vec<gimple *> stmts,
2772 		 control_dependences *cd, int *nb_calls, bool *destroy_p)
2773 {
2774   ddrs_table = new hash_table<ddr_hasher> (389);
2775   struct graph *rdg;
2776   partition *partition;
2777   bool any_builtin;
2778   int i, nbp;
2779 
2780   *destroy_p = false;
2781   *nb_calls = 0;
2782   loop_nest.create (0);
2783   if (!find_loop_nest (loop, &loop_nest))
2784     {
2785       loop_nest.release ();
2786       delete ddrs_table;
2787       return 0;
2788     }
2789 
2790   datarefs_vec.create (20);
2791   has_nonaddressable_dataref_p = false;
2792   rdg = build_rdg (loop, cd);
2793   if (!rdg)
2794     {
2795       if (dump_file && (dump_flags & TDF_DETAILS))
2796 	fprintf (dump_file,
2797 		 "Loop %d not distributed: failed to build the RDG.\n",
2798 		 loop->num);
2799 
2800       loop_nest.release ();
2801       free_data_refs (datarefs_vec);
2802       delete ddrs_table;
2803       return 0;
2804     }
2805 
2806   if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2807     {
2808       if (dump_file && (dump_flags & TDF_DETAILS))
2809 	fprintf (dump_file,
2810 		 "Loop %d not distributed: too many memory references.\n",
2811 		 loop->num);
2812 
2813       free_rdg (rdg);
2814       loop_nest.release ();
2815       free_data_refs (datarefs_vec);
2816       delete ddrs_table;
2817       return 0;
2818     }
2819 
2820   data_reference_p dref;
2821   for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2822     dref->aux = (void *) (uintptr_t) i;
2823 
2824   if (dump_file && (dump_flags & TDF_DETAILS))
2825     dump_rdg (dump_file, rdg);
2826 
2827   auto_vec<struct partition *, 3> partitions;
2828   rdg_build_partitions (rdg, stmts, &partitions);
2829 
2830   auto_vec<ddr_p> alias_ddrs;
2831 
2832   auto_bitmap stmt_in_all_partitions;
2833   bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2834   for (i = 1; partitions.iterate (i, &partition); ++i)
2835     bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2836 
2837   any_builtin = false;
2838   FOR_EACH_VEC_ELT (partitions, i, partition)
2839     {
2840       classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2841       any_builtin |= partition_builtin_p (partition);
2842     }
2843 
2844   /* If we are only distributing patterns but did not detect any,
2845      simply bail out.  */
2846   if (!flag_tree_loop_distribution
2847       && !any_builtin)
2848     {
2849       nbp = 0;
2850       goto ldist_done;
2851     }
2852 
2853   /* If we are only distributing patterns fuse all partitions that
2854      were not classified as builtins.  This also avoids chopping
2855      a loop into pieces, separated by builtin calls.  That is, we
2856      only want no or a single loop body remaining.  */
2857   struct partition *into;
2858   if (!flag_tree_loop_distribution)
2859     {
2860       for (i = 0; partitions.iterate (i, &into); ++i)
2861 	if (!partition_builtin_p (into))
2862 	  break;
2863       for (++i; partitions.iterate (i, &partition); ++i)
2864 	if (!partition_builtin_p (partition))
2865 	  {
2866 	    partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2867 	    partitions.unordered_remove (i);
2868 	    partition_free (partition);
2869 	    i--;
2870 	  }
2871     }
2872 
2873   /* Due to limitations in the transform phase we have to fuse all
2874      reduction partitions into the last partition so the existing
2875      loop will contain all loop-closed PHI nodes.  */
2876   for (i = 0; partitions.iterate (i, &into); ++i)
2877     if (partition_reduction_p (into))
2878       break;
2879   for (i = i + 1; partitions.iterate (i, &partition); ++i)
2880     if (partition_reduction_p (partition))
2881       {
2882 	partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2883 	partitions.unordered_remove (i);
2884 	partition_free (partition);
2885 	i--;
2886       }
2887 
2888   /* Apply our simple cost model - fuse partitions with similar
2889      memory accesses.  */
2890   for (i = 0; partitions.iterate (i, &into); ++i)
2891     {
2892       bool changed = false;
2893       if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
2894 	continue;
2895       for (int j = i + 1;
2896 	   partitions.iterate (j, &partition); ++j)
2897 	{
2898 	  if (share_memory_accesses (rdg, into, partition))
2899 	    {
2900 	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2901 	      partitions.unordered_remove (j);
2902 	      partition_free (partition);
2903 	      j--;
2904 	      changed = true;
2905 	    }
2906 	}
2907       /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2908          accesses when 1 and 2 have similar accesses but not 0 and 1
2909 	 then in the next iteration we will fail to consider merging
2910 	 1 into 0,2.  So try again if we did any merging into 0.  */
2911       if (changed)
2912 	i--;
2913     }
2914 
2915   /* Build the partition dependency graph and fuse partitions in strong
2916      connected component.  */
2917   if (partitions.length () > 1)
2918     {
2919       /* Don't support loop nest distribution under runtime alias check
2920 	 since it's not likely to enable many vectorization opportunities.
2921 	 Also if loop has any data reference which may be not addressable
2922 	 since alias check needs to take, compare address of the object.  */
2923       if (loop->inner || has_nonaddressable_dataref_p)
2924 	merge_dep_scc_partitions (rdg, &partitions, false);
2925       else
2926 	{
2927 	  merge_dep_scc_partitions (rdg, &partitions, true);
2928 	  if (partitions.length () > 1)
2929 	    break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2930 	}
2931     }
2932 
2933   finalize_partitions (loop, &partitions, &alias_ddrs);
2934 
2935   nbp = partitions.length ();
2936   if (nbp == 0
2937       || (nbp == 1 && !partition_builtin_p (partitions[0]))
2938       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2939     {
2940       nbp = 0;
2941       goto ldist_done;
2942     }
2943 
2944   if (version_for_distribution_p (&partitions, &alias_ddrs))
2945     version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
2946 
2947   if (dump_file && (dump_flags & TDF_DETAILS))
2948     {
2949       fprintf (dump_file,
2950 	       "distribute loop <%d> into partitions:\n", loop->num);
2951       dump_rdg_partitions (dump_file, partitions);
2952     }
2953 
2954   FOR_EACH_VEC_ELT (partitions, i, partition)
2955     {
2956       if (partition_builtin_p (partition))
2957 	(*nb_calls)++;
2958       *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2959     }
2960 
2961  ldist_done:
2962   loop_nest.release ();
2963   free_data_refs (datarefs_vec);
2964   for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
2965        iter != ddrs_table->end (); ++iter)
2966     {
2967       free_dependence_relation (*iter);
2968       *iter = NULL;
2969     }
2970   delete ddrs_table;
2971 
2972   FOR_EACH_VEC_ELT (partitions, i, partition)
2973     partition_free (partition);
2974 
2975   free_rdg (rdg);
2976   return nbp - *nb_calls;
2977 }
2978 
2979 /* Distribute all loops in the current function.  */
2980 
2981 namespace {
2982 
2983 const pass_data pass_data_loop_distribution =
2984 {
2985   GIMPLE_PASS, /* type */
2986   "ldist", /* name */
2987   OPTGROUP_LOOP, /* optinfo_flags */
2988   TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2989   ( PROP_cfg | PROP_ssa ), /* properties_required */
2990   0, /* properties_provided */
2991   0, /* properties_destroyed */
2992   0, /* todo_flags_start */
2993   0, /* todo_flags_finish */
2994 };
2995 
2996 class pass_loop_distribution : public gimple_opt_pass
2997 {
2998 public:
2999   pass_loop_distribution (gcc::context *ctxt)
3000     : gimple_opt_pass (pass_data_loop_distribution, ctxt)
3001   {}
3002 
3003   /* opt_pass methods: */
3004   virtual bool gate (function *)
3005     {
3006       return flag_tree_loop_distribution
3007 	|| flag_tree_loop_distribute_patterns;
3008     }
3009 
3010   virtual unsigned int execute (function *);
3011 
3012 }; // class pass_loop_distribution
3013 
3014 
3015 /* Given LOOP, this function records seed statements for distribution in
3016    WORK_LIST.  Return false if there is nothing for distribution.  */
3017 
3018 static bool
3019 find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
3020 {
3021   basic_block *bbs = get_loop_body_in_dom_order (loop);
3022 
3023   /* Initialize the worklist with stmts we seed the partitions with.  */
3024   for (unsigned i = 0; i < loop->num_nodes; ++i)
3025     {
3026       for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
3027 	   !gsi_end_p (gsi); gsi_next (&gsi))
3028 	{
3029 	  gphi *phi = gsi.phi ();
3030 	  if (virtual_operand_p (gimple_phi_result (phi)))
3031 	    continue;
3032 	  /* Distribute stmts which have defs that are used outside of
3033 	     the loop.  */
3034 	  if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
3035 	    continue;
3036 	  work_list->safe_push (phi);
3037 	}
3038       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3039 	   !gsi_end_p (gsi); gsi_next (&gsi))
3040 	{
3041 	  gimple *stmt = gsi_stmt (gsi);
3042 
3043 	  /* If there is a stmt with side-effects bail out - we
3044 	     cannot and should not distribute this loop.  */
3045 	  if (gimple_has_side_effects (stmt))
3046 	    {
3047 	      free (bbs);
3048 	      return false;
3049 	    }
3050 
3051 	  /* Distribute stmts which have defs that are used outside of
3052 	     the loop.  */
3053 	  if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3054 	    ;
3055 	  /* Otherwise only distribute stores for now.  */
3056 	  else if (!gimple_vdef (stmt))
3057 	    continue;
3058 
3059 	  work_list->safe_push (stmt);
3060 	}
3061     }
3062   free (bbs);
3063   return work_list->length () > 0;
3064 }
3065 
3066 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3067    perfect loop nest.  */
3068 
3069 static struct loop *
3070 prepare_perfect_loop_nest (struct loop *loop)
3071 {
3072   struct loop *outer = loop_outer (loop);
3073   tree niters = number_of_latch_executions (loop);
3074 
3075   /* TODO: We only support the innermost 3-level loop nest distribution
3076      because of compilation time issue for now.  This should be relaxed
3077      in the future.  Note we only allow 3-level loop nest distribution
3078      when parallelizing loops.  */
3079   while ((loop->inner == NULL
3080 	  || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3081 	 && loop_outer (outer)
3082 	 && outer->inner == loop && loop->next == NULL
3083 	 && single_exit (outer)
3084 	 && optimize_loop_for_speed_p (outer)
3085 	 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3086 	 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3087 	 && niters != chrec_dont_know)
3088     {
3089       loop = outer;
3090       outer = loop_outer (loop);
3091     }
3092 
3093   return loop;
3094 }
3095 
3096 unsigned int
3097 pass_loop_distribution::execute (function *fun)
3098 {
3099   struct loop *loop;
3100   bool changed = false;
3101   basic_block bb;
3102   control_dependences *cd = NULL;
3103   auto_vec<loop_p> loops_to_be_destroyed;
3104 
3105   if (number_of_loops (fun) <= 1)
3106     return 0;
3107 
3108   /* Compute topological order for basic blocks.  Topological order is
3109      needed because data dependence is computed for data references in
3110      lexicographical order.  */
3111   if (bb_top_order_index == NULL)
3112     {
3113       int rpo_num;
3114       int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
3115 
3116       bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3117       bb_top_order_index_size = last_basic_block_for_fn (cfun);
3118       rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
3119       for (int i = 0; i < rpo_num; i++)
3120 	bb_top_order_index[rpo[i]] = i;
3121 
3122       free (rpo);
3123     }
3124 
3125   FOR_ALL_BB_FN (bb, fun)
3126     {
3127       gimple_stmt_iterator gsi;
3128       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3129 	gimple_set_uid (gsi_stmt (gsi), -1);
3130       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3131 	gimple_set_uid (gsi_stmt (gsi), -1);
3132     }
3133 
3134   /* We can at the moment only distribute non-nested loops, thus restrict
3135      walking to innermost loops.  */
3136   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
3137     {
3138       /* Don't distribute multiple exit edges loop, or cold loop.  */
3139       if (!single_exit (loop)
3140 	  || !optimize_loop_for_speed_p (loop))
3141 	continue;
3142 
3143       /* Don't distribute loop if niters is unknown.  */
3144       tree niters = number_of_latch_executions (loop);
3145       if (niters == NULL_TREE || niters == chrec_dont_know)
3146 	continue;
3147 
3148       /* Get the perfect loop nest for distribution.  */
3149       loop = prepare_perfect_loop_nest (loop);
3150       for (; loop; loop = loop->inner)
3151 	{
3152 	  auto_vec<gimple *> work_list;
3153 	  if (!find_seed_stmts_for_distribution (loop, &work_list))
3154 	    break;
3155 
3156 	  const char *str = loop->inner ? " nest" : "";
3157 	  dump_user_location_t loc = find_loop_location (loop);
3158 	  if (!cd)
3159 	    {
3160 	      calculate_dominance_info (CDI_DOMINATORS);
3161 	      calculate_dominance_info (CDI_POST_DOMINATORS);
3162 	      cd = new control_dependences ();
3163 	      free_dominance_info (CDI_POST_DOMINATORS);
3164 	    }
3165 
3166 	  bool destroy_p;
3167 	  int nb_generated_loops, nb_generated_calls;
3168 	  nb_generated_loops = distribute_loop (loop, work_list, cd,
3169 						&nb_generated_calls,
3170 						&destroy_p);
3171 	  if (destroy_p)
3172 	    loops_to_be_destroyed.safe_push (loop);
3173 
3174 	  if (nb_generated_loops + nb_generated_calls > 0)
3175 	    {
3176 	      changed = true;
3177 	      if (dump_enabled_p ())
3178 		dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3179 				 loc, "Loop%s %d distributed: split to %d loops "
3180 				 "and %d library calls.\n", str, loop->num,
3181 				 nb_generated_loops, nb_generated_calls);
3182 
3183 	      break;
3184 	    }
3185 
3186 	  if (dump_file && (dump_flags & TDF_DETAILS))
3187 	    fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3188 	}
3189     }
3190 
3191   if (cd)
3192     delete cd;
3193 
3194   if (bb_top_order_index != NULL)
3195     {
3196       free (bb_top_order_index);
3197       bb_top_order_index = NULL;
3198       bb_top_order_index_size = 0;
3199     }
3200 
3201   if (changed)
3202     {
3203       /* Destroy loop bodies that could not be reused.  Do this late as we
3204 	 otherwise can end up refering to stale data in control dependences.  */
3205       unsigned i;
3206       FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3207 	destroy_loop (loop);
3208 
3209       /* Cached scalar evolutions now may refer to wrong or non-existing
3210 	 loops.  */
3211       scev_reset_htab ();
3212       mark_virtual_operands_for_renaming (fun);
3213       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3214     }
3215 
3216   checking_verify_loop_structure ();
3217 
3218   return changed ? TODO_cleanup_cfg : 0;
3219 }
3220 
3221 } // anon namespace
3222 
3223 gimple_opt_pass *
3224 make_pass_loop_distribution (gcc::context *ctxt)
3225 {
3226   return new pass_loop_distribution (ctxt);
3227 }
3228