xref: /dflybsd-src/contrib/gcc-8.0/gcc/tree-loop-distribution.c (revision 38fd149817dfbff97799f62fcb70be98c4e32523)
1*38fd1498Szrj /* Loop distribution.
2*38fd1498Szrj    Copyright (C) 2006-2018 Free Software Foundation, Inc.
3*38fd1498Szrj    Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4*38fd1498Szrj    and Sebastian Pop <sebastian.pop@amd.com>.
5*38fd1498Szrj 
6*38fd1498Szrj This file is part of GCC.
7*38fd1498Szrj 
8*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it
9*38fd1498Szrj under the terms of the GNU General Public License as published by the
10*38fd1498Szrj Free Software Foundation; either version 3, or (at your option) any
11*38fd1498Szrj later version.
12*38fd1498Szrj 
13*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT
14*38fd1498Szrj ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16*38fd1498Szrj for more details.
17*38fd1498Szrj 
18*38fd1498Szrj You should have received a copy of the GNU General Public License
19*38fd1498Szrj along with GCC; see the file COPYING3.  If not see
20*38fd1498Szrj <http://www.gnu.org/licenses/>.  */
21*38fd1498Szrj 
22*38fd1498Szrj /* This pass performs loop distribution: for example, the loop
23*38fd1498Szrj 
24*38fd1498Szrj    |DO I = 2, N
25*38fd1498Szrj    |    A(I) = B(I) + C
26*38fd1498Szrj    |    D(I) = A(I-1)*E
27*38fd1498Szrj    |ENDDO
28*38fd1498Szrj 
29*38fd1498Szrj    is transformed to
30*38fd1498Szrj 
31*38fd1498Szrj    |DOALL I = 2, N
32*38fd1498Szrj    |   A(I) = B(I) + C
33*38fd1498Szrj    |ENDDO
34*38fd1498Szrj    |
35*38fd1498Szrj    |DOALL I = 2, N
36*38fd1498Szrj    |   D(I) = A(I-1)*E
37*38fd1498Szrj    |ENDDO
38*38fd1498Szrj 
39*38fd1498Szrj    Loop distribution is the dual of loop fusion.  It separates statements
40*38fd1498Szrj    of a loop (or loop nest) into multiple loops (or loop nests) with the
41*38fd1498Szrj    same loop header.  The major goal is to separate statements which may
42*38fd1498Szrj    be vectorized from those that can't.  This pass implements distribution
43*38fd1498Szrj    in the following steps:
44*38fd1498Szrj 
45*38fd1498Szrj      1) Seed partitions with specific type statements.  For now we support
46*38fd1498Szrj 	two types seed statements: statement defining variable used outside
47*38fd1498Szrj 	of loop; statement storing to memory.
48*38fd1498Szrj      2) Build reduced dependence graph (RDG) for loop to be distributed.
49*38fd1498Szrj 	The vertices (RDG:V) model all statements in the loop and the edges
50*38fd1498Szrj 	(RDG:E) model flow and control dependencies between statements.
51*38fd1498Szrj      3) Apart from RDG, compute data dependencies between memory references.
52*38fd1498Szrj      4) Starting from seed statement, build up partition by adding depended
53*38fd1498Szrj 	statements according to RDG's dependence information.  Partition is
54*38fd1498Szrj 	classified as parallel type if it can be executed paralleled; or as
55*38fd1498Szrj 	sequential type if it can't.  Parallel type partition is further
56*38fd1498Szrj 	classified as different builtin kinds if it can be implemented as
57*38fd1498Szrj 	builtin function calls.
58*38fd1498Szrj      5) Build partition dependence graph (PG) based on data dependencies.
59*38fd1498Szrj 	The vertices (PG:V) model all partitions and the edges (PG:E) model
60*38fd1498Szrj 	all data dependencies between every partitions pair.  In general,
61*38fd1498Szrj 	data dependence is either compilation time known or unknown.  In C
62*38fd1498Szrj 	family languages, there exists quite amount compilation time unknown
63*38fd1498Szrj 	dependencies because of possible alias relation of data references.
64*38fd1498Szrj 	We categorize PG's edge to two types: "true" edge that represents
65*38fd1498Szrj 	compilation time known data dependencies; "alias" edge for all other
66*38fd1498Szrj 	data dependencies.
67*38fd1498Szrj      6) Traverse subgraph of PG as if all "alias" edges don't exist.  Merge
68*38fd1498Szrj 	partitions in each strong connected component (SCC) correspondingly.
69*38fd1498Szrj 	Build new PG for merged partitions.
70*38fd1498Szrj      7) Traverse PG again and this time with both "true" and "alias" edges
71*38fd1498Szrj 	included.  We try to break SCCs by removing some edges.  Because
72*38fd1498Szrj 	SCCs by "true" edges are all fused in step 6), we can break SCCs
73*38fd1498Szrj 	by removing some "alias" edges.  It's NP-hard to choose optimal
74*38fd1498Szrj 	edge set, fortunately simple approximation is good enough for us
75*38fd1498Szrj 	given the small problem scale.
76*38fd1498Szrj      8) Collect all data dependencies of the removed "alias" edges.  Create
77*38fd1498Szrj 	runtime alias checks for collected data dependencies.
78*38fd1498Szrj      9) Version loop under the condition of runtime alias checks.  Given
79*38fd1498Szrj 	loop distribution generally introduces additional overhead, it is
80*38fd1498Szrj 	only useful if vectorization is achieved in distributed loop.  We
81*38fd1498Szrj 	version loop with internal function call IFN_LOOP_DIST_ALIAS.  If
82*38fd1498Szrj 	no distributed loop can be vectorized, we simply remove distributed
83*38fd1498Szrj 	loops and recover to the original one.
84*38fd1498Szrj 
85*38fd1498Szrj    TODO:
86*38fd1498Szrj      1) We only distribute innermost two-level loop nest now.  We should
87*38fd1498Szrj 	extend it for arbitrary loop nests in the future.
88*38fd1498Szrj      2) We only fuse partitions in SCC now.  A better fusion algorithm is
89*38fd1498Szrj 	desired to minimize loop overhead, maximize parallelism and maximize
90*38fd1498Szrj 	data reuse.  */
91*38fd1498Szrj 
92*38fd1498Szrj #include "config.h"
93*38fd1498Szrj #define INCLUDE_ALGORITHM /* stable_sort */
94*38fd1498Szrj #include "system.h"
95*38fd1498Szrj #include "coretypes.h"
96*38fd1498Szrj #include "backend.h"
97*38fd1498Szrj #include "tree.h"
98*38fd1498Szrj #include "gimple.h"
99*38fd1498Szrj #include "cfghooks.h"
100*38fd1498Szrj #include "tree-pass.h"
101*38fd1498Szrj #include "ssa.h"
102*38fd1498Szrj #include "gimple-pretty-print.h"
103*38fd1498Szrj #include "fold-const.h"
104*38fd1498Szrj #include "cfganal.h"
105*38fd1498Szrj #include "gimple-iterator.h"
106*38fd1498Szrj #include "gimplify-me.h"
107*38fd1498Szrj #include "stor-layout.h"
108*38fd1498Szrj #include "tree-cfg.h"
109*38fd1498Szrj #include "tree-ssa-loop-manip.h"
110*38fd1498Szrj #include "tree-ssa-loop-ivopts.h"
111*38fd1498Szrj #include "tree-ssa-loop.h"
112*38fd1498Szrj #include "tree-into-ssa.h"
113*38fd1498Szrj #include "tree-ssa.h"
114*38fd1498Szrj #include "cfgloop.h"
115*38fd1498Szrj #include "tree-scalar-evolution.h"
116*38fd1498Szrj #include "params.h"
117*38fd1498Szrj #include "tree-vectorizer.h"
118*38fd1498Szrj 
119*38fd1498Szrj 
120*38fd1498Szrj #define MAX_DATAREFS_NUM \
121*38fd1498Szrj 	((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
122*38fd1498Szrj 
123*38fd1498Szrj /* Threshold controlling number of distributed partitions.  Given it may
124*38fd1498Szrj    be unnecessary if a memory stream cost model is invented in the future,
125*38fd1498Szrj    we define it as a temporary macro, rather than a parameter.  */
126*38fd1498Szrj #define NUM_PARTITION_THRESHOLD (4)
127*38fd1498Szrj 
128*38fd1498Szrj /* Hashtable helpers.  */
129*38fd1498Szrj 
130*38fd1498Szrj struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
131*38fd1498Szrj {
132*38fd1498Szrj   static inline hashval_t hash (const data_dependence_relation *);
133*38fd1498Szrj   static inline bool equal (const data_dependence_relation *,
134*38fd1498Szrj 			    const data_dependence_relation *);
135*38fd1498Szrj };
136*38fd1498Szrj 
137*38fd1498Szrj /* Hash function for data dependence.  */
138*38fd1498Szrj 
139*38fd1498Szrj inline hashval_t
140*38fd1498Szrj ddr_hasher::hash (const data_dependence_relation *ddr)
141*38fd1498Szrj {
142*38fd1498Szrj   inchash::hash h;
143*38fd1498Szrj   h.add_ptr (DDR_A (ddr));
144*38fd1498Szrj   h.add_ptr (DDR_B (ddr));
145*38fd1498Szrj   return h.end ();
146*38fd1498Szrj }
147*38fd1498Szrj 
148*38fd1498Szrj /* Hash table equality function for data dependence.  */
149*38fd1498Szrj 
150*38fd1498Szrj inline bool
151*38fd1498Szrj ddr_hasher::equal (const data_dependence_relation *ddr1,
152*38fd1498Szrj 		   const data_dependence_relation *ddr2)
153*38fd1498Szrj {
154*38fd1498Szrj   return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
155*38fd1498Szrj }
156*38fd1498Szrj 
157*38fd1498Szrj /* The loop (nest) to be distributed.  */
158*38fd1498Szrj static vec<loop_p> loop_nest;
159*38fd1498Szrj 
160*38fd1498Szrj /* Vector of data references in the loop to be distributed.  */
161*38fd1498Szrj static vec<data_reference_p> datarefs_vec;
162*38fd1498Szrj 
163*38fd1498Szrj /* Store index of data reference in aux field.  */
164*38fd1498Szrj #define DR_INDEX(dr)      ((uintptr_t) (dr)->aux)
165*38fd1498Szrj 
166*38fd1498Szrj /* Hash table for data dependence relation in the loop to be distributed.  */
167*38fd1498Szrj static hash_table<ddr_hasher> *ddrs_table;
168*38fd1498Szrj 
169*38fd1498Szrj /* A Reduced Dependence Graph (RDG) vertex representing a statement.  */
170*38fd1498Szrj struct rdg_vertex
171*38fd1498Szrj {
172*38fd1498Szrj   /* The statement represented by this vertex.  */
173*38fd1498Szrj   gimple *stmt;
174*38fd1498Szrj 
175*38fd1498Szrj   /* Vector of data-references in this statement.  */
176*38fd1498Szrj   vec<data_reference_p> datarefs;
177*38fd1498Szrj 
178*38fd1498Szrj   /* True when the statement contains a write to memory.  */
179*38fd1498Szrj   bool has_mem_write;
180*38fd1498Szrj 
181*38fd1498Szrj   /* True when the statement contains a read from memory.  */
182*38fd1498Szrj   bool has_mem_reads;
183*38fd1498Szrj };
184*38fd1498Szrj 
185*38fd1498Szrj #define RDGV_STMT(V)     ((struct rdg_vertex *) ((V)->data))->stmt
186*38fd1498Szrj #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
187*38fd1498Szrj #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
188*38fd1498Szrj #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
189*38fd1498Szrj #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
190*38fd1498Szrj #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
191*38fd1498Szrj #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
192*38fd1498Szrj #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
193*38fd1498Szrj 
194*38fd1498Szrj /* Data dependence type.  */
195*38fd1498Szrj 
196*38fd1498Szrj enum rdg_dep_type
197*38fd1498Szrj {
198*38fd1498Szrj   /* Read After Write (RAW).  */
199*38fd1498Szrj   flow_dd = 'f',
200*38fd1498Szrj 
201*38fd1498Szrj   /* Control dependence (execute conditional on).  */
202*38fd1498Szrj   control_dd = 'c'
203*38fd1498Szrj };
204*38fd1498Szrj 
205*38fd1498Szrj /* Dependence information attached to an edge of the RDG.  */
206*38fd1498Szrj 
207*38fd1498Szrj struct rdg_edge
208*38fd1498Szrj {
209*38fd1498Szrj   /* Type of the dependence.  */
210*38fd1498Szrj   enum rdg_dep_type type;
211*38fd1498Szrj };
212*38fd1498Szrj 
213*38fd1498Szrj #define RDGE_TYPE(E)        ((struct rdg_edge *) ((E)->data))->type
214*38fd1498Szrj 
215*38fd1498Szrj /* Dump vertex I in RDG to FILE.  */
216*38fd1498Szrj 
217*38fd1498Szrj static void
218*38fd1498Szrj dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
219*38fd1498Szrj {
220*38fd1498Szrj   struct vertex *v = &(rdg->vertices[i]);
221*38fd1498Szrj   struct graph_edge *e;
222*38fd1498Szrj 
223*38fd1498Szrj   fprintf (file, "(vertex %d: (%s%s) (in:", i,
224*38fd1498Szrj 	   RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
225*38fd1498Szrj 	   RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
226*38fd1498Szrj 
227*38fd1498Szrj   if (v->pred)
228*38fd1498Szrj     for (e = v->pred; e; e = e->pred_next)
229*38fd1498Szrj       fprintf (file, " %d", e->src);
230*38fd1498Szrj 
231*38fd1498Szrj   fprintf (file, ") (out:");
232*38fd1498Szrj 
233*38fd1498Szrj   if (v->succ)
234*38fd1498Szrj     for (e = v->succ; e; e = e->succ_next)
235*38fd1498Szrj       fprintf (file, " %d", e->dest);
236*38fd1498Szrj 
237*38fd1498Szrj   fprintf (file, ")\n");
238*38fd1498Szrj   print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
239*38fd1498Szrj   fprintf (file, ")\n");
240*38fd1498Szrj }
241*38fd1498Szrj 
242*38fd1498Szrj /* Call dump_rdg_vertex on stderr.  */
243*38fd1498Szrj 
244*38fd1498Szrj DEBUG_FUNCTION void
245*38fd1498Szrj debug_rdg_vertex (struct graph *rdg, int i)
246*38fd1498Szrj {
247*38fd1498Szrj   dump_rdg_vertex (stderr, rdg, i);
248*38fd1498Szrj }
249*38fd1498Szrj 
250*38fd1498Szrj /* Dump the reduced dependence graph RDG to FILE.  */
251*38fd1498Szrj 
252*38fd1498Szrj static void
253*38fd1498Szrj dump_rdg (FILE *file, struct graph *rdg)
254*38fd1498Szrj {
255*38fd1498Szrj   fprintf (file, "(rdg\n");
256*38fd1498Szrj   for (int i = 0; i < rdg->n_vertices; i++)
257*38fd1498Szrj     dump_rdg_vertex (file, rdg, i);
258*38fd1498Szrj   fprintf (file, ")\n");
259*38fd1498Szrj }
260*38fd1498Szrj 
261*38fd1498Szrj /* Call dump_rdg on stderr.  */
262*38fd1498Szrj 
263*38fd1498Szrj DEBUG_FUNCTION void
264*38fd1498Szrj debug_rdg (struct graph *rdg)
265*38fd1498Szrj {
266*38fd1498Szrj   dump_rdg (stderr, rdg);
267*38fd1498Szrj }
268*38fd1498Szrj 
269*38fd1498Szrj static void
270*38fd1498Szrj dot_rdg_1 (FILE *file, struct graph *rdg)
271*38fd1498Szrj {
272*38fd1498Szrj   int i;
273*38fd1498Szrj   pretty_printer buffer;
274*38fd1498Szrj   pp_needs_newline (&buffer) = false;
275*38fd1498Szrj   buffer.buffer->stream = file;
276*38fd1498Szrj 
277*38fd1498Szrj   fprintf (file, "digraph RDG {\n");
278*38fd1498Szrj 
279*38fd1498Szrj   for (i = 0; i < rdg->n_vertices; i++)
280*38fd1498Szrj     {
281*38fd1498Szrj       struct vertex *v = &(rdg->vertices[i]);
282*38fd1498Szrj       struct graph_edge *e;
283*38fd1498Szrj 
284*38fd1498Szrj       fprintf (file, "%d [label=\"[%d] ", i, i);
285*38fd1498Szrj       pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
286*38fd1498Szrj       pp_flush (&buffer);
287*38fd1498Szrj       fprintf (file, "\"]\n");
288*38fd1498Szrj 
289*38fd1498Szrj       /* Highlight reads from memory.  */
290*38fd1498Szrj       if (RDG_MEM_READS_STMT (rdg, i))
291*38fd1498Szrj        fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
292*38fd1498Szrj 
293*38fd1498Szrj       /* Highlight stores to memory.  */
294*38fd1498Szrj       if (RDG_MEM_WRITE_STMT (rdg, i))
295*38fd1498Szrj        fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
296*38fd1498Szrj 
297*38fd1498Szrj       if (v->succ)
298*38fd1498Szrj        for (e = v->succ; e; e = e->succ_next)
299*38fd1498Szrj          switch (RDGE_TYPE (e))
300*38fd1498Szrj            {
301*38fd1498Szrj            case flow_dd:
302*38fd1498Szrj              /* These are the most common dependences: don't print these. */
303*38fd1498Szrj              fprintf (file, "%d -> %d \n", i, e->dest);
304*38fd1498Szrj              break;
305*38fd1498Szrj 
306*38fd1498Szrj 	   case control_dd:
307*38fd1498Szrj              fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
308*38fd1498Szrj              break;
309*38fd1498Szrj 
310*38fd1498Szrj            default:
311*38fd1498Szrj              gcc_unreachable ();
312*38fd1498Szrj            }
313*38fd1498Szrj     }
314*38fd1498Szrj 
315*38fd1498Szrj   fprintf (file, "}\n\n");
316*38fd1498Szrj }
317*38fd1498Szrj 
318*38fd1498Szrj /* Display the Reduced Dependence Graph using dotty.  */
319*38fd1498Szrj 
320*38fd1498Szrj DEBUG_FUNCTION void
321*38fd1498Szrj dot_rdg (struct graph *rdg)
322*38fd1498Szrj {
323*38fd1498Szrj   /* When debugging, you may want to enable the following code.  */
324*38fd1498Szrj #ifdef HAVE_POPEN
325*38fd1498Szrj   FILE *file = popen ("dot -Tx11", "w");
326*38fd1498Szrj   if (!file)
327*38fd1498Szrj     return;
328*38fd1498Szrj   dot_rdg_1 (file, rdg);
329*38fd1498Szrj   fflush (file);
330*38fd1498Szrj   close (fileno (file));
331*38fd1498Szrj   pclose (file);
332*38fd1498Szrj #else
333*38fd1498Szrj   dot_rdg_1 (stderr, rdg);
334*38fd1498Szrj #endif
335*38fd1498Szrj }
336*38fd1498Szrj 
337*38fd1498Szrj /* Returns the index of STMT in RDG.  */
338*38fd1498Szrj 
339*38fd1498Szrj static int
340*38fd1498Szrj rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
341*38fd1498Szrj {
342*38fd1498Szrj   int index = gimple_uid (stmt);
343*38fd1498Szrj   gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
344*38fd1498Szrj   return index;
345*38fd1498Szrj }
346*38fd1498Szrj 
347*38fd1498Szrj /* Creates dependence edges in RDG for all the uses of DEF.  IDEF is
348*38fd1498Szrj    the index of DEF in RDG.  */
349*38fd1498Szrj 
350*38fd1498Szrj static void
351*38fd1498Szrj create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
352*38fd1498Szrj {
353*38fd1498Szrj   use_operand_p imm_use_p;
354*38fd1498Szrj   imm_use_iterator iterator;
355*38fd1498Szrj 
356*38fd1498Szrj   FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
357*38fd1498Szrj     {
358*38fd1498Szrj       struct graph_edge *e;
359*38fd1498Szrj       int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
360*38fd1498Szrj 
361*38fd1498Szrj       if (use < 0)
362*38fd1498Szrj 	continue;
363*38fd1498Szrj 
364*38fd1498Szrj       e = add_edge (rdg, idef, use);
365*38fd1498Szrj       e->data = XNEW (struct rdg_edge);
366*38fd1498Szrj       RDGE_TYPE (e) = flow_dd;
367*38fd1498Szrj     }
368*38fd1498Szrj }
369*38fd1498Szrj 
370*38fd1498Szrj /* Creates an edge for the control dependences of BB to the vertex V.  */
371*38fd1498Szrj 
372*38fd1498Szrj static void
373*38fd1498Szrj create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
374*38fd1498Szrj 				    int v, control_dependences *cd)
375*38fd1498Szrj {
376*38fd1498Szrj   bitmap_iterator bi;
377*38fd1498Szrj   unsigned edge_n;
378*38fd1498Szrj   EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
379*38fd1498Szrj 			    0, edge_n, bi)
380*38fd1498Szrj     {
381*38fd1498Szrj       basic_block cond_bb = cd->get_edge_src (edge_n);
382*38fd1498Szrj       gimple *stmt = last_stmt (cond_bb);
383*38fd1498Szrj       if (stmt && is_ctrl_stmt (stmt))
384*38fd1498Szrj 	{
385*38fd1498Szrj 	  struct graph_edge *e;
386*38fd1498Szrj 	  int c = rdg_vertex_for_stmt (rdg, stmt);
387*38fd1498Szrj 	  if (c < 0)
388*38fd1498Szrj 	    continue;
389*38fd1498Szrj 
390*38fd1498Szrj 	  e = add_edge (rdg, c, v);
391*38fd1498Szrj 	  e->data = XNEW (struct rdg_edge);
392*38fd1498Szrj 	  RDGE_TYPE (e) = control_dd;
393*38fd1498Szrj 	}
394*38fd1498Szrj     }
395*38fd1498Szrj }
396*38fd1498Szrj 
397*38fd1498Szrj /* Creates the edges of the reduced dependence graph RDG.  */
398*38fd1498Szrj 
399*38fd1498Szrj static void
400*38fd1498Szrj create_rdg_flow_edges (struct graph *rdg)
401*38fd1498Szrj {
402*38fd1498Szrj   int i;
403*38fd1498Szrj   def_operand_p def_p;
404*38fd1498Szrj   ssa_op_iter iter;
405*38fd1498Szrj 
406*38fd1498Szrj   for (i = 0; i < rdg->n_vertices; i++)
407*38fd1498Szrj     FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
408*38fd1498Szrj 			      iter, SSA_OP_DEF)
409*38fd1498Szrj       create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
410*38fd1498Szrj }
411*38fd1498Szrj 
412*38fd1498Szrj /* Creates the edges of the reduced dependence graph RDG.  */
413*38fd1498Szrj 
414*38fd1498Szrj static void
415*38fd1498Szrj create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
416*38fd1498Szrj {
417*38fd1498Szrj   int i;
418*38fd1498Szrj 
419*38fd1498Szrj   for (i = 0; i < rdg->n_vertices; i++)
420*38fd1498Szrj     {
421*38fd1498Szrj       gimple *stmt = RDG_STMT (rdg, i);
422*38fd1498Szrj       if (gimple_code (stmt) == GIMPLE_PHI)
423*38fd1498Szrj 	{
424*38fd1498Szrj 	  edge_iterator ei;
425*38fd1498Szrj 	  edge e;
426*38fd1498Szrj 	  FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
427*38fd1498Szrj 	    if (flow_bb_inside_loop_p (loop, e->src))
428*38fd1498Szrj 	      create_edge_for_control_dependence (rdg, e->src, i, cd);
429*38fd1498Szrj 	}
430*38fd1498Szrj       else
431*38fd1498Szrj 	create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
432*38fd1498Szrj     }
433*38fd1498Szrj }
434*38fd1498Szrj 
435*38fd1498Szrj /* Build the vertices of the reduced dependence graph RDG.  Return false
436*38fd1498Szrj    if that failed.  */
437*38fd1498Szrj 
438*38fd1498Szrj static bool
439*38fd1498Szrj create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop)
440*38fd1498Szrj {
441*38fd1498Szrj   int i;
442*38fd1498Szrj   gimple *stmt;
443*38fd1498Szrj 
444*38fd1498Szrj   FOR_EACH_VEC_ELT (stmts, i, stmt)
445*38fd1498Szrj     {
446*38fd1498Szrj       struct vertex *v = &(rdg->vertices[i]);
447*38fd1498Szrj 
448*38fd1498Szrj       /* Record statement to vertex mapping.  */
449*38fd1498Szrj       gimple_set_uid (stmt, i);
450*38fd1498Szrj 
451*38fd1498Szrj       v->data = XNEW (struct rdg_vertex);
452*38fd1498Szrj       RDGV_STMT (v) = stmt;
453*38fd1498Szrj       RDGV_DATAREFS (v).create (0);
454*38fd1498Szrj       RDGV_HAS_MEM_WRITE (v) = false;
455*38fd1498Szrj       RDGV_HAS_MEM_READS (v) = false;
456*38fd1498Szrj       if (gimple_code (stmt) == GIMPLE_PHI)
457*38fd1498Szrj 	continue;
458*38fd1498Szrj 
459*38fd1498Szrj       unsigned drp = datarefs_vec.length ();
460*38fd1498Szrj       if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
461*38fd1498Szrj 	return false;
462*38fd1498Szrj       for (unsigned j = drp; j < datarefs_vec.length (); ++j)
463*38fd1498Szrj 	{
464*38fd1498Szrj 	  data_reference_p dr = datarefs_vec[j];
465*38fd1498Szrj 	  if (DR_IS_READ (dr))
466*38fd1498Szrj 	    RDGV_HAS_MEM_READS (v) = true;
467*38fd1498Szrj 	  else
468*38fd1498Szrj 	    RDGV_HAS_MEM_WRITE (v) = true;
469*38fd1498Szrj 	  RDGV_DATAREFS (v).safe_push (dr);
470*38fd1498Szrj 	}
471*38fd1498Szrj     }
472*38fd1498Szrj   return true;
473*38fd1498Szrj }
474*38fd1498Szrj 
475*38fd1498Szrj /* Array mapping basic block's index to its topological order.  */
476*38fd1498Szrj static int *bb_top_order_index;
477*38fd1498Szrj /* And size of the array.  */
478*38fd1498Szrj static int bb_top_order_index_size;
479*38fd1498Szrj 
480*38fd1498Szrj /* If X has a smaller topological sort number than Y, returns -1;
481*38fd1498Szrj    if greater, returns 1.  */
482*38fd1498Szrj 
483*38fd1498Szrj static int
484*38fd1498Szrj bb_top_order_cmp (const void *x, const void *y)
485*38fd1498Szrj {
486*38fd1498Szrj   basic_block bb1 = *(const basic_block *) x;
487*38fd1498Szrj   basic_block bb2 = *(const basic_block *) y;
488*38fd1498Szrj 
489*38fd1498Szrj   gcc_assert (bb1->index < bb_top_order_index_size
490*38fd1498Szrj 	      && bb2->index < bb_top_order_index_size);
491*38fd1498Szrj   gcc_assert (bb1 == bb2
492*38fd1498Szrj 	      || bb_top_order_index[bb1->index]
493*38fd1498Szrj 		 != bb_top_order_index[bb2->index]);
494*38fd1498Szrj 
495*38fd1498Szrj   return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]);
496*38fd1498Szrj }
497*38fd1498Szrj 
498*38fd1498Szrj /* Initialize STMTS with all the statements of LOOP.  We use topological
499*38fd1498Szrj    order to discover all statements.  The order is important because
500*38fd1498Szrj    generate_loops_for_partition is using the same traversal for identifying
501*38fd1498Szrj    statements in loop copies.  */
502*38fd1498Szrj 
503*38fd1498Szrj static void
504*38fd1498Szrj stmts_from_loop (struct loop *loop, vec<gimple *> *stmts)
505*38fd1498Szrj {
506*38fd1498Szrj   unsigned int i;
507*38fd1498Szrj   basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp);
508*38fd1498Szrj 
509*38fd1498Szrj   for (i = 0; i < loop->num_nodes; i++)
510*38fd1498Szrj     {
511*38fd1498Szrj       basic_block bb = bbs[i];
512*38fd1498Szrj 
513*38fd1498Szrj       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
514*38fd1498Szrj 	   gsi_next (&bsi))
515*38fd1498Szrj 	if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
516*38fd1498Szrj 	  stmts->safe_push (bsi.phi ());
517*38fd1498Szrj 
518*38fd1498Szrj       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
519*38fd1498Szrj 	   gsi_next (&bsi))
520*38fd1498Szrj 	{
521*38fd1498Szrj 	  gimple *stmt = gsi_stmt (bsi);
522*38fd1498Szrj 	  if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
523*38fd1498Szrj 	    stmts->safe_push (stmt);
524*38fd1498Szrj 	}
525*38fd1498Szrj     }
526*38fd1498Szrj 
527*38fd1498Szrj   free (bbs);
528*38fd1498Szrj }
529*38fd1498Szrj 
530*38fd1498Szrj /* Free the reduced dependence graph RDG.  */
531*38fd1498Szrj 
532*38fd1498Szrj static void
533*38fd1498Szrj free_rdg (struct graph *rdg)
534*38fd1498Szrj {
535*38fd1498Szrj   int i;
536*38fd1498Szrj 
537*38fd1498Szrj   for (i = 0; i < rdg->n_vertices; i++)
538*38fd1498Szrj     {
539*38fd1498Szrj       struct vertex *v = &(rdg->vertices[i]);
540*38fd1498Szrj       struct graph_edge *e;
541*38fd1498Szrj 
542*38fd1498Szrj       for (e = v->succ; e; e = e->succ_next)
543*38fd1498Szrj 	free (e->data);
544*38fd1498Szrj 
545*38fd1498Szrj       if (v->data)
546*38fd1498Szrj 	{
547*38fd1498Szrj 	  gimple_set_uid (RDGV_STMT (v), -1);
548*38fd1498Szrj 	  (RDGV_DATAREFS (v)).release ();
549*38fd1498Szrj 	  free (v->data);
550*38fd1498Szrj 	}
551*38fd1498Szrj     }
552*38fd1498Szrj 
553*38fd1498Szrj   free_graph (rdg);
554*38fd1498Szrj }
555*38fd1498Szrj 
556*38fd1498Szrj /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
557*38fd1498Szrj    LOOP, and one edge per flow dependence or control dependence from control
558*38fd1498Szrj    dependence CD.  During visiting each statement, data references are also
559*38fd1498Szrj    collected and recorded in global data DATAREFS_VEC.  */
560*38fd1498Szrj 
561*38fd1498Szrj static struct graph *
562*38fd1498Szrj build_rdg (struct loop *loop, control_dependences *cd)
563*38fd1498Szrj {
564*38fd1498Szrj   struct graph *rdg;
565*38fd1498Szrj 
566*38fd1498Szrj   /* Create the RDG vertices from the stmts of the loop nest.  */
567*38fd1498Szrj   auto_vec<gimple *, 10> stmts;
568*38fd1498Szrj   stmts_from_loop (loop, &stmts);
569*38fd1498Szrj   rdg = new_graph (stmts.length ());
570*38fd1498Szrj   if (!create_rdg_vertices (rdg, stmts, loop))
571*38fd1498Szrj     {
572*38fd1498Szrj       free_rdg (rdg);
573*38fd1498Szrj       return NULL;
574*38fd1498Szrj     }
575*38fd1498Szrj   stmts.release ();
576*38fd1498Szrj 
577*38fd1498Szrj   create_rdg_flow_edges (rdg);
578*38fd1498Szrj   if (cd)
579*38fd1498Szrj     create_rdg_cd_edges (rdg, cd, loop);
580*38fd1498Szrj 
581*38fd1498Szrj   return rdg;
582*38fd1498Szrj }
583*38fd1498Szrj 
584*38fd1498Szrj 
585*38fd1498Szrj /* Kind of distributed loop.  */
586*38fd1498Szrj enum partition_kind {
587*38fd1498Szrj     PKIND_NORMAL,
588*38fd1498Szrj     /* Partial memset stands for a paritition can be distributed into a loop
589*38fd1498Szrj        of memset calls, rather than a single memset call.  It's handled just
590*38fd1498Szrj        like a normal parition, i.e, distributed as separate loop, no memset
591*38fd1498Szrj        call is generated.
592*38fd1498Szrj 
593*38fd1498Szrj        Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
594*38fd1498Szrj        loop nest as deep as possible.  As a result, parloop achieves better
595*38fd1498Szrj        parallelization by parallelizing deeper loop nest.  This hack should
596*38fd1498Szrj        be unnecessary and removed once distributed memset can be understood
597*38fd1498Szrj        and analyzed in data reference analysis.  See PR82604 for more.  */
598*38fd1498Szrj     PKIND_PARTIAL_MEMSET,
599*38fd1498Szrj     PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
600*38fd1498Szrj };
601*38fd1498Szrj 
602*38fd1498Szrj /* Type of distributed loop.  */
603*38fd1498Szrj enum partition_type {
604*38fd1498Szrj     /* The distributed loop can be executed parallelly.  */
605*38fd1498Szrj     PTYPE_PARALLEL = 0,
606*38fd1498Szrj     /* The distributed loop has to be executed sequentially.  */
607*38fd1498Szrj     PTYPE_SEQUENTIAL
608*38fd1498Szrj };
609*38fd1498Szrj 
610*38fd1498Szrj /* Builtin info for loop distribution.  */
611*38fd1498Szrj struct builtin_info
612*38fd1498Szrj {
613*38fd1498Szrj   /* data-references a kind != PKIND_NORMAL partition is about.  */
614*38fd1498Szrj   data_reference_p dst_dr;
615*38fd1498Szrj   data_reference_p src_dr;
616*38fd1498Szrj   /* Base address and size of memory objects operated by the builtin.  Note
617*38fd1498Szrj      both dest and source memory objects must have the same size.  */
618*38fd1498Szrj   tree dst_base;
619*38fd1498Szrj   tree src_base;
620*38fd1498Szrj   tree size;
621*38fd1498Szrj   /* Base and offset part of dst_base after stripping constant offset.  This
622*38fd1498Szrj      is only used in memset builtin distribution for now.  */
623*38fd1498Szrj   tree dst_base_base;
624*38fd1498Szrj   unsigned HOST_WIDE_INT dst_base_offset;
625*38fd1498Szrj };
626*38fd1498Szrj 
627*38fd1498Szrj /* Partition for loop distribution.  */
628*38fd1498Szrj struct partition
629*38fd1498Szrj {
630*38fd1498Szrj   /* Statements of the partition.  */
631*38fd1498Szrj   bitmap stmts;
632*38fd1498Szrj   /* True if the partition defines variable which is used outside of loop.  */
633*38fd1498Szrj   bool reduction_p;
634*38fd1498Szrj   enum partition_kind kind;
635*38fd1498Szrj   enum partition_type type;
636*38fd1498Szrj   /* Data references in the partition.  */
637*38fd1498Szrj   bitmap datarefs;
638*38fd1498Szrj   /* Information of builtin parition.  */
639*38fd1498Szrj   struct builtin_info *builtin;
640*38fd1498Szrj };
641*38fd1498Szrj 
642*38fd1498Szrj 
643*38fd1498Szrj /* Allocate and initialize a partition from BITMAP.  */
644*38fd1498Szrj 
645*38fd1498Szrj static partition *
646*38fd1498Szrj partition_alloc (void)
647*38fd1498Szrj {
648*38fd1498Szrj   partition *partition = XCNEW (struct partition);
649*38fd1498Szrj   partition->stmts = BITMAP_ALLOC (NULL);
650*38fd1498Szrj   partition->reduction_p = false;
651*38fd1498Szrj   partition->kind = PKIND_NORMAL;
652*38fd1498Szrj   partition->datarefs = BITMAP_ALLOC (NULL);
653*38fd1498Szrj   return partition;
654*38fd1498Szrj }
655*38fd1498Szrj 
656*38fd1498Szrj /* Free PARTITION.  */
657*38fd1498Szrj 
658*38fd1498Szrj static void
659*38fd1498Szrj partition_free (partition *partition)
660*38fd1498Szrj {
661*38fd1498Szrj   BITMAP_FREE (partition->stmts);
662*38fd1498Szrj   BITMAP_FREE (partition->datarefs);
663*38fd1498Szrj   if (partition->builtin)
664*38fd1498Szrj     free (partition->builtin);
665*38fd1498Szrj 
666*38fd1498Szrj   free (partition);
667*38fd1498Szrj }
668*38fd1498Szrj 
669*38fd1498Szrj /* Returns true if the partition can be generated as a builtin.  */
670*38fd1498Szrj 
671*38fd1498Szrj static bool
672*38fd1498Szrj partition_builtin_p (partition *partition)
673*38fd1498Szrj {
674*38fd1498Szrj   return partition->kind > PKIND_PARTIAL_MEMSET;
675*38fd1498Szrj }
676*38fd1498Szrj 
677*38fd1498Szrj /* Returns true if the partition contains a reduction.  */
678*38fd1498Szrj 
679*38fd1498Szrj static bool
680*38fd1498Szrj partition_reduction_p (partition *partition)
681*38fd1498Szrj {
682*38fd1498Szrj   return partition->reduction_p;
683*38fd1498Szrj }
684*38fd1498Szrj 
685*38fd1498Szrj /* Partitions are fused because of different reasons.  */
686*38fd1498Szrj enum fuse_type
687*38fd1498Szrj {
688*38fd1498Szrj   FUSE_NON_BUILTIN = 0,
689*38fd1498Szrj   FUSE_REDUCTION = 1,
690*38fd1498Szrj   FUSE_SHARE_REF = 2,
691*38fd1498Szrj   FUSE_SAME_SCC = 3,
692*38fd1498Szrj   FUSE_FINALIZE = 4
693*38fd1498Szrj };
694*38fd1498Szrj 
695*38fd1498Szrj /* Description on different fusing reason.  */
696*38fd1498Szrj static const char *fuse_message[] = {
697*38fd1498Szrj   "they are non-builtins",
698*38fd1498Szrj   "they have reductions",
699*38fd1498Szrj   "they have shared memory refs",
700*38fd1498Szrj   "they are in the same dependence scc",
701*38fd1498Szrj   "there is no point to distribute loop"};
702*38fd1498Szrj 
703*38fd1498Szrj static void
704*38fd1498Szrj update_type_for_merge (struct graph *, partition *, partition *);
705*38fd1498Szrj 
706*38fd1498Szrj /* Merge PARTITION into the partition DEST.  RDG is the reduced dependence
707*38fd1498Szrj    graph and we update type for result partition if it is non-NULL.  */
708*38fd1498Szrj 
709*38fd1498Szrj static void
710*38fd1498Szrj partition_merge_into (struct graph *rdg, partition *dest,
711*38fd1498Szrj 		      partition *partition, enum fuse_type ft)
712*38fd1498Szrj {
713*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
714*38fd1498Szrj     {
715*38fd1498Szrj       fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
716*38fd1498Szrj       fprintf (dump_file, "  Part 1: ");
717*38fd1498Szrj       dump_bitmap (dump_file, dest->stmts);
718*38fd1498Szrj       fprintf (dump_file, "  Part 2: ");
719*38fd1498Szrj       dump_bitmap (dump_file, partition->stmts);
720*38fd1498Szrj     }
721*38fd1498Szrj 
722*38fd1498Szrj   dest->kind = PKIND_NORMAL;
723*38fd1498Szrj   if (dest->type == PTYPE_PARALLEL)
724*38fd1498Szrj     dest->type = partition->type;
725*38fd1498Szrj 
726*38fd1498Szrj   bitmap_ior_into (dest->stmts, partition->stmts);
727*38fd1498Szrj   if (partition_reduction_p (partition))
728*38fd1498Szrj     dest->reduction_p = true;
729*38fd1498Szrj 
730*38fd1498Szrj   /* Further check if any data dependence prevents us from executing the
731*38fd1498Szrj      new partition parallelly.  */
732*38fd1498Szrj   if (dest->type == PTYPE_PARALLEL && rdg != NULL)
733*38fd1498Szrj     update_type_for_merge (rdg, dest, partition);
734*38fd1498Szrj 
735*38fd1498Szrj   bitmap_ior_into (dest->datarefs, partition->datarefs);
736*38fd1498Szrj }
737*38fd1498Szrj 
738*38fd1498Szrj 
739*38fd1498Szrj /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
740*38fd1498Szrj    the LOOP.  */
741*38fd1498Szrj 
742*38fd1498Szrj static bool
743*38fd1498Szrj ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
744*38fd1498Szrj {
745*38fd1498Szrj   imm_use_iterator imm_iter;
746*38fd1498Szrj   use_operand_p use_p;
747*38fd1498Szrj 
748*38fd1498Szrj   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
749*38fd1498Szrj     {
750*38fd1498Szrj       if (is_gimple_debug (USE_STMT (use_p)))
751*38fd1498Szrj 	continue;
752*38fd1498Szrj 
753*38fd1498Szrj       basic_block use_bb = gimple_bb (USE_STMT (use_p));
754*38fd1498Szrj       if (!flow_bb_inside_loop_p (loop, use_bb))
755*38fd1498Szrj 	return true;
756*38fd1498Szrj     }
757*38fd1498Szrj 
758*38fd1498Szrj   return false;
759*38fd1498Szrj }
760*38fd1498Szrj 
761*38fd1498Szrj /* Returns true when STMT defines a scalar variable used after the
762*38fd1498Szrj    loop LOOP.  */
763*38fd1498Szrj 
764*38fd1498Szrj static bool
765*38fd1498Szrj stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
766*38fd1498Szrj {
767*38fd1498Szrj   def_operand_p def_p;
768*38fd1498Szrj   ssa_op_iter op_iter;
769*38fd1498Szrj 
770*38fd1498Szrj   if (gimple_code (stmt) == GIMPLE_PHI)
771*38fd1498Szrj     return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
772*38fd1498Szrj 
773*38fd1498Szrj   FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
774*38fd1498Szrj     if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
775*38fd1498Szrj       return true;
776*38fd1498Szrj 
777*38fd1498Szrj   return false;
778*38fd1498Szrj }
779*38fd1498Szrj 
780*38fd1498Szrj /* Return a copy of LOOP placed before LOOP.  */
781*38fd1498Szrj 
782*38fd1498Szrj static struct loop *
783*38fd1498Szrj copy_loop_before (struct loop *loop)
784*38fd1498Szrj {
785*38fd1498Szrj   struct loop *res;
786*38fd1498Szrj   edge preheader = loop_preheader_edge (loop);
787*38fd1498Szrj 
788*38fd1498Szrj   initialize_original_copy_tables ();
789*38fd1498Szrj   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
790*38fd1498Szrj   gcc_assert (res != NULL);
791*38fd1498Szrj   free_original_copy_tables ();
792*38fd1498Szrj   delete_update_ssa ();
793*38fd1498Szrj 
794*38fd1498Szrj   return res;
795*38fd1498Szrj }
796*38fd1498Szrj 
797*38fd1498Szrj /* Creates an empty basic block after LOOP.  */
798*38fd1498Szrj 
799*38fd1498Szrj static void
800*38fd1498Szrj create_bb_after_loop (struct loop *loop)
801*38fd1498Szrj {
802*38fd1498Szrj   edge exit = single_exit (loop);
803*38fd1498Szrj 
804*38fd1498Szrj   if (!exit)
805*38fd1498Szrj     return;
806*38fd1498Szrj 
807*38fd1498Szrj   split_edge (exit);
808*38fd1498Szrj }
809*38fd1498Szrj 
810*38fd1498Szrj /* Generate code for PARTITION from the code in LOOP.  The loop is
811*38fd1498Szrj    copied when COPY_P is true.  All the statements not flagged in the
812*38fd1498Szrj    PARTITION bitmap are removed from the loop or from its copy.  The
813*38fd1498Szrj    statements are indexed in sequence inside a basic block, and the
814*38fd1498Szrj    basic blocks of a loop are taken in dom order.  */
815*38fd1498Szrj 
816*38fd1498Szrj static void
817*38fd1498Szrj generate_loops_for_partition (struct loop *loop, partition *partition,
818*38fd1498Szrj 			      bool copy_p)
819*38fd1498Szrj {
820*38fd1498Szrj   unsigned i;
821*38fd1498Szrj   basic_block *bbs;
822*38fd1498Szrj 
823*38fd1498Szrj   if (copy_p)
824*38fd1498Szrj     {
825*38fd1498Szrj       int orig_loop_num = loop->orig_loop_num;
826*38fd1498Szrj       loop = copy_loop_before (loop);
827*38fd1498Szrj       gcc_assert (loop != NULL);
828*38fd1498Szrj       loop->orig_loop_num = orig_loop_num;
829*38fd1498Szrj       create_preheader (loop, CP_SIMPLE_PREHEADERS);
830*38fd1498Szrj       create_bb_after_loop (loop);
831*38fd1498Szrj     }
832*38fd1498Szrj   else
833*38fd1498Szrj     {
834*38fd1498Szrj       /* Origin number is set to the new versioned loop's num.  */
835*38fd1498Szrj       gcc_assert (loop->orig_loop_num != loop->num);
836*38fd1498Szrj     }
837*38fd1498Szrj 
838*38fd1498Szrj   /* Remove stmts not in the PARTITION bitmap.  */
839*38fd1498Szrj   bbs = get_loop_body_in_dom_order (loop);
840*38fd1498Szrj 
841*38fd1498Szrj   if (MAY_HAVE_DEBUG_BIND_STMTS)
842*38fd1498Szrj     for (i = 0; i < loop->num_nodes; i++)
843*38fd1498Szrj       {
844*38fd1498Szrj 	basic_block bb = bbs[i];
845*38fd1498Szrj 
846*38fd1498Szrj 	for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
847*38fd1498Szrj 	     gsi_next (&bsi))
848*38fd1498Szrj 	  {
849*38fd1498Szrj 	    gphi *phi = bsi.phi ();
850*38fd1498Szrj 	    if (!virtual_operand_p (gimple_phi_result (phi))
851*38fd1498Szrj 		&& !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
852*38fd1498Szrj 	      reset_debug_uses (phi);
853*38fd1498Szrj 	  }
854*38fd1498Szrj 
855*38fd1498Szrj 	for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
856*38fd1498Szrj 	  {
857*38fd1498Szrj 	    gimple *stmt = gsi_stmt (bsi);
858*38fd1498Szrj 	    if (gimple_code (stmt) != GIMPLE_LABEL
859*38fd1498Szrj 		&& !is_gimple_debug (stmt)
860*38fd1498Szrj 		&& !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
861*38fd1498Szrj 	      reset_debug_uses (stmt);
862*38fd1498Szrj 	  }
863*38fd1498Szrj       }
864*38fd1498Szrj 
865*38fd1498Szrj   for (i = 0; i < loop->num_nodes; i++)
866*38fd1498Szrj     {
867*38fd1498Szrj       basic_block bb = bbs[i];
868*38fd1498Szrj       edge inner_exit = NULL;
869*38fd1498Szrj 
870*38fd1498Szrj       if (loop != bb->loop_father)
871*38fd1498Szrj 	inner_exit = single_exit (bb->loop_father);
872*38fd1498Szrj 
873*38fd1498Szrj       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
874*38fd1498Szrj 	{
875*38fd1498Szrj 	  gphi *phi = bsi.phi ();
876*38fd1498Szrj 	  if (!virtual_operand_p (gimple_phi_result (phi))
877*38fd1498Szrj 	      && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
878*38fd1498Szrj 	    remove_phi_node (&bsi, true);
879*38fd1498Szrj 	  else
880*38fd1498Szrj 	    gsi_next (&bsi);
881*38fd1498Szrj 	}
882*38fd1498Szrj 
883*38fd1498Szrj       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
884*38fd1498Szrj 	{
885*38fd1498Szrj 	  gimple *stmt = gsi_stmt (bsi);
886*38fd1498Szrj 	  if (gimple_code (stmt) != GIMPLE_LABEL
887*38fd1498Szrj 	      && !is_gimple_debug (stmt)
888*38fd1498Szrj 	      && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
889*38fd1498Szrj 	    {
890*38fd1498Szrj 	      /* In distribution of loop nest, if bb is inner loop's exit_bb,
891*38fd1498Szrj 		 we choose its exit edge/path in order to avoid generating
892*38fd1498Szrj 		 infinite loop.  For all other cases, we choose an arbitrary
893*38fd1498Szrj 		 path through the empty CFG part that this unnecessary
894*38fd1498Szrj 		 control stmt controls.  */
895*38fd1498Szrj 	      if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
896*38fd1498Szrj 		{
897*38fd1498Szrj 		  if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
898*38fd1498Szrj 		    gimple_cond_make_true (cond_stmt);
899*38fd1498Szrj 		  else
900*38fd1498Szrj 		    gimple_cond_make_false (cond_stmt);
901*38fd1498Szrj 		  update_stmt (stmt);
902*38fd1498Szrj 		}
903*38fd1498Szrj 	      else if (gimple_code (stmt) == GIMPLE_SWITCH)
904*38fd1498Szrj 		{
905*38fd1498Szrj 		  gswitch *switch_stmt = as_a <gswitch *> (stmt);
906*38fd1498Szrj 		  gimple_switch_set_index
907*38fd1498Szrj 		      (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
908*38fd1498Szrj 		  update_stmt (stmt);
909*38fd1498Szrj 		}
910*38fd1498Szrj 	      else
911*38fd1498Szrj 		{
912*38fd1498Szrj 		  unlink_stmt_vdef (stmt);
913*38fd1498Szrj 		  gsi_remove (&bsi, true);
914*38fd1498Szrj 		  release_defs (stmt);
915*38fd1498Szrj 		  continue;
916*38fd1498Szrj 		}
917*38fd1498Szrj 	    }
918*38fd1498Szrj 	  gsi_next (&bsi);
919*38fd1498Szrj 	}
920*38fd1498Szrj     }
921*38fd1498Szrj 
922*38fd1498Szrj   free (bbs);
923*38fd1498Szrj }
924*38fd1498Szrj 
925*38fd1498Szrj /* If VAL memory representation contains the same value in all bytes,
926*38fd1498Szrj    return that value, otherwise return -1.
927*38fd1498Szrj    E.g. for 0x24242424 return 0x24, for IEEE double
928*38fd1498Szrj    747708026454360457216.0 return 0x44, etc.  */
929*38fd1498Szrj 
930*38fd1498Szrj static int
931*38fd1498Szrj const_with_all_bytes_same (tree val)
932*38fd1498Szrj {
933*38fd1498Szrj   unsigned char buf[64];
934*38fd1498Szrj   int i, len;
935*38fd1498Szrj 
936*38fd1498Szrj   if (integer_zerop (val)
937*38fd1498Szrj       || (TREE_CODE (val) == CONSTRUCTOR
938*38fd1498Szrj           && !TREE_CLOBBER_P (val)
939*38fd1498Szrj           && CONSTRUCTOR_NELTS (val) == 0))
940*38fd1498Szrj     return 0;
941*38fd1498Szrj 
942*38fd1498Szrj   if (real_zerop (val))
943*38fd1498Szrj     {
944*38fd1498Szrj       /* Only return 0 for +0.0, not for -0.0, which doesn't have
945*38fd1498Szrj 	 an all bytes same memory representation.  Don't transform
946*38fd1498Szrj 	 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS.  */
947*38fd1498Szrj       switch (TREE_CODE (val))
948*38fd1498Szrj 	{
949*38fd1498Szrj 	case REAL_CST:
950*38fd1498Szrj 	  if (!real_isneg (TREE_REAL_CST_PTR (val)))
951*38fd1498Szrj 	    return 0;
952*38fd1498Szrj 	  break;
953*38fd1498Szrj 	case COMPLEX_CST:
954*38fd1498Szrj 	  if (!const_with_all_bytes_same (TREE_REALPART (val))
955*38fd1498Szrj 	      && !const_with_all_bytes_same (TREE_IMAGPART (val)))
956*38fd1498Szrj 	    return 0;
957*38fd1498Szrj 	  break;
958*38fd1498Szrj 	case VECTOR_CST:
959*38fd1498Szrj 	  {
960*38fd1498Szrj 	    unsigned int count = vector_cst_encoded_nelts (val);
961*38fd1498Szrj 	    unsigned int j;
962*38fd1498Szrj 	    for (j = 0; j < count; ++j)
963*38fd1498Szrj 	      if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
964*38fd1498Szrj 		break;
965*38fd1498Szrj 	    if (j == count)
966*38fd1498Szrj 	      return 0;
967*38fd1498Szrj 	    break;
968*38fd1498Szrj 	  }
969*38fd1498Szrj 	default:
970*38fd1498Szrj 	  break;
971*38fd1498Szrj 	}
972*38fd1498Szrj     }
973*38fd1498Szrj 
974*38fd1498Szrj   if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
975*38fd1498Szrj     return -1;
976*38fd1498Szrj 
977*38fd1498Szrj   len = native_encode_expr (val, buf, sizeof (buf));
978*38fd1498Szrj   if (len == 0)
979*38fd1498Szrj     return -1;
980*38fd1498Szrj   for (i = 1; i < len; i++)
981*38fd1498Szrj     if (buf[i] != buf[0])
982*38fd1498Szrj       return -1;
983*38fd1498Szrj   return buf[0];
984*38fd1498Szrj }
985*38fd1498Szrj 
986*38fd1498Szrj /* Generate a call to memset for PARTITION in LOOP.  */
987*38fd1498Szrj 
988*38fd1498Szrj static void
989*38fd1498Szrj generate_memset_builtin (struct loop *loop, partition *partition)
990*38fd1498Szrj {
991*38fd1498Szrj   gimple_stmt_iterator gsi;
992*38fd1498Szrj   tree mem, fn, nb_bytes;
993*38fd1498Szrj   tree val;
994*38fd1498Szrj   struct builtin_info *builtin = partition->builtin;
995*38fd1498Szrj   gimple *fn_call;
996*38fd1498Szrj 
997*38fd1498Szrj   /* The new statements will be placed before LOOP.  */
998*38fd1498Szrj   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
999*38fd1498Szrj 
1000*38fd1498Szrj   nb_bytes = builtin->size;
1001*38fd1498Szrj   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1002*38fd1498Szrj 				       false, GSI_CONTINUE_LINKING);
1003*38fd1498Szrj   mem = builtin->dst_base;
1004*38fd1498Szrj   mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1005*38fd1498Szrj 				  false, GSI_CONTINUE_LINKING);
1006*38fd1498Szrj 
1007*38fd1498Szrj   /* This exactly matches the pattern recognition in classify_partition.  */
1008*38fd1498Szrj   val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1009*38fd1498Szrj   /* Handle constants like 0x15151515 and similarly
1010*38fd1498Szrj      floating point constants etc. where all bytes are the same.  */
1011*38fd1498Szrj   int bytev = const_with_all_bytes_same (val);
1012*38fd1498Szrj   if (bytev != -1)
1013*38fd1498Szrj     val = build_int_cst (integer_type_node, bytev);
1014*38fd1498Szrj   else if (TREE_CODE (val) == INTEGER_CST)
1015*38fd1498Szrj     val = fold_convert (integer_type_node, val);
1016*38fd1498Szrj   else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1017*38fd1498Szrj     {
1018*38fd1498Szrj       tree tem = make_ssa_name (integer_type_node);
1019*38fd1498Szrj       gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1020*38fd1498Szrj       gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1021*38fd1498Szrj       val = tem;
1022*38fd1498Szrj     }
1023*38fd1498Szrj 
1024*38fd1498Szrj   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1025*38fd1498Szrj   fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1026*38fd1498Szrj   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1027*38fd1498Szrj 
1028*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
1029*38fd1498Szrj     {
1030*38fd1498Szrj       fprintf (dump_file, "generated memset");
1031*38fd1498Szrj       if (bytev == 0)
1032*38fd1498Szrj 	fprintf (dump_file, " zero\n");
1033*38fd1498Szrj       else
1034*38fd1498Szrj 	fprintf (dump_file, "\n");
1035*38fd1498Szrj     }
1036*38fd1498Szrj }
1037*38fd1498Szrj 
1038*38fd1498Szrj /* Generate a call to memcpy for PARTITION in LOOP.  */
1039*38fd1498Szrj 
1040*38fd1498Szrj static void
1041*38fd1498Szrj generate_memcpy_builtin (struct loop *loop, partition *partition)
1042*38fd1498Szrj {
1043*38fd1498Szrj   gimple_stmt_iterator gsi;
1044*38fd1498Szrj   gimple *fn_call;
1045*38fd1498Szrj   tree dest, src, fn, nb_bytes;
1046*38fd1498Szrj   enum built_in_function kind;
1047*38fd1498Szrj   struct builtin_info *builtin = partition->builtin;
1048*38fd1498Szrj 
1049*38fd1498Szrj   /* The new statements will be placed before LOOP.  */
1050*38fd1498Szrj   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1051*38fd1498Szrj 
1052*38fd1498Szrj   nb_bytes = builtin->size;
1053*38fd1498Szrj   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1054*38fd1498Szrj 				       false, GSI_CONTINUE_LINKING);
1055*38fd1498Szrj   dest = builtin->dst_base;
1056*38fd1498Szrj   src = builtin->src_base;
1057*38fd1498Szrj   if (partition->kind == PKIND_MEMCPY
1058*38fd1498Szrj       || ! ptr_derefs_may_alias_p (dest, src))
1059*38fd1498Szrj     kind = BUILT_IN_MEMCPY;
1060*38fd1498Szrj   else
1061*38fd1498Szrj     kind = BUILT_IN_MEMMOVE;
1062*38fd1498Szrj 
1063*38fd1498Szrj   dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1064*38fd1498Szrj 				   false, GSI_CONTINUE_LINKING);
1065*38fd1498Szrj   src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1066*38fd1498Szrj 				  false, GSI_CONTINUE_LINKING);
1067*38fd1498Szrj   fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1068*38fd1498Szrj   fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1069*38fd1498Szrj   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1070*38fd1498Szrj 
1071*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
1072*38fd1498Szrj     {
1073*38fd1498Szrj       if (kind == BUILT_IN_MEMCPY)
1074*38fd1498Szrj 	fprintf (dump_file, "generated memcpy\n");
1075*38fd1498Szrj       else
1076*38fd1498Szrj 	fprintf (dump_file, "generated memmove\n");
1077*38fd1498Szrj     }
1078*38fd1498Szrj }
1079*38fd1498Szrj 
1080*38fd1498Szrj /* Remove and destroy the loop LOOP.  */
1081*38fd1498Szrj 
1082*38fd1498Szrj static void
1083*38fd1498Szrj destroy_loop (struct loop *loop)
1084*38fd1498Szrj {
1085*38fd1498Szrj   unsigned nbbs = loop->num_nodes;
1086*38fd1498Szrj   edge exit = single_exit (loop);
1087*38fd1498Szrj   basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1088*38fd1498Szrj   basic_block *bbs;
1089*38fd1498Szrj   unsigned i;
1090*38fd1498Szrj 
1091*38fd1498Szrj   bbs = get_loop_body_in_dom_order (loop);
1092*38fd1498Szrj 
1093*38fd1498Szrj   redirect_edge_pred (exit, src);
1094*38fd1498Szrj   exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1095*38fd1498Szrj   exit->flags |= EDGE_FALLTHRU;
1096*38fd1498Szrj   cancel_loop_tree (loop);
1097*38fd1498Szrj   rescan_loop_exit (exit, false, true);
1098*38fd1498Szrj 
1099*38fd1498Szrj   i = nbbs;
1100*38fd1498Szrj   do
1101*38fd1498Szrj     {
1102*38fd1498Szrj       /* We have made sure to not leave any dangling uses of SSA
1103*38fd1498Szrj          names defined in the loop.  With the exception of virtuals.
1104*38fd1498Szrj 	 Make sure we replace all uses of virtual defs that will remain
1105*38fd1498Szrj 	 outside of the loop with the bare symbol as delete_basic_block
1106*38fd1498Szrj 	 will release them.  */
1107*38fd1498Szrj       --i;
1108*38fd1498Szrj       for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1109*38fd1498Szrj 	   gsi_next (&gsi))
1110*38fd1498Szrj 	{
1111*38fd1498Szrj 	  gphi *phi = gsi.phi ();
1112*38fd1498Szrj 	  if (virtual_operand_p (gimple_phi_result (phi)))
1113*38fd1498Szrj 	    mark_virtual_phi_result_for_renaming (phi);
1114*38fd1498Szrj 	}
1115*38fd1498Szrj       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);
1116*38fd1498Szrj 	   gsi_next (&gsi))
1117*38fd1498Szrj 	{
1118*38fd1498Szrj 	  gimple *stmt = gsi_stmt (gsi);
1119*38fd1498Szrj 	  tree vdef = gimple_vdef (stmt);
1120*38fd1498Szrj 	  if (vdef && TREE_CODE (vdef) == SSA_NAME)
1121*38fd1498Szrj 	    mark_virtual_operand_for_renaming (vdef);
1122*38fd1498Szrj 	}
1123*38fd1498Szrj       delete_basic_block (bbs[i]);
1124*38fd1498Szrj     }
1125*38fd1498Szrj   while (i != 0);
1126*38fd1498Szrj 
1127*38fd1498Szrj   free (bbs);
1128*38fd1498Szrj 
1129*38fd1498Szrj   set_immediate_dominator (CDI_DOMINATORS, dest,
1130*38fd1498Szrj 			   recompute_dominator (CDI_DOMINATORS, dest));
1131*38fd1498Szrj }
1132*38fd1498Szrj 
1133*38fd1498Szrj /* Generates code for PARTITION.  Return whether LOOP needs to be destroyed.  */
1134*38fd1498Szrj 
1135*38fd1498Szrj static bool
1136*38fd1498Szrj generate_code_for_partition (struct loop *loop,
1137*38fd1498Szrj 			     partition *partition, bool copy_p)
1138*38fd1498Szrj {
1139*38fd1498Szrj   switch (partition->kind)
1140*38fd1498Szrj     {
1141*38fd1498Szrj     case PKIND_NORMAL:
1142*38fd1498Szrj     case PKIND_PARTIAL_MEMSET:
1143*38fd1498Szrj       /* Reductions all have to be in the last partition.  */
1144*38fd1498Szrj       gcc_assert (!partition_reduction_p (partition)
1145*38fd1498Szrj 		  || !copy_p);
1146*38fd1498Szrj       generate_loops_for_partition (loop, partition, copy_p);
1147*38fd1498Szrj       return false;
1148*38fd1498Szrj 
1149*38fd1498Szrj     case PKIND_MEMSET:
1150*38fd1498Szrj       generate_memset_builtin (loop, partition);
1151*38fd1498Szrj       break;
1152*38fd1498Szrj 
1153*38fd1498Szrj     case PKIND_MEMCPY:
1154*38fd1498Szrj     case PKIND_MEMMOVE:
1155*38fd1498Szrj       generate_memcpy_builtin (loop, partition);
1156*38fd1498Szrj       break;
1157*38fd1498Szrj 
1158*38fd1498Szrj     default:
1159*38fd1498Szrj       gcc_unreachable ();
1160*38fd1498Szrj     }
1161*38fd1498Szrj 
1162*38fd1498Szrj   /* Common tail for partitions we turn into a call.  If this was the last
1163*38fd1498Szrj      partition for which we generate code, we have to destroy the loop.  */
1164*38fd1498Szrj   if (!copy_p)
1165*38fd1498Szrj     return true;
1166*38fd1498Szrj   return false;
1167*38fd1498Szrj }
1168*38fd1498Szrj 
1169*38fd1498Szrj /* Return data dependence relation for data references A and B.  The two
1170*38fd1498Szrj    data references must be in lexicographic order wrto reduced dependence
1171*38fd1498Szrj    graph RDG.  We firstly try to find ddr from global ddr hash table.  If
1172*38fd1498Szrj    it doesn't exist, compute the ddr and cache it.  */
1173*38fd1498Szrj 
1174*38fd1498Szrj static data_dependence_relation *
1175*38fd1498Szrj get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b)
1176*38fd1498Szrj {
1177*38fd1498Szrj   struct data_dependence_relation ent, **slot;
1178*38fd1498Szrj   struct data_dependence_relation *ddr;
1179*38fd1498Szrj 
1180*38fd1498Szrj   gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1181*38fd1498Szrj   gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1182*38fd1498Szrj 	      <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1183*38fd1498Szrj   ent.a = a;
1184*38fd1498Szrj   ent.b = b;
1185*38fd1498Szrj   slot = ddrs_table->find_slot (&ent, INSERT);
1186*38fd1498Szrj   if (*slot == NULL)
1187*38fd1498Szrj     {
1188*38fd1498Szrj       ddr = initialize_data_dependence_relation (a, b, loop_nest);
1189*38fd1498Szrj       compute_affine_dependence (ddr, loop_nest[0]);
1190*38fd1498Szrj       *slot = ddr;
1191*38fd1498Szrj     }
1192*38fd1498Szrj 
1193*38fd1498Szrj   return *slot;
1194*38fd1498Szrj }
1195*38fd1498Szrj 
1196*38fd1498Szrj /* In reduced dependence graph RDG for loop distribution, return true if
1197*38fd1498Szrj    dependence between references DR1 and DR2 leads to a dependence cycle
1198*38fd1498Szrj    and such dependence cycle can't be resolved by runtime alias check.  */
1199*38fd1498Szrj 
1200*38fd1498Szrj static bool
1201*38fd1498Szrj data_dep_in_cycle_p (struct graph *rdg,
1202*38fd1498Szrj 		     data_reference_p dr1, data_reference_p dr2)
1203*38fd1498Szrj {
1204*38fd1498Szrj   struct data_dependence_relation *ddr;
1205*38fd1498Szrj 
1206*38fd1498Szrj   /* Re-shuffle data-refs to be in topological order.  */
1207*38fd1498Szrj   if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1208*38fd1498Szrj       > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1209*38fd1498Szrj     std::swap (dr1, dr2);
1210*38fd1498Szrj 
1211*38fd1498Szrj   ddr = get_data_dependence (rdg, dr1, dr2);
1212*38fd1498Szrj 
1213*38fd1498Szrj   /* In case of no data dependence.  */
1214*38fd1498Szrj   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1215*38fd1498Szrj     return false;
1216*38fd1498Szrj   /* For unknown data dependence or known data dependence which can't be
1217*38fd1498Szrj      expressed in classic distance vector, we check if it can be resolved
1218*38fd1498Szrj      by runtime alias check.  If yes, we still consider data dependence
1219*38fd1498Szrj      as won't introduce data dependence cycle.  */
1220*38fd1498Szrj   else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1221*38fd1498Szrj 	   || DDR_NUM_DIST_VECTS (ddr) == 0)
1222*38fd1498Szrj     return !runtime_alias_check_p (ddr, NULL, true);
1223*38fd1498Szrj   else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1224*38fd1498Szrj     return true;
1225*38fd1498Szrj   else if (DDR_REVERSED_P (ddr)
1226*38fd1498Szrj 	   || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1227*38fd1498Szrj     return false;
1228*38fd1498Szrj 
1229*38fd1498Szrj   return true;
1230*38fd1498Szrj }
1231*38fd1498Szrj 
1232*38fd1498Szrj /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
1233*38fd1498Szrj    PARTITION1's type after merging PARTITION2 into PARTITION1.  */
1234*38fd1498Szrj 
1235*38fd1498Szrj static void
1236*38fd1498Szrj update_type_for_merge (struct graph *rdg,
1237*38fd1498Szrj 		       partition *partition1, partition *partition2)
1238*38fd1498Szrj {
1239*38fd1498Szrj   unsigned i, j;
1240*38fd1498Szrj   bitmap_iterator bi, bj;
1241*38fd1498Szrj   data_reference_p dr1, dr2;
1242*38fd1498Szrj 
1243*38fd1498Szrj   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1244*38fd1498Szrj     {
1245*38fd1498Szrj       unsigned start = (partition1 == partition2) ? i + 1 : 0;
1246*38fd1498Szrj 
1247*38fd1498Szrj       dr1 = datarefs_vec[i];
1248*38fd1498Szrj       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1249*38fd1498Szrj 	{
1250*38fd1498Szrj 	  dr2 = datarefs_vec[j];
1251*38fd1498Szrj 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1252*38fd1498Szrj 	    continue;
1253*38fd1498Szrj 
1254*38fd1498Szrj 	  /* Partition can only be executed sequentially if there is any
1255*38fd1498Szrj 	     data dependence cycle.  */
1256*38fd1498Szrj 	  if (data_dep_in_cycle_p (rdg, dr1, dr2))
1257*38fd1498Szrj 	    {
1258*38fd1498Szrj 	      partition1->type = PTYPE_SEQUENTIAL;
1259*38fd1498Szrj 	      return;
1260*38fd1498Szrj 	    }
1261*38fd1498Szrj 	}
1262*38fd1498Szrj     }
1263*38fd1498Szrj }
1264*38fd1498Szrj 
1265*38fd1498Szrj /* Returns a partition with all the statements needed for computing
1266*38fd1498Szrj    the vertex V of the RDG, also including the loop exit conditions.  */
1267*38fd1498Szrj 
1268*38fd1498Szrj static partition *
1269*38fd1498Szrj build_rdg_partition_for_vertex (struct graph *rdg, int v)
1270*38fd1498Szrj {
1271*38fd1498Szrj   partition *partition = partition_alloc ();
1272*38fd1498Szrj   auto_vec<int, 3> nodes;
1273*38fd1498Szrj   unsigned i, j;
1274*38fd1498Szrj   int x;
1275*38fd1498Szrj   data_reference_p dr;
1276*38fd1498Szrj 
1277*38fd1498Szrj   graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1278*38fd1498Szrj 
1279*38fd1498Szrj   FOR_EACH_VEC_ELT (nodes, i, x)
1280*38fd1498Szrj     {
1281*38fd1498Szrj       bitmap_set_bit (partition->stmts, x);
1282*38fd1498Szrj 
1283*38fd1498Szrj       for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1284*38fd1498Szrj 	{
1285*38fd1498Szrj 	  unsigned idx = (unsigned) DR_INDEX (dr);
1286*38fd1498Szrj 	  gcc_assert (idx < datarefs_vec.length ());
1287*38fd1498Szrj 
1288*38fd1498Szrj 	  /* Partition can only be executed sequentially if there is any
1289*38fd1498Szrj 	     unknown data reference.  */
1290*38fd1498Szrj 	  if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1291*38fd1498Szrj 	      || !DR_INIT (dr) || !DR_STEP (dr))
1292*38fd1498Szrj 	    partition->type = PTYPE_SEQUENTIAL;
1293*38fd1498Szrj 
1294*38fd1498Szrj 	  bitmap_set_bit (partition->datarefs, idx);
1295*38fd1498Szrj 	}
1296*38fd1498Szrj     }
1297*38fd1498Szrj 
1298*38fd1498Szrj   if (partition->type == PTYPE_SEQUENTIAL)
1299*38fd1498Szrj     return partition;
1300*38fd1498Szrj 
1301*38fd1498Szrj   /* Further check if any data dependence prevents us from executing the
1302*38fd1498Szrj      partition parallelly.  */
1303*38fd1498Szrj   update_type_for_merge (rdg, partition, partition);
1304*38fd1498Szrj 
1305*38fd1498Szrj   return partition;
1306*38fd1498Szrj }
1307*38fd1498Szrj 
1308*38fd1498Szrj /* Given PARTITION of LOOP and RDG, record single load/store data references
1309*38fd1498Szrj    for builtin partition in SRC_DR/DST_DR, return false if there is no such
1310*38fd1498Szrj    data references.  */
1311*38fd1498Szrj 
1312*38fd1498Szrj static bool
1313*38fd1498Szrj find_single_drs (struct loop *loop, struct graph *rdg, partition *partition,
1314*38fd1498Szrj 		 data_reference_p *dst_dr, data_reference_p *src_dr)
1315*38fd1498Szrj {
1316*38fd1498Szrj   unsigned i;
1317*38fd1498Szrj   data_reference_p single_ld = NULL, single_st = NULL;
1318*38fd1498Szrj   bitmap_iterator bi;
1319*38fd1498Szrj 
1320*38fd1498Szrj   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1321*38fd1498Szrj     {
1322*38fd1498Szrj       gimple *stmt = RDG_STMT (rdg, i);
1323*38fd1498Szrj       data_reference_p dr;
1324*38fd1498Szrj 
1325*38fd1498Szrj       if (gimple_code (stmt) == GIMPLE_PHI)
1326*38fd1498Szrj 	continue;
1327*38fd1498Szrj 
1328*38fd1498Szrj       /* Any scalar stmts are ok.  */
1329*38fd1498Szrj       if (!gimple_vuse (stmt))
1330*38fd1498Szrj 	continue;
1331*38fd1498Szrj 
1332*38fd1498Szrj       /* Otherwise just regular loads/stores.  */
1333*38fd1498Szrj       if (!gimple_assign_single_p (stmt))
1334*38fd1498Szrj 	return false;
1335*38fd1498Szrj 
1336*38fd1498Szrj       /* But exactly one store and/or load.  */
1337*38fd1498Szrj       for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1338*38fd1498Szrj 	{
1339*38fd1498Szrj 	  tree type = TREE_TYPE (DR_REF (dr));
1340*38fd1498Szrj 
1341*38fd1498Szrj 	  /* The memset, memcpy and memmove library calls are only
1342*38fd1498Szrj 	     able to deal with generic address space.  */
1343*38fd1498Szrj 	  if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1344*38fd1498Szrj 	    return false;
1345*38fd1498Szrj 
1346*38fd1498Szrj 	  if (DR_IS_READ (dr))
1347*38fd1498Szrj 	    {
1348*38fd1498Szrj 	      if (single_ld != NULL)
1349*38fd1498Szrj 		return false;
1350*38fd1498Szrj 	      single_ld = dr;
1351*38fd1498Szrj 	    }
1352*38fd1498Szrj 	  else
1353*38fd1498Szrj 	    {
1354*38fd1498Szrj 	      if (single_st != NULL)
1355*38fd1498Szrj 		return false;
1356*38fd1498Szrj 	      single_st = dr;
1357*38fd1498Szrj 	    }
1358*38fd1498Szrj 	}
1359*38fd1498Szrj     }
1360*38fd1498Szrj 
1361*38fd1498Szrj   if (!single_st)
1362*38fd1498Szrj     return false;
1363*38fd1498Szrj 
1364*38fd1498Szrj   /* Bail out if this is a bitfield memory reference.  */
1365*38fd1498Szrj   if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1366*38fd1498Szrj       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1367*38fd1498Szrj     return false;
1368*38fd1498Szrj 
1369*38fd1498Szrj   /* Data reference must be executed exactly once per iteration of each
1370*38fd1498Szrj      loop in the loop nest.  We only need to check dominance information
1371*38fd1498Szrj      against the outermost one in a perfect loop nest because a bb can't
1372*38fd1498Szrj      dominate outermost loop's latch without dominating inner loop's.  */
1373*38fd1498Szrj   basic_block bb_st = gimple_bb (DR_STMT (single_st));
1374*38fd1498Szrj   if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1375*38fd1498Szrj     return false;
1376*38fd1498Szrj 
1377*38fd1498Szrj   if (single_ld)
1378*38fd1498Szrj     {
1379*38fd1498Szrj       gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1380*38fd1498Szrj       /* Direct aggregate copy or via an SSA name temporary.  */
1381*38fd1498Szrj       if (load != store
1382*38fd1498Szrj 	  && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1383*38fd1498Szrj 	return false;
1384*38fd1498Szrj 
1385*38fd1498Szrj       /* Bail out if this is a bitfield memory reference.  */
1386*38fd1498Szrj       if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1387*38fd1498Szrj 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1388*38fd1498Szrj 	return false;
1389*38fd1498Szrj 
1390*38fd1498Szrj       /* Load and store must be in the same loop nest.  */
1391*38fd1498Szrj       basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1392*38fd1498Szrj       if (bb_st->loop_father != bb_ld->loop_father)
1393*38fd1498Szrj 	return false;
1394*38fd1498Szrj 
1395*38fd1498Szrj       /* Data reference must be executed exactly once per iteration.
1396*38fd1498Szrj 	 Same as single_st, we only need to check against the outermost
1397*38fd1498Szrj 	 loop.  */
1398*38fd1498Szrj       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1399*38fd1498Szrj 	return false;
1400*38fd1498Szrj 
1401*38fd1498Szrj       edge e = single_exit (bb_st->loop_father);
1402*38fd1498Szrj       bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1403*38fd1498Szrj       bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1404*38fd1498Szrj       if (dom_ld != dom_st)
1405*38fd1498Szrj 	return false;
1406*38fd1498Szrj     }
1407*38fd1498Szrj 
1408*38fd1498Szrj   *src_dr = single_ld;
1409*38fd1498Szrj   *dst_dr = single_st;
1410*38fd1498Szrj   return true;
1411*38fd1498Szrj }
1412*38fd1498Szrj 
1413*38fd1498Szrj /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1414*38fd1498Szrj    loops from inner to outer to see if loop's step equals to access size at
1415*38fd1498Szrj    each level of loop.  Return 2 if we can prove this at all level loops;
1416*38fd1498Szrj    record access base and size in BASE and SIZE; save loop's step at each
1417*38fd1498Szrj    level of loop in STEPS if it is not null.  For example:
1418*38fd1498Szrj 
1419*38fd1498Szrj      int arr[100][100][100];
1420*38fd1498Szrj      for (i = 0; i < 100; i++)       ;steps[2] = 40000
1421*38fd1498Szrj        for (j = 100; j > 0; j--)     ;steps[1] = -400
1422*38fd1498Szrj 	 for (k = 0; k < 100; k++)   ;steps[0] = 4
1423*38fd1498Szrj 	   arr[i][j - 1][k] = 0;     ;base = &arr, size = 4000000
1424*38fd1498Szrj 
1425*38fd1498Szrj    Return 1 if we can prove the equality at the innermost loop, but not all
1426*38fd1498Szrj    level loops.  In this case, no information is recorded.
1427*38fd1498Szrj 
1428*38fd1498Szrj    Return 0 if no equality can be proven at any level loops.  */
1429*38fd1498Szrj 
1430*38fd1498Szrj static int
1431*38fd1498Szrj compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1432*38fd1498Szrj 		      tree *size, vec<tree> *steps = NULL)
1433*38fd1498Szrj {
1434*38fd1498Szrj   location_t loc = gimple_location (DR_STMT (dr));
1435*38fd1498Szrj   basic_block bb = gimple_bb (DR_STMT (dr));
1436*38fd1498Szrj   struct loop *loop = bb->loop_father;
1437*38fd1498Szrj   tree ref = DR_REF (dr);
1438*38fd1498Szrj   tree access_base = build_fold_addr_expr (ref);
1439*38fd1498Szrj   tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1440*38fd1498Szrj   int res = 0;
1441*38fd1498Szrj 
1442*38fd1498Szrj   do {
1443*38fd1498Szrj       tree scev_fn = analyze_scalar_evolution (loop, access_base);
1444*38fd1498Szrj       if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1445*38fd1498Szrj 	return res;
1446*38fd1498Szrj 
1447*38fd1498Szrj       access_base = CHREC_LEFT (scev_fn);
1448*38fd1498Szrj       if (tree_contains_chrecs (access_base, NULL))
1449*38fd1498Szrj 	return res;
1450*38fd1498Szrj 
1451*38fd1498Szrj       tree scev_step = CHREC_RIGHT (scev_fn);
1452*38fd1498Szrj       /* Only support constant steps.  */
1453*38fd1498Szrj       if (TREE_CODE (scev_step) != INTEGER_CST)
1454*38fd1498Szrj 	return res;
1455*38fd1498Szrj 
1456*38fd1498Szrj       enum ev_direction access_dir = scev_direction (scev_fn);
1457*38fd1498Szrj       if (access_dir == EV_DIR_UNKNOWN)
1458*38fd1498Szrj 	return res;
1459*38fd1498Szrj 
1460*38fd1498Szrj       if (steps != NULL)
1461*38fd1498Szrj 	steps->safe_push (scev_step);
1462*38fd1498Szrj 
1463*38fd1498Szrj       scev_step = fold_convert_loc (loc, sizetype, scev_step);
1464*38fd1498Szrj       /* Compute absolute value of scev step.  */
1465*38fd1498Szrj       if (access_dir == EV_DIR_DECREASES)
1466*38fd1498Szrj 	scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1467*38fd1498Szrj 
1468*38fd1498Szrj       /* At each level of loop, scev step must equal to access size.  In other
1469*38fd1498Szrj 	 words, DR must access consecutive memory between loop iterations.  */
1470*38fd1498Szrj       if (!operand_equal_p (scev_step, access_size, 0))
1471*38fd1498Szrj 	return res;
1472*38fd1498Szrj 
1473*38fd1498Szrj       /* Access stride can be computed for data reference at least for the
1474*38fd1498Szrj 	 innermost loop.  */
1475*38fd1498Szrj       res = 1;
1476*38fd1498Szrj 
1477*38fd1498Szrj       /* Compute DR's execution times in loop.  */
1478*38fd1498Szrj       tree niters = number_of_latch_executions (loop);
1479*38fd1498Szrj       niters = fold_convert_loc (loc, sizetype, niters);
1480*38fd1498Szrj       if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1481*38fd1498Szrj 	niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1482*38fd1498Szrj 
1483*38fd1498Szrj       /* Compute DR's overall access size in loop.  */
1484*38fd1498Szrj       access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1485*38fd1498Szrj 				     niters, scev_step);
1486*38fd1498Szrj       /* Adjust base address in case of negative step.  */
1487*38fd1498Szrj       if (access_dir == EV_DIR_DECREASES)
1488*38fd1498Szrj 	{
1489*38fd1498Szrj 	  tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1490*38fd1498Szrj 				      scev_step, access_size);
1491*38fd1498Szrj 	  access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1492*38fd1498Szrj 	}
1493*38fd1498Szrj   } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1494*38fd1498Szrj 
1495*38fd1498Szrj   *base = access_base;
1496*38fd1498Szrj   *size = access_size;
1497*38fd1498Szrj   /* Access stride can be computed for data reference at each level loop.  */
1498*38fd1498Szrj   return 2;
1499*38fd1498Szrj }
1500*38fd1498Szrj 
1501*38fd1498Szrj /* Allocate and return builtin struct.  Record information like DST_DR,
1502*38fd1498Szrj    SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct.  */
1503*38fd1498Szrj 
1504*38fd1498Szrj static struct builtin_info *
1505*38fd1498Szrj alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1506*38fd1498Szrj 	       tree dst_base, tree src_base, tree size)
1507*38fd1498Szrj {
1508*38fd1498Szrj   struct builtin_info *builtin = XNEW (struct builtin_info);
1509*38fd1498Szrj   builtin->dst_dr = dst_dr;
1510*38fd1498Szrj   builtin->src_dr = src_dr;
1511*38fd1498Szrj   builtin->dst_base = dst_base;
1512*38fd1498Szrj   builtin->src_base = src_base;
1513*38fd1498Szrj   builtin->size = size;
1514*38fd1498Szrj   return builtin;
1515*38fd1498Szrj }
1516*38fd1498Szrj 
1517*38fd1498Szrj /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1518*38fd1498Szrj    memset call.  */
1519*38fd1498Szrj 
1520*38fd1498Szrj static void
1521*38fd1498Szrj classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1522*38fd1498Szrj {
1523*38fd1498Szrj   gimple *stmt = DR_STMT (dr);
1524*38fd1498Szrj   tree base, size, rhs = gimple_assign_rhs1 (stmt);
1525*38fd1498Szrj 
1526*38fd1498Szrj   if (const_with_all_bytes_same (rhs) == -1
1527*38fd1498Szrj       && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1528*38fd1498Szrj 	  || (TYPE_MODE (TREE_TYPE (rhs))
1529*38fd1498Szrj 	      != TYPE_MODE (unsigned_char_type_node))))
1530*38fd1498Szrj     return;
1531*38fd1498Szrj 
1532*38fd1498Szrj   if (TREE_CODE (rhs) == SSA_NAME
1533*38fd1498Szrj       && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1534*38fd1498Szrj       && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1535*38fd1498Szrj     return;
1536*38fd1498Szrj 
1537*38fd1498Szrj   int res = compute_access_range (loop, dr, &base, &size);
1538*38fd1498Szrj   if (res == 0)
1539*38fd1498Szrj     return;
1540*38fd1498Szrj   if (res == 1)
1541*38fd1498Szrj     {
1542*38fd1498Szrj       partition->kind = PKIND_PARTIAL_MEMSET;
1543*38fd1498Szrj       return;
1544*38fd1498Szrj     }
1545*38fd1498Szrj 
1546*38fd1498Szrj   poly_uint64 base_offset;
1547*38fd1498Szrj   unsigned HOST_WIDE_INT const_base_offset;
1548*38fd1498Szrj   tree base_base = strip_offset (base, &base_offset);
1549*38fd1498Szrj   if (!base_offset.is_constant (&const_base_offset))
1550*38fd1498Szrj     return;
1551*38fd1498Szrj 
1552*38fd1498Szrj   struct builtin_info *builtin;
1553*38fd1498Szrj   builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1554*38fd1498Szrj   builtin->dst_base_base = base_base;
1555*38fd1498Szrj   builtin->dst_base_offset = const_base_offset;
1556*38fd1498Szrj   partition->builtin = builtin;
1557*38fd1498Szrj   partition->kind = PKIND_MEMSET;
1558*38fd1498Szrj }
1559*38fd1498Szrj 
1560*38fd1498Szrj /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1561*38fd1498Szrj    if it forms builtin memcpy or memmove call.  */
1562*38fd1498Szrj 
1563*38fd1498Szrj static void
1564*38fd1498Szrj classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
1565*38fd1498Szrj 		       data_reference_p dst_dr, data_reference_p src_dr)
1566*38fd1498Szrj {
1567*38fd1498Szrj   tree base, size, src_base, src_size;
1568*38fd1498Szrj   auto_vec<tree> dst_steps, src_steps;
1569*38fd1498Szrj 
1570*38fd1498Szrj   /* Compute access range of both load and store.  */
1571*38fd1498Szrj   int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1572*38fd1498Szrj   if (res != 2)
1573*38fd1498Szrj     return;
1574*38fd1498Szrj   res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1575*38fd1498Szrj   if (res != 2)
1576*38fd1498Szrj     return;
1577*38fd1498Szrj 
1578*38fd1498Szrj   /* They much have the same access size.  */
1579*38fd1498Szrj   if (!operand_equal_p (size, src_size, 0))
1580*38fd1498Szrj     return;
1581*38fd1498Szrj 
1582*38fd1498Szrj   /* Load and store in loop nest must access memory in the same way, i.e,
1583*38fd1498Szrj      their must have the same steps in each loop of the nest.  */
1584*38fd1498Szrj   if (dst_steps.length () != src_steps.length ())
1585*38fd1498Szrj     return;
1586*38fd1498Szrj   for (unsigned i = 0; i < dst_steps.length (); ++i)
1587*38fd1498Szrj     if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1588*38fd1498Szrj       return;
1589*38fd1498Szrj 
1590*38fd1498Szrj   /* Now check that if there is a dependence.  */
1591*38fd1498Szrj   ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1592*38fd1498Szrj 
1593*38fd1498Szrj   /* Classify as memcpy if no dependence between load and store.  */
1594*38fd1498Szrj   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1595*38fd1498Szrj     {
1596*38fd1498Szrj       partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1597*38fd1498Szrj       partition->kind = PKIND_MEMCPY;
1598*38fd1498Szrj       return;
1599*38fd1498Szrj     }
1600*38fd1498Szrj 
1601*38fd1498Szrj   /* Can't do memmove in case of unknown dependence or dependence without
1602*38fd1498Szrj      classical distance vector.  */
1603*38fd1498Szrj   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1604*38fd1498Szrj       || DDR_NUM_DIST_VECTS (ddr) == 0)
1605*38fd1498Szrj     return;
1606*38fd1498Szrj 
1607*38fd1498Szrj   unsigned i;
1608*38fd1498Szrj   lambda_vector dist_v;
1609*38fd1498Szrj   int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1610*38fd1498Szrj   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1611*38fd1498Szrj     {
1612*38fd1498Szrj       unsigned dep_lev = dependence_level (dist_v, num_lev);
1613*38fd1498Szrj       /* Can't do memmove if load depends on store.  */
1614*38fd1498Szrj       if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1615*38fd1498Szrj 	return;
1616*38fd1498Szrj     }
1617*38fd1498Szrj 
1618*38fd1498Szrj   partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1619*38fd1498Szrj   partition->kind = PKIND_MEMMOVE;
1620*38fd1498Szrj   return;
1621*38fd1498Szrj }
1622*38fd1498Szrj 
1623*38fd1498Szrj /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
1624*38fd1498Szrj    For the moment we detect memset, memcpy and memmove patterns.  Bitmap
1625*38fd1498Szrj    STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.  */
1626*38fd1498Szrj 
1627*38fd1498Szrj static void
1628*38fd1498Szrj classify_partition (loop_p loop, struct graph *rdg, partition *partition,
1629*38fd1498Szrj 		    bitmap stmt_in_all_partitions)
1630*38fd1498Szrj {
1631*38fd1498Szrj   bitmap_iterator bi;
1632*38fd1498Szrj   unsigned i;
1633*38fd1498Szrj   data_reference_p single_ld = NULL, single_st = NULL;
1634*38fd1498Szrj   bool volatiles_p = false, has_reduction = false;
1635*38fd1498Szrj 
1636*38fd1498Szrj   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1637*38fd1498Szrj     {
1638*38fd1498Szrj       gimple *stmt = RDG_STMT (rdg, i);
1639*38fd1498Szrj 
1640*38fd1498Szrj       if (gimple_has_volatile_ops (stmt))
1641*38fd1498Szrj 	volatiles_p = true;
1642*38fd1498Szrj 
1643*38fd1498Szrj       /* If the stmt is not included by all partitions and there is uses
1644*38fd1498Szrj 	 outside of the loop, then mark the partition as reduction.  */
1645*38fd1498Szrj       if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1646*38fd1498Szrj 	{
1647*38fd1498Szrj 	  /* Due to limitation in the transform phase we have to fuse all
1648*38fd1498Szrj 	     reduction partitions.  As a result, this could cancel valid
1649*38fd1498Szrj 	     loop distribution especially for loop that induction variable
1650*38fd1498Szrj 	     is used outside of loop.  To workaround this issue, we skip
1651*38fd1498Szrj 	     marking partition as reudction if the reduction stmt belongs
1652*38fd1498Szrj 	     to all partitions.  In such case, reduction will be computed
1653*38fd1498Szrj 	     correctly no matter how partitions are fused/distributed.  */
1654*38fd1498Szrj 	  if (!bitmap_bit_p (stmt_in_all_partitions, i))
1655*38fd1498Szrj 	    {
1656*38fd1498Szrj 	      partition->reduction_p = true;
1657*38fd1498Szrj 	      return;
1658*38fd1498Szrj 	    }
1659*38fd1498Szrj 	  has_reduction = true;
1660*38fd1498Szrj 	}
1661*38fd1498Szrj     }
1662*38fd1498Szrj 
1663*38fd1498Szrj   /* Perform general partition disqualification for builtins.  */
1664*38fd1498Szrj   if (volatiles_p
1665*38fd1498Szrj       /* Simple workaround to prevent classifying the partition as builtin
1666*38fd1498Szrj 	 if it contains any use outside of loop.  */
1667*38fd1498Szrj       || has_reduction
1668*38fd1498Szrj       || !flag_tree_loop_distribute_patterns)
1669*38fd1498Szrj     return;
1670*38fd1498Szrj 
1671*38fd1498Szrj   /* Find single load/store data references for builtin partition.  */
1672*38fd1498Szrj   if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
1673*38fd1498Szrj     return;
1674*38fd1498Szrj 
1675*38fd1498Szrj   /* Classify the builtin kind.  */
1676*38fd1498Szrj   if (single_ld == NULL)
1677*38fd1498Szrj     classify_builtin_st (loop, partition, single_st);
1678*38fd1498Szrj   else
1679*38fd1498Szrj     classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1680*38fd1498Szrj }
1681*38fd1498Szrj 
1682*38fd1498Szrj /* Returns true when PARTITION1 and PARTITION2 access the same memory
1683*38fd1498Szrj    object in RDG.  */
1684*38fd1498Szrj 
1685*38fd1498Szrj static bool
1686*38fd1498Szrj share_memory_accesses (struct graph *rdg,
1687*38fd1498Szrj 		       partition *partition1, partition *partition2)
1688*38fd1498Szrj {
1689*38fd1498Szrj   unsigned i, j;
1690*38fd1498Szrj   bitmap_iterator bi, bj;
1691*38fd1498Szrj   data_reference_p dr1, dr2;
1692*38fd1498Szrj 
1693*38fd1498Szrj   /* First check whether in the intersection of the two partitions are
1694*38fd1498Szrj      any loads or stores.  Common loads are the situation that happens
1695*38fd1498Szrj      most often.  */
1696*38fd1498Szrj   EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1697*38fd1498Szrj     if (RDG_MEM_WRITE_STMT (rdg, i)
1698*38fd1498Szrj 	|| RDG_MEM_READS_STMT (rdg, i))
1699*38fd1498Szrj       return true;
1700*38fd1498Szrj 
1701*38fd1498Szrj   /* Then check whether the two partitions access the same memory object.  */
1702*38fd1498Szrj   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1703*38fd1498Szrj     {
1704*38fd1498Szrj       dr1 = datarefs_vec[i];
1705*38fd1498Szrj 
1706*38fd1498Szrj       if (!DR_BASE_ADDRESS (dr1)
1707*38fd1498Szrj 	  || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1708*38fd1498Szrj 	continue;
1709*38fd1498Szrj 
1710*38fd1498Szrj       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1711*38fd1498Szrj 	{
1712*38fd1498Szrj 	  dr2 = datarefs_vec[j];
1713*38fd1498Szrj 
1714*38fd1498Szrj 	  if (!DR_BASE_ADDRESS (dr2)
1715*38fd1498Szrj 	      || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1716*38fd1498Szrj 	    continue;
1717*38fd1498Szrj 
1718*38fd1498Szrj 	  if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1719*38fd1498Szrj 	      && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1720*38fd1498Szrj 	      && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1721*38fd1498Szrj 	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1722*38fd1498Szrj 	    return true;
1723*38fd1498Szrj 	}
1724*38fd1498Szrj     }
1725*38fd1498Szrj 
1726*38fd1498Szrj   return false;
1727*38fd1498Szrj }
1728*38fd1498Szrj 
1729*38fd1498Szrj /* For each seed statement in STARTING_STMTS, this function builds
1730*38fd1498Szrj    partition for it by adding depended statements according to RDG.
1731*38fd1498Szrj    All partitions are recorded in PARTITIONS.  */
1732*38fd1498Szrj 
1733*38fd1498Szrj static void
1734*38fd1498Szrj rdg_build_partitions (struct graph *rdg,
1735*38fd1498Szrj 		      vec<gimple *> starting_stmts,
1736*38fd1498Szrj 		      vec<partition *> *partitions)
1737*38fd1498Szrj {
1738*38fd1498Szrj   auto_bitmap processed;
1739*38fd1498Szrj   int i;
1740*38fd1498Szrj   gimple *stmt;
1741*38fd1498Szrj 
1742*38fd1498Szrj   FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1743*38fd1498Szrj     {
1744*38fd1498Szrj       int v = rdg_vertex_for_stmt (rdg, stmt);
1745*38fd1498Szrj 
1746*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
1747*38fd1498Szrj 	fprintf (dump_file,
1748*38fd1498Szrj 		 "ldist asked to generate code for vertex %d\n", v);
1749*38fd1498Szrj 
1750*38fd1498Szrj       /* If the vertex is already contained in another partition so
1751*38fd1498Szrj          is the partition rooted at it.  */
1752*38fd1498Szrj       if (bitmap_bit_p (processed, v))
1753*38fd1498Szrj 	continue;
1754*38fd1498Szrj 
1755*38fd1498Szrj       partition *partition = build_rdg_partition_for_vertex (rdg, v);
1756*38fd1498Szrj       bitmap_ior_into (processed, partition->stmts);
1757*38fd1498Szrj 
1758*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
1759*38fd1498Szrj 	{
1760*38fd1498Szrj 	  fprintf (dump_file, "ldist creates useful %s partition:\n",
1761*38fd1498Szrj 		   partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1762*38fd1498Szrj 	  bitmap_print (dump_file, partition->stmts, "  ", "\n");
1763*38fd1498Szrj 	}
1764*38fd1498Szrj 
1765*38fd1498Szrj       partitions->safe_push (partition);
1766*38fd1498Szrj     }
1767*38fd1498Szrj 
1768*38fd1498Szrj   /* All vertices should have been assigned to at least one partition now,
1769*38fd1498Szrj      other than vertices belonging to dead code.  */
1770*38fd1498Szrj }
1771*38fd1498Szrj 
1772*38fd1498Szrj /* Dump to FILE the PARTITIONS.  */
1773*38fd1498Szrj 
1774*38fd1498Szrj static void
1775*38fd1498Szrj dump_rdg_partitions (FILE *file, vec<partition *> partitions)
1776*38fd1498Szrj {
1777*38fd1498Szrj   int i;
1778*38fd1498Szrj   partition *partition;
1779*38fd1498Szrj 
1780*38fd1498Szrj   FOR_EACH_VEC_ELT (partitions, i, partition)
1781*38fd1498Szrj     debug_bitmap_file (file, partition->stmts);
1782*38fd1498Szrj }
1783*38fd1498Szrj 
1784*38fd1498Szrj /* Debug PARTITIONS.  */
1785*38fd1498Szrj extern void debug_rdg_partitions (vec<partition *> );
1786*38fd1498Szrj 
1787*38fd1498Szrj DEBUG_FUNCTION void
1788*38fd1498Szrj debug_rdg_partitions (vec<partition *> partitions)
1789*38fd1498Szrj {
1790*38fd1498Szrj   dump_rdg_partitions (stderr, partitions);
1791*38fd1498Szrj }
1792*38fd1498Szrj 
1793*38fd1498Szrj /* Returns the number of read and write operations in the RDG.  */
1794*38fd1498Szrj 
1795*38fd1498Szrj static int
1796*38fd1498Szrj number_of_rw_in_rdg (struct graph *rdg)
1797*38fd1498Szrj {
1798*38fd1498Szrj   int i, res = 0;
1799*38fd1498Szrj 
1800*38fd1498Szrj   for (i = 0; i < rdg->n_vertices; i++)
1801*38fd1498Szrj     {
1802*38fd1498Szrj       if (RDG_MEM_WRITE_STMT (rdg, i))
1803*38fd1498Szrj 	++res;
1804*38fd1498Szrj 
1805*38fd1498Szrj       if (RDG_MEM_READS_STMT (rdg, i))
1806*38fd1498Szrj 	++res;
1807*38fd1498Szrj     }
1808*38fd1498Szrj 
1809*38fd1498Szrj   return res;
1810*38fd1498Szrj }
1811*38fd1498Szrj 
1812*38fd1498Szrj /* Returns the number of read and write operations in a PARTITION of
1813*38fd1498Szrj    the RDG.  */
1814*38fd1498Szrj 
1815*38fd1498Szrj static int
1816*38fd1498Szrj number_of_rw_in_partition (struct graph *rdg, partition *partition)
1817*38fd1498Szrj {
1818*38fd1498Szrj   int res = 0;
1819*38fd1498Szrj   unsigned i;
1820*38fd1498Szrj   bitmap_iterator ii;
1821*38fd1498Szrj 
1822*38fd1498Szrj   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
1823*38fd1498Szrj     {
1824*38fd1498Szrj       if (RDG_MEM_WRITE_STMT (rdg, i))
1825*38fd1498Szrj 	++res;
1826*38fd1498Szrj 
1827*38fd1498Szrj       if (RDG_MEM_READS_STMT (rdg, i))
1828*38fd1498Szrj 	++res;
1829*38fd1498Szrj     }
1830*38fd1498Szrj 
1831*38fd1498Szrj   return res;
1832*38fd1498Szrj }
1833*38fd1498Szrj 
1834*38fd1498Szrj /* Returns true when one of the PARTITIONS contains all the read or
1835*38fd1498Szrj    write operations of RDG.  */
1836*38fd1498Szrj 
1837*38fd1498Szrj static bool
1838*38fd1498Szrj partition_contains_all_rw (struct graph *rdg,
1839*38fd1498Szrj 			   vec<partition *> partitions)
1840*38fd1498Szrj {
1841*38fd1498Szrj   int i;
1842*38fd1498Szrj   partition *partition;
1843*38fd1498Szrj   int nrw = number_of_rw_in_rdg (rdg);
1844*38fd1498Szrj 
1845*38fd1498Szrj   FOR_EACH_VEC_ELT (partitions, i, partition)
1846*38fd1498Szrj     if (nrw == number_of_rw_in_partition (rdg, partition))
1847*38fd1498Szrj       return true;
1848*38fd1498Szrj 
1849*38fd1498Szrj   return false;
1850*38fd1498Szrj }
1851*38fd1498Szrj 
1852*38fd1498Szrj /* Compute partition dependence created by the data references in DRS1
1853*38fd1498Szrj    and DRS2, modify and return DIR according to that.  IF ALIAS_DDR is
1854*38fd1498Szrj    not NULL, we record dependence introduced by possible alias between
1855*38fd1498Szrj    two data references in ALIAS_DDRS; otherwise, we simply ignore such
1856*38fd1498Szrj    dependence as if it doesn't exist at all.  */
1857*38fd1498Szrj 
1858*38fd1498Szrj static int
1859*38fd1498Szrj pg_add_dependence_edges (struct graph *rdg, int dir,
1860*38fd1498Szrj 			 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
1861*38fd1498Szrj {
1862*38fd1498Szrj   unsigned i, j;
1863*38fd1498Szrj   bitmap_iterator bi, bj;
1864*38fd1498Szrj   data_reference_p dr1, dr2, saved_dr1;
1865*38fd1498Szrj 
1866*38fd1498Szrj   /* dependence direction - 0 is no dependence, -1 is back,
1867*38fd1498Szrj      1 is forth, 2 is both (we can stop then, merging will occur).  */
1868*38fd1498Szrj   EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
1869*38fd1498Szrj     {
1870*38fd1498Szrj       dr1 = datarefs_vec[i];
1871*38fd1498Szrj 
1872*38fd1498Szrj       EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
1873*38fd1498Szrj 	{
1874*38fd1498Szrj 	  int res, this_dir = 1;
1875*38fd1498Szrj 	  ddr_p ddr;
1876*38fd1498Szrj 
1877*38fd1498Szrj 	  dr2 = datarefs_vec[j];
1878*38fd1498Szrj 
1879*38fd1498Szrj 	  /* Skip all <read, read> data dependence.  */
1880*38fd1498Szrj 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1881*38fd1498Szrj 	    continue;
1882*38fd1498Szrj 
1883*38fd1498Szrj 	  saved_dr1 = dr1;
1884*38fd1498Szrj 	  /* Re-shuffle data-refs to be in topological order.  */
1885*38fd1498Szrj 	  if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1886*38fd1498Szrj 	      > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1887*38fd1498Szrj 	    {
1888*38fd1498Szrj 	      std::swap (dr1, dr2);
1889*38fd1498Szrj 	      this_dir = -this_dir;
1890*38fd1498Szrj 	    }
1891*38fd1498Szrj 	  ddr = get_data_dependence (rdg, dr1, dr2);
1892*38fd1498Szrj 	  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1893*38fd1498Szrj 	    {
1894*38fd1498Szrj 	      this_dir = 0;
1895*38fd1498Szrj 	      res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
1896*38fd1498Szrj 					   DR_BASE_ADDRESS (dr2));
1897*38fd1498Szrj 	      /* Be conservative.  If data references are not well analyzed,
1898*38fd1498Szrj 		 or the two data references have the same base address and
1899*38fd1498Szrj 		 offset, add dependence and consider it alias to each other.
1900*38fd1498Szrj 		 In other words, the dependence can not be resolved by
1901*38fd1498Szrj 		 runtime alias check.  */
1902*38fd1498Szrj 	      if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
1903*38fd1498Szrj 		  || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
1904*38fd1498Szrj 		  || !DR_INIT (dr1) || !DR_INIT (dr2)
1905*38fd1498Szrj 		  || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
1906*38fd1498Szrj 		  || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
1907*38fd1498Szrj 		  || res == 0)
1908*38fd1498Szrj 		this_dir = 2;
1909*38fd1498Szrj 	      /* Data dependence could be resolved by runtime alias check,
1910*38fd1498Szrj 		 record it in ALIAS_DDRS.  */
1911*38fd1498Szrj 	      else if (alias_ddrs != NULL)
1912*38fd1498Szrj 		alias_ddrs->safe_push (ddr);
1913*38fd1498Szrj 	      /* Or simply ignore it.  */
1914*38fd1498Szrj 	    }
1915*38fd1498Szrj 	  else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
1916*38fd1498Szrj 	    {
1917*38fd1498Szrj 	      if (DDR_REVERSED_P (ddr))
1918*38fd1498Szrj 		this_dir = -this_dir;
1919*38fd1498Szrj 
1920*38fd1498Szrj 	      /* Known dependences can still be unordered througout the
1921*38fd1498Szrj 		 iteration space, see gcc.dg/tree-ssa/ldist-16.c.  */
1922*38fd1498Szrj 	      if (DDR_NUM_DIST_VECTS (ddr) != 1)
1923*38fd1498Szrj 		this_dir = 2;
1924*38fd1498Szrj 	      /* If the overlap is exact preserve stmt order.  */
1925*38fd1498Szrj 	      else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1926*38fd1498Szrj 		;
1927*38fd1498Szrj 	      /* Else as the distance vector is lexicographic positive swap
1928*38fd1498Szrj 		 the dependence direction.  */
1929*38fd1498Szrj 	      else
1930*38fd1498Szrj 		this_dir = -this_dir;
1931*38fd1498Szrj 	    }
1932*38fd1498Szrj 	  else
1933*38fd1498Szrj 	    this_dir = 0;
1934*38fd1498Szrj 	  if (this_dir == 2)
1935*38fd1498Szrj 	    return 2;
1936*38fd1498Szrj 	  else if (dir == 0)
1937*38fd1498Szrj 	    dir = this_dir;
1938*38fd1498Szrj 	  else if (this_dir != 0 && dir != this_dir)
1939*38fd1498Szrj 	    return 2;
1940*38fd1498Szrj 	  /* Shuffle "back" dr1.  */
1941*38fd1498Szrj 	  dr1 = saved_dr1;
1942*38fd1498Szrj 	}
1943*38fd1498Szrj     }
1944*38fd1498Szrj   return dir;
1945*38fd1498Szrj }
1946*38fd1498Szrj 
1947*38fd1498Szrj /* Compare postorder number of the partition graph vertices V1 and V2.  */
1948*38fd1498Szrj 
1949*38fd1498Szrj static int
1950*38fd1498Szrj pgcmp (const void *v1_, const void *v2_)
1951*38fd1498Szrj {
1952*38fd1498Szrj   const vertex *v1 = (const vertex *)v1_;
1953*38fd1498Szrj   const vertex *v2 = (const vertex *)v2_;
1954*38fd1498Szrj   return v2->post - v1->post;
1955*38fd1498Szrj }
1956*38fd1498Szrj 
1957*38fd1498Szrj /* Data attached to vertices of partition dependence graph.  */
1958*38fd1498Szrj struct pg_vdata
1959*38fd1498Szrj {
1960*38fd1498Szrj   /* ID of the corresponding partition.  */
1961*38fd1498Szrj   int id;
1962*38fd1498Szrj   /* The partition.  */
1963*38fd1498Szrj   struct partition *partition;
1964*38fd1498Szrj };
1965*38fd1498Szrj 
1966*38fd1498Szrj /* Data attached to edges of partition dependence graph.  */
1967*38fd1498Szrj struct pg_edata
1968*38fd1498Szrj {
1969*38fd1498Szrj   /* If the dependence edge can be resolved by runtime alias check,
1970*38fd1498Szrj      this vector contains data dependence relations for runtime alias
1971*38fd1498Szrj      check.  On the other hand, if the dependence edge is introduced
1972*38fd1498Szrj      because of compilation time known data dependence, this vector
1973*38fd1498Szrj      contains nothing.  */
1974*38fd1498Szrj   vec<ddr_p> alias_ddrs;
1975*38fd1498Szrj };
1976*38fd1498Szrj 
1977*38fd1498Szrj /* Callback data for traversing edges in graph.  */
1978*38fd1498Szrj struct pg_edge_callback_data
1979*38fd1498Szrj {
1980*38fd1498Szrj   /* Bitmap contains strong connected components should be merged.  */
1981*38fd1498Szrj   bitmap sccs_to_merge;
1982*38fd1498Szrj   /* Array constains component information for all vertices.  */
1983*38fd1498Szrj   int *vertices_component;
1984*38fd1498Szrj   /* Vector to record all data dependence relations which are needed
1985*38fd1498Szrj      to break strong connected components by runtime alias checks.  */
1986*38fd1498Szrj   vec<ddr_p> *alias_ddrs;
1987*38fd1498Szrj };
1988*38fd1498Szrj 
1989*38fd1498Szrj /* Initialize vertice's data for partition dependence graph PG with
1990*38fd1498Szrj    PARTITIONS.  */
1991*38fd1498Szrj 
1992*38fd1498Szrj static void
1993*38fd1498Szrj init_partition_graph_vertices (struct graph *pg,
1994*38fd1498Szrj 			       vec<struct partition *> *partitions)
1995*38fd1498Szrj {
1996*38fd1498Szrj   int i;
1997*38fd1498Szrj   partition *partition;
1998*38fd1498Szrj   struct pg_vdata *data;
1999*38fd1498Szrj 
2000*38fd1498Szrj   for (i = 0; partitions->iterate (i, &partition); ++i)
2001*38fd1498Szrj     {
2002*38fd1498Szrj       data = new pg_vdata;
2003*38fd1498Szrj       pg->vertices[i].data = data;
2004*38fd1498Szrj       data->id = i;
2005*38fd1498Szrj       data->partition = partition;
2006*38fd1498Szrj     }
2007*38fd1498Szrj }
2008*38fd1498Szrj 
2009*38fd1498Szrj /* Add edge <I, J> to partition dependence graph PG.  Attach vector of data
2010*38fd1498Szrj    dependence relations to the EDGE if DDRS isn't NULL.  */
2011*38fd1498Szrj 
2012*38fd1498Szrj static void
2013*38fd1498Szrj add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2014*38fd1498Szrj {
2015*38fd1498Szrj   struct graph_edge *e = add_edge (pg, i, j);
2016*38fd1498Szrj 
2017*38fd1498Szrj   /* If the edge is attached with data dependence relations, it means this
2018*38fd1498Szrj      dependence edge can be resolved by runtime alias checks.  */
2019*38fd1498Szrj   if (ddrs != NULL)
2020*38fd1498Szrj     {
2021*38fd1498Szrj       struct pg_edata *data = new pg_edata;
2022*38fd1498Szrj 
2023*38fd1498Szrj       gcc_assert (ddrs->length () > 0);
2024*38fd1498Szrj       e->data = data;
2025*38fd1498Szrj       data->alias_ddrs = vNULL;
2026*38fd1498Szrj       data->alias_ddrs.safe_splice (*ddrs);
2027*38fd1498Szrj     }
2028*38fd1498Szrj }
2029*38fd1498Szrj 
2030*38fd1498Szrj /* Callback function for graph travesal algorithm.  It returns true
2031*38fd1498Szrj    if edge E should skipped when traversing the graph.  */
2032*38fd1498Szrj 
2033*38fd1498Szrj static bool
2034*38fd1498Szrj pg_skip_alias_edge (struct graph_edge *e)
2035*38fd1498Szrj {
2036*38fd1498Szrj   struct pg_edata *data = (struct pg_edata *)e->data;
2037*38fd1498Szrj   return (data != NULL && data->alias_ddrs.length () > 0);
2038*38fd1498Szrj }
2039*38fd1498Szrj 
2040*38fd1498Szrj /* Callback function freeing data attached to edge E of graph.  */
2041*38fd1498Szrj 
2042*38fd1498Szrj static void
2043*38fd1498Szrj free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2044*38fd1498Szrj {
2045*38fd1498Szrj   if (e->data != NULL)
2046*38fd1498Szrj     {
2047*38fd1498Szrj       struct pg_edata *data = (struct pg_edata *)e->data;
2048*38fd1498Szrj       data->alias_ddrs.release ();
2049*38fd1498Szrj       delete data;
2050*38fd1498Szrj     }
2051*38fd1498Szrj }
2052*38fd1498Szrj 
2053*38fd1498Szrj /* Free data attached to vertice of partition dependence graph PG.  */
2054*38fd1498Szrj 
2055*38fd1498Szrj static void
2056*38fd1498Szrj free_partition_graph_vdata (struct graph *pg)
2057*38fd1498Szrj {
2058*38fd1498Szrj   int i;
2059*38fd1498Szrj   struct pg_vdata *data;
2060*38fd1498Szrj 
2061*38fd1498Szrj   for (i = 0; i < pg->n_vertices; ++i)
2062*38fd1498Szrj     {
2063*38fd1498Szrj       data = (struct pg_vdata *)pg->vertices[i].data;
2064*38fd1498Szrj       delete data;
2065*38fd1498Szrj     }
2066*38fd1498Szrj }
2067*38fd1498Szrj 
2068*38fd1498Szrj /* Build and return partition dependence graph for PARTITIONS.  RDG is
2069*38fd1498Szrj    reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
2070*38fd1498Szrj    is true, data dependence caused by possible alias between references
2071*38fd1498Szrj    is ignored, as if it doesn't exist at all; otherwise all depdendences
2072*38fd1498Szrj    are considered.  */
2073*38fd1498Szrj 
2074*38fd1498Szrj static struct graph *
2075*38fd1498Szrj build_partition_graph (struct graph *rdg,
2076*38fd1498Szrj 		       vec<struct partition *> *partitions,
2077*38fd1498Szrj 		       bool ignore_alias_p)
2078*38fd1498Szrj {
2079*38fd1498Szrj   int i, j;
2080*38fd1498Szrj   struct partition *partition1, *partition2;
2081*38fd1498Szrj   graph *pg = new_graph (partitions->length ());
2082*38fd1498Szrj   auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2083*38fd1498Szrj 
2084*38fd1498Szrj   alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2085*38fd1498Szrj 
2086*38fd1498Szrj   init_partition_graph_vertices (pg, partitions);
2087*38fd1498Szrj 
2088*38fd1498Szrj   for (i = 0; partitions->iterate (i, &partition1); ++i)
2089*38fd1498Szrj     {
2090*38fd1498Szrj       for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2091*38fd1498Szrj 	{
2092*38fd1498Szrj 	  /* dependence direction - 0 is no dependence, -1 is back,
2093*38fd1498Szrj 	     1 is forth, 2 is both (we can stop then, merging will occur).  */
2094*38fd1498Szrj 	  int dir = 0;
2095*38fd1498Szrj 
2096*38fd1498Szrj 	  /* If the first partition has reduction, add back edge; if the
2097*38fd1498Szrj 	     second partition has reduction, add forth edge.  This makes
2098*38fd1498Szrj 	     sure that reduction partition will be sorted as the last one.  */
2099*38fd1498Szrj 	  if (partition_reduction_p (partition1))
2100*38fd1498Szrj 	    dir = -1;
2101*38fd1498Szrj 	  else if (partition_reduction_p (partition2))
2102*38fd1498Szrj 	    dir = 1;
2103*38fd1498Szrj 
2104*38fd1498Szrj 	  /* Cleanup the temporary vector.  */
2105*38fd1498Szrj 	  alias_ddrs.truncate (0);
2106*38fd1498Szrj 
2107*38fd1498Szrj 	  dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2108*38fd1498Szrj 					 partition2->datarefs, alias_ddrs_p);
2109*38fd1498Szrj 
2110*38fd1498Szrj 	  /* Add edge to partition graph if there exists dependence.  There
2111*38fd1498Szrj 	     are two types of edges.  One type edge is caused by compilation
2112*38fd1498Szrj 	     time known dependence, this type can not be resolved by runtime
2113*38fd1498Szrj 	     alias check.  The other type can be resolved by runtime alias
2114*38fd1498Szrj 	     check.  */
2115*38fd1498Szrj 	  if (dir == 1 || dir == 2
2116*38fd1498Szrj 	      || alias_ddrs.length () > 0)
2117*38fd1498Szrj 	    {
2118*38fd1498Szrj 	      /* Attach data dependence relations to edge that can be resolved
2119*38fd1498Szrj 		 by runtime alias check.  */
2120*38fd1498Szrj 	      bool alias_edge_p = (dir != 1 && dir != 2);
2121*38fd1498Szrj 	      add_partition_graph_edge (pg, i, j,
2122*38fd1498Szrj 					(alias_edge_p) ? &alias_ddrs : NULL);
2123*38fd1498Szrj 	    }
2124*38fd1498Szrj 	  if (dir == -1 || dir == 2
2125*38fd1498Szrj 	      || alias_ddrs.length () > 0)
2126*38fd1498Szrj 	    {
2127*38fd1498Szrj 	      /* Attach data dependence relations to edge that can be resolved
2128*38fd1498Szrj 		 by runtime alias check.  */
2129*38fd1498Szrj 	      bool alias_edge_p = (dir != -1 && dir != 2);
2130*38fd1498Szrj 	      add_partition_graph_edge (pg, j, i,
2131*38fd1498Szrj 					(alias_edge_p) ? &alias_ddrs : NULL);
2132*38fd1498Szrj 	    }
2133*38fd1498Szrj 	}
2134*38fd1498Szrj     }
2135*38fd1498Szrj   return pg;
2136*38fd1498Szrj }
2137*38fd1498Szrj 
2138*38fd1498Szrj /* Sort partitions in PG in descending post order and store them in
2139*38fd1498Szrj    PARTITIONS.  */
2140*38fd1498Szrj 
2141*38fd1498Szrj static void
2142*38fd1498Szrj sort_partitions_by_post_order (struct graph *pg,
2143*38fd1498Szrj 			       vec<struct partition *> *partitions)
2144*38fd1498Szrj {
2145*38fd1498Szrj   int i;
2146*38fd1498Szrj   struct pg_vdata *data;
2147*38fd1498Szrj 
2148*38fd1498Szrj   /* Now order the remaining nodes in descending postorder.  */
2149*38fd1498Szrj   qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2150*38fd1498Szrj   partitions->truncate (0);
2151*38fd1498Szrj   for (i = 0; i < pg->n_vertices; ++i)
2152*38fd1498Szrj     {
2153*38fd1498Szrj       data = (struct pg_vdata *)pg->vertices[i].data;
2154*38fd1498Szrj       if (data->partition)
2155*38fd1498Szrj 	partitions->safe_push (data->partition);
2156*38fd1498Szrj     }
2157*38fd1498Szrj }
2158*38fd1498Szrj 
2159*38fd1498Szrj /* Given reduced dependence graph RDG merge strong connected components
2160*38fd1498Szrj    of PARTITIONS.  If IGNORE_ALIAS_P is true, data dependence caused by
2161*38fd1498Szrj    possible alias between references is ignored, as if it doesn't exist
2162*38fd1498Szrj    at all; otherwise all depdendences are considered.  */
2163*38fd1498Szrj 
2164*38fd1498Szrj static void
2165*38fd1498Szrj merge_dep_scc_partitions (struct graph *rdg,
2166*38fd1498Szrj 			  vec<struct partition *> *partitions,
2167*38fd1498Szrj 			  bool ignore_alias_p)
2168*38fd1498Szrj {
2169*38fd1498Szrj   struct partition *partition1, *partition2;
2170*38fd1498Szrj   struct pg_vdata *data;
2171*38fd1498Szrj   graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2172*38fd1498Szrj   int i, j, num_sccs = graphds_scc (pg, NULL);
2173*38fd1498Szrj 
2174*38fd1498Szrj   /* Strong connected compoenent means dependence cycle, we cannot distribute
2175*38fd1498Szrj      them.  So fuse them together.  */
2176*38fd1498Szrj   if ((unsigned) num_sccs < partitions->length ())
2177*38fd1498Szrj     {
2178*38fd1498Szrj       for (i = 0; i < num_sccs; ++i)
2179*38fd1498Szrj 	{
2180*38fd1498Szrj 	  for (j = 0; partitions->iterate (j, &partition1); ++j)
2181*38fd1498Szrj 	    if (pg->vertices[j].component == i)
2182*38fd1498Szrj 	      break;
2183*38fd1498Szrj 	  for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2184*38fd1498Szrj 	    if (pg->vertices[j].component == i)
2185*38fd1498Szrj 	      {
2186*38fd1498Szrj 		partition_merge_into (NULL, partition1,
2187*38fd1498Szrj 				      partition2, FUSE_SAME_SCC);
2188*38fd1498Szrj 		partition1->type = PTYPE_SEQUENTIAL;
2189*38fd1498Szrj 		(*partitions)[j] = NULL;
2190*38fd1498Szrj 		partition_free (partition2);
2191*38fd1498Szrj 		data = (struct pg_vdata *)pg->vertices[j].data;
2192*38fd1498Szrj 		data->partition = NULL;
2193*38fd1498Szrj 	      }
2194*38fd1498Szrj 	}
2195*38fd1498Szrj     }
2196*38fd1498Szrj 
2197*38fd1498Szrj   sort_partitions_by_post_order (pg, partitions);
2198*38fd1498Szrj   gcc_assert (partitions->length () == (unsigned)num_sccs);
2199*38fd1498Szrj   free_partition_graph_vdata (pg);
2200*38fd1498Szrj   free_graph (pg);
2201*38fd1498Szrj }
2202*38fd1498Szrj 
2203*38fd1498Szrj /* Callback function for traversing edge E in graph G.  DATA is private
2204*38fd1498Szrj    callback data.  */
2205*38fd1498Szrj 
2206*38fd1498Szrj static void
2207*38fd1498Szrj pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2208*38fd1498Szrj {
2209*38fd1498Szrj   int i, j, component;
2210*38fd1498Szrj   struct pg_edge_callback_data *cbdata;
2211*38fd1498Szrj   struct pg_edata *edata = (struct pg_edata *) e->data;
2212*38fd1498Szrj 
2213*38fd1498Szrj   /* If the edge doesn't have attached data dependence, it represents
2214*38fd1498Szrj      compilation time known dependences.  This type dependence cannot
2215*38fd1498Szrj      be resolved by runtime alias check.  */
2216*38fd1498Szrj   if (edata == NULL || edata->alias_ddrs.length () == 0)
2217*38fd1498Szrj     return;
2218*38fd1498Szrj 
2219*38fd1498Szrj   cbdata = (struct pg_edge_callback_data *) data;
2220*38fd1498Szrj   i = e->src;
2221*38fd1498Szrj   j = e->dest;
2222*38fd1498Szrj   component = cbdata->vertices_component[i];
2223*38fd1498Szrj   /* Vertices are topologically sorted according to compilation time
2224*38fd1498Szrj      known dependences, so we can break strong connected components
2225*38fd1498Szrj      by removing edges of the opposite direction, i.e, edges pointing
2226*38fd1498Szrj      from vertice with smaller post number to vertice with bigger post
2227*38fd1498Szrj      number.  */
2228*38fd1498Szrj   if (g->vertices[i].post < g->vertices[j].post
2229*38fd1498Szrj       /* We only need to remove edges connecting vertices in the same
2230*38fd1498Szrj 	 strong connected component to break it.  */
2231*38fd1498Szrj       && component == cbdata->vertices_component[j]
2232*38fd1498Szrj       /* Check if we want to break the strong connected component or not.  */
2233*38fd1498Szrj       && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2234*38fd1498Szrj     cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2235*38fd1498Szrj }
2236*38fd1498Szrj 
2237*38fd1498Szrj /* This is the main function breaking strong conected components in
2238*38fd1498Szrj    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
2239*38fd1498Szrj    relations for runtime alias check in ALIAS_DDRS.  */
2240*38fd1498Szrj 
2241*38fd1498Szrj static void
2242*38fd1498Szrj break_alias_scc_partitions (struct graph *rdg,
2243*38fd1498Szrj 			    vec<struct partition *> *partitions,
2244*38fd1498Szrj 			    vec<ddr_p> *alias_ddrs)
2245*38fd1498Szrj {
2246*38fd1498Szrj   int i, j, k, num_sccs, num_sccs_no_alias;
2247*38fd1498Szrj   /* Build partition dependence graph.  */
2248*38fd1498Szrj   graph *pg = build_partition_graph (rdg, partitions, false);
2249*38fd1498Szrj 
2250*38fd1498Szrj   alias_ddrs->truncate (0);
2251*38fd1498Szrj   /* Find strong connected components in the graph, with all dependence edges
2252*38fd1498Szrj      considered.  */
2253*38fd1498Szrj   num_sccs = graphds_scc (pg, NULL);
2254*38fd1498Szrj   /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2255*38fd1498Szrj      compilation time known dependences are merged before this function.  */
2256*38fd1498Szrj   if ((unsigned) num_sccs < partitions->length ())
2257*38fd1498Szrj     {
2258*38fd1498Szrj       struct pg_edge_callback_data cbdata;
2259*38fd1498Szrj       auto_bitmap sccs_to_merge;
2260*38fd1498Szrj       auto_vec<enum partition_type> scc_types;
2261*38fd1498Szrj       struct partition *partition, *first;
2262*38fd1498Szrj 
2263*38fd1498Szrj       /* If all partitions in a SCC have the same type, we can simply merge the
2264*38fd1498Szrj 	 SCC.  This loop finds out such SCCS and record them in bitmap.  */
2265*38fd1498Szrj       bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2266*38fd1498Szrj       for (i = 0; i < num_sccs; ++i)
2267*38fd1498Szrj 	{
2268*38fd1498Szrj 	  for (j = 0; partitions->iterate (j, &first); ++j)
2269*38fd1498Szrj 	    if (pg->vertices[j].component == i)
2270*38fd1498Szrj 	      break;
2271*38fd1498Szrj 	  for (++j; partitions->iterate (j, &partition); ++j)
2272*38fd1498Szrj 	    {
2273*38fd1498Szrj 	      if (pg->vertices[j].component != i)
2274*38fd1498Szrj 		continue;
2275*38fd1498Szrj 
2276*38fd1498Szrj 	      /* Note we Merge partitions of parallel type on purpose, though
2277*38fd1498Szrj 		 the result partition is sequential.  The reason is vectorizer
2278*38fd1498Szrj 		 can do more accurate runtime alias check in this case.  Also
2279*38fd1498Szrj 		 it results in more conservative distribution.  */
2280*38fd1498Szrj 	      if (first->type != partition->type)
2281*38fd1498Szrj 		{
2282*38fd1498Szrj 		  bitmap_clear_bit (sccs_to_merge, i);
2283*38fd1498Szrj 		  break;
2284*38fd1498Szrj 		}
2285*38fd1498Szrj 	    }
2286*38fd1498Szrj 	}
2287*38fd1498Szrj 
2288*38fd1498Szrj       /* Initialize callback data for traversing.  */
2289*38fd1498Szrj       cbdata.sccs_to_merge = sccs_to_merge;
2290*38fd1498Szrj       cbdata.alias_ddrs = alias_ddrs;
2291*38fd1498Szrj       cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2292*38fd1498Szrj       /* Record the component information which will be corrupted by next
2293*38fd1498Szrj 	 graph scc finding call.  */
2294*38fd1498Szrj       for (i = 0; i < pg->n_vertices; ++i)
2295*38fd1498Szrj 	cbdata.vertices_component[i] = pg->vertices[i].component;
2296*38fd1498Szrj 
2297*38fd1498Szrj       /* Collect data dependences for runtime alias checks to break SCCs.  */
2298*38fd1498Szrj       if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2299*38fd1498Szrj 	{
2300*38fd1498Szrj 	  /* Run SCC finding algorithm again, with alias dependence edges
2301*38fd1498Szrj 	     skipped.  This is to topologically sort partitions according to
2302*38fd1498Szrj 	     compilation time known dependence.  Note the topological order
2303*38fd1498Szrj 	     is stored in the form of pg's post order number.  */
2304*38fd1498Szrj 	  num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2305*38fd1498Szrj 	  gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias);
2306*38fd1498Szrj 	  /* With topological order, we can construct two subgraphs L and R.
2307*38fd1498Szrj 	     L contains edge <x, y> where x < y in terms of post order, while
2308*38fd1498Szrj 	     R contains edge <x, y> where x > y.  Edges for compilation time
2309*38fd1498Szrj 	     known dependence all fall in R, so we break SCCs by removing all
2310*38fd1498Szrj 	     (alias) edges of in subgraph L.  */
2311*38fd1498Szrj 	  for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2312*38fd1498Szrj 	}
2313*38fd1498Szrj 
2314*38fd1498Szrj       /* For SCC that doesn't need to be broken, merge it.  */
2315*38fd1498Szrj       for (i = 0; i < num_sccs; ++i)
2316*38fd1498Szrj 	{
2317*38fd1498Szrj 	  if (!bitmap_bit_p (sccs_to_merge, i))
2318*38fd1498Szrj 	    continue;
2319*38fd1498Szrj 
2320*38fd1498Szrj 	  for (j = 0; partitions->iterate (j, &first); ++j)
2321*38fd1498Szrj 	    if (cbdata.vertices_component[j] == i)
2322*38fd1498Szrj 	      break;
2323*38fd1498Szrj 	  for (k = j + 1; partitions->iterate (k, &partition); ++k)
2324*38fd1498Szrj 	    {
2325*38fd1498Szrj 	      struct pg_vdata *data;
2326*38fd1498Szrj 
2327*38fd1498Szrj 	      if (cbdata.vertices_component[k] != i)
2328*38fd1498Szrj 		continue;
2329*38fd1498Szrj 
2330*38fd1498Szrj 	      /* Update postorder number so that merged reduction partition is
2331*38fd1498Szrj 		 sorted after other partitions.  */
2332*38fd1498Szrj 	      if (!partition_reduction_p (first)
2333*38fd1498Szrj 		  && partition_reduction_p (partition))
2334*38fd1498Szrj 		{
2335*38fd1498Szrj 		  gcc_assert (pg->vertices[k].post < pg->vertices[j].post);
2336*38fd1498Szrj 		  pg->vertices[j].post = pg->vertices[k].post;
2337*38fd1498Szrj 		}
2338*38fd1498Szrj 	      partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2339*38fd1498Szrj 	      (*partitions)[k] = NULL;
2340*38fd1498Szrj 	      partition_free (partition);
2341*38fd1498Szrj 	      data = (struct pg_vdata *)pg->vertices[k].data;
2342*38fd1498Szrj 	      gcc_assert (data->id == k);
2343*38fd1498Szrj 	      data->partition = NULL;
2344*38fd1498Szrj 	      /* The result partition of merged SCC must be sequential.  */
2345*38fd1498Szrj 	      first->type = PTYPE_SEQUENTIAL;
2346*38fd1498Szrj 	    }
2347*38fd1498Szrj 	}
2348*38fd1498Szrj     }
2349*38fd1498Szrj 
2350*38fd1498Szrj   sort_partitions_by_post_order (pg, partitions);
2351*38fd1498Szrj   free_partition_graph_vdata (pg);
2352*38fd1498Szrj   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2353*38fd1498Szrj   free_graph (pg);
2354*38fd1498Szrj 
2355*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
2356*38fd1498Szrj     {
2357*38fd1498Szrj       fprintf (dump_file, "Possible alias data dependence to break:\n");
2358*38fd1498Szrj       dump_data_dependence_relations (dump_file, *alias_ddrs);
2359*38fd1498Szrj     }
2360*38fd1498Szrj }
2361*38fd1498Szrj 
2362*38fd1498Szrj /* Compute and return an expression whose value is the segment length which
2363*38fd1498Szrj    will be accessed by DR in NITERS iterations.  */
2364*38fd1498Szrj 
2365*38fd1498Szrj static tree
2366*38fd1498Szrj data_ref_segment_size (struct data_reference *dr, tree niters)
2367*38fd1498Szrj {
2368*38fd1498Szrj   niters = size_binop (MINUS_EXPR,
2369*38fd1498Szrj 		       fold_convert (sizetype, niters),
2370*38fd1498Szrj 		       size_one_node);
2371*38fd1498Szrj   return size_binop (MULT_EXPR,
2372*38fd1498Szrj 		     fold_convert (sizetype, DR_STEP (dr)),
2373*38fd1498Szrj 		     fold_convert (sizetype, niters));
2374*38fd1498Szrj }
2375*38fd1498Szrj 
2376*38fd1498Szrj /* Return true if LOOP's latch is dominated by statement for data reference
2377*38fd1498Szrj    DR.  */
2378*38fd1498Szrj 
2379*38fd1498Szrj static inline bool
2380*38fd1498Szrj latch_dominated_by_data_ref (struct loop *loop, data_reference *dr)
2381*38fd1498Szrj {
2382*38fd1498Szrj   return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2383*38fd1498Szrj 			 gimple_bb (DR_STMT (dr)));
2384*38fd1498Szrj }
2385*38fd1498Szrj 
2386*38fd1498Szrj /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2387*38fd1498Szrj    data dependence relations ALIAS_DDRS.  */
2388*38fd1498Szrj 
2389*38fd1498Szrj static void
2390*38fd1498Szrj compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs,
2391*38fd1498Szrj 			   vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2392*38fd1498Szrj {
2393*38fd1498Szrj   unsigned int i;
2394*38fd1498Szrj   unsigned HOST_WIDE_INT factor = 1;
2395*38fd1498Szrj   tree niters_plus_one, niters = number_of_latch_executions (loop);
2396*38fd1498Szrj 
2397*38fd1498Szrj   gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2398*38fd1498Szrj   niters = fold_convert (sizetype, niters);
2399*38fd1498Szrj   niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2400*38fd1498Szrj 
2401*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
2402*38fd1498Szrj     fprintf (dump_file, "Creating alias check pairs:\n");
2403*38fd1498Szrj 
2404*38fd1498Szrj   /* Iterate all data dependence relations and compute alias check pairs.  */
2405*38fd1498Szrj   for (i = 0; i < alias_ddrs->length (); i++)
2406*38fd1498Szrj     {
2407*38fd1498Szrj       ddr_p ddr = (*alias_ddrs)[i];
2408*38fd1498Szrj       struct data_reference *dr_a = DDR_A (ddr);
2409*38fd1498Szrj       struct data_reference *dr_b = DDR_B (ddr);
2410*38fd1498Szrj       tree seg_length_a, seg_length_b;
2411*38fd1498Szrj       int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a),
2412*38fd1498Szrj 					    DR_BASE_ADDRESS (dr_b));
2413*38fd1498Szrj 
2414*38fd1498Szrj       if (comp_res == 0)
2415*38fd1498Szrj 	comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b));
2416*38fd1498Szrj       gcc_assert (comp_res != 0);
2417*38fd1498Szrj 
2418*38fd1498Szrj       if (latch_dominated_by_data_ref (loop, dr_a))
2419*38fd1498Szrj 	seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2420*38fd1498Szrj       else
2421*38fd1498Szrj 	seg_length_a = data_ref_segment_size (dr_a, niters);
2422*38fd1498Szrj 
2423*38fd1498Szrj       if (latch_dominated_by_data_ref (loop, dr_b))
2424*38fd1498Szrj 	seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2425*38fd1498Szrj       else
2426*38fd1498Szrj 	seg_length_b = data_ref_segment_size (dr_b, niters);
2427*38fd1498Szrj 
2428*38fd1498Szrj       unsigned HOST_WIDE_INT access_size_a
2429*38fd1498Szrj 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2430*38fd1498Szrj       unsigned HOST_WIDE_INT access_size_b
2431*38fd1498Szrj 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2432*38fd1498Szrj       unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2433*38fd1498Szrj       unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2434*38fd1498Szrj 
2435*38fd1498Szrj       dr_with_seg_len_pair_t dr_with_seg_len_pair
2436*38fd1498Szrj 	(dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2437*38fd1498Szrj 	 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b));
2438*38fd1498Szrj 
2439*38fd1498Szrj       /* Canonicalize pairs by sorting the two DR members.  */
2440*38fd1498Szrj       if (comp_res > 0)
2441*38fd1498Szrj 	std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2442*38fd1498Szrj 
2443*38fd1498Szrj       comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2444*38fd1498Szrj     }
2445*38fd1498Szrj 
2446*38fd1498Szrj   if (tree_fits_uhwi_p (niters))
2447*38fd1498Szrj     factor = tree_to_uhwi (niters);
2448*38fd1498Szrj 
2449*38fd1498Szrj   /* Prune alias check pairs.  */
2450*38fd1498Szrj   prune_runtime_alias_test_list (comp_alias_pairs, factor);
2451*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
2452*38fd1498Szrj     fprintf (dump_file,
2453*38fd1498Szrj 	     "Improved number of alias checks from %d to %d\n",
2454*38fd1498Szrj 	     alias_ddrs->length (), comp_alias_pairs->length ());
2455*38fd1498Szrj }
2456*38fd1498Szrj 
2457*38fd1498Szrj /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2458*38fd1498Szrj    checks and version LOOP under condition of these runtime alias checks.  */
2459*38fd1498Szrj 
2460*38fd1498Szrj static void
2461*38fd1498Szrj version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs)
2462*38fd1498Szrj {
2463*38fd1498Szrj   profile_probability prob;
2464*38fd1498Szrj   basic_block cond_bb;
2465*38fd1498Szrj   struct loop *nloop;
2466*38fd1498Szrj   tree lhs, arg0, cond_expr = NULL_TREE;
2467*38fd1498Szrj   gimple_seq cond_stmts = NULL;
2468*38fd1498Szrj   gimple *call_stmt = NULL;
2469*38fd1498Szrj   auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2470*38fd1498Szrj 
2471*38fd1498Szrj   /* Generate code for runtime alias checks if necessary.  */
2472*38fd1498Szrj   gcc_assert (alias_ddrs->length () > 0);
2473*38fd1498Szrj 
2474*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
2475*38fd1498Szrj     fprintf (dump_file,
2476*38fd1498Szrj 	     "Version loop <%d> with runtime alias check\n", loop->num);
2477*38fd1498Szrj 
2478*38fd1498Szrj   compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2479*38fd1498Szrj   create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2480*38fd1498Szrj   cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2481*38fd1498Szrj 				      is_gimple_val, NULL_TREE);
2482*38fd1498Szrj 
2483*38fd1498Szrj   /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS.  */
2484*38fd1498Szrj   if (flag_tree_loop_vectorize)
2485*38fd1498Szrj     {
2486*38fd1498Szrj       /* Generate internal function call for loop distribution alias check.  */
2487*38fd1498Szrj       call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2488*38fd1498Szrj 					      2, NULL_TREE, cond_expr);
2489*38fd1498Szrj       lhs = make_ssa_name (boolean_type_node);
2490*38fd1498Szrj       gimple_call_set_lhs (call_stmt, lhs);
2491*38fd1498Szrj     }
2492*38fd1498Szrj   else
2493*38fd1498Szrj     lhs = cond_expr;
2494*38fd1498Szrj 
2495*38fd1498Szrj   prob = profile_probability::guessed_always ().apply_scale (9, 10);
2496*38fd1498Szrj   initialize_original_copy_tables ();
2497*38fd1498Szrj   nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2498*38fd1498Szrj 			prob, prob.invert (), true);
2499*38fd1498Szrj   free_original_copy_tables ();
2500*38fd1498Szrj   /* Record the original loop number in newly generated loops.  In case of
2501*38fd1498Szrj      distribution, the original loop will be distributed and the new loop
2502*38fd1498Szrj      is kept.  */
2503*38fd1498Szrj   loop->orig_loop_num = nloop->num;
2504*38fd1498Szrj   nloop->orig_loop_num = nloop->num;
2505*38fd1498Szrj   nloop->dont_vectorize = true;
2506*38fd1498Szrj   nloop->force_vectorize = false;
2507*38fd1498Szrj 
2508*38fd1498Szrj   if (call_stmt)
2509*38fd1498Szrj     {
2510*38fd1498Szrj       /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2511*38fd1498Szrj 	 loop could be destroyed.  */
2512*38fd1498Szrj       arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2513*38fd1498Szrj       gimple_call_set_arg (call_stmt, 0, arg0);
2514*38fd1498Szrj       gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2515*38fd1498Szrj     }
2516*38fd1498Szrj 
2517*38fd1498Szrj   if (cond_stmts)
2518*38fd1498Szrj     {
2519*38fd1498Szrj       gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2520*38fd1498Szrj       gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2521*38fd1498Szrj     }
2522*38fd1498Szrj   update_ssa (TODO_update_ssa);
2523*38fd1498Szrj }
2524*38fd1498Szrj 
2525*38fd1498Szrj /* Return true if loop versioning is needed to distrubute PARTITIONS.
2526*38fd1498Szrj    ALIAS_DDRS are data dependence relations for runtime alias check.  */
2527*38fd1498Szrj 
2528*38fd1498Szrj static inline bool
2529*38fd1498Szrj version_for_distribution_p (vec<struct partition *> *partitions,
2530*38fd1498Szrj 			    vec<ddr_p> *alias_ddrs)
2531*38fd1498Szrj {
2532*38fd1498Szrj   /* No need to version loop if we have only one partition.  */
2533*38fd1498Szrj   if (partitions->length () == 1)
2534*38fd1498Szrj     return false;
2535*38fd1498Szrj 
2536*38fd1498Szrj   /* Need to version loop if runtime alias check is necessary.  */
2537*38fd1498Szrj   return (alias_ddrs->length () > 0);
2538*38fd1498Szrj }
2539*38fd1498Szrj 
2540*38fd1498Szrj /* Compare base offset of builtin mem* partitions P1 and P2.  */
2541*38fd1498Szrj 
2542*38fd1498Szrj static bool
2543*38fd1498Szrj offset_cmp (struct partition *p1, struct partition *p2)
2544*38fd1498Szrj {
2545*38fd1498Szrj   gcc_assert (p1 != NULL && p1->builtin != NULL);
2546*38fd1498Szrj   gcc_assert (p2 != NULL && p2->builtin != NULL);
2547*38fd1498Szrj   return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset;
2548*38fd1498Szrj }
2549*38fd1498Szrj 
2550*38fd1498Szrj /* Fuse adjacent memset builtin PARTITIONS if possible.  This is a special
2551*38fd1498Szrj    case optimization transforming below code:
2552*38fd1498Szrj 
2553*38fd1498Szrj      __builtin_memset (&obj, 0, 100);
2554*38fd1498Szrj      _1 = &obj + 100;
2555*38fd1498Szrj      __builtin_memset (_1, 0, 200);
2556*38fd1498Szrj      _2 = &obj + 300;
2557*38fd1498Szrj      __builtin_memset (_2, 0, 100);
2558*38fd1498Szrj 
2559*38fd1498Szrj    into:
2560*38fd1498Szrj 
2561*38fd1498Szrj      __builtin_memset (&obj, 0, 400);
2562*38fd1498Szrj 
2563*38fd1498Szrj    Note we don't have dependence information between different partitions
2564*38fd1498Szrj    at this point, as a result, we can't handle nonadjacent memset builtin
2565*38fd1498Szrj    partitions since dependence might be broken.  */
2566*38fd1498Szrj 
2567*38fd1498Szrj static void
2568*38fd1498Szrj fuse_memset_builtins (vec<struct partition *> *partitions)
2569*38fd1498Szrj {
2570*38fd1498Szrj   unsigned i, j;
2571*38fd1498Szrj   struct partition *part1, *part2;
2572*38fd1498Szrj   tree rhs1, rhs2;
2573*38fd1498Szrj 
2574*38fd1498Szrj   for (i = 0; partitions->iterate (i, &part1);)
2575*38fd1498Szrj     {
2576*38fd1498Szrj       if (part1->kind != PKIND_MEMSET)
2577*38fd1498Szrj 	{
2578*38fd1498Szrj 	  i++;
2579*38fd1498Szrj 	  continue;
2580*38fd1498Szrj 	}
2581*38fd1498Szrj 
2582*38fd1498Szrj       /* Find sub-array of memset builtins of the same base.  Index range
2583*38fd1498Szrj 	 of the sub-array is [i, j) with "j > i".  */
2584*38fd1498Szrj       for (j = i + 1; partitions->iterate (j, &part2); ++j)
2585*38fd1498Szrj 	{
2586*38fd1498Szrj 	  if (part2->kind != PKIND_MEMSET
2587*38fd1498Szrj 	      || !operand_equal_p (part1->builtin->dst_base_base,
2588*38fd1498Szrj 				   part2->builtin->dst_base_base, 0))
2589*38fd1498Szrj 	    break;
2590*38fd1498Szrj 
2591*38fd1498Szrj 	  /* Memset calls setting different values can't be merged.  */
2592*38fd1498Szrj 	  rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2593*38fd1498Szrj 	  rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2594*38fd1498Szrj 	  if (!operand_equal_p (rhs1, rhs2, 0))
2595*38fd1498Szrj 	    break;
2596*38fd1498Szrj 	}
2597*38fd1498Szrj 
2598*38fd1498Szrj       /* Stable sort is required in order to avoid breaking dependence.  */
2599*38fd1498Szrj       std::stable_sort (&(*partitions)[i],
2600*38fd1498Szrj 			&(*partitions)[i] + j - i, offset_cmp);
2601*38fd1498Szrj       /* Continue with next partition.  */
2602*38fd1498Szrj       i = j;
2603*38fd1498Szrj     }
2604*38fd1498Szrj 
2605*38fd1498Szrj   /* Merge all consecutive memset builtin partitions.  */
2606*38fd1498Szrj   for (i = 0; i < partitions->length () - 1;)
2607*38fd1498Szrj     {
2608*38fd1498Szrj       part1 = (*partitions)[i];
2609*38fd1498Szrj       if (part1->kind != PKIND_MEMSET)
2610*38fd1498Szrj 	{
2611*38fd1498Szrj 	  i++;
2612*38fd1498Szrj 	  continue;
2613*38fd1498Szrj 	}
2614*38fd1498Szrj 
2615*38fd1498Szrj       part2 = (*partitions)[i + 1];
2616*38fd1498Szrj       /* Only merge memset partitions of the same base and with constant
2617*38fd1498Szrj 	 access sizes.  */
2618*38fd1498Szrj       if (part2->kind != PKIND_MEMSET
2619*38fd1498Szrj 	  || TREE_CODE (part1->builtin->size) != INTEGER_CST
2620*38fd1498Szrj 	  || TREE_CODE (part2->builtin->size) != INTEGER_CST
2621*38fd1498Szrj 	  || !operand_equal_p (part1->builtin->dst_base_base,
2622*38fd1498Szrj 			       part2->builtin->dst_base_base, 0))
2623*38fd1498Szrj 	{
2624*38fd1498Szrj 	  i++;
2625*38fd1498Szrj 	  continue;
2626*38fd1498Szrj 	}
2627*38fd1498Szrj       rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2628*38fd1498Szrj       rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2629*38fd1498Szrj       int bytev1 = const_with_all_bytes_same (rhs1);
2630*38fd1498Szrj       int bytev2 = const_with_all_bytes_same (rhs2);
2631*38fd1498Szrj       /* Only merge memset partitions of the same value.  */
2632*38fd1498Szrj       if (bytev1 != bytev2 || bytev1 == -1)
2633*38fd1498Szrj 	{
2634*38fd1498Szrj 	  i++;
2635*38fd1498Szrj 	  continue;
2636*38fd1498Szrj 	}
2637*38fd1498Szrj       wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2638*38fd1498Szrj 			       wi::to_wide (part1->builtin->size));
2639*38fd1498Szrj       /* Only merge adjacent memset partitions.  */
2640*38fd1498Szrj       if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2641*38fd1498Szrj 	{
2642*38fd1498Szrj 	  i++;
2643*38fd1498Szrj 	  continue;
2644*38fd1498Szrj 	}
2645*38fd1498Szrj       /* Merge partitions[i] and partitions[i+1].  */
2646*38fd1498Szrj       part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2647*38fd1498Szrj 					  part1->builtin->size,
2648*38fd1498Szrj 					  part2->builtin->size);
2649*38fd1498Szrj       partition_free (part2);
2650*38fd1498Szrj       partitions->ordered_remove (i + 1);
2651*38fd1498Szrj     }
2652*38fd1498Szrj }
2653*38fd1498Szrj 
2654*38fd1498Szrj /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
2655*38fd1498Szrj    ALIAS_DDRS contains ddrs which need runtime alias check.  */
2656*38fd1498Szrj 
2657*38fd1498Szrj static void
2658*38fd1498Szrj finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
2659*38fd1498Szrj 		     vec<ddr_p> *alias_ddrs)
2660*38fd1498Szrj {
2661*38fd1498Szrj   unsigned i;
2662*38fd1498Szrj   struct partition *partition, *a;
2663*38fd1498Szrj 
2664*38fd1498Szrj   if (partitions->length () == 1
2665*38fd1498Szrj       || alias_ddrs->length () > 0)
2666*38fd1498Szrj     return;
2667*38fd1498Szrj 
2668*38fd1498Szrj   unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2669*38fd1498Szrj   bool same_type_p = true;
2670*38fd1498Szrj   enum partition_type type = ((*partitions)[0])->type;
2671*38fd1498Szrj   for (i = 0; partitions->iterate (i, &partition); ++i)
2672*38fd1498Szrj     {
2673*38fd1498Szrj       same_type_p &= (type == partition->type);
2674*38fd1498Szrj       if (partition_builtin_p (partition))
2675*38fd1498Szrj 	{
2676*38fd1498Szrj 	  num_builtin++;
2677*38fd1498Szrj 	  continue;
2678*38fd1498Szrj 	}
2679*38fd1498Szrj       num_normal++;
2680*38fd1498Szrj       if (partition->kind == PKIND_PARTIAL_MEMSET)
2681*38fd1498Szrj 	num_partial_memset++;
2682*38fd1498Szrj     }
2683*38fd1498Szrj 
2684*38fd1498Szrj   /* Don't distribute current loop into too many loops given we don't have
2685*38fd1498Szrj      memory stream cost model.  Be even more conservative in case of loop
2686*38fd1498Szrj      nest distribution.  */
2687*38fd1498Szrj   if ((same_type_p && num_builtin == 0
2688*38fd1498Szrj        && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2689*38fd1498Szrj       || (loop->inner != NULL
2690*38fd1498Szrj 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2691*38fd1498Szrj       || (loop->inner == NULL
2692*38fd1498Szrj 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2693*38fd1498Szrj     {
2694*38fd1498Szrj       a = (*partitions)[0];
2695*38fd1498Szrj       for (i = 1; partitions->iterate (i, &partition); ++i)
2696*38fd1498Szrj 	{
2697*38fd1498Szrj 	  partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2698*38fd1498Szrj 	  partition_free (partition);
2699*38fd1498Szrj 	}
2700*38fd1498Szrj       partitions->truncate (1);
2701*38fd1498Szrj     }
2702*38fd1498Szrj 
2703*38fd1498Szrj   /* Fuse memset builtins if possible.  */
2704*38fd1498Szrj   if (partitions->length () > 1)
2705*38fd1498Szrj     fuse_memset_builtins (partitions);
2706*38fd1498Szrj }
2707*38fd1498Szrj 
2708*38fd1498Szrj /* Distributes the code from LOOP in such a way that producer statements
2709*38fd1498Szrj    are placed before consumer statements.  Tries to separate only the
2710*38fd1498Szrj    statements from STMTS into separate loops.  Returns the number of
2711*38fd1498Szrj    distributed loops.  Set NB_CALLS to number of generated builtin calls.
2712*38fd1498Szrj    Set *DESTROY_P to whether LOOP needs to be destroyed.  */
2713*38fd1498Szrj 
2714*38fd1498Szrj static int
2715*38fd1498Szrj distribute_loop (struct loop *loop, vec<gimple *> stmts,
2716*38fd1498Szrj 		 control_dependences *cd, int *nb_calls, bool *destroy_p)
2717*38fd1498Szrj {
2718*38fd1498Szrj   ddrs_table = new hash_table<ddr_hasher> (389);
2719*38fd1498Szrj   struct graph *rdg;
2720*38fd1498Szrj   partition *partition;
2721*38fd1498Szrj   bool any_builtin;
2722*38fd1498Szrj   int i, nbp;
2723*38fd1498Szrj 
2724*38fd1498Szrj   *destroy_p = false;
2725*38fd1498Szrj   *nb_calls = 0;
2726*38fd1498Szrj   loop_nest.create (0);
2727*38fd1498Szrj   if (!find_loop_nest (loop, &loop_nest))
2728*38fd1498Szrj     {
2729*38fd1498Szrj       loop_nest.release ();
2730*38fd1498Szrj       delete ddrs_table;
2731*38fd1498Szrj       return 0;
2732*38fd1498Szrj     }
2733*38fd1498Szrj 
2734*38fd1498Szrj   datarefs_vec.create (20);
2735*38fd1498Szrj   rdg = build_rdg (loop, cd);
2736*38fd1498Szrj   if (!rdg)
2737*38fd1498Szrj     {
2738*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
2739*38fd1498Szrj 	fprintf (dump_file,
2740*38fd1498Szrj 		 "Loop %d not distributed: failed to build the RDG.\n",
2741*38fd1498Szrj 		 loop->num);
2742*38fd1498Szrj 
2743*38fd1498Szrj       loop_nest.release ();
2744*38fd1498Szrj       free_data_refs (datarefs_vec);
2745*38fd1498Szrj       delete ddrs_table;
2746*38fd1498Szrj       return 0;
2747*38fd1498Szrj     }
2748*38fd1498Szrj 
2749*38fd1498Szrj   if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2750*38fd1498Szrj     {
2751*38fd1498Szrj       if (dump_file && (dump_flags & TDF_DETAILS))
2752*38fd1498Szrj 	fprintf (dump_file,
2753*38fd1498Szrj 		 "Loop %d not distributed: too many memory references.\n",
2754*38fd1498Szrj 		 loop->num);
2755*38fd1498Szrj 
2756*38fd1498Szrj       free_rdg (rdg);
2757*38fd1498Szrj       loop_nest.release ();
2758*38fd1498Szrj       free_data_refs (datarefs_vec);
2759*38fd1498Szrj       delete ddrs_table;
2760*38fd1498Szrj       return 0;
2761*38fd1498Szrj     }
2762*38fd1498Szrj 
2763*38fd1498Szrj   data_reference_p dref;
2764*38fd1498Szrj   for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2765*38fd1498Szrj     dref->aux = (void *) (uintptr_t) i;
2766*38fd1498Szrj 
2767*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
2768*38fd1498Szrj     dump_rdg (dump_file, rdg);
2769*38fd1498Szrj 
2770*38fd1498Szrj   auto_vec<struct partition *, 3> partitions;
2771*38fd1498Szrj   rdg_build_partitions (rdg, stmts, &partitions);
2772*38fd1498Szrj 
2773*38fd1498Szrj   auto_vec<ddr_p> alias_ddrs;
2774*38fd1498Szrj 
2775*38fd1498Szrj   auto_bitmap stmt_in_all_partitions;
2776*38fd1498Szrj   bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
2777*38fd1498Szrj   for (i = 1; partitions.iterate (i, &partition); ++i)
2778*38fd1498Szrj     bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
2779*38fd1498Szrj 
2780*38fd1498Szrj   any_builtin = false;
2781*38fd1498Szrj   FOR_EACH_VEC_ELT (partitions, i, partition)
2782*38fd1498Szrj     {
2783*38fd1498Szrj       classify_partition (loop, rdg, partition, stmt_in_all_partitions);
2784*38fd1498Szrj       any_builtin |= partition_builtin_p (partition);
2785*38fd1498Szrj     }
2786*38fd1498Szrj 
2787*38fd1498Szrj   /* If we are only distributing patterns but did not detect any,
2788*38fd1498Szrj      simply bail out.  */
2789*38fd1498Szrj   if (!flag_tree_loop_distribution
2790*38fd1498Szrj       && !any_builtin)
2791*38fd1498Szrj     {
2792*38fd1498Szrj       nbp = 0;
2793*38fd1498Szrj       goto ldist_done;
2794*38fd1498Szrj     }
2795*38fd1498Szrj 
2796*38fd1498Szrj   /* If we are only distributing patterns fuse all partitions that
2797*38fd1498Szrj      were not classified as builtins.  This also avoids chopping
2798*38fd1498Szrj      a loop into pieces, separated by builtin calls.  That is, we
2799*38fd1498Szrj      only want no or a single loop body remaining.  */
2800*38fd1498Szrj   struct partition *into;
2801*38fd1498Szrj   if (!flag_tree_loop_distribution)
2802*38fd1498Szrj     {
2803*38fd1498Szrj       for (i = 0; partitions.iterate (i, &into); ++i)
2804*38fd1498Szrj 	if (!partition_builtin_p (into))
2805*38fd1498Szrj 	  break;
2806*38fd1498Szrj       for (++i; partitions.iterate (i, &partition); ++i)
2807*38fd1498Szrj 	if (!partition_builtin_p (partition))
2808*38fd1498Szrj 	  {
2809*38fd1498Szrj 	    partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
2810*38fd1498Szrj 	    partitions.unordered_remove (i);
2811*38fd1498Szrj 	    partition_free (partition);
2812*38fd1498Szrj 	    i--;
2813*38fd1498Szrj 	  }
2814*38fd1498Szrj     }
2815*38fd1498Szrj 
2816*38fd1498Szrj   /* Due to limitations in the transform phase we have to fuse all
2817*38fd1498Szrj      reduction partitions into the last partition so the existing
2818*38fd1498Szrj      loop will contain all loop-closed PHI nodes.  */
2819*38fd1498Szrj   for (i = 0; partitions.iterate (i, &into); ++i)
2820*38fd1498Szrj     if (partition_reduction_p (into))
2821*38fd1498Szrj       break;
2822*38fd1498Szrj   for (i = i + 1; partitions.iterate (i, &partition); ++i)
2823*38fd1498Szrj     if (partition_reduction_p (partition))
2824*38fd1498Szrj       {
2825*38fd1498Szrj 	partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
2826*38fd1498Szrj 	partitions.unordered_remove (i);
2827*38fd1498Szrj 	partition_free (partition);
2828*38fd1498Szrj 	i--;
2829*38fd1498Szrj       }
2830*38fd1498Szrj 
2831*38fd1498Szrj   /* Apply our simple cost model - fuse partitions with similar
2832*38fd1498Szrj      memory accesses.  */
2833*38fd1498Szrj   for (i = 0; partitions.iterate (i, &into); ++i)
2834*38fd1498Szrj     {
2835*38fd1498Szrj       bool changed = false;
2836*38fd1498Szrj       if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
2837*38fd1498Szrj 	continue;
2838*38fd1498Szrj       for (int j = i + 1;
2839*38fd1498Szrj 	   partitions.iterate (j, &partition); ++j)
2840*38fd1498Szrj 	{
2841*38fd1498Szrj 	  if (share_memory_accesses (rdg, into, partition))
2842*38fd1498Szrj 	    {
2843*38fd1498Szrj 	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
2844*38fd1498Szrj 	      partitions.unordered_remove (j);
2845*38fd1498Szrj 	      partition_free (partition);
2846*38fd1498Szrj 	      j--;
2847*38fd1498Szrj 	      changed = true;
2848*38fd1498Szrj 	    }
2849*38fd1498Szrj 	}
2850*38fd1498Szrj       /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
2851*38fd1498Szrj          accesses when 1 and 2 have similar accesses but not 0 and 1
2852*38fd1498Szrj 	 then in the next iteration we will fail to consider merging
2853*38fd1498Szrj 	 1 into 0,2.  So try again if we did any merging into 0.  */
2854*38fd1498Szrj       if (changed)
2855*38fd1498Szrj 	i--;
2856*38fd1498Szrj     }
2857*38fd1498Szrj 
2858*38fd1498Szrj   /* Build the partition dependency graph and fuse partitions in strong
2859*38fd1498Szrj      connected component.  */
2860*38fd1498Szrj   if (partitions.length () > 1)
2861*38fd1498Szrj     {
2862*38fd1498Szrj       /* Don't support loop nest distribution under runtime alias check
2863*38fd1498Szrj 	 since it's not likely to enable many vectorization opportunities.  */
2864*38fd1498Szrj       if (loop->inner)
2865*38fd1498Szrj 	merge_dep_scc_partitions (rdg, &partitions, false);
2866*38fd1498Szrj       else
2867*38fd1498Szrj 	{
2868*38fd1498Szrj 	  merge_dep_scc_partitions (rdg, &partitions, true);
2869*38fd1498Szrj 	  if (partitions.length () > 1)
2870*38fd1498Szrj 	    break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
2871*38fd1498Szrj 	}
2872*38fd1498Szrj     }
2873*38fd1498Szrj 
2874*38fd1498Szrj   finalize_partitions (loop, &partitions, &alias_ddrs);
2875*38fd1498Szrj 
2876*38fd1498Szrj   nbp = partitions.length ();
2877*38fd1498Szrj   if (nbp == 0
2878*38fd1498Szrj       || (nbp == 1 && !partition_builtin_p (partitions[0]))
2879*38fd1498Szrj       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
2880*38fd1498Szrj     {
2881*38fd1498Szrj       nbp = 0;
2882*38fd1498Szrj       goto ldist_done;
2883*38fd1498Szrj     }
2884*38fd1498Szrj 
2885*38fd1498Szrj   if (version_for_distribution_p (&partitions, &alias_ddrs))
2886*38fd1498Szrj     version_loop_by_alias_check (loop, &alias_ddrs);
2887*38fd1498Szrj 
2888*38fd1498Szrj   if (dump_file && (dump_flags & TDF_DETAILS))
2889*38fd1498Szrj     {
2890*38fd1498Szrj       fprintf (dump_file,
2891*38fd1498Szrj 	       "distribute loop <%d> into partitions:\n", loop->num);
2892*38fd1498Szrj       dump_rdg_partitions (dump_file, partitions);
2893*38fd1498Szrj     }
2894*38fd1498Szrj 
2895*38fd1498Szrj   FOR_EACH_VEC_ELT (partitions, i, partition)
2896*38fd1498Szrj     {
2897*38fd1498Szrj       if (partition_builtin_p (partition))
2898*38fd1498Szrj 	(*nb_calls)++;
2899*38fd1498Szrj       *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
2900*38fd1498Szrj     }
2901*38fd1498Szrj 
2902*38fd1498Szrj  ldist_done:
2903*38fd1498Szrj   loop_nest.release ();
2904*38fd1498Szrj   free_data_refs (datarefs_vec);
2905*38fd1498Szrj   for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
2906*38fd1498Szrj        iter != ddrs_table->end (); ++iter)
2907*38fd1498Szrj     {
2908*38fd1498Szrj       free_dependence_relation (*iter);
2909*38fd1498Szrj       *iter = NULL;
2910*38fd1498Szrj     }
2911*38fd1498Szrj   delete ddrs_table;
2912*38fd1498Szrj 
2913*38fd1498Szrj   FOR_EACH_VEC_ELT (partitions, i, partition)
2914*38fd1498Szrj     partition_free (partition);
2915*38fd1498Szrj 
2916*38fd1498Szrj   free_rdg (rdg);
2917*38fd1498Szrj   return nbp - *nb_calls;
2918*38fd1498Szrj }
2919*38fd1498Szrj 
2920*38fd1498Szrj /* Distribute all loops in the current function.  */
2921*38fd1498Szrj 
2922*38fd1498Szrj namespace {
2923*38fd1498Szrj 
2924*38fd1498Szrj const pass_data pass_data_loop_distribution =
2925*38fd1498Szrj {
2926*38fd1498Szrj   GIMPLE_PASS, /* type */
2927*38fd1498Szrj   "ldist", /* name */
2928*38fd1498Szrj   OPTGROUP_LOOP, /* optinfo_flags */
2929*38fd1498Szrj   TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
2930*38fd1498Szrj   ( PROP_cfg | PROP_ssa ), /* properties_required */
2931*38fd1498Szrj   0, /* properties_provided */
2932*38fd1498Szrj   0, /* properties_destroyed */
2933*38fd1498Szrj   0, /* todo_flags_start */
2934*38fd1498Szrj   0, /* todo_flags_finish */
2935*38fd1498Szrj };
2936*38fd1498Szrj 
2937*38fd1498Szrj class pass_loop_distribution : public gimple_opt_pass
2938*38fd1498Szrj {
2939*38fd1498Szrj public:
2940*38fd1498Szrj   pass_loop_distribution (gcc::context *ctxt)
2941*38fd1498Szrj     : gimple_opt_pass (pass_data_loop_distribution, ctxt)
2942*38fd1498Szrj   {}
2943*38fd1498Szrj 
2944*38fd1498Szrj   /* opt_pass methods: */
2945*38fd1498Szrj   virtual bool gate (function *)
2946*38fd1498Szrj     {
2947*38fd1498Szrj       return flag_tree_loop_distribution
2948*38fd1498Szrj 	|| flag_tree_loop_distribute_patterns;
2949*38fd1498Szrj     }
2950*38fd1498Szrj 
2951*38fd1498Szrj   virtual unsigned int execute (function *);
2952*38fd1498Szrj 
2953*38fd1498Szrj }; // class pass_loop_distribution
2954*38fd1498Szrj 
2955*38fd1498Szrj 
2956*38fd1498Szrj /* Given LOOP, this function records seed statements for distribution in
2957*38fd1498Szrj    WORK_LIST.  Return false if there is nothing for distribution.  */
2958*38fd1498Szrj 
2959*38fd1498Szrj static bool
2960*38fd1498Szrj find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
2961*38fd1498Szrj {
2962*38fd1498Szrj   basic_block *bbs = get_loop_body_in_dom_order (loop);
2963*38fd1498Szrj 
2964*38fd1498Szrj   /* Initialize the worklist with stmts we seed the partitions with.  */
2965*38fd1498Szrj   for (unsigned i = 0; i < loop->num_nodes; ++i)
2966*38fd1498Szrj     {
2967*38fd1498Szrj       for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
2968*38fd1498Szrj 	   !gsi_end_p (gsi); gsi_next (&gsi))
2969*38fd1498Szrj 	{
2970*38fd1498Szrj 	  gphi *phi = gsi.phi ();
2971*38fd1498Szrj 	  if (virtual_operand_p (gimple_phi_result (phi)))
2972*38fd1498Szrj 	    continue;
2973*38fd1498Szrj 	  /* Distribute stmts which have defs that are used outside of
2974*38fd1498Szrj 	     the loop.  */
2975*38fd1498Szrj 	  if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
2976*38fd1498Szrj 	    continue;
2977*38fd1498Szrj 	  work_list->safe_push (phi);
2978*38fd1498Szrj 	}
2979*38fd1498Szrj       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2980*38fd1498Szrj 	   !gsi_end_p (gsi); gsi_next (&gsi))
2981*38fd1498Szrj 	{
2982*38fd1498Szrj 	  gimple *stmt = gsi_stmt (gsi);
2983*38fd1498Szrj 
2984*38fd1498Szrj 	  /* If there is a stmt with side-effects bail out - we
2985*38fd1498Szrj 	     cannot and should not distribute this loop.  */
2986*38fd1498Szrj 	  if (gimple_has_side_effects (stmt))
2987*38fd1498Szrj 	    {
2988*38fd1498Szrj 	      free (bbs);
2989*38fd1498Szrj 	      return false;
2990*38fd1498Szrj 	    }
2991*38fd1498Szrj 
2992*38fd1498Szrj 	  /* Distribute stmts which have defs that are used outside of
2993*38fd1498Szrj 	     the loop.  */
2994*38fd1498Szrj 	  if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
2995*38fd1498Szrj 	    ;
2996*38fd1498Szrj 	  /* Otherwise only distribute stores for now.  */
2997*38fd1498Szrj 	  else if (!gimple_vdef (stmt))
2998*38fd1498Szrj 	    continue;
2999*38fd1498Szrj 
3000*38fd1498Szrj 	  work_list->safe_push (stmt);
3001*38fd1498Szrj 	}
3002*38fd1498Szrj     }
3003*38fd1498Szrj   free (bbs);
3004*38fd1498Szrj   return work_list->length () > 0;
3005*38fd1498Szrj }
3006*38fd1498Szrj 
3007*38fd1498Szrj /* Given innermost LOOP, return the outermost enclosing loop that forms a
3008*38fd1498Szrj    perfect loop nest.  */
3009*38fd1498Szrj 
3010*38fd1498Szrj static struct loop *
3011*38fd1498Szrj prepare_perfect_loop_nest (struct loop *loop)
3012*38fd1498Szrj {
3013*38fd1498Szrj   struct loop *outer = loop_outer (loop);
3014*38fd1498Szrj   tree niters = number_of_latch_executions (loop);
3015*38fd1498Szrj 
3016*38fd1498Szrj   /* TODO: We only support the innermost 3-level loop nest distribution
3017*38fd1498Szrj      because of compilation time issue for now.  This should be relaxed
3018*38fd1498Szrj      in the future.  Note we only allow 3-level loop nest distribution
3019*38fd1498Szrj      when parallelizing loops.  */
3020*38fd1498Szrj   while ((loop->inner == NULL
3021*38fd1498Szrj 	  || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3022*38fd1498Szrj 	 && loop_outer (outer)
3023*38fd1498Szrj 	 && outer->inner == loop && loop->next == NULL
3024*38fd1498Szrj 	 && single_exit (outer)
3025*38fd1498Szrj 	 && optimize_loop_for_speed_p (outer)
3026*38fd1498Szrj 	 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3027*38fd1498Szrj 	 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3028*38fd1498Szrj 	 && niters != chrec_dont_know)
3029*38fd1498Szrj     {
3030*38fd1498Szrj       loop = outer;
3031*38fd1498Szrj       outer = loop_outer (loop);
3032*38fd1498Szrj     }
3033*38fd1498Szrj 
3034*38fd1498Szrj   return loop;
3035*38fd1498Szrj }
3036*38fd1498Szrj 
3037*38fd1498Szrj unsigned int
3038*38fd1498Szrj pass_loop_distribution::execute (function *fun)
3039*38fd1498Szrj {
3040*38fd1498Szrj   struct loop *loop;
3041*38fd1498Szrj   bool changed = false;
3042*38fd1498Szrj   basic_block bb;
3043*38fd1498Szrj   control_dependences *cd = NULL;
3044*38fd1498Szrj   auto_vec<loop_p> loops_to_be_destroyed;
3045*38fd1498Szrj 
3046*38fd1498Szrj   if (number_of_loops (fun) <= 1)
3047*38fd1498Szrj     return 0;
3048*38fd1498Szrj 
3049*38fd1498Szrj   /* Compute topological order for basic blocks.  Topological order is
3050*38fd1498Szrj      needed because data dependence is computed for data references in
3051*38fd1498Szrj      lexicographical order.  */
3052*38fd1498Szrj   if (bb_top_order_index == NULL)
3053*38fd1498Szrj     {
3054*38fd1498Szrj       int rpo_num;
3055*38fd1498Szrj       int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun));
3056*38fd1498Szrj 
3057*38fd1498Szrj       bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3058*38fd1498Szrj       bb_top_order_index_size = last_basic_block_for_fn (cfun);
3059*38fd1498Szrj       rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true);
3060*38fd1498Szrj       for (int i = 0; i < rpo_num; i++)
3061*38fd1498Szrj 	bb_top_order_index[rpo[i]] = i;
3062*38fd1498Szrj 
3063*38fd1498Szrj       free (rpo);
3064*38fd1498Szrj     }
3065*38fd1498Szrj 
3066*38fd1498Szrj   FOR_ALL_BB_FN (bb, fun)
3067*38fd1498Szrj     {
3068*38fd1498Szrj       gimple_stmt_iterator gsi;
3069*38fd1498Szrj       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3070*38fd1498Szrj 	gimple_set_uid (gsi_stmt (gsi), -1);
3071*38fd1498Szrj       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3072*38fd1498Szrj 	gimple_set_uid (gsi_stmt (gsi), -1);
3073*38fd1498Szrj     }
3074*38fd1498Szrj 
3075*38fd1498Szrj   /* We can at the moment only distribute non-nested loops, thus restrict
3076*38fd1498Szrj      walking to innermost loops.  */
3077*38fd1498Szrj   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
3078*38fd1498Szrj     {
3079*38fd1498Szrj       /* Don't distribute multiple exit edges loop, or cold loop.  */
3080*38fd1498Szrj       if (!single_exit (loop)
3081*38fd1498Szrj 	  || !optimize_loop_for_speed_p (loop))
3082*38fd1498Szrj 	continue;
3083*38fd1498Szrj 
3084*38fd1498Szrj       /* Don't distribute loop if niters is unknown.  */
3085*38fd1498Szrj       tree niters = number_of_latch_executions (loop);
3086*38fd1498Szrj       if (niters == NULL_TREE || niters == chrec_dont_know)
3087*38fd1498Szrj 	continue;
3088*38fd1498Szrj 
3089*38fd1498Szrj       /* Get the perfect loop nest for distribution.  */
3090*38fd1498Szrj       loop = prepare_perfect_loop_nest (loop);
3091*38fd1498Szrj       for (; loop; loop = loop->inner)
3092*38fd1498Szrj 	{
3093*38fd1498Szrj 	  auto_vec<gimple *> work_list;
3094*38fd1498Szrj 	  if (!find_seed_stmts_for_distribution (loop, &work_list))
3095*38fd1498Szrj 	    break;
3096*38fd1498Szrj 
3097*38fd1498Szrj 	  const char *str = loop->inner ? " nest" : "";
3098*38fd1498Szrj 	  location_t loc = find_loop_location (loop);
3099*38fd1498Szrj 	  if (!cd)
3100*38fd1498Szrj 	    {
3101*38fd1498Szrj 	      calculate_dominance_info (CDI_DOMINATORS);
3102*38fd1498Szrj 	      calculate_dominance_info (CDI_POST_DOMINATORS);
3103*38fd1498Szrj 	      cd = new control_dependences ();
3104*38fd1498Szrj 	      free_dominance_info (CDI_POST_DOMINATORS);
3105*38fd1498Szrj 	    }
3106*38fd1498Szrj 
3107*38fd1498Szrj 	  bool destroy_p;
3108*38fd1498Szrj 	  int nb_generated_loops, nb_generated_calls;
3109*38fd1498Szrj 	  nb_generated_loops = distribute_loop (loop, work_list, cd,
3110*38fd1498Szrj 						&nb_generated_calls,
3111*38fd1498Szrj 						&destroy_p);
3112*38fd1498Szrj 	  if (destroy_p)
3113*38fd1498Szrj 	    loops_to_be_destroyed.safe_push (loop);
3114*38fd1498Szrj 
3115*38fd1498Szrj 	  if (nb_generated_loops + nb_generated_calls > 0)
3116*38fd1498Szrj 	    {
3117*38fd1498Szrj 	      changed = true;
3118*38fd1498Szrj 	      dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3119*38fd1498Szrj 			       loc, "Loop%s %d distributed: split to %d loops "
3120*38fd1498Szrj 			       "and %d library calls.\n", str, loop->num,
3121*38fd1498Szrj 			       nb_generated_loops, nb_generated_calls);
3122*38fd1498Szrj 
3123*38fd1498Szrj 	      break;
3124*38fd1498Szrj 	    }
3125*38fd1498Szrj 
3126*38fd1498Szrj 	  if (dump_file && (dump_flags & TDF_DETAILS))
3127*38fd1498Szrj 	    fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3128*38fd1498Szrj 	}
3129*38fd1498Szrj     }
3130*38fd1498Szrj 
3131*38fd1498Szrj   if (cd)
3132*38fd1498Szrj     delete cd;
3133*38fd1498Szrj 
3134*38fd1498Szrj   if (bb_top_order_index != NULL)
3135*38fd1498Szrj     {
3136*38fd1498Szrj       free (bb_top_order_index);
3137*38fd1498Szrj       bb_top_order_index = NULL;
3138*38fd1498Szrj       bb_top_order_index_size = 0;
3139*38fd1498Szrj     }
3140*38fd1498Szrj 
3141*38fd1498Szrj   if (changed)
3142*38fd1498Szrj     {
3143*38fd1498Szrj       /* Destroy loop bodies that could not be reused.  Do this late as we
3144*38fd1498Szrj 	 otherwise can end up refering to stale data in control dependences.  */
3145*38fd1498Szrj       unsigned i;
3146*38fd1498Szrj       FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3147*38fd1498Szrj 	destroy_loop (loop);
3148*38fd1498Szrj 
3149*38fd1498Szrj       /* Cached scalar evolutions now may refer to wrong or non-existing
3150*38fd1498Szrj 	 loops.  */
3151*38fd1498Szrj       scev_reset_htab ();
3152*38fd1498Szrj       mark_virtual_operands_for_renaming (fun);
3153*38fd1498Szrj       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3154*38fd1498Szrj     }
3155*38fd1498Szrj 
3156*38fd1498Szrj   checking_verify_loop_structure ();
3157*38fd1498Szrj 
3158*38fd1498Szrj   return changed ? TODO_cleanup_cfg : 0;
3159*38fd1498Szrj }
3160*38fd1498Szrj 
3161*38fd1498Szrj } // anon namespace
3162*38fd1498Szrj 
3163*38fd1498Szrj gimple_opt_pass *
3164*38fd1498Szrj make_pass_loop_distribution (gcc::context *ctxt)
3165*38fd1498Szrj {
3166*38fd1498Szrj   return new pass_loop_distribution (ctxt);
3167*38fd1498Szrj }
3168