xref: /netbsd-src/external/gpl3/gcc/dist/gcc/tree-loop-distribution.cc (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1b1e83836Smrg /* Loop distribution.
2b1e83836Smrg    Copyright (C) 2006-2022 Free Software Foundation, Inc.
3b1e83836Smrg    Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4b1e83836Smrg    and Sebastian Pop <sebastian.pop@amd.com>.
5b1e83836Smrg 
6b1e83836Smrg This file is part of GCC.
7b1e83836Smrg 
8b1e83836Smrg GCC is free software; you can redistribute it and/or modify it
9b1e83836Smrg under the terms of the GNU General Public License as published by the
10b1e83836Smrg Free Software Foundation; either version 3, or (at your option) any
11b1e83836Smrg later version.
12b1e83836Smrg 
13b1e83836Smrg GCC is distributed in the hope that it will be useful, but WITHOUT
14b1e83836Smrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15b1e83836Smrg FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16b1e83836Smrg for more details.
17b1e83836Smrg 
18b1e83836Smrg You should have received a copy of the GNU General Public License
19b1e83836Smrg along with GCC; see the file COPYING3.  If not see
20b1e83836Smrg <http://www.gnu.org/licenses/>.  */
21b1e83836Smrg 
22b1e83836Smrg /* This pass performs loop distribution: for example, the loop
23b1e83836Smrg 
24b1e83836Smrg    |DO I = 2, N
25b1e83836Smrg    |    A(I) = B(I) + C
26b1e83836Smrg    |    D(I) = A(I-1)*E
27b1e83836Smrg    |ENDDO
28b1e83836Smrg 
29b1e83836Smrg    is transformed to
30b1e83836Smrg 
31b1e83836Smrg    |DOALL I = 2, N
32b1e83836Smrg    |   A(I) = B(I) + C
33b1e83836Smrg    |ENDDO
34b1e83836Smrg    |
35b1e83836Smrg    |DOALL I = 2, N
36b1e83836Smrg    |   D(I) = A(I-1)*E
37b1e83836Smrg    |ENDDO
38b1e83836Smrg 
39b1e83836Smrg    Loop distribution is the dual of loop fusion.  It separates statements
40b1e83836Smrg    of a loop (or loop nest) into multiple loops (or loop nests) with the
41b1e83836Smrg    same loop header.  The major goal is to separate statements which may
42b1e83836Smrg    be vectorized from those that can't.  This pass implements distribution
43b1e83836Smrg    in the following steps:
44b1e83836Smrg 
45b1e83836Smrg      1) Seed partitions with specific type statements.  For now we support
46b1e83836Smrg 	two types seed statements: statement defining variable used outside
47b1e83836Smrg 	of loop; statement storing to memory.
48b1e83836Smrg      2) Build reduced dependence graph (RDG) for loop to be distributed.
49b1e83836Smrg 	The vertices (RDG:V) model all statements in the loop and the edges
50b1e83836Smrg 	(RDG:E) model flow and control dependencies between statements.
51b1e83836Smrg      3) Apart from RDG, compute data dependencies between memory references.
52b1e83836Smrg      4) Starting from seed statement, build up partition by adding depended
53b1e83836Smrg 	statements according to RDG's dependence information.  Partition is
54b1e83836Smrg 	classified as parallel type if it can be executed paralleled; or as
55b1e83836Smrg 	sequential type if it can't.  Parallel type partition is further
56b1e83836Smrg 	classified as different builtin kinds if it can be implemented as
57b1e83836Smrg 	builtin function calls.
58b1e83836Smrg      5) Build partition dependence graph (PG) based on data dependencies.
59b1e83836Smrg 	The vertices (PG:V) model all partitions and the edges (PG:E) model
60b1e83836Smrg 	all data dependencies between every partitions pair.  In general,
61b1e83836Smrg 	data dependence is either compilation time known or unknown.  In C
62b1e83836Smrg 	family languages, there exists quite amount compilation time unknown
63b1e83836Smrg 	dependencies because of possible alias relation of data references.
64b1e83836Smrg 	We categorize PG's edge to two types: "true" edge that represents
65b1e83836Smrg 	compilation time known data dependencies; "alias" edge for all other
66b1e83836Smrg 	data dependencies.
67b1e83836Smrg      6) Traverse subgraph of PG as if all "alias" edges don't exist.  Merge
68b1e83836Smrg 	partitions in each strong connected component (SCC) correspondingly.
69b1e83836Smrg 	Build new PG for merged partitions.
70b1e83836Smrg      7) Traverse PG again and this time with both "true" and "alias" edges
71b1e83836Smrg 	included.  We try to break SCCs by removing some edges.  Because
72b1e83836Smrg 	SCCs by "true" edges are all fused in step 6), we can break SCCs
73b1e83836Smrg 	by removing some "alias" edges.  It's NP-hard to choose optimal
74b1e83836Smrg 	edge set, fortunately simple approximation is good enough for us
75b1e83836Smrg 	given the small problem scale.
76b1e83836Smrg      8) Collect all data dependencies of the removed "alias" edges.  Create
77b1e83836Smrg 	runtime alias checks for collected data dependencies.
78b1e83836Smrg      9) Version loop under the condition of runtime alias checks.  Given
79b1e83836Smrg 	loop distribution generally introduces additional overhead, it is
80b1e83836Smrg 	only useful if vectorization is achieved in distributed loop.  We
81b1e83836Smrg 	version loop with internal function call IFN_LOOP_DIST_ALIAS.  If
82b1e83836Smrg 	no distributed loop can be vectorized, we simply remove distributed
83b1e83836Smrg 	loops and recover to the original one.
84b1e83836Smrg 
85b1e83836Smrg    TODO:
86b1e83836Smrg      1) We only distribute innermost two-level loop nest now.  We should
87b1e83836Smrg 	extend it for arbitrary loop nests in the future.
88b1e83836Smrg      2) We only fuse partitions in SCC now.  A better fusion algorithm is
89b1e83836Smrg 	desired to minimize loop overhead, maximize parallelism and maximize
90b1e83836Smrg 	data reuse.  */
91b1e83836Smrg 
92b1e83836Smrg #include "config.h"
93b1e83836Smrg #include "system.h"
94b1e83836Smrg #include "coretypes.h"
95b1e83836Smrg #include "backend.h"
96b1e83836Smrg #include "tree.h"
97b1e83836Smrg #include "gimple.h"
98b1e83836Smrg #include "cfghooks.h"
99b1e83836Smrg #include "tree-pass.h"
100b1e83836Smrg #include "ssa.h"
101b1e83836Smrg #include "gimple-pretty-print.h"
102b1e83836Smrg #include "fold-const.h"
103b1e83836Smrg #include "cfganal.h"
104b1e83836Smrg #include "gimple-iterator.h"
105b1e83836Smrg #include "gimplify-me.h"
106b1e83836Smrg #include "stor-layout.h"
107b1e83836Smrg #include "tree-cfg.h"
108b1e83836Smrg #include "tree-ssa-loop-manip.h"
109b1e83836Smrg #include "tree-ssa-loop-ivopts.h"
110b1e83836Smrg #include "tree-ssa-loop.h"
111b1e83836Smrg #include "tree-into-ssa.h"
112b1e83836Smrg #include "tree-ssa.h"
113b1e83836Smrg #include "cfgloop.h"
114b1e83836Smrg #include "tree-scalar-evolution.h"
115b1e83836Smrg #include "tree-vectorizer.h"
116b1e83836Smrg #include "tree-eh.h"
117b1e83836Smrg #include "gimple-fold.h"
118b1e83836Smrg #include "tree-affine.h"
119b1e83836Smrg #include "intl.h"
120b1e83836Smrg #include "rtl.h"
121b1e83836Smrg #include "memmodel.h"
122b1e83836Smrg #include "optabs.h"
123b1e83836Smrg 
124b1e83836Smrg 
125b1e83836Smrg #define MAX_DATAREFS_NUM \
126b1e83836Smrg 	((unsigned) param_loop_max_datarefs_for_datadeps)
127b1e83836Smrg 
128b1e83836Smrg /* Threshold controlling number of distributed partitions.  Given it may
129b1e83836Smrg    be unnecessary if a memory stream cost model is invented in the future,
130b1e83836Smrg    we define it as a temporary macro, rather than a parameter.  */
131b1e83836Smrg #define NUM_PARTITION_THRESHOLD (4)
132b1e83836Smrg 
133b1e83836Smrg /* Hashtable helpers.  */
134b1e83836Smrg 
135b1e83836Smrg struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
136b1e83836Smrg {
137b1e83836Smrg   static inline hashval_t hash (const data_dependence_relation *);
138b1e83836Smrg   static inline bool equal (const data_dependence_relation *,
139b1e83836Smrg 			    const data_dependence_relation *);
140b1e83836Smrg };
141b1e83836Smrg 
142b1e83836Smrg /* Hash function for data dependence.  */
143b1e83836Smrg 
144b1e83836Smrg inline hashval_t
hash(const data_dependence_relation * ddr)145b1e83836Smrg ddr_hasher::hash (const data_dependence_relation *ddr)
146b1e83836Smrg {
147b1e83836Smrg   inchash::hash h;
148b1e83836Smrg   h.add_ptr (DDR_A (ddr));
149b1e83836Smrg   h.add_ptr (DDR_B (ddr));
150b1e83836Smrg   return h.end ();
151b1e83836Smrg }
152b1e83836Smrg 
153b1e83836Smrg /* Hash table equality function for data dependence.  */
154b1e83836Smrg 
155b1e83836Smrg inline bool
equal(const data_dependence_relation * ddr1,const data_dependence_relation * ddr2)156b1e83836Smrg ddr_hasher::equal (const data_dependence_relation *ddr1,
157b1e83836Smrg 		   const data_dependence_relation *ddr2)
158b1e83836Smrg {
159b1e83836Smrg   return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
160b1e83836Smrg }
161b1e83836Smrg 
162b1e83836Smrg 
163b1e83836Smrg 
164b1e83836Smrg #define DR_INDEX(dr)      ((uintptr_t) (dr)->aux)
165b1e83836Smrg 
166b1e83836Smrg /* A Reduced Dependence Graph (RDG) vertex representing a statement.  */
167b1e83836Smrg struct rdg_vertex
168b1e83836Smrg {
169b1e83836Smrg   /* The statement represented by this vertex.  */
170b1e83836Smrg   gimple *stmt;
171b1e83836Smrg 
172b1e83836Smrg   /* Vector of data-references in this statement.  */
173b1e83836Smrg   vec<data_reference_p> datarefs;
174b1e83836Smrg 
175b1e83836Smrg   /* True when the statement contains a write to memory.  */
176b1e83836Smrg   bool has_mem_write;
177b1e83836Smrg 
178b1e83836Smrg   /* True when the statement contains a read from memory.  */
179b1e83836Smrg   bool has_mem_reads;
180b1e83836Smrg };
181b1e83836Smrg 
182b1e83836Smrg #define RDGV_STMT(V)     ((struct rdg_vertex *) ((V)->data))->stmt
183b1e83836Smrg #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
184b1e83836Smrg #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
185b1e83836Smrg #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
186b1e83836Smrg #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
187b1e83836Smrg #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
188b1e83836Smrg #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
189b1e83836Smrg #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
190b1e83836Smrg 
191b1e83836Smrg /* Data dependence type.  */
192b1e83836Smrg 
193b1e83836Smrg enum rdg_dep_type
194b1e83836Smrg {
195b1e83836Smrg   /* Read After Write (RAW).  */
196b1e83836Smrg   flow_dd = 'f',
197b1e83836Smrg 
198b1e83836Smrg   /* Control dependence (execute conditional on).  */
199b1e83836Smrg   control_dd = 'c'
200b1e83836Smrg };
201b1e83836Smrg 
202b1e83836Smrg /* Dependence information attached to an edge of the RDG.  */
203b1e83836Smrg 
204b1e83836Smrg struct rdg_edge
205b1e83836Smrg {
206b1e83836Smrg   /* Type of the dependence.  */
207b1e83836Smrg   enum rdg_dep_type type;
208b1e83836Smrg };
209b1e83836Smrg 
210b1e83836Smrg #define RDGE_TYPE(E)        ((struct rdg_edge *) ((E)->data))->type
211b1e83836Smrg 
212b1e83836Smrg /* Kind of distributed loop.  */
213b1e83836Smrg enum partition_kind {
214b1e83836Smrg     PKIND_NORMAL,
215b1e83836Smrg     /* Partial memset stands for a paritition can be distributed into a loop
216b1e83836Smrg        of memset calls, rather than a single memset call.  It's handled just
217b1e83836Smrg        like a normal parition, i.e, distributed as separate loop, no memset
218b1e83836Smrg        call is generated.
219b1e83836Smrg 
220b1e83836Smrg        Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
221b1e83836Smrg        loop nest as deep as possible.  As a result, parloop achieves better
222b1e83836Smrg        parallelization by parallelizing deeper loop nest.  This hack should
223b1e83836Smrg        be unnecessary and removed once distributed memset can be understood
224b1e83836Smrg        and analyzed in data reference analysis.  See PR82604 for more.  */
225b1e83836Smrg     PKIND_PARTIAL_MEMSET,
226b1e83836Smrg     PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
227b1e83836Smrg };
228b1e83836Smrg 
229b1e83836Smrg /* Type of distributed loop.  */
230b1e83836Smrg enum partition_type {
231b1e83836Smrg     /* The distributed loop can be executed parallelly.  */
232b1e83836Smrg     PTYPE_PARALLEL = 0,
233b1e83836Smrg     /* The distributed loop has to be executed sequentially.  */
234b1e83836Smrg     PTYPE_SEQUENTIAL
235b1e83836Smrg };
236b1e83836Smrg 
237b1e83836Smrg /* Builtin info for loop distribution.  */
238b1e83836Smrg struct builtin_info
239b1e83836Smrg {
240b1e83836Smrg   /* data-references a kind != PKIND_NORMAL partition is about.  */
241b1e83836Smrg   data_reference_p dst_dr;
242b1e83836Smrg   data_reference_p src_dr;
243b1e83836Smrg   /* Base address and size of memory objects operated by the builtin.  Note
244b1e83836Smrg      both dest and source memory objects must have the same size.  */
245b1e83836Smrg   tree dst_base;
246b1e83836Smrg   tree src_base;
247b1e83836Smrg   tree size;
248b1e83836Smrg   /* Base and offset part of dst_base after stripping constant offset.  This
249b1e83836Smrg      is only used in memset builtin distribution for now.  */
250b1e83836Smrg   tree dst_base_base;
251b1e83836Smrg   unsigned HOST_WIDE_INT dst_base_offset;
252b1e83836Smrg };
253b1e83836Smrg 
254b1e83836Smrg /* Partition for loop distribution.  */
255b1e83836Smrg struct partition
256b1e83836Smrg {
257b1e83836Smrg   /* Statements of the partition.  */
258b1e83836Smrg   bitmap stmts;
259b1e83836Smrg   /* True if the partition defines variable which is used outside of loop.  */
260b1e83836Smrg   bool reduction_p;
261b1e83836Smrg   location_t loc;
262b1e83836Smrg   enum partition_kind kind;
263b1e83836Smrg   enum partition_type type;
264b1e83836Smrg   /* Data references in the partition.  */
265b1e83836Smrg   bitmap datarefs;
266b1e83836Smrg   /* Information of builtin parition.  */
267b1e83836Smrg   struct builtin_info *builtin;
268b1e83836Smrg };
269b1e83836Smrg 
270b1e83836Smrg /* Partitions are fused because of different reasons.  */
271b1e83836Smrg enum fuse_type
272b1e83836Smrg {
273b1e83836Smrg   FUSE_NON_BUILTIN = 0,
274b1e83836Smrg   FUSE_REDUCTION = 1,
275b1e83836Smrg   FUSE_SHARE_REF = 2,
276b1e83836Smrg   FUSE_SAME_SCC = 3,
277b1e83836Smrg   FUSE_FINALIZE = 4
278b1e83836Smrg };
279b1e83836Smrg 
280b1e83836Smrg /* Description on different fusing reason.  */
281b1e83836Smrg static const char *fuse_message[] = {
282b1e83836Smrg   "they are non-builtins",
283b1e83836Smrg   "they have reductions",
284b1e83836Smrg   "they have shared memory refs",
285b1e83836Smrg   "they are in the same dependence scc",
286b1e83836Smrg   "there is no point to distribute loop"};
287b1e83836Smrg 
288b1e83836Smrg 
289b1e83836Smrg /* Dump vertex I in RDG to FILE.  */
290b1e83836Smrg 
291b1e83836Smrg static void
dump_rdg_vertex(FILE * file,struct graph * rdg,int i)292b1e83836Smrg dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
293b1e83836Smrg {
294b1e83836Smrg   struct vertex *v = &(rdg->vertices[i]);
295b1e83836Smrg   struct graph_edge *e;
296b1e83836Smrg 
297b1e83836Smrg   fprintf (file, "(vertex %d: (%s%s) (in:", i,
298b1e83836Smrg 	   RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
299b1e83836Smrg 	   RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
300b1e83836Smrg 
301b1e83836Smrg   if (v->pred)
302b1e83836Smrg     for (e = v->pred; e; e = e->pred_next)
303b1e83836Smrg       fprintf (file, " %d", e->src);
304b1e83836Smrg 
305b1e83836Smrg   fprintf (file, ") (out:");
306b1e83836Smrg 
307b1e83836Smrg   if (v->succ)
308b1e83836Smrg     for (e = v->succ; e; e = e->succ_next)
309b1e83836Smrg       fprintf (file, " %d", e->dest);
310b1e83836Smrg 
311b1e83836Smrg   fprintf (file, ")\n");
312b1e83836Smrg   print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
313b1e83836Smrg   fprintf (file, ")\n");
314b1e83836Smrg }
315b1e83836Smrg 
316b1e83836Smrg /* Call dump_rdg_vertex on stderr.  */
317b1e83836Smrg 
318b1e83836Smrg DEBUG_FUNCTION void
debug_rdg_vertex(struct graph * rdg,int i)319b1e83836Smrg debug_rdg_vertex (struct graph *rdg, int i)
320b1e83836Smrg {
321b1e83836Smrg   dump_rdg_vertex (stderr, rdg, i);
322b1e83836Smrg }
323b1e83836Smrg 
324b1e83836Smrg /* Dump the reduced dependence graph RDG to FILE.  */
325b1e83836Smrg 
326b1e83836Smrg static void
dump_rdg(FILE * file,struct graph * rdg)327b1e83836Smrg dump_rdg (FILE *file, struct graph *rdg)
328b1e83836Smrg {
329b1e83836Smrg   fprintf (file, "(rdg\n");
330b1e83836Smrg   for (int i = 0; i < rdg->n_vertices; i++)
331b1e83836Smrg     dump_rdg_vertex (file, rdg, i);
332b1e83836Smrg   fprintf (file, ")\n");
333b1e83836Smrg }
334b1e83836Smrg 
335b1e83836Smrg /* Call dump_rdg on stderr.  */
336b1e83836Smrg 
337b1e83836Smrg DEBUG_FUNCTION void
debug_rdg(struct graph * rdg)338b1e83836Smrg debug_rdg (struct graph *rdg)
339b1e83836Smrg {
340b1e83836Smrg   dump_rdg (stderr, rdg);
341b1e83836Smrg }
342b1e83836Smrg 
343b1e83836Smrg static void
dot_rdg_1(FILE * file,struct graph * rdg)344b1e83836Smrg dot_rdg_1 (FILE *file, struct graph *rdg)
345b1e83836Smrg {
346b1e83836Smrg   int i;
347b1e83836Smrg   pretty_printer buffer;
348b1e83836Smrg   pp_needs_newline (&buffer) = false;
349b1e83836Smrg   buffer.buffer->stream = file;
350b1e83836Smrg 
351b1e83836Smrg   fprintf (file, "digraph RDG {\n");
352b1e83836Smrg 
353b1e83836Smrg   for (i = 0; i < rdg->n_vertices; i++)
354b1e83836Smrg     {
355b1e83836Smrg       struct vertex *v = &(rdg->vertices[i]);
356b1e83836Smrg       struct graph_edge *e;
357b1e83836Smrg 
358b1e83836Smrg       fprintf (file, "%d [label=\"[%d] ", i, i);
359b1e83836Smrg       pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
360b1e83836Smrg       pp_flush (&buffer);
361b1e83836Smrg       fprintf (file, "\"]\n");
362b1e83836Smrg 
363b1e83836Smrg       /* Highlight reads from memory.  */
364b1e83836Smrg       if (RDG_MEM_READS_STMT (rdg, i))
365b1e83836Smrg        fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
366b1e83836Smrg 
367b1e83836Smrg       /* Highlight stores to memory.  */
368b1e83836Smrg       if (RDG_MEM_WRITE_STMT (rdg, i))
369b1e83836Smrg        fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
370b1e83836Smrg 
371b1e83836Smrg       if (v->succ)
372b1e83836Smrg        for (e = v->succ; e; e = e->succ_next)
373b1e83836Smrg          switch (RDGE_TYPE (e))
374b1e83836Smrg            {
375b1e83836Smrg            case flow_dd:
376b1e83836Smrg              /* These are the most common dependences: don't print these. */
377b1e83836Smrg              fprintf (file, "%d -> %d \n", i, e->dest);
378b1e83836Smrg              break;
379b1e83836Smrg 
380b1e83836Smrg 	   case control_dd:
381b1e83836Smrg              fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
382b1e83836Smrg              break;
383b1e83836Smrg 
384b1e83836Smrg            default:
385b1e83836Smrg              gcc_unreachable ();
386b1e83836Smrg            }
387b1e83836Smrg     }
388b1e83836Smrg 
389b1e83836Smrg   fprintf (file, "}\n\n");
390b1e83836Smrg }
391b1e83836Smrg 
392b1e83836Smrg /* Display the Reduced Dependence Graph using dotty.  */
393b1e83836Smrg 
394b1e83836Smrg DEBUG_FUNCTION void
dot_rdg(struct graph * rdg)395b1e83836Smrg dot_rdg (struct graph *rdg)
396b1e83836Smrg {
397b1e83836Smrg   /* When debugging, you may want to enable the following code.  */
398b1e83836Smrg #ifdef HAVE_POPEN
399b1e83836Smrg   FILE *file = popen ("dot -Tx11", "w");
400b1e83836Smrg   if (!file)
401b1e83836Smrg     return;
402b1e83836Smrg   dot_rdg_1 (file, rdg);
403b1e83836Smrg   fflush (file);
404b1e83836Smrg   close (fileno (file));
405b1e83836Smrg   pclose (file);
406b1e83836Smrg #else
407b1e83836Smrg   dot_rdg_1 (stderr, rdg);
408b1e83836Smrg #endif
409b1e83836Smrg }
410b1e83836Smrg 
411b1e83836Smrg /* Returns the index of STMT in RDG.  */
412b1e83836Smrg 
413b1e83836Smrg static int
rdg_vertex_for_stmt(struct graph * rdg ATTRIBUTE_UNUSED,gimple * stmt)414b1e83836Smrg rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
415b1e83836Smrg {
416b1e83836Smrg   int index = gimple_uid (stmt);
417b1e83836Smrg   gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
418b1e83836Smrg   return index;
419b1e83836Smrg }
420b1e83836Smrg 
421b1e83836Smrg /* Creates dependence edges in RDG for all the uses of DEF.  IDEF is
422b1e83836Smrg    the index of DEF in RDG.  */
423b1e83836Smrg 
424b1e83836Smrg static void
create_rdg_edges_for_scalar(struct graph * rdg,tree def,int idef)425b1e83836Smrg create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
426b1e83836Smrg {
427b1e83836Smrg   use_operand_p imm_use_p;
428b1e83836Smrg   imm_use_iterator iterator;
429b1e83836Smrg 
430b1e83836Smrg   FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
431b1e83836Smrg     {
432b1e83836Smrg       struct graph_edge *e;
433b1e83836Smrg       int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
434b1e83836Smrg 
435b1e83836Smrg       if (use < 0)
436b1e83836Smrg 	continue;
437b1e83836Smrg 
438b1e83836Smrg       e = add_edge (rdg, idef, use);
439b1e83836Smrg       e->data = XNEW (struct rdg_edge);
440b1e83836Smrg       RDGE_TYPE (e) = flow_dd;
441b1e83836Smrg     }
442b1e83836Smrg }
443b1e83836Smrg 
444b1e83836Smrg /* Creates an edge for the control dependences of BB to the vertex V.  */
445b1e83836Smrg 
446b1e83836Smrg static void
create_edge_for_control_dependence(struct graph * rdg,basic_block bb,int v,control_dependences * cd)447b1e83836Smrg create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
448b1e83836Smrg 				    int v, control_dependences *cd)
449b1e83836Smrg {
450b1e83836Smrg   bitmap_iterator bi;
451b1e83836Smrg   unsigned edge_n;
452b1e83836Smrg   EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
453b1e83836Smrg 			    0, edge_n, bi)
454b1e83836Smrg     {
455b1e83836Smrg       basic_block cond_bb = cd->get_edge_src (edge_n);
456b1e83836Smrg       gimple *stmt = last_stmt (cond_bb);
457b1e83836Smrg       if (stmt && is_ctrl_stmt (stmt))
458b1e83836Smrg 	{
459b1e83836Smrg 	  struct graph_edge *e;
460b1e83836Smrg 	  int c = rdg_vertex_for_stmt (rdg, stmt);
461b1e83836Smrg 	  if (c < 0)
462b1e83836Smrg 	    continue;
463b1e83836Smrg 
464b1e83836Smrg 	  e = add_edge (rdg, c, v);
465b1e83836Smrg 	  e->data = XNEW (struct rdg_edge);
466b1e83836Smrg 	  RDGE_TYPE (e) = control_dd;
467b1e83836Smrg 	}
468b1e83836Smrg     }
469b1e83836Smrg }
470b1e83836Smrg 
471b1e83836Smrg /* Creates the edges of the reduced dependence graph RDG.  */
472b1e83836Smrg 
473b1e83836Smrg static void
create_rdg_flow_edges(struct graph * rdg)474b1e83836Smrg create_rdg_flow_edges (struct graph *rdg)
475b1e83836Smrg {
476b1e83836Smrg   int i;
477b1e83836Smrg   def_operand_p def_p;
478b1e83836Smrg   ssa_op_iter iter;
479b1e83836Smrg 
480b1e83836Smrg   for (i = 0; i < rdg->n_vertices; i++)
481b1e83836Smrg     FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
482b1e83836Smrg 			      iter, SSA_OP_DEF)
483b1e83836Smrg       create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
484b1e83836Smrg }
485b1e83836Smrg 
486b1e83836Smrg /* Creates the edges of the reduced dependence graph RDG.  */
487b1e83836Smrg 
488b1e83836Smrg static void
create_rdg_cd_edges(struct graph * rdg,control_dependences * cd,loop_p loop)489b1e83836Smrg create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
490b1e83836Smrg {
491b1e83836Smrg   int i;
492b1e83836Smrg 
493b1e83836Smrg   for (i = 0; i < rdg->n_vertices; i++)
494b1e83836Smrg     {
495b1e83836Smrg       gimple *stmt = RDG_STMT (rdg, i);
496b1e83836Smrg       if (gimple_code (stmt) == GIMPLE_PHI)
497b1e83836Smrg 	{
498b1e83836Smrg 	  edge_iterator ei;
499b1e83836Smrg 	  edge e;
500b1e83836Smrg 	  FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
501b1e83836Smrg 	    if (flow_bb_inside_loop_p (loop, e->src))
502b1e83836Smrg 	      create_edge_for_control_dependence (rdg, e->src, i, cd);
503b1e83836Smrg 	}
504b1e83836Smrg       else
505b1e83836Smrg 	create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
506b1e83836Smrg     }
507b1e83836Smrg }
508b1e83836Smrg 
509b1e83836Smrg 
510b1e83836Smrg class loop_distribution
511b1e83836Smrg {
512b1e83836Smrg   private:
513b1e83836Smrg   /* The loop (nest) to be distributed.  */
514b1e83836Smrg   vec<loop_p> loop_nest;
515b1e83836Smrg 
516b1e83836Smrg   /* Vector of data references in the loop to be distributed.  */
517b1e83836Smrg   vec<data_reference_p> datarefs_vec;
518b1e83836Smrg 
519b1e83836Smrg   /* If there is nonaddressable data reference in above vector.  */
520b1e83836Smrg   bool has_nonaddressable_dataref_p;
521b1e83836Smrg 
522b1e83836Smrg   /* Store index of data reference in aux field.  */
523b1e83836Smrg 
524b1e83836Smrg   /* Hash table for data dependence relation in the loop to be distributed.  */
525b1e83836Smrg   hash_table<ddr_hasher> *ddrs_table;
526b1e83836Smrg 
527b1e83836Smrg   /* Array mapping basic block's index to its topological order.  */
528b1e83836Smrg   int *bb_top_order_index;
529b1e83836Smrg   /* And size of the array.  */
530b1e83836Smrg   int bb_top_order_index_size;
531b1e83836Smrg 
532b1e83836Smrg   /* Build the vertices of the reduced dependence graph RDG.  Return false
533b1e83836Smrg      if that failed.  */
534b1e83836Smrg   bool create_rdg_vertices (struct graph *rdg, const vec<gimple *> &stmts,
535b1e83836Smrg 			    loop_p loop);
536b1e83836Smrg 
537b1e83836Smrg   /* Initialize STMTS with all the statements of LOOP.  We use topological
538b1e83836Smrg      order to discover all statements.  The order is important because
539b1e83836Smrg      generate_loops_for_partition is using the same traversal for identifying
540b1e83836Smrg      statements in loop copies.  */
541b1e83836Smrg   void stmts_from_loop (class loop *loop, vec<gimple *> *stmts);
542b1e83836Smrg 
543b1e83836Smrg 
544b1e83836Smrg   /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
545b1e83836Smrg      LOOP, and one edge per flow dependence or control dependence from control
546b1e83836Smrg      dependence CD.  During visiting each statement, data references are also
547b1e83836Smrg      collected and recorded in global data DATAREFS_VEC.  */
548b1e83836Smrg   struct graph * build_rdg (class loop *loop, control_dependences *cd);
549b1e83836Smrg 
550b1e83836Smrg /* Merge PARTITION into the partition DEST.  RDG is the reduced dependence
551b1e83836Smrg    graph and we update type for result partition if it is non-NULL.  */
552b1e83836Smrg   void partition_merge_into (struct graph *rdg,
553b1e83836Smrg 			     partition *dest, partition *partition,
554b1e83836Smrg 			     enum fuse_type ft);
555b1e83836Smrg 
556b1e83836Smrg 
557b1e83836Smrg   /* Return data dependence relation for data references A and B.  The two
558b1e83836Smrg      data references must be in lexicographic order wrto reduced dependence
559b1e83836Smrg      graph RDG.  We firstly try to find ddr from global ddr hash table.  If
560b1e83836Smrg      it doesn't exist, compute the ddr and cache it.  */
561b1e83836Smrg   data_dependence_relation * get_data_dependence (struct graph *rdg,
562b1e83836Smrg 						  data_reference_p a,
563b1e83836Smrg 						  data_reference_p b);
564b1e83836Smrg 
565b1e83836Smrg 
566b1e83836Smrg   /* In reduced dependence graph RDG for loop distribution, return true if
567b1e83836Smrg      dependence between references DR1 and DR2 leads to a dependence cycle
568b1e83836Smrg      and such dependence cycle can't be resolved by runtime alias check.  */
569b1e83836Smrg   bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1,
570b1e83836Smrg 			    data_reference_p dr2);
571b1e83836Smrg 
572b1e83836Smrg 
573b1e83836Smrg   /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
574b1e83836Smrg      PARTITION1's type after merging PARTITION2 into PARTITION1.  */
575b1e83836Smrg   void update_type_for_merge (struct graph *rdg,
576b1e83836Smrg 			      partition *partition1, partition *partition2);
577b1e83836Smrg 
578b1e83836Smrg 
579b1e83836Smrg   /* Returns a partition with all the statements needed for computing
580b1e83836Smrg      the vertex V of the RDG, also including the loop exit conditions.  */
581b1e83836Smrg   partition *build_rdg_partition_for_vertex (struct graph *rdg, int v);
582b1e83836Smrg 
583b1e83836Smrg   /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
584b1e83836Smrg      if it forms builtin memcpy or memmove call.  */
585b1e83836Smrg   void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
586b1e83836Smrg 			      data_reference_p dst_dr, data_reference_p src_dr);
587b1e83836Smrg 
588b1e83836Smrg   /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
589b1e83836Smrg      For the moment we detect memset, memcpy and memmove patterns.  Bitmap
590b1e83836Smrg      STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.
591b1e83836Smrg      Returns true if there is a reduction in all partitions and we
592b1e83836Smrg      possibly did not mark PARTITION as having one for this reason.  */
593b1e83836Smrg 
594b1e83836Smrg   bool
595b1e83836Smrg   classify_partition (loop_p loop,
596b1e83836Smrg 		      struct graph *rdg, partition *partition,
597b1e83836Smrg 		      bitmap stmt_in_all_partitions);
598b1e83836Smrg 
599b1e83836Smrg 
600b1e83836Smrg   /* Returns true when PARTITION1 and PARTITION2 access the same memory
601b1e83836Smrg      object in RDG.  */
602b1e83836Smrg   bool share_memory_accesses (struct graph *rdg,
603b1e83836Smrg 			      partition *partition1, partition *partition2);
604b1e83836Smrg 
605b1e83836Smrg   /* For each seed statement in STARTING_STMTS, this function builds
606b1e83836Smrg      partition for it by adding depended statements according to RDG.
607b1e83836Smrg      All partitions are recorded in PARTITIONS.  */
608b1e83836Smrg   void rdg_build_partitions (struct graph *rdg,
609b1e83836Smrg 			     vec<gimple *> starting_stmts,
610b1e83836Smrg 			     vec<partition *> *partitions);
611b1e83836Smrg 
612b1e83836Smrg   /* Compute partition dependence created by the data references in DRS1
613b1e83836Smrg      and DRS2, modify and return DIR according to that.  IF ALIAS_DDR is
614b1e83836Smrg      not NULL, we record dependence introduced by possible alias between
615b1e83836Smrg      two data references in ALIAS_DDRS; otherwise, we simply ignore such
616b1e83836Smrg      dependence as if it doesn't exist at all.  */
617b1e83836Smrg   int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1,
618b1e83836Smrg 			       bitmap drs2, vec<ddr_p> *alias_ddrs);
619b1e83836Smrg 
620b1e83836Smrg 
621b1e83836Smrg   /* Build and return partition dependence graph for PARTITIONS.  RDG is
622b1e83836Smrg      reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
623b1e83836Smrg      is true, data dependence caused by possible alias between references
624b1e83836Smrg      is ignored, as if it doesn't exist at all; otherwise all depdendences
625b1e83836Smrg      are considered.  */
626b1e83836Smrg   struct graph *build_partition_graph (struct graph *rdg,
627b1e83836Smrg 				       vec<struct partition *> *partitions,
628b1e83836Smrg 				       bool ignore_alias_p);
629b1e83836Smrg 
630b1e83836Smrg   /* Given reduced dependence graph RDG merge strong connected components
631b1e83836Smrg      of PARTITIONS.  If IGNORE_ALIAS_P is true, data dependence caused by
632b1e83836Smrg      possible alias between references is ignored, as if it doesn't exist
633b1e83836Smrg      at all; otherwise all depdendences are considered.  */
634b1e83836Smrg   void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *>
635b1e83836Smrg 				 *partitions, bool ignore_alias_p);
636b1e83836Smrg 
637b1e83836Smrg /* This is the main function breaking strong conected components in
638b1e83836Smrg    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
639b1e83836Smrg    relations for runtime alias check in ALIAS_DDRS.  */
640b1e83836Smrg   void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *>
641b1e83836Smrg 				   *partitions, vec<ddr_p> *alias_ddrs);
642b1e83836Smrg 
643b1e83836Smrg 
644b1e83836Smrg   /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
645b1e83836Smrg      ALIAS_DDRS contains ddrs which need runtime alias check.  */
646b1e83836Smrg   void finalize_partitions (class loop *loop, vec<struct partition *>
647b1e83836Smrg 			    *partitions, vec<ddr_p> *alias_ddrs);
648b1e83836Smrg 
649b1e83836Smrg   /* Distributes the code from LOOP in such a way that producer statements
650b1e83836Smrg      are placed before consumer statements.  Tries to separate only the
651b1e83836Smrg      statements from STMTS into separate loops.  Returns the number of
652b1e83836Smrg      distributed loops.  Set NB_CALLS to number of generated builtin calls.
653b1e83836Smrg      Set *DESTROY_P to whether LOOP needs to be destroyed.  */
654b1e83836Smrg   int distribute_loop (class loop *loop, const vec<gimple *> &stmts,
655b1e83836Smrg 		       control_dependences *cd, int *nb_calls, bool *destroy_p,
656b1e83836Smrg 		       bool only_patterns_p);
657b1e83836Smrg 
658b1e83836Smrg   /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
659b1e83836Smrg      replace them accordingly.  */
660b1e83836Smrg   bool transform_reduction_loop (loop_p loop);
661b1e83836Smrg 
662b1e83836Smrg   /* Compute topological order for basic blocks.  Topological order is
663b1e83836Smrg      needed because data dependence is computed for data references in
664b1e83836Smrg      lexicographical order.  */
665b1e83836Smrg   void bb_top_order_init (void);
666b1e83836Smrg 
667b1e83836Smrg   void bb_top_order_destroy (void);
668b1e83836Smrg 
669b1e83836Smrg   public:
670b1e83836Smrg 
671b1e83836Smrg   /* Getter for bb_top_order.  */
672b1e83836Smrg 
get_bb_top_order_index_size(void)673b1e83836Smrg   inline int get_bb_top_order_index_size (void)
674b1e83836Smrg     {
675b1e83836Smrg       return bb_top_order_index_size;
676b1e83836Smrg     }
677b1e83836Smrg 
get_bb_top_order_index(int i)678b1e83836Smrg   inline int get_bb_top_order_index (int i)
679b1e83836Smrg     {
680b1e83836Smrg       return bb_top_order_index[i];
681b1e83836Smrg     }
682b1e83836Smrg 
683b1e83836Smrg   unsigned int execute (function *fun);
684b1e83836Smrg };
685b1e83836Smrg 
686b1e83836Smrg 
687b1e83836Smrg /* If X has a smaller topological sort number than Y, returns -1;
688b1e83836Smrg    if greater, returns 1.  */
689b1e83836Smrg static int
bb_top_order_cmp_r(const void * x,const void * y,void * loop)690b1e83836Smrg bb_top_order_cmp_r (const void *x, const void *y, void *loop)
691b1e83836Smrg {
692b1e83836Smrg   loop_distribution *_loop =
693b1e83836Smrg     (loop_distribution *) loop;
694b1e83836Smrg 
695b1e83836Smrg   basic_block bb1 = *(const basic_block *) x;
696b1e83836Smrg   basic_block bb2 = *(const basic_block *) y;
697b1e83836Smrg 
698b1e83836Smrg   int bb_top_order_index_size = _loop->get_bb_top_order_index_size ();
699b1e83836Smrg 
700b1e83836Smrg   gcc_assert (bb1->index < bb_top_order_index_size
701b1e83836Smrg 	      && bb2->index < bb_top_order_index_size);
702b1e83836Smrg   gcc_assert (bb1 == bb2
703b1e83836Smrg 	      || _loop->get_bb_top_order_index(bb1->index)
704b1e83836Smrg 		 != _loop->get_bb_top_order_index(bb2->index));
705b1e83836Smrg 
706b1e83836Smrg   return (_loop->get_bb_top_order_index(bb1->index) -
707b1e83836Smrg 	  _loop->get_bb_top_order_index(bb2->index));
708b1e83836Smrg }
709b1e83836Smrg 
710b1e83836Smrg bool
create_rdg_vertices(struct graph * rdg,const vec<gimple * > & stmts,loop_p loop)711b1e83836Smrg loop_distribution::create_rdg_vertices (struct graph *rdg,
712b1e83836Smrg 					const vec<gimple *> &stmts,
713b1e83836Smrg 					loop_p loop)
714b1e83836Smrg {
715b1e83836Smrg   int i;
716b1e83836Smrg   gimple *stmt;
717b1e83836Smrg 
718b1e83836Smrg   FOR_EACH_VEC_ELT (stmts, i, stmt)
719b1e83836Smrg     {
720b1e83836Smrg       struct vertex *v = &(rdg->vertices[i]);
721b1e83836Smrg 
722b1e83836Smrg       /* Record statement to vertex mapping.  */
723b1e83836Smrg       gimple_set_uid (stmt, i);
724b1e83836Smrg 
725b1e83836Smrg       v->data = XNEW (struct rdg_vertex);
726b1e83836Smrg       RDGV_STMT (v) = stmt;
727b1e83836Smrg       RDGV_DATAREFS (v).create (0);
728b1e83836Smrg       RDGV_HAS_MEM_WRITE (v) = false;
729b1e83836Smrg       RDGV_HAS_MEM_READS (v) = false;
730b1e83836Smrg       if (gimple_code (stmt) == GIMPLE_PHI)
731b1e83836Smrg 	continue;
732b1e83836Smrg 
733b1e83836Smrg       unsigned drp = datarefs_vec.length ();
734b1e83836Smrg       if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
735b1e83836Smrg 	return false;
736b1e83836Smrg       for (unsigned j = drp; j < datarefs_vec.length (); ++j)
737b1e83836Smrg 	{
738b1e83836Smrg 	  data_reference_p dr = datarefs_vec[j];
739b1e83836Smrg 	  if (DR_IS_READ (dr))
740b1e83836Smrg 	    RDGV_HAS_MEM_READS (v) = true;
741b1e83836Smrg 	  else
742b1e83836Smrg 	    RDGV_HAS_MEM_WRITE (v) = true;
743b1e83836Smrg 	  RDGV_DATAREFS (v).safe_push (dr);
744b1e83836Smrg 	  has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref);
745b1e83836Smrg 	}
746b1e83836Smrg     }
747b1e83836Smrg   return true;
748b1e83836Smrg }
749b1e83836Smrg 
750b1e83836Smrg void
stmts_from_loop(class loop * loop,vec<gimple * > * stmts)751b1e83836Smrg loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts)
752b1e83836Smrg {
753b1e83836Smrg   unsigned int i;
754b1e83836Smrg   basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
755b1e83836Smrg 
756b1e83836Smrg   for (i = 0; i < loop->num_nodes; i++)
757b1e83836Smrg     {
758b1e83836Smrg       basic_block bb = bbs[i];
759b1e83836Smrg 
760b1e83836Smrg       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
761b1e83836Smrg 	   gsi_next (&bsi))
762b1e83836Smrg 	if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
763b1e83836Smrg 	  stmts->safe_push (bsi.phi ());
764b1e83836Smrg 
765b1e83836Smrg       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
766b1e83836Smrg 	   gsi_next (&bsi))
767b1e83836Smrg 	{
768b1e83836Smrg 	  gimple *stmt = gsi_stmt (bsi);
769b1e83836Smrg 	  if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
770b1e83836Smrg 	    stmts->safe_push (stmt);
771b1e83836Smrg 	}
772b1e83836Smrg     }
773b1e83836Smrg 
774b1e83836Smrg   free (bbs);
775b1e83836Smrg }
776b1e83836Smrg 
777b1e83836Smrg /* Free the reduced dependence graph RDG.  */
778b1e83836Smrg 
779b1e83836Smrg static void
free_rdg(struct graph * rdg)780b1e83836Smrg free_rdg (struct graph *rdg)
781b1e83836Smrg {
782b1e83836Smrg   int i;
783b1e83836Smrg 
784b1e83836Smrg   for (i = 0; i < rdg->n_vertices; i++)
785b1e83836Smrg     {
786b1e83836Smrg       struct vertex *v = &(rdg->vertices[i]);
787b1e83836Smrg       struct graph_edge *e;
788b1e83836Smrg 
789b1e83836Smrg       for (e = v->succ; e; e = e->succ_next)
790b1e83836Smrg 	free (e->data);
791b1e83836Smrg 
792b1e83836Smrg       if (v->data)
793b1e83836Smrg 	{
794b1e83836Smrg 	  gimple_set_uid (RDGV_STMT (v), -1);
795b1e83836Smrg 	  (RDGV_DATAREFS (v)).release ();
796b1e83836Smrg 	  free (v->data);
797b1e83836Smrg 	}
798b1e83836Smrg     }
799b1e83836Smrg 
800b1e83836Smrg   free_graph (rdg);
801b1e83836Smrg }
802b1e83836Smrg 
803b1e83836Smrg struct graph *
build_rdg(class loop * loop,control_dependences * cd)804b1e83836Smrg loop_distribution::build_rdg (class loop *loop, control_dependences *cd)
805b1e83836Smrg {
806b1e83836Smrg   struct graph *rdg;
807b1e83836Smrg 
808b1e83836Smrg   /* Create the RDG vertices from the stmts of the loop nest.  */
809b1e83836Smrg   auto_vec<gimple *, 10> stmts;
810b1e83836Smrg   stmts_from_loop (loop, &stmts);
811b1e83836Smrg   rdg = new_graph (stmts.length ());
812b1e83836Smrg   if (!create_rdg_vertices (rdg, stmts, loop))
813b1e83836Smrg     {
814b1e83836Smrg       free_rdg (rdg);
815b1e83836Smrg       return NULL;
816b1e83836Smrg     }
817b1e83836Smrg   stmts.release ();
818b1e83836Smrg 
819b1e83836Smrg   create_rdg_flow_edges (rdg);
820b1e83836Smrg   if (cd)
821b1e83836Smrg     create_rdg_cd_edges (rdg, cd, loop);
822b1e83836Smrg 
823b1e83836Smrg   return rdg;
824b1e83836Smrg }
825b1e83836Smrg 
826b1e83836Smrg 
827b1e83836Smrg /* Allocate and initialize a partition from BITMAP.  */
828b1e83836Smrg 
829b1e83836Smrg static partition *
partition_alloc(void)830b1e83836Smrg partition_alloc (void)
831b1e83836Smrg {
832b1e83836Smrg   partition *partition = XCNEW (struct partition);
833b1e83836Smrg   partition->stmts = BITMAP_ALLOC (NULL);
834b1e83836Smrg   partition->reduction_p = false;
835b1e83836Smrg   partition->loc = UNKNOWN_LOCATION;
836b1e83836Smrg   partition->kind = PKIND_NORMAL;
837b1e83836Smrg   partition->type = PTYPE_PARALLEL;
838b1e83836Smrg   partition->datarefs = BITMAP_ALLOC (NULL);
839b1e83836Smrg   return partition;
840b1e83836Smrg }
841b1e83836Smrg 
842b1e83836Smrg /* Free PARTITION.  */
843b1e83836Smrg 
844b1e83836Smrg static void
partition_free(partition * partition)845b1e83836Smrg partition_free (partition *partition)
846b1e83836Smrg {
847b1e83836Smrg   BITMAP_FREE (partition->stmts);
848b1e83836Smrg   BITMAP_FREE (partition->datarefs);
849b1e83836Smrg   if (partition->builtin)
850b1e83836Smrg     free (partition->builtin);
851b1e83836Smrg 
852b1e83836Smrg   free (partition);
853b1e83836Smrg }
854b1e83836Smrg 
855b1e83836Smrg /* Returns true if the partition can be generated as a builtin.  */
856b1e83836Smrg 
857b1e83836Smrg static bool
partition_builtin_p(partition * partition)858b1e83836Smrg partition_builtin_p (partition *partition)
859b1e83836Smrg {
860b1e83836Smrg   return partition->kind > PKIND_PARTIAL_MEMSET;
861b1e83836Smrg }
862b1e83836Smrg 
863b1e83836Smrg /* Returns true if the partition contains a reduction.  */
864b1e83836Smrg 
865b1e83836Smrg static bool
partition_reduction_p(partition * partition)866b1e83836Smrg partition_reduction_p (partition *partition)
867b1e83836Smrg {
868b1e83836Smrg   return partition->reduction_p;
869b1e83836Smrg }
870b1e83836Smrg 
871b1e83836Smrg void
partition_merge_into(struct graph * rdg,partition * dest,partition * partition,enum fuse_type ft)872b1e83836Smrg loop_distribution::partition_merge_into (struct graph *rdg,
873b1e83836Smrg 		      partition *dest, partition *partition, enum fuse_type ft)
874b1e83836Smrg {
875b1e83836Smrg   if (dump_file && (dump_flags & TDF_DETAILS))
876b1e83836Smrg     {
877b1e83836Smrg       fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
878b1e83836Smrg       fprintf (dump_file, "  Part 1: ");
879b1e83836Smrg       dump_bitmap (dump_file, dest->stmts);
880b1e83836Smrg       fprintf (dump_file, "  Part 2: ");
881b1e83836Smrg       dump_bitmap (dump_file, partition->stmts);
882b1e83836Smrg     }
883b1e83836Smrg 
884b1e83836Smrg   dest->kind = PKIND_NORMAL;
885b1e83836Smrg   if (dest->type == PTYPE_PARALLEL)
886b1e83836Smrg     dest->type = partition->type;
887b1e83836Smrg 
888b1e83836Smrg   bitmap_ior_into (dest->stmts, partition->stmts);
889b1e83836Smrg   if (partition_reduction_p (partition))
890b1e83836Smrg     dest->reduction_p = true;
891b1e83836Smrg 
892b1e83836Smrg   /* Further check if any data dependence prevents us from executing the
893b1e83836Smrg      new partition parallelly.  */
894b1e83836Smrg   if (dest->type == PTYPE_PARALLEL && rdg != NULL)
895b1e83836Smrg     update_type_for_merge (rdg, dest, partition);
896b1e83836Smrg 
897b1e83836Smrg   bitmap_ior_into (dest->datarefs, partition->datarefs);
898b1e83836Smrg }
899b1e83836Smrg 
900b1e83836Smrg 
901b1e83836Smrg /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
902b1e83836Smrg    the LOOP.  */
903b1e83836Smrg 
904b1e83836Smrg static bool
ssa_name_has_uses_outside_loop_p(tree def,loop_p loop)905b1e83836Smrg ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
906b1e83836Smrg {
907b1e83836Smrg   imm_use_iterator imm_iter;
908b1e83836Smrg   use_operand_p use_p;
909b1e83836Smrg 
910b1e83836Smrg   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
911b1e83836Smrg     {
912b1e83836Smrg       if (is_gimple_debug (USE_STMT (use_p)))
913b1e83836Smrg 	continue;
914b1e83836Smrg 
915b1e83836Smrg       basic_block use_bb = gimple_bb (USE_STMT (use_p));
916b1e83836Smrg       if (!flow_bb_inside_loop_p (loop, use_bb))
917b1e83836Smrg 	return true;
918b1e83836Smrg     }
919b1e83836Smrg 
920b1e83836Smrg   return false;
921b1e83836Smrg }
922b1e83836Smrg 
923b1e83836Smrg /* Returns true when STMT defines a scalar variable used after the
924b1e83836Smrg    loop LOOP.  */
925b1e83836Smrg 
926b1e83836Smrg static bool
stmt_has_scalar_dependences_outside_loop(loop_p loop,gimple * stmt)927b1e83836Smrg stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
928b1e83836Smrg {
929b1e83836Smrg   def_operand_p def_p;
930b1e83836Smrg   ssa_op_iter op_iter;
931b1e83836Smrg 
932b1e83836Smrg   if (gimple_code (stmt) == GIMPLE_PHI)
933b1e83836Smrg     return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
934b1e83836Smrg 
935b1e83836Smrg   FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
936b1e83836Smrg     if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
937b1e83836Smrg       return true;
938b1e83836Smrg 
939b1e83836Smrg   return false;
940b1e83836Smrg }
941b1e83836Smrg 
942b1e83836Smrg /* Return a copy of LOOP placed before LOOP.  */
943b1e83836Smrg 
944b1e83836Smrg static class loop *
copy_loop_before(class loop * loop)945b1e83836Smrg copy_loop_before (class loop *loop)
946b1e83836Smrg {
947b1e83836Smrg   class loop *res;
948b1e83836Smrg   edge preheader = loop_preheader_edge (loop);
949b1e83836Smrg 
950b1e83836Smrg   initialize_original_copy_tables ();
951b1e83836Smrg   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
952b1e83836Smrg   gcc_assert (res != NULL);
953b1e83836Smrg   free_original_copy_tables ();
954b1e83836Smrg   delete_update_ssa ();
955b1e83836Smrg 
956b1e83836Smrg   return res;
957b1e83836Smrg }
958b1e83836Smrg 
959b1e83836Smrg /* Creates an empty basic block after LOOP.  */
960b1e83836Smrg 
961b1e83836Smrg static void
create_bb_after_loop(class loop * loop)962b1e83836Smrg create_bb_after_loop (class loop *loop)
963b1e83836Smrg {
964b1e83836Smrg   edge exit = single_exit (loop);
965b1e83836Smrg 
966b1e83836Smrg   if (!exit)
967b1e83836Smrg     return;
968b1e83836Smrg 
969b1e83836Smrg   split_edge (exit);
970b1e83836Smrg }
971b1e83836Smrg 
972b1e83836Smrg /* Generate code for PARTITION from the code in LOOP.  The loop is
973b1e83836Smrg    copied when COPY_P is true.  All the statements not flagged in the
974b1e83836Smrg    PARTITION bitmap are removed from the loop or from its copy.  The
975b1e83836Smrg    statements are indexed in sequence inside a basic block, and the
976b1e83836Smrg    basic blocks of a loop are taken in dom order.  */
977b1e83836Smrg 
978b1e83836Smrg static void
generate_loops_for_partition(class loop * loop,partition * partition,bool copy_p)979b1e83836Smrg generate_loops_for_partition (class loop *loop, partition *partition,
980b1e83836Smrg 			      bool copy_p)
981b1e83836Smrg {
982b1e83836Smrg   unsigned i;
983b1e83836Smrg   basic_block *bbs;
984b1e83836Smrg 
985b1e83836Smrg   if (copy_p)
986b1e83836Smrg     {
987b1e83836Smrg       int orig_loop_num = loop->orig_loop_num;
988b1e83836Smrg       loop = copy_loop_before (loop);
989b1e83836Smrg       gcc_assert (loop != NULL);
990b1e83836Smrg       loop->orig_loop_num = orig_loop_num;
991b1e83836Smrg       create_preheader (loop, CP_SIMPLE_PREHEADERS);
992b1e83836Smrg       create_bb_after_loop (loop);
993b1e83836Smrg     }
994b1e83836Smrg   else
995b1e83836Smrg     {
996b1e83836Smrg       /* Origin number is set to the new versioned loop's num.  */
997b1e83836Smrg       gcc_assert (loop->orig_loop_num != loop->num);
998b1e83836Smrg     }
999b1e83836Smrg 
1000b1e83836Smrg   /* Remove stmts not in the PARTITION bitmap.  */
1001b1e83836Smrg   bbs = get_loop_body_in_dom_order (loop);
1002b1e83836Smrg 
1003b1e83836Smrg   if (MAY_HAVE_DEBUG_BIND_STMTS)
1004b1e83836Smrg     for (i = 0; i < loop->num_nodes; i++)
1005b1e83836Smrg       {
1006b1e83836Smrg 	basic_block bb = bbs[i];
1007b1e83836Smrg 
1008b1e83836Smrg 	for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
1009b1e83836Smrg 	     gsi_next (&bsi))
1010b1e83836Smrg 	  {
1011b1e83836Smrg 	    gphi *phi = bsi.phi ();
1012b1e83836Smrg 	    if (!virtual_operand_p (gimple_phi_result (phi))
1013b1e83836Smrg 		&& !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1014b1e83836Smrg 	      reset_debug_uses (phi);
1015b1e83836Smrg 	  }
1016b1e83836Smrg 
1017b1e83836Smrg 	for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
1018b1e83836Smrg 	  {
1019b1e83836Smrg 	    gimple *stmt = gsi_stmt (bsi);
1020b1e83836Smrg 	    if (gimple_code (stmt) != GIMPLE_LABEL
1021b1e83836Smrg 		&& !is_gimple_debug (stmt)
1022b1e83836Smrg 		&& !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1023b1e83836Smrg 	      reset_debug_uses (stmt);
1024b1e83836Smrg 	  }
1025b1e83836Smrg       }
1026b1e83836Smrg 
1027b1e83836Smrg   for (i = 0; i < loop->num_nodes; i++)
1028b1e83836Smrg     {
1029b1e83836Smrg       basic_block bb = bbs[i];
1030b1e83836Smrg       edge inner_exit = NULL;
1031b1e83836Smrg 
1032b1e83836Smrg       if (loop != bb->loop_father)
1033b1e83836Smrg 	inner_exit = single_exit (bb->loop_father);
1034b1e83836Smrg 
1035b1e83836Smrg       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
1036b1e83836Smrg 	{
1037b1e83836Smrg 	  gphi *phi = bsi.phi ();
1038b1e83836Smrg 	  if (!virtual_operand_p (gimple_phi_result (phi))
1039b1e83836Smrg 	      && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1040b1e83836Smrg 	    remove_phi_node (&bsi, true);
1041b1e83836Smrg 	  else
1042b1e83836Smrg 	    gsi_next (&bsi);
1043b1e83836Smrg 	}
1044b1e83836Smrg 
1045b1e83836Smrg       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
1046b1e83836Smrg 	{
1047b1e83836Smrg 	  gimple *stmt = gsi_stmt (bsi);
1048b1e83836Smrg 	  if (gimple_code (stmt) != GIMPLE_LABEL
1049b1e83836Smrg 	      && !is_gimple_debug (stmt)
1050b1e83836Smrg 	      && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1051b1e83836Smrg 	    {
1052b1e83836Smrg 	      /* In distribution of loop nest, if bb is inner loop's exit_bb,
1053b1e83836Smrg 		 we choose its exit edge/path in order to avoid generating
1054b1e83836Smrg 		 infinite loop.  For all other cases, we choose an arbitrary
1055b1e83836Smrg 		 path through the empty CFG part that this unnecessary
1056b1e83836Smrg 		 control stmt controls.  */
1057b1e83836Smrg 	      if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
1058b1e83836Smrg 		{
1059b1e83836Smrg 		  if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
1060b1e83836Smrg 		    gimple_cond_make_true (cond_stmt);
1061b1e83836Smrg 		  else
1062b1e83836Smrg 		    gimple_cond_make_false (cond_stmt);
1063b1e83836Smrg 		  update_stmt (stmt);
1064b1e83836Smrg 		}
1065b1e83836Smrg 	      else if (gimple_code (stmt) == GIMPLE_SWITCH)
1066b1e83836Smrg 		{
1067b1e83836Smrg 		  gswitch *switch_stmt = as_a <gswitch *> (stmt);
1068b1e83836Smrg 		  gimple_switch_set_index
1069b1e83836Smrg 		      (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
1070b1e83836Smrg 		  update_stmt (stmt);
1071b1e83836Smrg 		}
1072b1e83836Smrg 	      else
1073b1e83836Smrg 		{
1074b1e83836Smrg 		  unlink_stmt_vdef (stmt);
1075b1e83836Smrg 		  gsi_remove (&bsi, true);
1076b1e83836Smrg 		  release_defs (stmt);
1077b1e83836Smrg 		  continue;
1078b1e83836Smrg 		}
1079b1e83836Smrg 	    }
1080b1e83836Smrg 	  gsi_next (&bsi);
1081b1e83836Smrg 	}
1082b1e83836Smrg     }
1083b1e83836Smrg 
1084b1e83836Smrg   free (bbs);
1085b1e83836Smrg }
1086b1e83836Smrg 
1087b1e83836Smrg /* If VAL memory representation contains the same value in all bytes,
1088b1e83836Smrg    return that value, otherwise return -1.
1089b1e83836Smrg    E.g. for 0x24242424 return 0x24, for IEEE double
1090b1e83836Smrg    747708026454360457216.0 return 0x44, etc.  */
1091b1e83836Smrg 
1092b1e83836Smrg static int
const_with_all_bytes_same(tree val)1093b1e83836Smrg const_with_all_bytes_same (tree val)
1094b1e83836Smrg {
1095b1e83836Smrg   unsigned char buf[64];
1096b1e83836Smrg   int i, len;
1097b1e83836Smrg 
1098b1e83836Smrg   if (integer_zerop (val)
1099b1e83836Smrg       || (TREE_CODE (val) == CONSTRUCTOR
1100b1e83836Smrg           && !TREE_CLOBBER_P (val)
1101b1e83836Smrg           && CONSTRUCTOR_NELTS (val) == 0))
1102b1e83836Smrg     return 0;
1103b1e83836Smrg 
1104b1e83836Smrg   if (real_zerop (val))
1105b1e83836Smrg     {
1106b1e83836Smrg       /* Only return 0 for +0.0, not for -0.0, which doesn't have
1107b1e83836Smrg 	 an all bytes same memory representation.  Don't transform
1108b1e83836Smrg 	 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS.  */
1109b1e83836Smrg       switch (TREE_CODE (val))
1110b1e83836Smrg 	{
1111b1e83836Smrg 	case REAL_CST:
1112b1e83836Smrg 	  if (!real_isneg (TREE_REAL_CST_PTR (val)))
1113b1e83836Smrg 	    return 0;
1114b1e83836Smrg 	  break;
1115b1e83836Smrg 	case COMPLEX_CST:
1116b1e83836Smrg 	  if (!const_with_all_bytes_same (TREE_REALPART (val))
1117b1e83836Smrg 	      && !const_with_all_bytes_same (TREE_IMAGPART (val)))
1118b1e83836Smrg 	    return 0;
1119b1e83836Smrg 	  break;
1120b1e83836Smrg 	case VECTOR_CST:
1121b1e83836Smrg 	  {
1122b1e83836Smrg 	    unsigned int count = vector_cst_encoded_nelts (val);
1123b1e83836Smrg 	    unsigned int j;
1124b1e83836Smrg 	    for (j = 0; j < count; ++j)
1125b1e83836Smrg 	      if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
1126b1e83836Smrg 		break;
1127b1e83836Smrg 	    if (j == count)
1128b1e83836Smrg 	      return 0;
1129b1e83836Smrg 	    break;
1130b1e83836Smrg 	  }
1131b1e83836Smrg 	default:
1132b1e83836Smrg 	  break;
1133b1e83836Smrg 	}
1134b1e83836Smrg     }
1135b1e83836Smrg 
1136b1e83836Smrg   if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
1137b1e83836Smrg     return -1;
1138b1e83836Smrg 
1139b1e83836Smrg   len = native_encode_expr (val, buf, sizeof (buf));
1140b1e83836Smrg   if (len == 0)
1141b1e83836Smrg     return -1;
1142b1e83836Smrg   for (i = 1; i < len; i++)
1143b1e83836Smrg     if (buf[i] != buf[0])
1144b1e83836Smrg       return -1;
1145b1e83836Smrg   return buf[0];
1146b1e83836Smrg }
1147b1e83836Smrg 
1148b1e83836Smrg /* Generate a call to memset for PARTITION in LOOP.  */
1149b1e83836Smrg 
1150b1e83836Smrg static void
generate_memset_builtin(class loop * loop,partition * partition)1151b1e83836Smrg generate_memset_builtin (class loop *loop, partition *partition)
1152b1e83836Smrg {
1153b1e83836Smrg   gimple_stmt_iterator gsi;
1154b1e83836Smrg   tree mem, fn, nb_bytes;
1155b1e83836Smrg   tree val;
1156b1e83836Smrg   struct builtin_info *builtin = partition->builtin;
1157b1e83836Smrg   gimple *fn_call;
1158b1e83836Smrg 
1159b1e83836Smrg   /* The new statements will be placed before LOOP.  */
1160b1e83836Smrg   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1161b1e83836Smrg 
1162b1e83836Smrg   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1163b1e83836Smrg   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1164b1e83836Smrg 				       false, GSI_CONTINUE_LINKING);
1165b1e83836Smrg   mem = rewrite_to_non_trapping_overflow (builtin->dst_base);
1166b1e83836Smrg   mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1167b1e83836Smrg 				  false, GSI_CONTINUE_LINKING);
1168b1e83836Smrg 
1169b1e83836Smrg   /* This exactly matches the pattern recognition in classify_partition.  */
1170b1e83836Smrg   val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1171b1e83836Smrg   /* Handle constants like 0x15151515 and similarly
1172b1e83836Smrg      floating point constants etc. where all bytes are the same.  */
1173b1e83836Smrg   int bytev = const_with_all_bytes_same (val);
1174b1e83836Smrg   if (bytev != -1)
1175b1e83836Smrg     val = build_int_cst (integer_type_node, bytev);
1176b1e83836Smrg   else if (TREE_CODE (val) == INTEGER_CST)
1177b1e83836Smrg     val = fold_convert (integer_type_node, val);
1178b1e83836Smrg   else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1179b1e83836Smrg     {
1180b1e83836Smrg       tree tem = make_ssa_name (integer_type_node);
1181b1e83836Smrg       gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1182b1e83836Smrg       gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1183b1e83836Smrg       val = tem;
1184b1e83836Smrg     }
1185b1e83836Smrg 
1186b1e83836Smrg   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1187b1e83836Smrg   fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1188b1e83836Smrg   gimple_set_location (fn_call, partition->loc);
1189b1e83836Smrg   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1190b1e83836Smrg   fold_stmt (&gsi);
1191b1e83836Smrg 
1192b1e83836Smrg   if (dump_file && (dump_flags & TDF_DETAILS))
1193b1e83836Smrg     {
1194b1e83836Smrg       fprintf (dump_file, "generated memset");
1195b1e83836Smrg       if (bytev == 0)
1196b1e83836Smrg 	fprintf (dump_file, " zero\n");
1197b1e83836Smrg       else
1198b1e83836Smrg 	fprintf (dump_file, "\n");
1199b1e83836Smrg     }
1200b1e83836Smrg }
1201b1e83836Smrg 
1202b1e83836Smrg /* Generate a call to memcpy for PARTITION in LOOP.  */
1203b1e83836Smrg 
1204b1e83836Smrg static void
generate_memcpy_builtin(class loop * loop,partition * partition)1205b1e83836Smrg generate_memcpy_builtin (class loop *loop, partition *partition)
1206b1e83836Smrg {
1207b1e83836Smrg   gimple_stmt_iterator gsi;
1208b1e83836Smrg   gimple *fn_call;
1209b1e83836Smrg   tree dest, src, fn, nb_bytes;
1210b1e83836Smrg   enum built_in_function kind;
1211b1e83836Smrg   struct builtin_info *builtin = partition->builtin;
1212b1e83836Smrg 
1213b1e83836Smrg   /* The new statements will be placed before LOOP.  */
1214b1e83836Smrg   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1215b1e83836Smrg 
1216b1e83836Smrg   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1217b1e83836Smrg   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1218b1e83836Smrg 				       false, GSI_CONTINUE_LINKING);
1219b1e83836Smrg   dest = rewrite_to_non_trapping_overflow (builtin->dst_base);
1220b1e83836Smrg   src = rewrite_to_non_trapping_overflow (builtin->src_base);
1221b1e83836Smrg   if (partition->kind == PKIND_MEMCPY
1222b1e83836Smrg       || ! ptr_derefs_may_alias_p (dest, src))
1223b1e83836Smrg     kind = BUILT_IN_MEMCPY;
1224b1e83836Smrg   else
1225b1e83836Smrg     kind = BUILT_IN_MEMMOVE;
1226b1e83836Smrg   /* Try harder if we're copying a constant size.  */
1227b1e83836Smrg   if (kind == BUILT_IN_MEMMOVE && poly_int_tree_p (nb_bytes))
1228b1e83836Smrg     {
1229b1e83836Smrg       aff_tree asrc, adest;
1230b1e83836Smrg       tree_to_aff_combination (src, ptr_type_node, &asrc);
1231b1e83836Smrg       tree_to_aff_combination (dest, ptr_type_node, &adest);
1232b1e83836Smrg       aff_combination_scale (&adest, -1);
1233b1e83836Smrg       aff_combination_add (&asrc, &adest);
1234b1e83836Smrg       if (aff_comb_cannot_overlap_p (&asrc, wi::to_poly_widest (nb_bytes),
1235b1e83836Smrg 				     wi::to_poly_widest (nb_bytes)))
1236b1e83836Smrg 	kind = BUILT_IN_MEMCPY;
1237b1e83836Smrg     }
1238b1e83836Smrg 
1239b1e83836Smrg   dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1240b1e83836Smrg 				   false, GSI_CONTINUE_LINKING);
1241b1e83836Smrg   src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1242b1e83836Smrg 				  false, GSI_CONTINUE_LINKING);
1243b1e83836Smrg   fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1244b1e83836Smrg   fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1245b1e83836Smrg   gimple_set_location (fn_call, partition->loc);
1246b1e83836Smrg   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1247b1e83836Smrg   fold_stmt (&gsi);
1248b1e83836Smrg 
1249b1e83836Smrg   if (dump_file && (dump_flags & TDF_DETAILS))
1250b1e83836Smrg     {
1251b1e83836Smrg       if (kind == BUILT_IN_MEMCPY)
1252b1e83836Smrg 	fprintf (dump_file, "generated memcpy\n");
1253b1e83836Smrg       else
1254b1e83836Smrg 	fprintf (dump_file, "generated memmove\n");
1255b1e83836Smrg     }
1256b1e83836Smrg }
1257b1e83836Smrg 
1258b1e83836Smrg /* Remove and destroy the loop LOOP.  */
1259b1e83836Smrg 
1260b1e83836Smrg static void
destroy_loop(class loop * loop)1261b1e83836Smrg destroy_loop (class loop *loop)
1262b1e83836Smrg {
1263b1e83836Smrg   unsigned nbbs = loop->num_nodes;
1264b1e83836Smrg   edge exit = single_exit (loop);
1265b1e83836Smrg   basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1266b1e83836Smrg   basic_block *bbs;
1267b1e83836Smrg   unsigned i;
1268b1e83836Smrg 
1269b1e83836Smrg   bbs = get_loop_body_in_dom_order (loop);
1270b1e83836Smrg 
1271b1e83836Smrg   gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest);
1272b1e83836Smrg   bool safe_p = single_pred_p (exit->dest);
1273b1e83836Smrg   for (unsigned i = 0; i < nbbs; ++i)
1274b1e83836Smrg     {
1275b1e83836Smrg       /* We have made sure to not leave any dangling uses of SSA
1276b1e83836Smrg          names defined in the loop.  With the exception of virtuals.
1277b1e83836Smrg 	 Make sure we replace all uses of virtual defs that will remain
1278b1e83836Smrg 	 outside of the loop with the bare symbol as delete_basic_block
1279b1e83836Smrg 	 will release them.  */
1280b1e83836Smrg       for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1281b1e83836Smrg 	   gsi_next (&gsi))
1282b1e83836Smrg 	{
1283b1e83836Smrg 	  gphi *phi = gsi.phi ();
1284b1e83836Smrg 	  if (virtual_operand_p (gimple_phi_result (phi)))
1285b1e83836Smrg 	    mark_virtual_phi_result_for_renaming (phi);
1286b1e83836Smrg 	}
1287b1e83836Smrg       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);)
1288b1e83836Smrg 	{
1289b1e83836Smrg 	  gimple *stmt = gsi_stmt (gsi);
1290b1e83836Smrg 	  tree vdef = gimple_vdef (stmt);
1291b1e83836Smrg 	  if (vdef && TREE_CODE (vdef) == SSA_NAME)
1292b1e83836Smrg 	    mark_virtual_operand_for_renaming (vdef);
1293b1e83836Smrg 	  /* Also move and eventually reset debug stmts.  We can leave
1294b1e83836Smrg 	     constant values in place in case the stmt dominates the exit.
1295b1e83836Smrg 	     ???  Non-constant values from the last iteration can be
1296b1e83836Smrg 	     replaced with final values if we can compute them.  */
1297b1e83836Smrg 	  if (gimple_debug_bind_p (stmt))
1298b1e83836Smrg 	    {
1299b1e83836Smrg 	      tree val = gimple_debug_bind_get_value (stmt);
1300b1e83836Smrg 	      gsi_move_before (&gsi, &dst_gsi);
1301b1e83836Smrg 	      if (val
1302b1e83836Smrg 		  && (!safe_p
1303b1e83836Smrg 		      || !is_gimple_min_invariant (val)
1304b1e83836Smrg 		      || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i])))
1305b1e83836Smrg 		{
1306b1e83836Smrg 		  gimple_debug_bind_reset_value (stmt);
1307b1e83836Smrg 		  update_stmt (stmt);
1308b1e83836Smrg 		}
1309b1e83836Smrg 	    }
1310b1e83836Smrg 	  else
1311b1e83836Smrg 	    gsi_next (&gsi);
1312b1e83836Smrg 	}
1313b1e83836Smrg     }
1314b1e83836Smrg 
1315b1e83836Smrg   redirect_edge_pred (exit, src);
1316b1e83836Smrg   exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1317b1e83836Smrg   exit->flags |= EDGE_FALLTHRU;
1318b1e83836Smrg   cancel_loop_tree (loop);
1319b1e83836Smrg   rescan_loop_exit (exit, false, true);
1320b1e83836Smrg 
1321b1e83836Smrg   i = nbbs;
1322b1e83836Smrg   do
1323b1e83836Smrg     {
1324b1e83836Smrg       --i;
1325b1e83836Smrg       delete_basic_block (bbs[i]);
1326b1e83836Smrg     }
1327b1e83836Smrg   while (i != 0);
1328b1e83836Smrg 
1329b1e83836Smrg   free (bbs);
1330b1e83836Smrg 
1331b1e83836Smrg   set_immediate_dominator (CDI_DOMINATORS, dest,
1332b1e83836Smrg 			   recompute_dominator (CDI_DOMINATORS, dest));
1333b1e83836Smrg }
1334b1e83836Smrg 
1335b1e83836Smrg /* Generates code for PARTITION.  Return whether LOOP needs to be destroyed.  */
1336b1e83836Smrg 
1337b1e83836Smrg static bool
generate_code_for_partition(class loop * loop,partition * partition,bool copy_p)1338b1e83836Smrg generate_code_for_partition (class loop *loop,
1339b1e83836Smrg 			     partition *partition, bool copy_p)
1340b1e83836Smrg {
1341b1e83836Smrg   switch (partition->kind)
1342b1e83836Smrg     {
1343b1e83836Smrg     case PKIND_NORMAL:
1344b1e83836Smrg     case PKIND_PARTIAL_MEMSET:
1345b1e83836Smrg       /* Reductions all have to be in the last partition.  */
1346b1e83836Smrg       gcc_assert (!partition_reduction_p (partition)
1347b1e83836Smrg 		  || !copy_p);
1348b1e83836Smrg       generate_loops_for_partition (loop, partition, copy_p);
1349b1e83836Smrg       return false;
1350b1e83836Smrg 
1351b1e83836Smrg     case PKIND_MEMSET:
1352b1e83836Smrg       generate_memset_builtin (loop, partition);
1353b1e83836Smrg       break;
1354b1e83836Smrg 
1355b1e83836Smrg     case PKIND_MEMCPY:
1356b1e83836Smrg     case PKIND_MEMMOVE:
1357b1e83836Smrg       generate_memcpy_builtin (loop, partition);
1358b1e83836Smrg       break;
1359b1e83836Smrg 
1360b1e83836Smrg     default:
1361b1e83836Smrg       gcc_unreachable ();
1362b1e83836Smrg     }
1363b1e83836Smrg 
1364b1e83836Smrg   /* Common tail for partitions we turn into a call.  If this was the last
1365b1e83836Smrg      partition for which we generate code, we have to destroy the loop.  */
1366b1e83836Smrg   if (!copy_p)
1367b1e83836Smrg     return true;
1368b1e83836Smrg   return false;
1369b1e83836Smrg }
1370b1e83836Smrg 
1371b1e83836Smrg data_dependence_relation *
get_data_dependence(struct graph * rdg,data_reference_p a,data_reference_p b)1372b1e83836Smrg loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a,
1373b1e83836Smrg 					data_reference_p b)
1374b1e83836Smrg {
1375b1e83836Smrg   struct data_dependence_relation ent, **slot;
1376b1e83836Smrg   struct data_dependence_relation *ddr;
1377b1e83836Smrg 
1378b1e83836Smrg   gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1379b1e83836Smrg   gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1380b1e83836Smrg 	      <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1381b1e83836Smrg   ent.a = a;
1382b1e83836Smrg   ent.b = b;
1383b1e83836Smrg   slot = ddrs_table->find_slot (&ent, INSERT);
1384b1e83836Smrg   if (*slot == NULL)
1385b1e83836Smrg     {
1386b1e83836Smrg       ddr = initialize_data_dependence_relation (a, b, loop_nest);
1387b1e83836Smrg       compute_affine_dependence (ddr, loop_nest[0]);
1388b1e83836Smrg       *slot = ddr;
1389b1e83836Smrg     }
1390b1e83836Smrg 
1391b1e83836Smrg   return *slot;
1392b1e83836Smrg }
1393b1e83836Smrg 
1394b1e83836Smrg bool
data_dep_in_cycle_p(struct graph * rdg,data_reference_p dr1,data_reference_p dr2)1395b1e83836Smrg loop_distribution::data_dep_in_cycle_p (struct graph *rdg,
1396b1e83836Smrg 					data_reference_p dr1,
1397b1e83836Smrg 					data_reference_p dr2)
1398b1e83836Smrg {
1399b1e83836Smrg   struct data_dependence_relation *ddr;
1400b1e83836Smrg 
1401b1e83836Smrg   /* Re-shuffle data-refs to be in topological order.  */
1402b1e83836Smrg   if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1403b1e83836Smrg       > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1404b1e83836Smrg     std::swap (dr1, dr2);
1405b1e83836Smrg 
1406b1e83836Smrg   ddr = get_data_dependence (rdg, dr1, dr2);
1407b1e83836Smrg 
1408b1e83836Smrg   /* In case of no data dependence.  */
1409b1e83836Smrg   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1410b1e83836Smrg     return false;
1411b1e83836Smrg   /* For unknown data dependence or known data dependence which can't be
1412b1e83836Smrg      expressed in classic distance vector, we check if it can be resolved
1413b1e83836Smrg      by runtime alias check.  If yes, we still consider data dependence
1414b1e83836Smrg      as won't introduce data dependence cycle.  */
1415b1e83836Smrg   else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1416b1e83836Smrg 	   || DDR_NUM_DIST_VECTS (ddr) == 0)
1417b1e83836Smrg     return !runtime_alias_check_p (ddr, NULL, true);
1418b1e83836Smrg   else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1419b1e83836Smrg     return true;
1420b1e83836Smrg   else if (DDR_REVERSED_P (ddr)
1421b1e83836Smrg 	   || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1422b1e83836Smrg     return false;
1423b1e83836Smrg 
1424b1e83836Smrg   return true;
1425b1e83836Smrg }
1426b1e83836Smrg 
1427b1e83836Smrg void
update_type_for_merge(struct graph * rdg,partition * partition1,partition * partition2)1428b1e83836Smrg loop_distribution::update_type_for_merge (struct graph *rdg,
1429b1e83836Smrg 					   partition *partition1,
1430b1e83836Smrg 					   partition *partition2)
1431b1e83836Smrg {
1432b1e83836Smrg   unsigned i, j;
1433b1e83836Smrg   bitmap_iterator bi, bj;
1434b1e83836Smrg   data_reference_p dr1, dr2;
1435b1e83836Smrg 
1436b1e83836Smrg   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1437b1e83836Smrg     {
1438b1e83836Smrg       unsigned start = (partition1 == partition2) ? i + 1 : 0;
1439b1e83836Smrg 
1440b1e83836Smrg       dr1 = datarefs_vec[i];
1441b1e83836Smrg       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1442b1e83836Smrg 	{
1443b1e83836Smrg 	  dr2 = datarefs_vec[j];
1444b1e83836Smrg 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1445b1e83836Smrg 	    continue;
1446b1e83836Smrg 
1447b1e83836Smrg 	  /* Partition can only be executed sequentially if there is any
1448b1e83836Smrg 	     data dependence cycle.  */
1449b1e83836Smrg 	  if (data_dep_in_cycle_p (rdg, dr1, dr2))
1450b1e83836Smrg 	    {
1451b1e83836Smrg 	      partition1->type = PTYPE_SEQUENTIAL;
1452b1e83836Smrg 	      return;
1453b1e83836Smrg 	    }
1454b1e83836Smrg 	}
1455b1e83836Smrg     }
1456b1e83836Smrg }
1457b1e83836Smrg 
1458b1e83836Smrg partition *
build_rdg_partition_for_vertex(struct graph * rdg,int v)1459b1e83836Smrg loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v)
1460b1e83836Smrg {
1461b1e83836Smrg   partition *partition = partition_alloc ();
1462b1e83836Smrg   auto_vec<int, 3> nodes;
1463b1e83836Smrg   unsigned i, j;
1464b1e83836Smrg   int x;
1465b1e83836Smrg   data_reference_p dr;
1466b1e83836Smrg 
1467b1e83836Smrg   graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1468b1e83836Smrg 
1469b1e83836Smrg   FOR_EACH_VEC_ELT (nodes, i, x)
1470b1e83836Smrg     {
1471b1e83836Smrg       bitmap_set_bit (partition->stmts, x);
1472b1e83836Smrg 
1473b1e83836Smrg       for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1474b1e83836Smrg 	{
1475b1e83836Smrg 	  unsigned idx = (unsigned) DR_INDEX (dr);
1476b1e83836Smrg 	  gcc_assert (idx < datarefs_vec.length ());
1477b1e83836Smrg 
1478b1e83836Smrg 	  /* Partition can only be executed sequentially if there is any
1479b1e83836Smrg 	     unknown data reference.  */
1480b1e83836Smrg 	  if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1481b1e83836Smrg 	      || !DR_INIT (dr) || !DR_STEP (dr))
1482b1e83836Smrg 	    partition->type = PTYPE_SEQUENTIAL;
1483b1e83836Smrg 
1484b1e83836Smrg 	  bitmap_set_bit (partition->datarefs, idx);
1485b1e83836Smrg 	}
1486b1e83836Smrg     }
1487b1e83836Smrg 
1488b1e83836Smrg   if (partition->type == PTYPE_SEQUENTIAL)
1489b1e83836Smrg     return partition;
1490b1e83836Smrg 
1491b1e83836Smrg   /* Further check if any data dependence prevents us from executing the
1492b1e83836Smrg      partition parallelly.  */
1493b1e83836Smrg   update_type_for_merge (rdg, partition, partition);
1494b1e83836Smrg 
1495b1e83836Smrg   return partition;
1496b1e83836Smrg }
1497b1e83836Smrg 
1498b1e83836Smrg /* Given PARTITION of LOOP and RDG, record single load/store data references
1499b1e83836Smrg    for builtin partition in SRC_DR/DST_DR, return false if there is no such
1500b1e83836Smrg    data references.  */
1501b1e83836Smrg 
1502b1e83836Smrg static bool
find_single_drs(class loop * loop,struct graph * rdg,const bitmap & partition_stmts,data_reference_p * dst_dr,data_reference_p * src_dr)1503b1e83836Smrg find_single_drs (class loop *loop, struct graph *rdg, const bitmap &partition_stmts,
1504b1e83836Smrg 		 data_reference_p *dst_dr, data_reference_p *src_dr)
1505b1e83836Smrg {
1506b1e83836Smrg   unsigned i;
1507b1e83836Smrg   data_reference_p single_ld = NULL, single_st = NULL;
1508b1e83836Smrg   bitmap_iterator bi;
1509b1e83836Smrg 
1510b1e83836Smrg   EXECUTE_IF_SET_IN_BITMAP (partition_stmts, 0, i, bi)
1511b1e83836Smrg     {
1512b1e83836Smrg       gimple *stmt = RDG_STMT (rdg, i);
1513b1e83836Smrg       data_reference_p dr;
1514b1e83836Smrg 
1515b1e83836Smrg       if (gimple_code (stmt) == GIMPLE_PHI)
1516b1e83836Smrg 	continue;
1517b1e83836Smrg 
1518b1e83836Smrg       /* Any scalar stmts are ok.  */
1519b1e83836Smrg       if (!gimple_vuse (stmt))
1520b1e83836Smrg 	continue;
1521b1e83836Smrg 
1522b1e83836Smrg       /* Otherwise just regular loads/stores.  */
1523b1e83836Smrg       if (!gimple_assign_single_p (stmt))
1524b1e83836Smrg 	return false;
1525b1e83836Smrg 
1526b1e83836Smrg       /* But exactly one store and/or load.  */
1527b1e83836Smrg       for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1528b1e83836Smrg 	{
1529b1e83836Smrg 	  tree type = TREE_TYPE (DR_REF (dr));
1530b1e83836Smrg 
1531b1e83836Smrg 	  /* The memset, memcpy and memmove library calls are only
1532b1e83836Smrg 	     able to deal with generic address space.  */
1533b1e83836Smrg 	  if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1534b1e83836Smrg 	    return false;
1535b1e83836Smrg 
1536b1e83836Smrg 	  if (DR_IS_READ (dr))
1537b1e83836Smrg 	    {
1538b1e83836Smrg 	      if (single_ld != NULL)
1539b1e83836Smrg 		return false;
1540b1e83836Smrg 	      single_ld = dr;
1541b1e83836Smrg 	    }
1542b1e83836Smrg 	  else
1543b1e83836Smrg 	    {
1544b1e83836Smrg 	      if (single_st != NULL)
1545b1e83836Smrg 		return false;
1546b1e83836Smrg 	      single_st = dr;
1547b1e83836Smrg 	    }
1548b1e83836Smrg 	}
1549b1e83836Smrg     }
1550b1e83836Smrg 
1551b1e83836Smrg   if (!single_ld && !single_st)
1552b1e83836Smrg     return false;
1553b1e83836Smrg 
1554b1e83836Smrg   basic_block bb_ld = NULL;
1555b1e83836Smrg   basic_block bb_st = NULL;
1556b1e83836Smrg 
1557b1e83836Smrg   if (single_ld)
1558b1e83836Smrg     {
1559b1e83836Smrg       /* Bail out if this is a bitfield memory reference.  */
1560b1e83836Smrg       if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1561b1e83836Smrg 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1562b1e83836Smrg 	return false;
1563b1e83836Smrg 
1564b1e83836Smrg       /* Data reference must be executed exactly once per iteration of each
1565b1e83836Smrg 	 loop in the loop nest.  We only need to check dominance information
1566b1e83836Smrg 	 against the outermost one in a perfect loop nest because a bb can't
1567b1e83836Smrg 	 dominate outermost loop's latch without dominating inner loop's.  */
1568b1e83836Smrg       bb_ld = gimple_bb (DR_STMT (single_ld));
1569b1e83836Smrg       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1570b1e83836Smrg 	return false;
1571b1e83836Smrg     }
1572b1e83836Smrg 
1573b1e83836Smrg   if (single_st)
1574b1e83836Smrg     {
1575b1e83836Smrg       /* Bail out if this is a bitfield memory reference.  */
1576b1e83836Smrg       if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1577b1e83836Smrg 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1578b1e83836Smrg 	return false;
1579b1e83836Smrg 
1580b1e83836Smrg       /* Data reference must be executed exactly once per iteration.
1581b1e83836Smrg 	 Same as single_ld, we only need to check against the outermost
1582b1e83836Smrg 	 loop.  */
1583b1e83836Smrg       bb_st = gimple_bb (DR_STMT (single_st));
1584b1e83836Smrg       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1585b1e83836Smrg 	return false;
1586b1e83836Smrg     }
1587b1e83836Smrg 
1588b1e83836Smrg   if (single_ld && single_st)
1589b1e83836Smrg     {
1590b1e83836Smrg       /* Load and store must be in the same loop nest.  */
1591b1e83836Smrg       if (bb_st->loop_father != bb_ld->loop_father)
1592b1e83836Smrg 	return false;
1593b1e83836Smrg 
1594b1e83836Smrg       edge e = single_exit (bb_st->loop_father);
1595b1e83836Smrg       bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1596b1e83836Smrg       bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1597b1e83836Smrg       if (dom_ld != dom_st)
1598b1e83836Smrg 	return false;
1599b1e83836Smrg     }
1600b1e83836Smrg 
1601b1e83836Smrg   *src_dr = single_ld;
1602b1e83836Smrg   *dst_dr = single_st;
1603b1e83836Smrg   return true;
1604b1e83836Smrg }
1605b1e83836Smrg 
1606b1e83836Smrg /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1607b1e83836Smrg    loops from inner to outer to see if loop's step equals to access size at
1608b1e83836Smrg    each level of loop.  Return 2 if we can prove this at all level loops;
1609b1e83836Smrg    record access base and size in BASE and SIZE; save loop's step at each
1610b1e83836Smrg    level of loop in STEPS if it is not null.  For example:
1611b1e83836Smrg 
1612b1e83836Smrg      int arr[100][100][100];
1613b1e83836Smrg      for (i = 0; i < 100; i++)       ;steps[2] = 40000
1614b1e83836Smrg        for (j = 100; j > 0; j--)     ;steps[1] = -400
1615b1e83836Smrg 	 for (k = 0; k < 100; k++)   ;steps[0] = 4
1616b1e83836Smrg 	   arr[i][j - 1][k] = 0;     ;base = &arr, size = 4000000
1617b1e83836Smrg 
1618b1e83836Smrg    Return 1 if we can prove the equality at the innermost loop, but not all
1619b1e83836Smrg    level loops.  In this case, no information is recorded.
1620b1e83836Smrg 
1621b1e83836Smrg    Return 0 if no equality can be proven at any level loops.  */
1622b1e83836Smrg 
1623b1e83836Smrg static int
compute_access_range(loop_p loop_nest,data_reference_p dr,tree * base,tree * size,vec<tree> * steps=NULL)1624b1e83836Smrg compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1625b1e83836Smrg 		      tree *size, vec<tree> *steps = NULL)
1626b1e83836Smrg {
1627b1e83836Smrg   location_t loc = gimple_location (DR_STMT (dr));
1628b1e83836Smrg   basic_block bb = gimple_bb (DR_STMT (dr));
1629b1e83836Smrg   class loop *loop = bb->loop_father;
1630b1e83836Smrg   tree ref = DR_REF (dr);
1631b1e83836Smrg   tree access_base = build_fold_addr_expr (ref);
1632b1e83836Smrg   tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1633b1e83836Smrg   int res = 0;
1634b1e83836Smrg 
1635b1e83836Smrg   do {
1636b1e83836Smrg       tree scev_fn = analyze_scalar_evolution (loop, access_base);
1637b1e83836Smrg       if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1638b1e83836Smrg 	return res;
1639b1e83836Smrg 
1640b1e83836Smrg       access_base = CHREC_LEFT (scev_fn);
1641b1e83836Smrg       if (tree_contains_chrecs (access_base, NULL))
1642b1e83836Smrg 	return res;
1643b1e83836Smrg 
1644b1e83836Smrg       tree scev_step = CHREC_RIGHT (scev_fn);
1645b1e83836Smrg       /* Only support constant steps.  */
1646b1e83836Smrg       if (TREE_CODE (scev_step) != INTEGER_CST)
1647b1e83836Smrg 	return res;
1648b1e83836Smrg 
1649b1e83836Smrg       enum ev_direction access_dir = scev_direction (scev_fn);
1650b1e83836Smrg       if (access_dir == EV_DIR_UNKNOWN)
1651b1e83836Smrg 	return res;
1652b1e83836Smrg 
1653b1e83836Smrg       if (steps != NULL)
1654b1e83836Smrg 	steps->safe_push (scev_step);
1655b1e83836Smrg 
1656b1e83836Smrg       scev_step = fold_convert_loc (loc, sizetype, scev_step);
1657b1e83836Smrg       /* Compute absolute value of scev step.  */
1658b1e83836Smrg       if (access_dir == EV_DIR_DECREASES)
1659b1e83836Smrg 	scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1660b1e83836Smrg 
1661b1e83836Smrg       /* At each level of loop, scev step must equal to access size.  In other
1662b1e83836Smrg 	 words, DR must access consecutive memory between loop iterations.  */
1663b1e83836Smrg       if (!operand_equal_p (scev_step, access_size, 0))
1664b1e83836Smrg 	return res;
1665b1e83836Smrg 
1666b1e83836Smrg       /* Access stride can be computed for data reference at least for the
1667b1e83836Smrg 	 innermost loop.  */
1668b1e83836Smrg       res = 1;
1669b1e83836Smrg 
1670b1e83836Smrg       /* Compute DR's execution times in loop.  */
1671b1e83836Smrg       tree niters = number_of_latch_executions (loop);
1672b1e83836Smrg       niters = fold_convert_loc (loc, sizetype, niters);
1673b1e83836Smrg       if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1674b1e83836Smrg 	niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1675b1e83836Smrg 
1676b1e83836Smrg       /* Compute DR's overall access size in loop.  */
1677b1e83836Smrg       access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1678b1e83836Smrg 				     niters, scev_step);
1679b1e83836Smrg       /* Adjust base address in case of negative step.  */
1680b1e83836Smrg       if (access_dir == EV_DIR_DECREASES)
1681b1e83836Smrg 	{
1682b1e83836Smrg 	  tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1683b1e83836Smrg 				      scev_step, access_size);
1684b1e83836Smrg 	  access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1685b1e83836Smrg 	}
1686b1e83836Smrg   } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1687b1e83836Smrg 
1688b1e83836Smrg   *base = access_base;
1689b1e83836Smrg   *size = access_size;
1690b1e83836Smrg   /* Access stride can be computed for data reference at each level loop.  */
1691b1e83836Smrg   return 2;
1692b1e83836Smrg }
1693b1e83836Smrg 
1694b1e83836Smrg /* Allocate and return builtin struct.  Record information like DST_DR,
1695b1e83836Smrg    SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct.  */
1696b1e83836Smrg 
1697b1e83836Smrg static struct builtin_info *
alloc_builtin(data_reference_p dst_dr,data_reference_p src_dr,tree dst_base,tree src_base,tree size)1698b1e83836Smrg alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1699b1e83836Smrg 	       tree dst_base, tree src_base, tree size)
1700b1e83836Smrg {
1701b1e83836Smrg   struct builtin_info *builtin = XNEW (struct builtin_info);
1702b1e83836Smrg   builtin->dst_dr = dst_dr;
1703b1e83836Smrg   builtin->src_dr = src_dr;
1704b1e83836Smrg   builtin->dst_base = dst_base;
1705b1e83836Smrg   builtin->src_base = src_base;
1706b1e83836Smrg   builtin->size = size;
1707b1e83836Smrg   return builtin;
1708b1e83836Smrg }
1709b1e83836Smrg 
1710b1e83836Smrg /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1711b1e83836Smrg    memset call.  */
1712b1e83836Smrg 
1713b1e83836Smrg static void
classify_builtin_st(loop_p loop,partition * partition,data_reference_p dr)1714b1e83836Smrg classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1715b1e83836Smrg {
1716b1e83836Smrg   gimple *stmt = DR_STMT (dr);
1717b1e83836Smrg   tree base, size, rhs = gimple_assign_rhs1 (stmt);
1718b1e83836Smrg 
1719b1e83836Smrg   if (const_with_all_bytes_same (rhs) == -1
1720b1e83836Smrg       && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1721b1e83836Smrg 	  || (TYPE_MODE (TREE_TYPE (rhs))
1722b1e83836Smrg 	      != TYPE_MODE (unsigned_char_type_node))))
1723b1e83836Smrg     return;
1724b1e83836Smrg 
1725b1e83836Smrg   if (TREE_CODE (rhs) == SSA_NAME
1726b1e83836Smrg       && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1727b1e83836Smrg       && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1728b1e83836Smrg     return;
1729b1e83836Smrg 
1730b1e83836Smrg   int res = compute_access_range (loop, dr, &base, &size);
1731b1e83836Smrg   if (res == 0)
1732b1e83836Smrg     return;
1733b1e83836Smrg   if (res == 1)
1734b1e83836Smrg     {
1735b1e83836Smrg       partition->kind = PKIND_PARTIAL_MEMSET;
1736b1e83836Smrg       return;
1737b1e83836Smrg     }
1738b1e83836Smrg 
1739b1e83836Smrg   poly_uint64 base_offset;
1740b1e83836Smrg   unsigned HOST_WIDE_INT const_base_offset;
1741b1e83836Smrg   tree base_base = strip_offset (base, &base_offset);
1742b1e83836Smrg   if (!base_offset.is_constant (&const_base_offset))
1743b1e83836Smrg     return;
1744b1e83836Smrg 
1745b1e83836Smrg   struct builtin_info *builtin;
1746b1e83836Smrg   builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1747b1e83836Smrg   builtin->dst_base_base = base_base;
1748b1e83836Smrg   builtin->dst_base_offset = const_base_offset;
1749b1e83836Smrg   partition->builtin = builtin;
1750b1e83836Smrg   partition->kind = PKIND_MEMSET;
1751b1e83836Smrg }
1752b1e83836Smrg 
1753b1e83836Smrg /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1754b1e83836Smrg    if it forms builtin memcpy or memmove call.  */
1755b1e83836Smrg 
1756b1e83836Smrg void
classify_builtin_ldst(loop_p loop,struct graph * rdg,partition * partition,data_reference_p dst_dr,data_reference_p src_dr)1757b1e83836Smrg loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg,
1758b1e83836Smrg 					  partition *partition,
1759b1e83836Smrg 					  data_reference_p dst_dr,
1760b1e83836Smrg 					  data_reference_p src_dr)
1761b1e83836Smrg {
1762b1e83836Smrg   tree base, size, src_base, src_size;
1763b1e83836Smrg   auto_vec<tree> dst_steps, src_steps;
1764b1e83836Smrg 
1765b1e83836Smrg   /* Compute access range of both load and store.  */
1766b1e83836Smrg   int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1767b1e83836Smrg   if (res != 2)
1768b1e83836Smrg     return;
1769b1e83836Smrg   res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1770b1e83836Smrg   if (res != 2)
1771b1e83836Smrg     return;
1772b1e83836Smrg 
1773b1e83836Smrg   /* They must have the same access size.  */
1774b1e83836Smrg   if (!operand_equal_p (size, src_size, 0))
1775b1e83836Smrg     return;
1776b1e83836Smrg 
1777b1e83836Smrg   /* They must have the same storage order.  */
1778b1e83836Smrg   if (reverse_storage_order_for_component_p (DR_REF (dst_dr))
1779b1e83836Smrg       != reverse_storage_order_for_component_p (DR_REF (src_dr)))
1780b1e83836Smrg     return;
1781b1e83836Smrg 
1782b1e83836Smrg   /* Load and store in loop nest must access memory in the same way, i.e,
1783b1e83836Smrg      their must have the same steps in each loop of the nest.  */
1784b1e83836Smrg   if (dst_steps.length () != src_steps.length ())
1785b1e83836Smrg     return;
1786b1e83836Smrg   for (unsigned i = 0; i < dst_steps.length (); ++i)
1787b1e83836Smrg     if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1788b1e83836Smrg       return;
1789b1e83836Smrg 
1790b1e83836Smrg   /* Now check that if there is a dependence.  */
1791b1e83836Smrg   ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1792b1e83836Smrg 
1793b1e83836Smrg   /* Classify as memmove if no dependence between load and store.  */
1794b1e83836Smrg   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1795b1e83836Smrg     {
1796b1e83836Smrg       partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1797b1e83836Smrg       partition->kind = PKIND_MEMMOVE;
1798b1e83836Smrg       return;
1799b1e83836Smrg     }
1800b1e83836Smrg 
1801b1e83836Smrg   /* Can't do memmove in case of unknown dependence or dependence without
1802b1e83836Smrg      classical distance vector.  */
1803b1e83836Smrg   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1804b1e83836Smrg       || DDR_NUM_DIST_VECTS (ddr) == 0)
1805b1e83836Smrg     return;
1806b1e83836Smrg 
1807b1e83836Smrg   unsigned i;
1808b1e83836Smrg   lambda_vector dist_v;
1809b1e83836Smrg   int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1810b1e83836Smrg   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1811b1e83836Smrg     {
1812b1e83836Smrg       unsigned dep_lev = dependence_level (dist_v, num_lev);
1813b1e83836Smrg       /* Can't do memmove if load depends on store.  */
1814b1e83836Smrg       if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1815b1e83836Smrg 	return;
1816b1e83836Smrg     }
1817b1e83836Smrg 
1818b1e83836Smrg   partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1819b1e83836Smrg   partition->kind = PKIND_MEMMOVE;
1820b1e83836Smrg   return;
1821b1e83836Smrg }
1822b1e83836Smrg 
1823b1e83836Smrg bool
classify_partition(loop_p loop,struct graph * rdg,partition * partition,bitmap stmt_in_all_partitions)1824b1e83836Smrg loop_distribution::classify_partition (loop_p loop,
1825b1e83836Smrg 				       struct graph *rdg, partition *partition,
1826b1e83836Smrg 				       bitmap stmt_in_all_partitions)
1827b1e83836Smrg {
1828b1e83836Smrg   bitmap_iterator bi;
1829b1e83836Smrg   unsigned i;
1830b1e83836Smrg   data_reference_p single_ld = NULL, single_st = NULL;
1831b1e83836Smrg   bool volatiles_p = false, has_reduction = false;
1832b1e83836Smrg 
1833b1e83836Smrg   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1834b1e83836Smrg     {
1835b1e83836Smrg       gimple *stmt = RDG_STMT (rdg, i);
1836b1e83836Smrg 
1837b1e83836Smrg       if (gimple_has_volatile_ops (stmt))
1838b1e83836Smrg 	volatiles_p = true;
1839b1e83836Smrg 
1840b1e83836Smrg       /* If the stmt is not included by all partitions and there is uses
1841b1e83836Smrg 	 outside of the loop, then mark the partition as reduction.  */
1842b1e83836Smrg       if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1843b1e83836Smrg 	{
1844b1e83836Smrg 	  /* Due to limitation in the transform phase we have to fuse all
1845b1e83836Smrg 	     reduction partitions.  As a result, this could cancel valid
1846b1e83836Smrg 	     loop distribution especially for loop that induction variable
1847b1e83836Smrg 	     is used outside of loop.  To workaround this issue, we skip
1848b1e83836Smrg 	     marking partition as reudction if the reduction stmt belongs
1849b1e83836Smrg 	     to all partitions.  In such case, reduction will be computed
1850b1e83836Smrg 	     correctly no matter how partitions are fused/distributed.  */
1851b1e83836Smrg 	  if (!bitmap_bit_p (stmt_in_all_partitions, i))
1852b1e83836Smrg 	    partition->reduction_p = true;
1853b1e83836Smrg 	  else
1854b1e83836Smrg 	    has_reduction = true;
1855b1e83836Smrg 	}
1856b1e83836Smrg     }
1857b1e83836Smrg 
1858b1e83836Smrg   /* Simple workaround to prevent classifying the partition as builtin
1859b1e83836Smrg      if it contains any use outside of loop.  For the case where all
1860b1e83836Smrg      partitions have the reduction this simple workaround is delayed
1861b1e83836Smrg      to only affect the last partition.  */
1862b1e83836Smrg   if (partition->reduction_p)
1863b1e83836Smrg      return has_reduction;
1864b1e83836Smrg 
1865b1e83836Smrg   /* Perform general partition disqualification for builtins.  */
1866b1e83836Smrg   if (volatiles_p
1867b1e83836Smrg       || !flag_tree_loop_distribute_patterns)
1868b1e83836Smrg     return has_reduction;
1869b1e83836Smrg 
1870b1e83836Smrg   /* Find single load/store data references for builtin partition.  */
1871b1e83836Smrg   if (!find_single_drs (loop, rdg, partition->stmts, &single_st, &single_ld)
1872b1e83836Smrg       || !single_st)
1873b1e83836Smrg     return has_reduction;
1874b1e83836Smrg 
1875b1e83836Smrg   if (single_ld && single_st)
1876b1e83836Smrg     {
1877b1e83836Smrg       gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1878b1e83836Smrg       /* Direct aggregate copy or via an SSA name temporary.  */
1879b1e83836Smrg       if (load != store
1880b1e83836Smrg 	  && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1881b1e83836Smrg 	return has_reduction;
1882b1e83836Smrg     }
1883b1e83836Smrg 
1884b1e83836Smrg   partition->loc = gimple_location (DR_STMT (single_st));
1885b1e83836Smrg 
1886b1e83836Smrg   /* Classify the builtin kind.  */
1887b1e83836Smrg   if (single_ld == NULL)
1888b1e83836Smrg     classify_builtin_st (loop, partition, single_st);
1889b1e83836Smrg   else
1890b1e83836Smrg     classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1891b1e83836Smrg   return has_reduction;
1892b1e83836Smrg }
1893b1e83836Smrg 
1894b1e83836Smrg bool
share_memory_accesses(struct graph * rdg,partition * partition1,partition * partition2)1895b1e83836Smrg loop_distribution::share_memory_accesses (struct graph *rdg,
1896b1e83836Smrg 		       partition *partition1, partition *partition2)
1897b1e83836Smrg {
1898b1e83836Smrg   unsigned i, j;
1899b1e83836Smrg   bitmap_iterator bi, bj;
1900b1e83836Smrg   data_reference_p dr1, dr2;
1901b1e83836Smrg 
1902b1e83836Smrg   /* First check whether in the intersection of the two partitions are
1903b1e83836Smrg      any loads or stores.  Common loads are the situation that happens
1904b1e83836Smrg      most often.  */
1905b1e83836Smrg   EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1906b1e83836Smrg     if (RDG_MEM_WRITE_STMT (rdg, i)
1907b1e83836Smrg 	|| RDG_MEM_READS_STMT (rdg, i))
1908b1e83836Smrg       return true;
1909b1e83836Smrg 
1910b1e83836Smrg   /* Then check whether the two partitions access the same memory object.  */
1911b1e83836Smrg   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1912b1e83836Smrg     {
1913b1e83836Smrg       dr1 = datarefs_vec[i];
1914b1e83836Smrg 
1915b1e83836Smrg       if (!DR_BASE_ADDRESS (dr1)
1916b1e83836Smrg 	  || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1917b1e83836Smrg 	continue;
1918b1e83836Smrg 
1919b1e83836Smrg       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1920b1e83836Smrg 	{
1921b1e83836Smrg 	  dr2 = datarefs_vec[j];
1922b1e83836Smrg 
1923b1e83836Smrg 	  if (!DR_BASE_ADDRESS (dr2)
1924b1e83836Smrg 	      || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1925b1e83836Smrg 	    continue;
1926b1e83836Smrg 
1927b1e83836Smrg 	  if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1928b1e83836Smrg 	      && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1929b1e83836Smrg 	      && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1930b1e83836Smrg 	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1931b1e83836Smrg 	    return true;
1932b1e83836Smrg 	}
1933b1e83836Smrg     }
1934b1e83836Smrg 
1935b1e83836Smrg   return false;
1936b1e83836Smrg }
1937b1e83836Smrg 
1938b1e83836Smrg /* For each seed statement in STARTING_STMTS, this function builds
1939b1e83836Smrg    partition for it by adding depended statements according to RDG.
1940b1e83836Smrg    All partitions are recorded in PARTITIONS.  */
1941b1e83836Smrg 
1942b1e83836Smrg void
rdg_build_partitions(struct graph * rdg,vec<gimple * > starting_stmts,vec<partition * > * partitions)1943b1e83836Smrg loop_distribution::rdg_build_partitions (struct graph *rdg,
1944b1e83836Smrg 					 vec<gimple *> starting_stmts,
1945b1e83836Smrg 					 vec<partition *> *partitions)
1946b1e83836Smrg {
1947b1e83836Smrg   auto_bitmap processed;
1948b1e83836Smrg   int i;
1949b1e83836Smrg   gimple *stmt;
1950b1e83836Smrg 
1951b1e83836Smrg   FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1952b1e83836Smrg     {
1953b1e83836Smrg       int v = rdg_vertex_for_stmt (rdg, stmt);
1954b1e83836Smrg 
1955b1e83836Smrg       if (dump_file && (dump_flags & TDF_DETAILS))
1956b1e83836Smrg 	fprintf (dump_file,
1957b1e83836Smrg 		 "ldist asked to generate code for vertex %d\n", v);
1958b1e83836Smrg 
1959b1e83836Smrg       /* If the vertex is already contained in another partition so
1960b1e83836Smrg          is the partition rooted at it.  */
1961b1e83836Smrg       if (bitmap_bit_p (processed, v))
1962b1e83836Smrg 	continue;
1963b1e83836Smrg 
1964b1e83836Smrg       partition *partition = build_rdg_partition_for_vertex (rdg, v);
1965b1e83836Smrg       bitmap_ior_into (processed, partition->stmts);
1966b1e83836Smrg 
1967b1e83836Smrg       if (dump_file && (dump_flags & TDF_DETAILS))
1968b1e83836Smrg 	{
1969b1e83836Smrg 	  fprintf (dump_file, "ldist creates useful %s partition:\n",
1970b1e83836Smrg 		   partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1971b1e83836Smrg 	  bitmap_print (dump_file, partition->stmts, "  ", "\n");
1972b1e83836Smrg 	}
1973b1e83836Smrg 
1974b1e83836Smrg       partitions->safe_push (partition);
1975b1e83836Smrg     }
1976b1e83836Smrg 
1977b1e83836Smrg   /* All vertices should have been assigned to at least one partition now,
1978b1e83836Smrg      other than vertices belonging to dead code.  */
1979b1e83836Smrg }
1980b1e83836Smrg 
1981b1e83836Smrg /* Dump to FILE the PARTITIONS.  */
1982b1e83836Smrg 
1983b1e83836Smrg static void
dump_rdg_partitions(FILE * file,const vec<partition * > & partitions)1984b1e83836Smrg dump_rdg_partitions (FILE *file, const vec<partition *> &partitions)
1985b1e83836Smrg {
1986b1e83836Smrg   int i;
1987b1e83836Smrg   partition *partition;
1988b1e83836Smrg 
1989b1e83836Smrg   FOR_EACH_VEC_ELT (partitions, i, partition)
1990b1e83836Smrg     debug_bitmap_file (file, partition->stmts);
1991b1e83836Smrg }
1992b1e83836Smrg 
1993b1e83836Smrg /* Debug PARTITIONS.  */
1994b1e83836Smrg extern void debug_rdg_partitions (const vec<partition *> &);
1995b1e83836Smrg 
1996b1e83836Smrg DEBUG_FUNCTION void
debug_rdg_partitions(const vec<partition * > & partitions)1997b1e83836Smrg debug_rdg_partitions (const vec<partition *> &partitions)
1998b1e83836Smrg {
1999b1e83836Smrg   dump_rdg_partitions (stderr, partitions);
2000b1e83836Smrg }
2001b1e83836Smrg 
2002b1e83836Smrg /* Returns the number of read and write operations in the RDG.  */
2003b1e83836Smrg 
2004b1e83836Smrg static int
number_of_rw_in_rdg(struct graph * rdg)2005b1e83836Smrg number_of_rw_in_rdg (struct graph *rdg)
2006b1e83836Smrg {
2007b1e83836Smrg   int i, res = 0;
2008b1e83836Smrg 
2009b1e83836Smrg   for (i = 0; i < rdg->n_vertices; i++)
2010b1e83836Smrg     {
2011b1e83836Smrg       if (RDG_MEM_WRITE_STMT (rdg, i))
2012b1e83836Smrg 	++res;
2013b1e83836Smrg 
2014b1e83836Smrg       if (RDG_MEM_READS_STMT (rdg, i))
2015b1e83836Smrg 	++res;
2016b1e83836Smrg     }
2017b1e83836Smrg 
2018b1e83836Smrg   return res;
2019b1e83836Smrg }
2020b1e83836Smrg 
2021b1e83836Smrg /* Returns the number of read and write operations in a PARTITION of
2022b1e83836Smrg    the RDG.  */
2023b1e83836Smrg 
2024b1e83836Smrg static int
number_of_rw_in_partition(struct graph * rdg,partition * partition)2025b1e83836Smrg number_of_rw_in_partition (struct graph *rdg, partition *partition)
2026b1e83836Smrg {
2027b1e83836Smrg   int res = 0;
2028b1e83836Smrg   unsigned i;
2029b1e83836Smrg   bitmap_iterator ii;
2030b1e83836Smrg 
2031b1e83836Smrg   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
2032b1e83836Smrg     {
2033b1e83836Smrg       if (RDG_MEM_WRITE_STMT (rdg, i))
2034b1e83836Smrg 	++res;
2035b1e83836Smrg 
2036b1e83836Smrg       if (RDG_MEM_READS_STMT (rdg, i))
2037b1e83836Smrg 	++res;
2038b1e83836Smrg     }
2039b1e83836Smrg 
2040b1e83836Smrg   return res;
2041b1e83836Smrg }
2042b1e83836Smrg 
2043b1e83836Smrg /* Returns true when one of the PARTITIONS contains all the read or
2044b1e83836Smrg    write operations of RDG.  */
2045b1e83836Smrg 
2046b1e83836Smrg static bool
partition_contains_all_rw(struct graph * rdg,const vec<partition * > & partitions)2047b1e83836Smrg partition_contains_all_rw (struct graph *rdg,
2048b1e83836Smrg 			   const vec<partition *> &partitions)
2049b1e83836Smrg {
2050b1e83836Smrg   int i;
2051b1e83836Smrg   partition *partition;
2052b1e83836Smrg   int nrw = number_of_rw_in_rdg (rdg);
2053b1e83836Smrg 
2054b1e83836Smrg   FOR_EACH_VEC_ELT (partitions, i, partition)
2055b1e83836Smrg     if (nrw == number_of_rw_in_partition (rdg, partition))
2056b1e83836Smrg       return true;
2057b1e83836Smrg 
2058b1e83836Smrg   return false;
2059b1e83836Smrg }
2060b1e83836Smrg 
2061b1e83836Smrg int
pg_add_dependence_edges(struct graph * rdg,int dir,bitmap drs1,bitmap drs2,vec<ddr_p> * alias_ddrs)2062b1e83836Smrg loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir,
2063b1e83836Smrg 			 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
2064b1e83836Smrg {
2065b1e83836Smrg   unsigned i, j;
2066b1e83836Smrg   bitmap_iterator bi, bj;
2067b1e83836Smrg   data_reference_p dr1, dr2, saved_dr1;
2068b1e83836Smrg 
2069b1e83836Smrg   /* dependence direction - 0 is no dependence, -1 is back,
2070b1e83836Smrg      1 is forth, 2 is both (we can stop then, merging will occur).  */
2071b1e83836Smrg   EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
2072b1e83836Smrg     {
2073b1e83836Smrg       dr1 = datarefs_vec[i];
2074b1e83836Smrg 
2075b1e83836Smrg       EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
2076b1e83836Smrg 	{
2077b1e83836Smrg 	  int res, this_dir = 1;
2078b1e83836Smrg 	  ddr_p ddr;
2079b1e83836Smrg 
2080b1e83836Smrg 	  dr2 = datarefs_vec[j];
2081b1e83836Smrg 
2082b1e83836Smrg 	  /* Skip all <read, read> data dependence.  */
2083b1e83836Smrg 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
2084b1e83836Smrg 	    continue;
2085b1e83836Smrg 
2086b1e83836Smrg 	  saved_dr1 = dr1;
2087b1e83836Smrg 	  /* Re-shuffle data-refs to be in topological order.  */
2088b1e83836Smrg 	  if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
2089b1e83836Smrg 	      > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
2090b1e83836Smrg 	    {
2091b1e83836Smrg 	      std::swap (dr1, dr2);
2092b1e83836Smrg 	      this_dir = -this_dir;
2093b1e83836Smrg 	    }
2094b1e83836Smrg 	  ddr = get_data_dependence (rdg, dr1, dr2);
2095b1e83836Smrg 	  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2096b1e83836Smrg 	    {
2097b1e83836Smrg 	      this_dir = 0;
2098b1e83836Smrg 	      res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
2099b1e83836Smrg 					   DR_BASE_ADDRESS (dr2));
2100b1e83836Smrg 	      /* Be conservative.  If data references are not well analyzed,
2101b1e83836Smrg 		 or the two data references have the same base address and
2102b1e83836Smrg 		 offset, add dependence and consider it alias to each other.
2103b1e83836Smrg 		 In other words, the dependence cannot be resolved by
2104b1e83836Smrg 		 runtime alias check.  */
2105b1e83836Smrg 	      if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
2106b1e83836Smrg 		  || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
2107b1e83836Smrg 		  || !DR_INIT (dr1) || !DR_INIT (dr2)
2108b1e83836Smrg 		  || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
2109b1e83836Smrg 		  || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
2110b1e83836Smrg 		  || res == 0)
2111b1e83836Smrg 		this_dir = 2;
2112b1e83836Smrg 	      /* Data dependence could be resolved by runtime alias check,
2113b1e83836Smrg 		 record it in ALIAS_DDRS.  */
2114b1e83836Smrg 	      else if (alias_ddrs != NULL)
2115b1e83836Smrg 		alias_ddrs->safe_push (ddr);
2116b1e83836Smrg 	      /* Or simply ignore it.  */
2117b1e83836Smrg 	    }
2118b1e83836Smrg 	  else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
2119b1e83836Smrg 	    {
2120b1e83836Smrg 	      /* Known dependences can still be unordered througout the
2121b1e83836Smrg 		 iteration space, see gcc.dg/tree-ssa/ldist-16.c and
2122b1e83836Smrg 		 gcc.dg/tree-ssa/pr94969.c.  */
2123b1e83836Smrg 	      if (DDR_NUM_DIST_VECTS (ddr) != 1)
2124b1e83836Smrg 		this_dir = 2;
2125b1e83836Smrg 	      /* If the overlap is exact preserve stmt order.  */
2126b1e83836Smrg 	      else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
2127b1e83836Smrg 					    DDR_NB_LOOPS (ddr)))
2128b1e83836Smrg 		;
2129b1e83836Smrg 	      /* Else as the distance vector is lexicographic positive swap
2130b1e83836Smrg 		 the dependence direction.  */
2131b1e83836Smrg 	      else
2132*0a307195Smrg 		{
2133*0a307195Smrg 		  if (DDR_REVERSED_P (ddr))
2134b1e83836Smrg 		    this_dir = -this_dir;
2135*0a307195Smrg 		  this_dir = -this_dir;
2136*0a307195Smrg 
2137*0a307195Smrg 		  /* When then dependence distance of the innermost common
2138*0a307195Smrg 		     loop of the DRs is zero we have a conflict.  */
2139*0a307195Smrg 		  auto l1 = gimple_bb (DR_STMT (dr1))->loop_father;
2140*0a307195Smrg 		  auto l2 = gimple_bb (DR_STMT (dr2))->loop_father;
2141*0a307195Smrg 		  int idx = index_in_loop_nest (find_common_loop (l1, l2)->num,
2142*0a307195Smrg 						DDR_LOOP_NEST (ddr));
2143*0a307195Smrg 		  if (DDR_DIST_VECT (ddr, 0)[idx] == 0)
2144*0a307195Smrg 		    this_dir = 2;
2145*0a307195Smrg 		}
2146b1e83836Smrg 	    }
2147b1e83836Smrg 	  else
2148b1e83836Smrg 	    this_dir = 0;
2149b1e83836Smrg 	  if (this_dir == 2)
2150b1e83836Smrg 	    return 2;
2151b1e83836Smrg 	  else if (dir == 0)
2152b1e83836Smrg 	    dir = this_dir;
2153b1e83836Smrg 	  else if (this_dir != 0 && dir != this_dir)
2154b1e83836Smrg 	    return 2;
2155b1e83836Smrg 	  /* Shuffle "back" dr1.  */
2156b1e83836Smrg 	  dr1 = saved_dr1;
2157b1e83836Smrg 	}
2158b1e83836Smrg     }
2159b1e83836Smrg   return dir;
2160b1e83836Smrg }
2161b1e83836Smrg 
2162b1e83836Smrg /* Compare postorder number of the partition graph vertices V1 and V2.  */
2163b1e83836Smrg 
2164b1e83836Smrg static int
pgcmp(const void * v1_,const void * v2_)2165b1e83836Smrg pgcmp (const void *v1_, const void *v2_)
2166b1e83836Smrg {
2167b1e83836Smrg   const vertex *v1 = (const vertex *)v1_;
2168b1e83836Smrg   const vertex *v2 = (const vertex *)v2_;
2169b1e83836Smrg   return v2->post - v1->post;
2170b1e83836Smrg }
2171b1e83836Smrg 
2172b1e83836Smrg /* Data attached to vertices of partition dependence graph.  */
2173b1e83836Smrg struct pg_vdata
2174b1e83836Smrg {
2175b1e83836Smrg   /* ID of the corresponding partition.  */
2176b1e83836Smrg   int id;
2177b1e83836Smrg   /* The partition.  */
2178b1e83836Smrg   struct partition *partition;
2179b1e83836Smrg };
2180b1e83836Smrg 
2181b1e83836Smrg /* Data attached to edges of partition dependence graph.  */
2182b1e83836Smrg struct pg_edata
2183b1e83836Smrg {
2184b1e83836Smrg   /* If the dependence edge can be resolved by runtime alias check,
2185b1e83836Smrg      this vector contains data dependence relations for runtime alias
2186b1e83836Smrg      check.  On the other hand, if the dependence edge is introduced
2187b1e83836Smrg      because of compilation time known data dependence, this vector
2188b1e83836Smrg      contains nothing.  */
2189b1e83836Smrg   vec<ddr_p> alias_ddrs;
2190b1e83836Smrg };
2191b1e83836Smrg 
2192b1e83836Smrg /* Callback data for traversing edges in graph.  */
2193b1e83836Smrg struct pg_edge_callback_data
2194b1e83836Smrg {
2195b1e83836Smrg   /* Bitmap contains strong connected components should be merged.  */
2196b1e83836Smrg   bitmap sccs_to_merge;
2197b1e83836Smrg   /* Array constains component information for all vertices.  */
2198b1e83836Smrg   int *vertices_component;
2199b1e83836Smrg   /* Vector to record all data dependence relations which are needed
2200b1e83836Smrg      to break strong connected components by runtime alias checks.  */
2201b1e83836Smrg   vec<ddr_p> *alias_ddrs;
2202b1e83836Smrg };
2203b1e83836Smrg 
2204b1e83836Smrg /* Initialize vertice's data for partition dependence graph PG with
2205b1e83836Smrg    PARTITIONS.  */
2206b1e83836Smrg 
2207b1e83836Smrg static void
init_partition_graph_vertices(struct graph * pg,vec<struct partition * > * partitions)2208b1e83836Smrg init_partition_graph_vertices (struct graph *pg,
2209b1e83836Smrg 			       vec<struct partition *> *partitions)
2210b1e83836Smrg {
2211b1e83836Smrg   int i;
2212b1e83836Smrg   partition *partition;
2213b1e83836Smrg   struct pg_vdata *data;
2214b1e83836Smrg 
2215b1e83836Smrg   for (i = 0; partitions->iterate (i, &partition); ++i)
2216b1e83836Smrg     {
2217b1e83836Smrg       data = new pg_vdata;
2218b1e83836Smrg       pg->vertices[i].data = data;
2219b1e83836Smrg       data->id = i;
2220b1e83836Smrg       data->partition = partition;
2221b1e83836Smrg     }
2222b1e83836Smrg }
2223b1e83836Smrg 
2224b1e83836Smrg /* Add edge <I, J> to partition dependence graph PG.  Attach vector of data
2225b1e83836Smrg    dependence relations to the EDGE if DDRS isn't NULL.  */
2226b1e83836Smrg 
2227b1e83836Smrg static void
add_partition_graph_edge(struct graph * pg,int i,int j,vec<ddr_p> * ddrs)2228b1e83836Smrg add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2229b1e83836Smrg {
2230b1e83836Smrg   struct graph_edge *e = add_edge (pg, i, j);
2231b1e83836Smrg 
2232b1e83836Smrg   /* If the edge is attached with data dependence relations, it means this
2233b1e83836Smrg      dependence edge can be resolved by runtime alias checks.  */
2234b1e83836Smrg   if (ddrs != NULL)
2235b1e83836Smrg     {
2236b1e83836Smrg       struct pg_edata *data = new pg_edata;
2237b1e83836Smrg 
2238b1e83836Smrg       gcc_assert (ddrs->length () > 0);
2239b1e83836Smrg       e->data = data;
2240b1e83836Smrg       data->alias_ddrs = vNULL;
2241b1e83836Smrg       data->alias_ddrs.safe_splice (*ddrs);
2242b1e83836Smrg     }
2243b1e83836Smrg }
2244b1e83836Smrg 
2245b1e83836Smrg /* Callback function for graph travesal algorithm.  It returns true
2246b1e83836Smrg    if edge E should skipped when traversing the graph.  */
2247b1e83836Smrg 
2248b1e83836Smrg static bool
pg_skip_alias_edge(struct graph_edge * e)2249b1e83836Smrg pg_skip_alias_edge (struct graph_edge *e)
2250b1e83836Smrg {
2251b1e83836Smrg   struct pg_edata *data = (struct pg_edata *)e->data;
2252b1e83836Smrg   return (data != NULL && data->alias_ddrs.length () > 0);
2253b1e83836Smrg }
2254b1e83836Smrg 
2255b1e83836Smrg /* Callback function freeing data attached to edge E of graph.  */
2256b1e83836Smrg 
2257b1e83836Smrg static void
free_partition_graph_edata_cb(struct graph *,struct graph_edge * e,void *)2258b1e83836Smrg free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2259b1e83836Smrg {
2260b1e83836Smrg   if (e->data != NULL)
2261b1e83836Smrg     {
2262b1e83836Smrg       struct pg_edata *data = (struct pg_edata *)e->data;
2263b1e83836Smrg       data->alias_ddrs.release ();
2264b1e83836Smrg       delete data;
2265b1e83836Smrg     }
2266b1e83836Smrg }
2267b1e83836Smrg 
2268b1e83836Smrg /* Free data attached to vertice of partition dependence graph PG.  */
2269b1e83836Smrg 
2270b1e83836Smrg static void
free_partition_graph_vdata(struct graph * pg)2271b1e83836Smrg free_partition_graph_vdata (struct graph *pg)
2272b1e83836Smrg {
2273b1e83836Smrg   int i;
2274b1e83836Smrg   struct pg_vdata *data;
2275b1e83836Smrg 
2276b1e83836Smrg   for (i = 0; i < pg->n_vertices; ++i)
2277b1e83836Smrg     {
2278b1e83836Smrg       data = (struct pg_vdata *)pg->vertices[i].data;
2279b1e83836Smrg       delete data;
2280b1e83836Smrg     }
2281b1e83836Smrg }
2282b1e83836Smrg 
2283b1e83836Smrg /* Build and return partition dependence graph for PARTITIONS.  RDG is
2284b1e83836Smrg    reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
2285b1e83836Smrg    is true, data dependence caused by possible alias between references
2286b1e83836Smrg    is ignored, as if it doesn't exist at all; otherwise all depdendences
2287b1e83836Smrg    are considered.  */
2288b1e83836Smrg 
2289b1e83836Smrg struct graph *
build_partition_graph(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)2290b1e83836Smrg loop_distribution::build_partition_graph (struct graph *rdg,
2291b1e83836Smrg 					  vec<struct partition *> *partitions,
2292b1e83836Smrg 					  bool ignore_alias_p)
2293b1e83836Smrg {
2294b1e83836Smrg   int i, j;
2295b1e83836Smrg   struct partition *partition1, *partition2;
2296b1e83836Smrg   graph *pg = new_graph (partitions->length ());
2297b1e83836Smrg   auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2298b1e83836Smrg 
2299b1e83836Smrg   alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2300b1e83836Smrg 
2301b1e83836Smrg   init_partition_graph_vertices (pg, partitions);
2302b1e83836Smrg 
2303b1e83836Smrg   for (i = 0; partitions->iterate (i, &partition1); ++i)
2304b1e83836Smrg     {
2305b1e83836Smrg       for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2306b1e83836Smrg 	{
2307b1e83836Smrg 	  /* dependence direction - 0 is no dependence, -1 is back,
2308b1e83836Smrg 	     1 is forth, 2 is both (we can stop then, merging will occur).  */
2309b1e83836Smrg 	  int dir = 0;
2310b1e83836Smrg 
2311b1e83836Smrg 	  /* If the first partition has reduction, add back edge; if the
2312b1e83836Smrg 	     second partition has reduction, add forth edge.  This makes
2313b1e83836Smrg 	     sure that reduction partition will be sorted as the last one.  */
2314b1e83836Smrg 	  if (partition_reduction_p (partition1))
2315b1e83836Smrg 	    dir = -1;
2316b1e83836Smrg 	  else if (partition_reduction_p (partition2))
2317b1e83836Smrg 	    dir = 1;
2318b1e83836Smrg 
2319b1e83836Smrg 	  /* Cleanup the temporary vector.  */
2320b1e83836Smrg 	  alias_ddrs.truncate (0);
2321b1e83836Smrg 
2322b1e83836Smrg 	  dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2323b1e83836Smrg 					 partition2->datarefs, alias_ddrs_p);
2324b1e83836Smrg 
2325b1e83836Smrg 	  /* Add edge to partition graph if there exists dependence.  There
2326b1e83836Smrg 	     are two types of edges.  One type edge is caused by compilation
2327b1e83836Smrg 	     time known dependence, this type cannot be resolved by runtime
2328b1e83836Smrg 	     alias check.  The other type can be resolved by runtime alias
2329b1e83836Smrg 	     check.  */
2330b1e83836Smrg 	  if (dir == 1 || dir == 2
2331b1e83836Smrg 	      || alias_ddrs.length () > 0)
2332b1e83836Smrg 	    {
2333b1e83836Smrg 	      /* Attach data dependence relations to edge that can be resolved
2334b1e83836Smrg 		 by runtime alias check.  */
2335b1e83836Smrg 	      bool alias_edge_p = (dir != 1 && dir != 2);
2336b1e83836Smrg 	      add_partition_graph_edge (pg, i, j,
2337b1e83836Smrg 					(alias_edge_p) ? &alias_ddrs : NULL);
2338b1e83836Smrg 	    }
2339b1e83836Smrg 	  if (dir == -1 || dir == 2
2340b1e83836Smrg 	      || alias_ddrs.length () > 0)
2341b1e83836Smrg 	    {
2342b1e83836Smrg 	      /* Attach data dependence relations to edge that can be resolved
2343b1e83836Smrg 		 by runtime alias check.  */
2344b1e83836Smrg 	      bool alias_edge_p = (dir != -1 && dir != 2);
2345b1e83836Smrg 	      add_partition_graph_edge (pg, j, i,
2346b1e83836Smrg 					(alias_edge_p) ? &alias_ddrs : NULL);
2347b1e83836Smrg 	    }
2348b1e83836Smrg 	}
2349b1e83836Smrg     }
2350b1e83836Smrg   return pg;
2351b1e83836Smrg }
2352b1e83836Smrg 
2353b1e83836Smrg /* Sort partitions in PG in descending post order and store them in
2354b1e83836Smrg    PARTITIONS.  */
2355b1e83836Smrg 
2356b1e83836Smrg static void
sort_partitions_by_post_order(struct graph * pg,vec<struct partition * > * partitions)2357b1e83836Smrg sort_partitions_by_post_order (struct graph *pg,
2358b1e83836Smrg 			       vec<struct partition *> *partitions)
2359b1e83836Smrg {
2360b1e83836Smrg   int i;
2361b1e83836Smrg   struct pg_vdata *data;
2362b1e83836Smrg 
2363b1e83836Smrg   /* Now order the remaining nodes in descending postorder.  */
2364b1e83836Smrg   qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2365b1e83836Smrg   partitions->truncate (0);
2366b1e83836Smrg   for (i = 0; i < pg->n_vertices; ++i)
2367b1e83836Smrg     {
2368b1e83836Smrg       data = (struct pg_vdata *)pg->vertices[i].data;
2369b1e83836Smrg       if (data->partition)
2370b1e83836Smrg 	partitions->safe_push (data->partition);
2371b1e83836Smrg     }
2372b1e83836Smrg }
2373b1e83836Smrg 
2374b1e83836Smrg void
merge_dep_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)2375b1e83836Smrg loop_distribution::merge_dep_scc_partitions (struct graph *rdg,
2376b1e83836Smrg 					     vec<struct partition *> *partitions,
2377b1e83836Smrg 					     bool ignore_alias_p)
2378b1e83836Smrg {
2379b1e83836Smrg   struct partition *partition1, *partition2;
2380b1e83836Smrg   struct pg_vdata *data;
2381b1e83836Smrg   graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2382b1e83836Smrg   int i, j, num_sccs = graphds_scc (pg, NULL);
2383b1e83836Smrg 
2384b1e83836Smrg   /* Strong connected compoenent means dependence cycle, we cannot distribute
2385b1e83836Smrg      them.  So fuse them together.  */
2386b1e83836Smrg   if ((unsigned) num_sccs < partitions->length ())
2387b1e83836Smrg     {
2388b1e83836Smrg       for (i = 0; i < num_sccs; ++i)
2389b1e83836Smrg 	{
2390b1e83836Smrg 	  for (j = 0; partitions->iterate (j, &partition1); ++j)
2391b1e83836Smrg 	    if (pg->vertices[j].component == i)
2392b1e83836Smrg 	      break;
2393b1e83836Smrg 	  for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2394b1e83836Smrg 	    if (pg->vertices[j].component == i)
2395b1e83836Smrg 	      {
2396b1e83836Smrg 		partition_merge_into (NULL, partition1,
2397b1e83836Smrg 				      partition2, FUSE_SAME_SCC);
2398b1e83836Smrg 		partition1->type = PTYPE_SEQUENTIAL;
2399b1e83836Smrg 		(*partitions)[j] = NULL;
2400b1e83836Smrg 		partition_free (partition2);
2401b1e83836Smrg 		data = (struct pg_vdata *)pg->vertices[j].data;
2402b1e83836Smrg 		data->partition = NULL;
2403b1e83836Smrg 	      }
2404b1e83836Smrg 	}
2405b1e83836Smrg     }
2406b1e83836Smrg 
2407b1e83836Smrg   sort_partitions_by_post_order (pg, partitions);
2408b1e83836Smrg   gcc_assert (partitions->length () == (unsigned)num_sccs);
2409b1e83836Smrg   free_partition_graph_vdata (pg);
2410b1e83836Smrg   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2411b1e83836Smrg   free_graph (pg);
2412b1e83836Smrg }
2413b1e83836Smrg 
2414b1e83836Smrg /* Callback function for traversing edge E in graph G.  DATA is private
2415b1e83836Smrg    callback data.  */
2416b1e83836Smrg 
2417b1e83836Smrg static void
pg_collect_alias_ddrs(struct graph * g,struct graph_edge * e,void * data)2418b1e83836Smrg pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2419b1e83836Smrg {
2420b1e83836Smrg   int i, j, component;
2421b1e83836Smrg   struct pg_edge_callback_data *cbdata;
2422b1e83836Smrg   struct pg_edata *edata = (struct pg_edata *) e->data;
2423b1e83836Smrg 
2424b1e83836Smrg   /* If the edge doesn't have attached data dependence, it represents
2425b1e83836Smrg      compilation time known dependences.  This type dependence cannot
2426b1e83836Smrg      be resolved by runtime alias check.  */
2427b1e83836Smrg   if (edata == NULL || edata->alias_ddrs.length () == 0)
2428b1e83836Smrg     return;
2429b1e83836Smrg 
2430b1e83836Smrg   cbdata = (struct pg_edge_callback_data *) data;
2431b1e83836Smrg   i = e->src;
2432b1e83836Smrg   j = e->dest;
2433b1e83836Smrg   component = cbdata->vertices_component[i];
2434b1e83836Smrg   /* Vertices are topologically sorted according to compilation time
2435b1e83836Smrg      known dependences, so we can break strong connected components
2436b1e83836Smrg      by removing edges of the opposite direction, i.e, edges pointing
2437b1e83836Smrg      from vertice with smaller post number to vertice with bigger post
2438b1e83836Smrg      number.  */
2439b1e83836Smrg   if (g->vertices[i].post < g->vertices[j].post
2440b1e83836Smrg       /* We only need to remove edges connecting vertices in the same
2441b1e83836Smrg 	 strong connected component to break it.  */
2442b1e83836Smrg       && component == cbdata->vertices_component[j]
2443b1e83836Smrg       /* Check if we want to break the strong connected component or not.  */
2444b1e83836Smrg       && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2445b1e83836Smrg     cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2446b1e83836Smrg }
2447b1e83836Smrg 
2448b1e83836Smrg /* Callback function for traversing edge E.  DATA is private
2449b1e83836Smrg    callback data.  */
2450b1e83836Smrg 
2451b1e83836Smrg static void
pg_unmark_merged_alias_ddrs(struct graph *,struct graph_edge * e,void * data)2452b1e83836Smrg pg_unmark_merged_alias_ddrs (struct graph *, struct graph_edge *e, void *data)
2453b1e83836Smrg {
2454b1e83836Smrg   int i, j, component;
2455b1e83836Smrg   struct pg_edge_callback_data *cbdata;
2456b1e83836Smrg   struct pg_edata *edata = (struct pg_edata *) e->data;
2457b1e83836Smrg 
2458b1e83836Smrg   if (edata == NULL || edata->alias_ddrs.length () == 0)
2459b1e83836Smrg     return;
2460b1e83836Smrg 
2461b1e83836Smrg   cbdata = (struct pg_edge_callback_data *) data;
2462b1e83836Smrg   i = e->src;
2463b1e83836Smrg   j = e->dest;
2464b1e83836Smrg   component = cbdata->vertices_component[i];
2465b1e83836Smrg   /* Make sure to not skip vertices inside SCCs we are going to merge.  */
2466b1e83836Smrg   if (component == cbdata->vertices_component[j]
2467b1e83836Smrg       && bitmap_bit_p (cbdata->sccs_to_merge, component))
2468b1e83836Smrg     {
2469b1e83836Smrg       edata->alias_ddrs.release ();
2470b1e83836Smrg       delete edata;
2471b1e83836Smrg       e->data = NULL;
2472b1e83836Smrg     }
2473b1e83836Smrg }
2474b1e83836Smrg 
2475b1e83836Smrg /* This is the main function breaking strong conected components in
2476b1e83836Smrg    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
2477b1e83836Smrg    relations for runtime alias check in ALIAS_DDRS.  */
2478b1e83836Smrg void
break_alias_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2479b1e83836Smrg loop_distribution::break_alias_scc_partitions (struct graph *rdg,
2480b1e83836Smrg 					       vec<struct partition *> *partitions,
2481b1e83836Smrg 					       vec<ddr_p> *alias_ddrs)
2482b1e83836Smrg {
2483b1e83836Smrg   int i, j, k, num_sccs, num_sccs_no_alias = 0;
2484b1e83836Smrg   /* Build partition dependence graph.  */
2485b1e83836Smrg   graph *pg = build_partition_graph (rdg, partitions, false);
2486b1e83836Smrg 
2487b1e83836Smrg   alias_ddrs->truncate (0);
2488b1e83836Smrg   /* Find strong connected components in the graph, with all dependence edges
2489b1e83836Smrg      considered.  */
2490b1e83836Smrg   num_sccs = graphds_scc (pg, NULL);
2491b1e83836Smrg   /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2492b1e83836Smrg      compilation time known dependences are merged before this function.  */
2493b1e83836Smrg   if ((unsigned) num_sccs < partitions->length ())
2494b1e83836Smrg     {
2495b1e83836Smrg       struct pg_edge_callback_data cbdata;
2496b1e83836Smrg       auto_bitmap sccs_to_merge;
2497b1e83836Smrg       auto_vec<enum partition_type> scc_types;
2498b1e83836Smrg       struct partition *partition, *first;
2499b1e83836Smrg 
2500b1e83836Smrg       /* If all partitions in a SCC have the same type, we can simply merge the
2501b1e83836Smrg 	 SCC.  This loop finds out such SCCS and record them in bitmap.  */
2502b1e83836Smrg       bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2503b1e83836Smrg       for (i = 0; i < num_sccs; ++i)
2504b1e83836Smrg 	{
2505b1e83836Smrg 	  for (j = 0; partitions->iterate (j, &first); ++j)
2506b1e83836Smrg 	    if (pg->vertices[j].component == i)
2507b1e83836Smrg 	      break;
2508b1e83836Smrg 
2509b1e83836Smrg 	  bool same_type = true, all_builtins = partition_builtin_p (first);
2510b1e83836Smrg 	  for (++j; partitions->iterate (j, &partition); ++j)
2511b1e83836Smrg 	    {
2512b1e83836Smrg 	      if (pg->vertices[j].component != i)
2513b1e83836Smrg 		continue;
2514b1e83836Smrg 
2515b1e83836Smrg 	      if (first->type != partition->type)
2516b1e83836Smrg 		{
2517b1e83836Smrg 		  same_type = false;
2518b1e83836Smrg 		  break;
2519b1e83836Smrg 		}
2520b1e83836Smrg 	      all_builtins &= partition_builtin_p (partition);
2521b1e83836Smrg 	    }
2522b1e83836Smrg 	  /* Merge SCC if all partitions in SCC have the same type, though the
2523b1e83836Smrg 	     result partition is sequential, because vectorizer can do better
2524b1e83836Smrg 	     runtime alias check.  One expecption is all partitions in SCC are
2525b1e83836Smrg 	     builtins.  */
2526b1e83836Smrg 	  if (!same_type || all_builtins)
2527b1e83836Smrg 	    bitmap_clear_bit (sccs_to_merge, i);
2528b1e83836Smrg 	}
2529b1e83836Smrg 
2530b1e83836Smrg       /* Initialize callback data for traversing.  */
2531b1e83836Smrg       cbdata.sccs_to_merge = sccs_to_merge;
2532b1e83836Smrg       cbdata.alias_ddrs = alias_ddrs;
2533b1e83836Smrg       cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2534b1e83836Smrg       /* Record the component information which will be corrupted by next
2535b1e83836Smrg 	 graph scc finding call.  */
2536b1e83836Smrg       for (i = 0; i < pg->n_vertices; ++i)
2537b1e83836Smrg 	cbdata.vertices_component[i] = pg->vertices[i].component;
2538b1e83836Smrg 
2539b1e83836Smrg       /* Collect data dependences for runtime alias checks to break SCCs.  */
2540b1e83836Smrg       if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2541b1e83836Smrg 	{
2542b1e83836Smrg 	  /* For SCCs we want to merge clear all alias_ddrs for edges
2543b1e83836Smrg 	     inside the component.  */
2544b1e83836Smrg 	  for_each_edge (pg, pg_unmark_merged_alias_ddrs, &cbdata);
2545b1e83836Smrg 
2546b1e83836Smrg 	  /* Run SCC finding algorithm again, with alias dependence edges
2547b1e83836Smrg 	     skipped.  This is to topologically sort partitions according to
2548b1e83836Smrg 	     compilation time known dependence.  Note the topological order
2549b1e83836Smrg 	     is stored in the form of pg's post order number.  */
2550b1e83836Smrg 	  num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2551b1e83836Smrg 	  /* We cannot assert partitions->length () == num_sccs_no_alias
2552b1e83836Smrg 	     since we are not ignoring alias edges in cycles we are
2553b1e83836Smrg 	     going to merge.  That's required to compute correct postorder.  */
2554b1e83836Smrg 	  /* With topological order, we can construct two subgraphs L and R.
2555b1e83836Smrg 	     L contains edge <x, y> where x < y in terms of post order, while
2556b1e83836Smrg 	     R contains edge <x, y> where x > y.  Edges for compilation time
2557b1e83836Smrg 	     known dependence all fall in R, so we break SCCs by removing all
2558b1e83836Smrg 	     (alias) edges of in subgraph L.  */
2559b1e83836Smrg 	  for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2560b1e83836Smrg 	}
2561b1e83836Smrg 
2562b1e83836Smrg       /* For SCC that doesn't need to be broken, merge it.  */
2563b1e83836Smrg       for (i = 0; i < num_sccs; ++i)
2564b1e83836Smrg 	{
2565b1e83836Smrg 	  if (!bitmap_bit_p (sccs_to_merge, i))
2566b1e83836Smrg 	    continue;
2567b1e83836Smrg 
2568b1e83836Smrg 	  for (j = 0; partitions->iterate (j, &first); ++j)
2569b1e83836Smrg 	    if (cbdata.vertices_component[j] == i)
2570b1e83836Smrg 	      break;
2571b1e83836Smrg 	  for (k = j + 1; partitions->iterate (k, &partition); ++k)
2572b1e83836Smrg 	    {
2573b1e83836Smrg 	      struct pg_vdata *data;
2574b1e83836Smrg 
2575b1e83836Smrg 	      if (cbdata.vertices_component[k] != i)
2576b1e83836Smrg 		continue;
2577b1e83836Smrg 
2578b1e83836Smrg 	      partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2579b1e83836Smrg 	      (*partitions)[k] = NULL;
2580b1e83836Smrg 	      partition_free (partition);
2581b1e83836Smrg 	      data = (struct pg_vdata *)pg->vertices[k].data;
2582b1e83836Smrg 	      gcc_assert (data->id == k);
2583b1e83836Smrg 	      data->partition = NULL;
2584b1e83836Smrg 	      /* The result partition of merged SCC must be sequential.  */
2585b1e83836Smrg 	      first->type = PTYPE_SEQUENTIAL;
2586b1e83836Smrg 	    }
2587b1e83836Smrg 	}
2588b1e83836Smrg       /* If reduction partition's SCC is broken by runtime alias checks,
2589b1e83836Smrg 	 we force a negative post order to it making sure it will be scheduled
2590b1e83836Smrg 	 in the last.  */
2591b1e83836Smrg       if (num_sccs_no_alias > 0)
2592b1e83836Smrg 	{
2593b1e83836Smrg 	  j = -1;
2594b1e83836Smrg 	  for (i = 0; i < pg->n_vertices; ++i)
2595b1e83836Smrg 	    {
2596b1e83836Smrg 	      struct pg_vdata *data = (struct pg_vdata *)pg->vertices[i].data;
2597b1e83836Smrg 	      if (data->partition && partition_reduction_p (data->partition))
2598b1e83836Smrg 		{
2599b1e83836Smrg 		  gcc_assert (j == -1);
2600b1e83836Smrg 		  j = i;
2601b1e83836Smrg 		}
2602b1e83836Smrg 	    }
2603b1e83836Smrg 	  if (j >= 0)
2604b1e83836Smrg 	    pg->vertices[j].post = -1;
2605b1e83836Smrg 	}
2606b1e83836Smrg 
2607b1e83836Smrg       free (cbdata.vertices_component);
2608b1e83836Smrg     }
2609b1e83836Smrg 
2610b1e83836Smrg   sort_partitions_by_post_order (pg, partitions);
2611b1e83836Smrg   free_partition_graph_vdata (pg);
2612b1e83836Smrg   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2613b1e83836Smrg   free_graph (pg);
2614b1e83836Smrg 
2615b1e83836Smrg   if (dump_file && (dump_flags & TDF_DETAILS))
2616b1e83836Smrg     {
2617b1e83836Smrg       fprintf (dump_file, "Possible alias data dependence to break:\n");
2618b1e83836Smrg       dump_data_dependence_relations (dump_file, *alias_ddrs);
2619b1e83836Smrg     }
2620b1e83836Smrg }
2621b1e83836Smrg 
2622b1e83836Smrg /* Compute and return an expression whose value is the segment length which
2623b1e83836Smrg    will be accessed by DR in NITERS iterations.  */
2624b1e83836Smrg 
2625b1e83836Smrg static tree
data_ref_segment_size(struct data_reference * dr,tree niters)2626b1e83836Smrg data_ref_segment_size (struct data_reference *dr, tree niters)
2627b1e83836Smrg {
2628b1e83836Smrg   niters = size_binop (MINUS_EXPR,
2629b1e83836Smrg 		       fold_convert (sizetype, niters),
2630b1e83836Smrg 		       size_one_node);
2631b1e83836Smrg   return size_binop (MULT_EXPR,
2632b1e83836Smrg 		     fold_convert (sizetype, DR_STEP (dr)),
2633b1e83836Smrg 		     fold_convert (sizetype, niters));
2634b1e83836Smrg }
2635b1e83836Smrg 
2636b1e83836Smrg /* Return true if LOOP's latch is dominated by statement for data reference
2637b1e83836Smrg    DR.  */
2638b1e83836Smrg 
2639b1e83836Smrg static inline bool
latch_dominated_by_data_ref(class loop * loop,data_reference * dr)2640b1e83836Smrg latch_dominated_by_data_ref (class loop *loop, data_reference *dr)
2641b1e83836Smrg {
2642b1e83836Smrg   return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2643b1e83836Smrg 			 gimple_bb (DR_STMT (dr)));
2644b1e83836Smrg }
2645b1e83836Smrg 
2646b1e83836Smrg /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2647b1e83836Smrg    data dependence relations ALIAS_DDRS.  */
2648b1e83836Smrg 
2649b1e83836Smrg static void
compute_alias_check_pairs(class loop * loop,vec<ddr_p> * alias_ddrs,vec<dr_with_seg_len_pair_t> * comp_alias_pairs)2650b1e83836Smrg compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs,
2651b1e83836Smrg 			   vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2652b1e83836Smrg {
2653b1e83836Smrg   unsigned int i;
2654b1e83836Smrg   unsigned HOST_WIDE_INT factor = 1;
2655b1e83836Smrg   tree niters_plus_one, niters = number_of_latch_executions (loop);
2656b1e83836Smrg 
2657b1e83836Smrg   gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2658b1e83836Smrg   niters = fold_convert (sizetype, niters);
2659b1e83836Smrg   niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2660b1e83836Smrg 
2661b1e83836Smrg   if (dump_file && (dump_flags & TDF_DETAILS))
2662b1e83836Smrg     fprintf (dump_file, "Creating alias check pairs:\n");
2663b1e83836Smrg 
2664b1e83836Smrg   /* Iterate all data dependence relations and compute alias check pairs.  */
2665b1e83836Smrg   for (i = 0; i < alias_ddrs->length (); i++)
2666b1e83836Smrg     {
2667b1e83836Smrg       ddr_p ddr = (*alias_ddrs)[i];
2668b1e83836Smrg       struct data_reference *dr_a = DDR_A (ddr);
2669b1e83836Smrg       struct data_reference *dr_b = DDR_B (ddr);
2670b1e83836Smrg       tree seg_length_a, seg_length_b;
2671b1e83836Smrg 
2672b1e83836Smrg       if (latch_dominated_by_data_ref (loop, dr_a))
2673b1e83836Smrg 	seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2674b1e83836Smrg       else
2675b1e83836Smrg 	seg_length_a = data_ref_segment_size (dr_a, niters);
2676b1e83836Smrg 
2677b1e83836Smrg       if (latch_dominated_by_data_ref (loop, dr_b))
2678b1e83836Smrg 	seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2679b1e83836Smrg       else
2680b1e83836Smrg 	seg_length_b = data_ref_segment_size (dr_b, niters);
2681b1e83836Smrg 
2682b1e83836Smrg       unsigned HOST_WIDE_INT access_size_a
2683b1e83836Smrg 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2684b1e83836Smrg       unsigned HOST_WIDE_INT access_size_b
2685b1e83836Smrg 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2686b1e83836Smrg       unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2687b1e83836Smrg       unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2688b1e83836Smrg 
2689b1e83836Smrg       dr_with_seg_len_pair_t dr_with_seg_len_pair
2690b1e83836Smrg 	(dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2691b1e83836Smrg 	 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b),
2692b1e83836Smrg 	 /* ??? Would WELL_ORDERED be safe?  */
2693b1e83836Smrg 	 dr_with_seg_len_pair_t::REORDERED);
2694b1e83836Smrg 
2695b1e83836Smrg       comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2696b1e83836Smrg     }
2697b1e83836Smrg 
2698b1e83836Smrg   if (tree_fits_uhwi_p (niters))
2699b1e83836Smrg     factor = tree_to_uhwi (niters);
2700b1e83836Smrg 
2701b1e83836Smrg   /* Prune alias check pairs.  */
2702b1e83836Smrg   prune_runtime_alias_test_list (comp_alias_pairs, factor);
2703b1e83836Smrg   if (dump_file && (dump_flags & TDF_DETAILS))
2704b1e83836Smrg     fprintf (dump_file,
2705b1e83836Smrg 	     "Improved number of alias checks from %d to %d\n",
2706b1e83836Smrg 	     alias_ddrs->length (), comp_alias_pairs->length ());
2707b1e83836Smrg }
2708b1e83836Smrg 
2709b1e83836Smrg /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2710b1e83836Smrg    checks and version LOOP under condition of these runtime alias checks.  */
2711b1e83836Smrg 
2712b1e83836Smrg static void
version_loop_by_alias_check(vec<struct partition * > * partitions,class loop * loop,vec<ddr_p> * alias_ddrs)2713b1e83836Smrg version_loop_by_alias_check (vec<struct partition *> *partitions,
2714b1e83836Smrg 			     class loop *loop, vec<ddr_p> *alias_ddrs)
2715b1e83836Smrg {
2716b1e83836Smrg   profile_probability prob;
2717b1e83836Smrg   basic_block cond_bb;
2718b1e83836Smrg   class loop *nloop;
2719b1e83836Smrg   tree lhs, arg0, cond_expr = NULL_TREE;
2720b1e83836Smrg   gimple_seq cond_stmts = NULL;
2721b1e83836Smrg   gimple *call_stmt = NULL;
2722b1e83836Smrg   auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2723b1e83836Smrg 
2724b1e83836Smrg   /* Generate code for runtime alias checks if necessary.  */
2725b1e83836Smrg   gcc_assert (alias_ddrs->length () > 0);
2726b1e83836Smrg 
2727b1e83836Smrg   if (dump_file && (dump_flags & TDF_DETAILS))
2728b1e83836Smrg     fprintf (dump_file,
2729b1e83836Smrg 	     "Version loop <%d> with runtime alias check\n", loop->num);
2730b1e83836Smrg 
2731b1e83836Smrg   compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2732b1e83836Smrg   create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2733b1e83836Smrg   cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2734b1e83836Smrg 				      is_gimple_val, NULL_TREE);
2735b1e83836Smrg 
2736b1e83836Smrg   /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS.  */
2737b1e83836Smrg   bool cancelable_p = flag_tree_loop_vectorize;
2738b1e83836Smrg   if (cancelable_p)
2739b1e83836Smrg     {
2740b1e83836Smrg       unsigned i = 0;
2741b1e83836Smrg       struct partition *partition;
2742b1e83836Smrg       for (; partitions->iterate (i, &partition); ++i)
2743b1e83836Smrg 	if (!partition_builtin_p (partition))
2744b1e83836Smrg 	  break;
2745b1e83836Smrg 
2746b1e83836Smrg      /* If all partitions are builtins, distributing it would be profitable and
2747b1e83836Smrg 	we don't want to cancel the runtime alias checks.  */
2748b1e83836Smrg       if (i == partitions->length ())
2749b1e83836Smrg 	cancelable_p = false;
2750b1e83836Smrg     }
2751b1e83836Smrg 
2752b1e83836Smrg   /* Generate internal function call for loop distribution alias check if the
2753b1e83836Smrg      runtime alias check should be cancelable.  */
2754b1e83836Smrg   if (cancelable_p)
2755b1e83836Smrg     {
2756b1e83836Smrg       call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2757b1e83836Smrg 					      2, NULL_TREE, cond_expr);
2758b1e83836Smrg       lhs = make_ssa_name (boolean_type_node);
2759b1e83836Smrg       gimple_call_set_lhs (call_stmt, lhs);
2760b1e83836Smrg     }
2761b1e83836Smrg   else
2762b1e83836Smrg     lhs = cond_expr;
2763b1e83836Smrg 
2764b1e83836Smrg   prob = profile_probability::guessed_always ().apply_scale (9, 10);
2765b1e83836Smrg   initialize_original_copy_tables ();
2766b1e83836Smrg   nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2767b1e83836Smrg 			prob, prob.invert (), true);
2768b1e83836Smrg   free_original_copy_tables ();
2769b1e83836Smrg   /* Record the original loop number in newly generated loops.  In case of
2770b1e83836Smrg      distribution, the original loop will be distributed and the new loop
2771b1e83836Smrg      is kept.  */
2772b1e83836Smrg   loop->orig_loop_num = nloop->num;
2773b1e83836Smrg   nloop->orig_loop_num = nloop->num;
2774b1e83836Smrg   nloop->dont_vectorize = true;
2775b1e83836Smrg   nloop->force_vectorize = false;
2776b1e83836Smrg 
2777b1e83836Smrg   if (call_stmt)
2778b1e83836Smrg     {
2779b1e83836Smrg       /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2780b1e83836Smrg 	 loop could be destroyed.  */
2781b1e83836Smrg       arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2782b1e83836Smrg       gimple_call_set_arg (call_stmt, 0, arg0);
2783b1e83836Smrg       gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2784b1e83836Smrg     }
2785b1e83836Smrg 
2786b1e83836Smrg   if (cond_stmts)
2787b1e83836Smrg     {
2788b1e83836Smrg       gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2789b1e83836Smrg       gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2790b1e83836Smrg     }
2791b1e83836Smrg   update_ssa (TODO_update_ssa);
2792b1e83836Smrg }
2793b1e83836Smrg 
2794b1e83836Smrg /* Return true if loop versioning is needed to distrubute PARTITIONS.
2795b1e83836Smrg    ALIAS_DDRS are data dependence relations for runtime alias check.  */
2796b1e83836Smrg 
2797b1e83836Smrg static inline bool
version_for_distribution_p(vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2798b1e83836Smrg version_for_distribution_p (vec<struct partition *> *partitions,
2799b1e83836Smrg 			    vec<ddr_p> *alias_ddrs)
2800b1e83836Smrg {
2801b1e83836Smrg   /* No need to version loop if we have only one partition.  */
2802b1e83836Smrg   if (partitions->length () == 1)
2803b1e83836Smrg     return false;
2804b1e83836Smrg 
2805b1e83836Smrg   /* Need to version loop if runtime alias check is necessary.  */
2806b1e83836Smrg   return (alias_ddrs->length () > 0);
2807b1e83836Smrg }
2808b1e83836Smrg 
2809b1e83836Smrg /* Compare base offset of builtin mem* partitions P1 and P2.  */
2810b1e83836Smrg 
2811b1e83836Smrg static int
offset_cmp(const void * vp1,const void * vp2)2812b1e83836Smrg offset_cmp (const void *vp1, const void *vp2)
2813b1e83836Smrg {
2814b1e83836Smrg   struct partition *p1 = *(struct partition *const *) vp1;
2815b1e83836Smrg   struct partition *p2 = *(struct partition *const *) vp2;
2816b1e83836Smrg   unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
2817b1e83836Smrg   unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
2818b1e83836Smrg   return (o2 < o1) - (o1 < o2);
2819b1e83836Smrg }
2820b1e83836Smrg 
2821b1e83836Smrg /* Fuse adjacent memset builtin PARTITIONS if possible.  This is a special
2822b1e83836Smrg    case optimization transforming below code:
2823b1e83836Smrg 
2824b1e83836Smrg      __builtin_memset (&obj, 0, 100);
2825b1e83836Smrg      _1 = &obj + 100;
2826b1e83836Smrg      __builtin_memset (_1, 0, 200);
2827b1e83836Smrg      _2 = &obj + 300;
2828b1e83836Smrg      __builtin_memset (_2, 0, 100);
2829b1e83836Smrg 
2830b1e83836Smrg    into:
2831b1e83836Smrg 
2832b1e83836Smrg      __builtin_memset (&obj, 0, 400);
2833b1e83836Smrg 
2834b1e83836Smrg    Note we don't have dependence information between different partitions
2835b1e83836Smrg    at this point, as a result, we can't handle nonadjacent memset builtin
2836b1e83836Smrg    partitions since dependence might be broken.  */
2837b1e83836Smrg 
2838b1e83836Smrg static void
fuse_memset_builtins(vec<struct partition * > * partitions)2839b1e83836Smrg fuse_memset_builtins (vec<struct partition *> *partitions)
2840b1e83836Smrg {
2841b1e83836Smrg   unsigned i, j;
2842b1e83836Smrg   struct partition *part1, *part2;
2843b1e83836Smrg   tree rhs1, rhs2;
2844b1e83836Smrg 
2845b1e83836Smrg   for (i = 0; partitions->iterate (i, &part1);)
2846b1e83836Smrg     {
2847b1e83836Smrg       if (part1->kind != PKIND_MEMSET)
2848b1e83836Smrg 	{
2849b1e83836Smrg 	  i++;
2850b1e83836Smrg 	  continue;
2851b1e83836Smrg 	}
2852b1e83836Smrg 
2853b1e83836Smrg       /* Find sub-array of memset builtins of the same base.  Index range
2854b1e83836Smrg 	 of the sub-array is [i, j) with "j > i".  */
2855b1e83836Smrg       for (j = i + 1; partitions->iterate (j, &part2); ++j)
2856b1e83836Smrg 	{
2857b1e83836Smrg 	  if (part2->kind != PKIND_MEMSET
2858b1e83836Smrg 	      || !operand_equal_p (part1->builtin->dst_base_base,
2859b1e83836Smrg 				   part2->builtin->dst_base_base, 0))
2860b1e83836Smrg 	    break;
2861b1e83836Smrg 
2862b1e83836Smrg 	  /* Memset calls setting different values can't be merged.  */
2863b1e83836Smrg 	  rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2864b1e83836Smrg 	  rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2865b1e83836Smrg 	  if (!operand_equal_p (rhs1, rhs2, 0))
2866b1e83836Smrg 	    break;
2867b1e83836Smrg 	}
2868b1e83836Smrg 
2869b1e83836Smrg       /* Stable sort is required in order to avoid breaking dependence.  */
2870b1e83836Smrg       gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
2871b1e83836Smrg 		      offset_cmp);
2872b1e83836Smrg       /* Continue with next partition.  */
2873b1e83836Smrg       i = j;
2874b1e83836Smrg     }
2875b1e83836Smrg 
2876b1e83836Smrg   /* Merge all consecutive memset builtin partitions.  */
2877b1e83836Smrg   for (i = 0; i < partitions->length () - 1;)
2878b1e83836Smrg     {
2879b1e83836Smrg       part1 = (*partitions)[i];
2880b1e83836Smrg       if (part1->kind != PKIND_MEMSET)
2881b1e83836Smrg 	{
2882b1e83836Smrg 	  i++;
2883b1e83836Smrg 	  continue;
2884b1e83836Smrg 	}
2885b1e83836Smrg 
2886b1e83836Smrg       part2 = (*partitions)[i + 1];
2887b1e83836Smrg       /* Only merge memset partitions of the same base and with constant
2888b1e83836Smrg 	 access sizes.  */
2889b1e83836Smrg       if (part2->kind != PKIND_MEMSET
2890b1e83836Smrg 	  || TREE_CODE (part1->builtin->size) != INTEGER_CST
2891b1e83836Smrg 	  || TREE_CODE (part2->builtin->size) != INTEGER_CST
2892b1e83836Smrg 	  || !operand_equal_p (part1->builtin->dst_base_base,
2893b1e83836Smrg 			       part2->builtin->dst_base_base, 0))
2894b1e83836Smrg 	{
2895b1e83836Smrg 	  i++;
2896b1e83836Smrg 	  continue;
2897b1e83836Smrg 	}
2898b1e83836Smrg       rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2899b1e83836Smrg       rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2900b1e83836Smrg       int bytev1 = const_with_all_bytes_same (rhs1);
2901b1e83836Smrg       int bytev2 = const_with_all_bytes_same (rhs2);
2902b1e83836Smrg       /* Only merge memset partitions of the same value.  */
2903b1e83836Smrg       if (bytev1 != bytev2 || bytev1 == -1)
2904b1e83836Smrg 	{
2905b1e83836Smrg 	  i++;
2906b1e83836Smrg 	  continue;
2907b1e83836Smrg 	}
2908b1e83836Smrg       wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2909b1e83836Smrg 			       wi::to_wide (part1->builtin->size));
2910b1e83836Smrg       /* Only merge adjacent memset partitions.  */
2911b1e83836Smrg       if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2912b1e83836Smrg 	{
2913b1e83836Smrg 	  i++;
2914b1e83836Smrg 	  continue;
2915b1e83836Smrg 	}
2916b1e83836Smrg       /* Merge partitions[i] and partitions[i+1].  */
2917b1e83836Smrg       part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2918b1e83836Smrg 					  part1->builtin->size,
2919b1e83836Smrg 					  part2->builtin->size);
2920b1e83836Smrg       partition_free (part2);
2921b1e83836Smrg       partitions->ordered_remove (i + 1);
2922b1e83836Smrg     }
2923b1e83836Smrg }
2924b1e83836Smrg 
2925b1e83836Smrg void
finalize_partitions(class loop * loop,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2926b1e83836Smrg loop_distribution::finalize_partitions (class loop *loop,
2927b1e83836Smrg 					vec<struct partition *> *partitions,
2928b1e83836Smrg 					vec<ddr_p> *alias_ddrs)
2929b1e83836Smrg {
2930b1e83836Smrg   unsigned i;
2931b1e83836Smrg   struct partition *partition, *a;
2932b1e83836Smrg 
2933b1e83836Smrg   if (partitions->length () == 1
2934b1e83836Smrg       || alias_ddrs->length () > 0)
2935b1e83836Smrg     return;
2936b1e83836Smrg 
2937b1e83836Smrg   unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2938b1e83836Smrg   bool same_type_p = true;
2939b1e83836Smrg   enum partition_type type = ((*partitions)[0])->type;
2940b1e83836Smrg   for (i = 0; partitions->iterate (i, &partition); ++i)
2941b1e83836Smrg     {
2942b1e83836Smrg       same_type_p &= (type == partition->type);
2943b1e83836Smrg       if (partition_builtin_p (partition))
2944b1e83836Smrg 	{
2945b1e83836Smrg 	  num_builtin++;
2946b1e83836Smrg 	  continue;
2947b1e83836Smrg 	}
2948b1e83836Smrg       num_normal++;
2949b1e83836Smrg       if (partition->kind == PKIND_PARTIAL_MEMSET)
2950b1e83836Smrg 	num_partial_memset++;
2951b1e83836Smrg     }
2952b1e83836Smrg 
2953b1e83836Smrg   /* Don't distribute current loop into too many loops given we don't have
2954b1e83836Smrg      memory stream cost model.  Be even more conservative in case of loop
2955b1e83836Smrg      nest distribution.  */
2956b1e83836Smrg   if ((same_type_p && num_builtin == 0
2957b1e83836Smrg        && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2958b1e83836Smrg       || (loop->inner != NULL
2959b1e83836Smrg 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2960b1e83836Smrg       || (loop->inner == NULL
2961b1e83836Smrg 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2962b1e83836Smrg     {
2963b1e83836Smrg       a = (*partitions)[0];
2964b1e83836Smrg       for (i = 1; partitions->iterate (i, &partition); ++i)
2965b1e83836Smrg 	{
2966b1e83836Smrg 	  partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2967b1e83836Smrg 	  partition_free (partition);
2968b1e83836Smrg 	}
2969b1e83836Smrg       partitions->truncate (1);
2970b1e83836Smrg     }
2971b1e83836Smrg 
2972b1e83836Smrg   /* Fuse memset builtins if possible.  */
2973b1e83836Smrg   if (partitions->length () > 1)
2974b1e83836Smrg     fuse_memset_builtins (partitions);
2975b1e83836Smrg }
2976b1e83836Smrg 
2977b1e83836Smrg /* Distributes the code from LOOP in such a way that producer statements
2978b1e83836Smrg    are placed before consumer statements.  Tries to separate only the
2979b1e83836Smrg    statements from STMTS into separate loops.  Returns the number of
2980b1e83836Smrg    distributed loops.  Set NB_CALLS to number of generated builtin calls.
2981b1e83836Smrg    Set *DESTROY_P to whether LOOP needs to be destroyed.  */
2982b1e83836Smrg 
2983b1e83836Smrg int
distribute_loop(class loop * loop,const vec<gimple * > & stmts,control_dependences * cd,int * nb_calls,bool * destroy_p,bool only_patterns_p)2984b1e83836Smrg loop_distribution::distribute_loop (class loop *loop,
2985b1e83836Smrg 		 const vec<gimple *> &stmts,
2986b1e83836Smrg 		 control_dependences *cd, int *nb_calls, bool *destroy_p,
2987b1e83836Smrg 		 bool only_patterns_p)
2988b1e83836Smrg {
2989b1e83836Smrg   ddrs_table = new hash_table<ddr_hasher> (389);
2990b1e83836Smrg   struct graph *rdg;
2991b1e83836Smrg   partition *partition;
2992b1e83836Smrg   int i, nbp;
2993b1e83836Smrg 
2994b1e83836Smrg   *destroy_p = false;
2995b1e83836Smrg   *nb_calls = 0;
2996b1e83836Smrg   loop_nest.create (0);
2997b1e83836Smrg   if (!find_loop_nest (loop, &loop_nest))
2998b1e83836Smrg     {
2999b1e83836Smrg       loop_nest.release ();
3000b1e83836Smrg       delete ddrs_table;
3001b1e83836Smrg       return 0;
3002b1e83836Smrg     }
3003b1e83836Smrg 
3004b1e83836Smrg   datarefs_vec.create (20);
3005b1e83836Smrg   has_nonaddressable_dataref_p = false;
3006b1e83836Smrg   rdg = build_rdg (loop, cd);
3007b1e83836Smrg   if (!rdg)
3008b1e83836Smrg     {
3009b1e83836Smrg       if (dump_file && (dump_flags & TDF_DETAILS))
3010b1e83836Smrg 	fprintf (dump_file,
3011b1e83836Smrg 		 "Loop %d not distributed: failed to build the RDG.\n",
3012b1e83836Smrg 		 loop->num);
3013b1e83836Smrg 
3014b1e83836Smrg       loop_nest.release ();
3015b1e83836Smrg       free_data_refs (datarefs_vec);
3016b1e83836Smrg       delete ddrs_table;
3017b1e83836Smrg       return 0;
3018b1e83836Smrg     }
3019b1e83836Smrg 
3020b1e83836Smrg   if (datarefs_vec.length () > MAX_DATAREFS_NUM)
3021b1e83836Smrg     {
3022b1e83836Smrg       if (dump_file && (dump_flags & TDF_DETAILS))
3023b1e83836Smrg 	fprintf (dump_file,
3024b1e83836Smrg 		 "Loop %d not distributed: too many memory references.\n",
3025b1e83836Smrg 		 loop->num);
3026b1e83836Smrg 
3027b1e83836Smrg       free_rdg (rdg);
3028b1e83836Smrg       loop_nest.release ();
3029b1e83836Smrg       free_data_refs (datarefs_vec);
3030b1e83836Smrg       delete ddrs_table;
3031b1e83836Smrg       return 0;
3032b1e83836Smrg     }
3033b1e83836Smrg 
3034b1e83836Smrg   data_reference_p dref;
3035b1e83836Smrg   for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
3036b1e83836Smrg     dref->aux = (void *) (uintptr_t) i;
3037b1e83836Smrg 
3038b1e83836Smrg   if (dump_file && (dump_flags & TDF_DETAILS))
3039b1e83836Smrg     dump_rdg (dump_file, rdg);
3040b1e83836Smrg 
3041b1e83836Smrg   auto_vec<struct partition *, 3> partitions;
3042b1e83836Smrg   rdg_build_partitions (rdg, stmts, &partitions);
3043b1e83836Smrg 
3044b1e83836Smrg   auto_vec<ddr_p> alias_ddrs;
3045b1e83836Smrg 
3046b1e83836Smrg   auto_bitmap stmt_in_all_partitions;
3047b1e83836Smrg   bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
3048b1e83836Smrg   for (i = 1; partitions.iterate (i, &partition); ++i)
3049b1e83836Smrg     bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
3050b1e83836Smrg 
3051b1e83836Smrg   bool any_builtin = false;
3052b1e83836Smrg   bool reduction_in_all = false;
3053b1e83836Smrg   FOR_EACH_VEC_ELT (partitions, i, partition)
3054b1e83836Smrg     {
3055b1e83836Smrg       reduction_in_all
3056b1e83836Smrg 	|= classify_partition (loop, rdg, partition, stmt_in_all_partitions);
3057b1e83836Smrg       any_builtin |= partition_builtin_p (partition);
3058b1e83836Smrg     }
3059b1e83836Smrg 
3060b1e83836Smrg   /* If we are only distributing patterns but did not detect any,
3061b1e83836Smrg      simply bail out.  */
3062b1e83836Smrg   if (only_patterns_p
3063b1e83836Smrg       && !any_builtin)
3064b1e83836Smrg     {
3065b1e83836Smrg       nbp = 0;
3066b1e83836Smrg       goto ldist_done;
3067b1e83836Smrg     }
3068b1e83836Smrg 
3069b1e83836Smrg   /* If we are only distributing patterns fuse all partitions that
3070b1e83836Smrg      were not classified as builtins.  This also avoids chopping
3071b1e83836Smrg      a loop into pieces, separated by builtin calls.  That is, we
3072b1e83836Smrg      only want no or a single loop body remaining.  */
3073b1e83836Smrg   struct partition *into;
3074b1e83836Smrg   if (only_patterns_p)
3075b1e83836Smrg     {
3076b1e83836Smrg       for (i = 0; partitions.iterate (i, &into); ++i)
3077b1e83836Smrg 	if (!partition_builtin_p (into))
3078b1e83836Smrg 	  break;
3079b1e83836Smrg       for (++i; partitions.iterate (i, &partition); ++i)
3080b1e83836Smrg 	if (!partition_builtin_p (partition))
3081b1e83836Smrg 	  {
3082b1e83836Smrg 	    partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
3083b1e83836Smrg 	    partitions.unordered_remove (i);
3084b1e83836Smrg 	    partition_free (partition);
3085b1e83836Smrg 	    i--;
3086b1e83836Smrg 	  }
3087b1e83836Smrg     }
3088b1e83836Smrg 
3089b1e83836Smrg   /* Due to limitations in the transform phase we have to fuse all
3090b1e83836Smrg      reduction partitions into the last partition so the existing
3091b1e83836Smrg      loop will contain all loop-closed PHI nodes.  */
3092b1e83836Smrg   for (i = 0; partitions.iterate (i, &into); ++i)
3093b1e83836Smrg     if (partition_reduction_p (into))
3094b1e83836Smrg       break;
3095b1e83836Smrg   for (i = i + 1; partitions.iterate (i, &partition); ++i)
3096b1e83836Smrg     if (partition_reduction_p (partition))
3097b1e83836Smrg       {
3098b1e83836Smrg 	partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
3099b1e83836Smrg 	partitions.unordered_remove (i);
3100b1e83836Smrg 	partition_free (partition);
3101b1e83836Smrg 	i--;
3102b1e83836Smrg       }
3103b1e83836Smrg 
3104b1e83836Smrg   /* Apply our simple cost model - fuse partitions with similar
3105b1e83836Smrg      memory accesses.  */
3106b1e83836Smrg   for (i = 0; partitions.iterate (i, &into); ++i)
3107b1e83836Smrg     {
3108b1e83836Smrg       bool changed = false;
3109b1e83836Smrg       if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
3110b1e83836Smrg 	continue;
3111b1e83836Smrg       for (int j = i + 1;
3112b1e83836Smrg 	   partitions.iterate (j, &partition); ++j)
3113b1e83836Smrg 	{
3114b1e83836Smrg 	  if (share_memory_accesses (rdg, into, partition))
3115b1e83836Smrg 	    {
3116b1e83836Smrg 	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
3117b1e83836Smrg 	      partitions.unordered_remove (j);
3118b1e83836Smrg 	      partition_free (partition);
3119b1e83836Smrg 	      j--;
3120b1e83836Smrg 	      changed = true;
3121b1e83836Smrg 	    }
3122b1e83836Smrg 	}
3123b1e83836Smrg       /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
3124b1e83836Smrg          accesses when 1 and 2 have similar accesses but not 0 and 1
3125b1e83836Smrg 	 then in the next iteration we will fail to consider merging
3126b1e83836Smrg 	 1 into 0,2.  So try again if we did any merging into 0.  */
3127b1e83836Smrg       if (changed)
3128b1e83836Smrg 	i--;
3129b1e83836Smrg     }
3130b1e83836Smrg 
3131b1e83836Smrg   /* Put a non-builtin partition last if we need to preserve a reduction.
3132b1e83836Smrg      ???  This is a workaround that makes sort_partitions_by_post_order do
3133b1e83836Smrg      the correct thing while in reality it should sort each component
3134b1e83836Smrg      separately and then put the component with a reduction or a non-builtin
3135b1e83836Smrg      last.  */
3136b1e83836Smrg   if (reduction_in_all
3137b1e83836Smrg       && partition_builtin_p (partitions.last()))
3138b1e83836Smrg     FOR_EACH_VEC_ELT (partitions, i, partition)
3139b1e83836Smrg       if (!partition_builtin_p (partition))
3140b1e83836Smrg 	{
3141b1e83836Smrg 	  partitions.unordered_remove (i);
3142b1e83836Smrg 	  partitions.quick_push (partition);
3143b1e83836Smrg 	  break;
3144b1e83836Smrg 	}
3145b1e83836Smrg 
3146b1e83836Smrg   /* Build the partition dependency graph and fuse partitions in strong
3147b1e83836Smrg      connected component.  */
3148b1e83836Smrg   if (partitions.length () > 1)
3149b1e83836Smrg     {
3150b1e83836Smrg       /* Don't support loop nest distribution under runtime alias check
3151b1e83836Smrg 	 since it's not likely to enable many vectorization opportunities.
3152b1e83836Smrg 	 Also if loop has any data reference which may be not addressable
3153b1e83836Smrg 	 since alias check needs to take, compare address of the object.  */
3154b1e83836Smrg       if (loop->inner || has_nonaddressable_dataref_p)
3155b1e83836Smrg 	merge_dep_scc_partitions (rdg, &partitions, false);
3156b1e83836Smrg       else
3157b1e83836Smrg 	{
3158b1e83836Smrg 	  merge_dep_scc_partitions (rdg, &partitions, true);
3159b1e83836Smrg 	  if (partitions.length () > 1)
3160b1e83836Smrg 	    break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
3161b1e83836Smrg 	}
3162b1e83836Smrg     }
3163b1e83836Smrg 
3164b1e83836Smrg   finalize_partitions (loop, &partitions, &alias_ddrs);
3165b1e83836Smrg 
3166b1e83836Smrg   /* If there is a reduction in all partitions make sure the last one
3167b1e83836Smrg      is not classified for builtin code generation.  */
3168b1e83836Smrg   if (reduction_in_all)
3169b1e83836Smrg     {
3170b1e83836Smrg       partition = partitions.last ();
3171b1e83836Smrg       if (only_patterns_p
3172b1e83836Smrg 	  && partition_builtin_p (partition)
3173b1e83836Smrg 	  && !partition_builtin_p (partitions[0]))
3174b1e83836Smrg 	{
3175b1e83836Smrg 	  nbp = 0;
3176b1e83836Smrg 	  goto ldist_done;
3177b1e83836Smrg 	}
3178b1e83836Smrg       partition->kind = PKIND_NORMAL;
3179b1e83836Smrg     }
3180b1e83836Smrg 
3181b1e83836Smrg   nbp = partitions.length ();
3182b1e83836Smrg   if (nbp == 0
3183b1e83836Smrg       || (nbp == 1 && !partition_builtin_p (partitions[0]))
3184b1e83836Smrg       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
3185b1e83836Smrg     {
3186b1e83836Smrg       nbp = 0;
3187b1e83836Smrg       goto ldist_done;
3188b1e83836Smrg     }
3189b1e83836Smrg 
3190b1e83836Smrg   if (version_for_distribution_p (&partitions, &alias_ddrs))
3191b1e83836Smrg     version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
3192b1e83836Smrg 
3193b1e83836Smrg   if (dump_file && (dump_flags & TDF_DETAILS))
3194b1e83836Smrg     {
3195b1e83836Smrg       fprintf (dump_file,
3196b1e83836Smrg 	       "distribute loop <%d> into partitions:\n", loop->num);
3197b1e83836Smrg       dump_rdg_partitions (dump_file, partitions);
3198b1e83836Smrg     }
3199b1e83836Smrg 
3200b1e83836Smrg   FOR_EACH_VEC_ELT (partitions, i, partition)
3201b1e83836Smrg     {
3202b1e83836Smrg       if (partition_builtin_p (partition))
3203b1e83836Smrg 	(*nb_calls)++;
3204b1e83836Smrg       *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
3205b1e83836Smrg     }
3206b1e83836Smrg 
3207b1e83836Smrg  ldist_done:
3208b1e83836Smrg   loop_nest.release ();
3209b1e83836Smrg   free_data_refs (datarefs_vec);
3210b1e83836Smrg   for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
3211b1e83836Smrg        iter != ddrs_table->end (); ++iter)
3212b1e83836Smrg     {
3213b1e83836Smrg       free_dependence_relation (*iter);
3214b1e83836Smrg       *iter = NULL;
3215b1e83836Smrg     }
3216b1e83836Smrg   delete ddrs_table;
3217b1e83836Smrg 
3218b1e83836Smrg   FOR_EACH_VEC_ELT (partitions, i, partition)
3219b1e83836Smrg     partition_free (partition);
3220b1e83836Smrg 
3221b1e83836Smrg   free_rdg (rdg);
3222b1e83836Smrg   return nbp - *nb_calls;
3223b1e83836Smrg }
3224b1e83836Smrg 
3225b1e83836Smrg 
bb_top_order_init(void)3226b1e83836Smrg void loop_distribution::bb_top_order_init (void)
3227b1e83836Smrg {
3228b1e83836Smrg   int rpo_num;
3229b1e83836Smrg   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (cfun) - NUM_FIXED_BLOCKS);
3230b1e83836Smrg   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3231b1e83836Smrg   bitmap exit_bbs = BITMAP_ALLOC (NULL);
3232b1e83836Smrg 
3233b1e83836Smrg   bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3234b1e83836Smrg   bb_top_order_index_size = last_basic_block_for_fn (cfun);
3235b1e83836Smrg 
3236b1e83836Smrg   entry->flags &= ~EDGE_DFS_BACK;
3237b1e83836Smrg   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
3238b1e83836Smrg   rpo_num = rev_post_order_and_mark_dfs_back_seme (cfun, entry, exit_bbs, true,
3239b1e83836Smrg 						   rpo, NULL);
3240b1e83836Smrg   BITMAP_FREE (exit_bbs);
3241b1e83836Smrg 
3242b1e83836Smrg   for (int i = 0; i < rpo_num; i++)
3243b1e83836Smrg     bb_top_order_index[rpo[i]] = i;
3244b1e83836Smrg 
3245b1e83836Smrg   free (rpo);
3246b1e83836Smrg }
3247b1e83836Smrg 
bb_top_order_destroy()3248b1e83836Smrg void loop_distribution::bb_top_order_destroy ()
3249b1e83836Smrg {
3250b1e83836Smrg   free (bb_top_order_index);
3251b1e83836Smrg   bb_top_order_index = NULL;
3252b1e83836Smrg   bb_top_order_index_size = 0;
3253b1e83836Smrg }
3254b1e83836Smrg 
3255b1e83836Smrg 
3256b1e83836Smrg /* Given LOOP, this function records seed statements for distribution in
3257b1e83836Smrg    WORK_LIST.  Return false if there is nothing for distribution.  */
3258b1e83836Smrg 
3259b1e83836Smrg static bool
find_seed_stmts_for_distribution(class loop * loop,vec<gimple * > * work_list)3260b1e83836Smrg find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list)
3261b1e83836Smrg {
3262b1e83836Smrg   basic_block *bbs = get_loop_body_in_dom_order (loop);
3263b1e83836Smrg 
3264b1e83836Smrg   /* Initialize the worklist with stmts we seed the partitions with.  */
3265b1e83836Smrg   for (unsigned i = 0; i < loop->num_nodes; ++i)
3266b1e83836Smrg     {
3267b1e83836Smrg       /* In irreducible sub-regions we don't know how to redirect
3268b1e83836Smrg 	 conditions, so fail.  See PR100492.  */
3269b1e83836Smrg       if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
3270b1e83836Smrg 	{
3271b1e83836Smrg 	  if (dump_file && (dump_flags & TDF_DETAILS))
3272b1e83836Smrg 	    fprintf (dump_file, "loop %d contains an irreducible region.\n",
3273b1e83836Smrg 		     loop->num);
3274b1e83836Smrg 	  work_list->truncate (0);
3275b1e83836Smrg 	  break;
3276b1e83836Smrg 	}
3277b1e83836Smrg       for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
3278b1e83836Smrg 	   !gsi_end_p (gsi); gsi_next (&gsi))
3279b1e83836Smrg 	{
3280b1e83836Smrg 	  gphi *phi = gsi.phi ();
3281b1e83836Smrg 	  if (virtual_operand_p (gimple_phi_result (phi)))
3282b1e83836Smrg 	    continue;
3283b1e83836Smrg 	  /* Distribute stmts which have defs that are used outside of
3284b1e83836Smrg 	     the loop.  */
3285b1e83836Smrg 	  if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
3286b1e83836Smrg 	    continue;
3287b1e83836Smrg 	  work_list->safe_push (phi);
3288b1e83836Smrg 	}
3289b1e83836Smrg       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3290b1e83836Smrg 	   !gsi_end_p (gsi); gsi_next (&gsi))
3291b1e83836Smrg 	{
3292b1e83836Smrg 	  gimple *stmt = gsi_stmt (gsi);
3293b1e83836Smrg 
3294b1e83836Smrg 	  /* Ignore clobbers, they do not have true side effects.  */
3295b1e83836Smrg 	  if (gimple_clobber_p (stmt))
3296b1e83836Smrg 	    continue;
3297b1e83836Smrg 
3298b1e83836Smrg 	  /* If there is a stmt with side-effects bail out - we
3299b1e83836Smrg 	     cannot and should not distribute this loop.  */
3300b1e83836Smrg 	  if (gimple_has_side_effects (stmt))
3301b1e83836Smrg 	    {
3302b1e83836Smrg 	      free (bbs);
3303b1e83836Smrg 	      return false;
3304b1e83836Smrg 	    }
3305b1e83836Smrg 
3306b1e83836Smrg 	  /* Distribute stmts which have defs that are used outside of
3307b1e83836Smrg 	     the loop.  */
3308b1e83836Smrg 	  if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3309b1e83836Smrg 	    ;
3310b1e83836Smrg 	  /* Otherwise only distribute stores for now.  */
3311b1e83836Smrg 	  else if (!gimple_vdef (stmt))
3312b1e83836Smrg 	    continue;
3313b1e83836Smrg 
3314b1e83836Smrg 	  work_list->safe_push (stmt);
3315b1e83836Smrg 	}
3316b1e83836Smrg     }
3317b1e83836Smrg   bool res = work_list->length () > 0;
3318b1e83836Smrg   if (res && !can_copy_bbs_p (bbs, loop->num_nodes))
3319b1e83836Smrg     {
3320b1e83836Smrg       if (dump_file && (dump_flags & TDF_DETAILS))
3321b1e83836Smrg 	fprintf (dump_file, "cannot copy loop %d.\n", loop->num);
3322b1e83836Smrg       res = false;
3323b1e83836Smrg     }
3324b1e83836Smrg   free (bbs);
3325b1e83836Smrg   return res;
3326b1e83836Smrg }
3327b1e83836Smrg 
3328b1e83836Smrg /* A helper function for generate_{rawmemchr,strlen}_builtin functions in order
3329b1e83836Smrg    to place new statements SEQ before LOOP and replace the old reduction
3330b1e83836Smrg    variable with the new one.  */
3331b1e83836Smrg 
3332b1e83836Smrg static void
generate_reduction_builtin_1(loop_p loop,gimple_seq & seq,tree reduction_var_old,tree reduction_var_new,const char * info,machine_mode load_mode)3333b1e83836Smrg generate_reduction_builtin_1 (loop_p loop, gimple_seq &seq,
3334b1e83836Smrg 			      tree reduction_var_old, tree reduction_var_new,
3335b1e83836Smrg 			      const char *info, machine_mode load_mode)
3336b1e83836Smrg {
3337b1e83836Smrg   gcc_assert (flag_tree_loop_distribute_patterns);
3338b1e83836Smrg 
3339b1e83836Smrg   /* Place new statements before LOOP.  */
3340b1e83836Smrg   gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
3341b1e83836Smrg   gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING);
3342b1e83836Smrg 
3343b1e83836Smrg   /* Replace old reduction variable with new one.  */
3344b1e83836Smrg   imm_use_iterator iter;
3345b1e83836Smrg   gimple *stmt;
3346b1e83836Smrg   use_operand_p use_p;
3347b1e83836Smrg   FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var_old)
3348b1e83836Smrg     {
3349b1e83836Smrg       FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
3350b1e83836Smrg 	SET_USE (use_p, reduction_var_new);
3351b1e83836Smrg 
3352b1e83836Smrg       update_stmt (stmt);
3353b1e83836Smrg     }
3354b1e83836Smrg 
3355b1e83836Smrg   if (dump_file && (dump_flags & TDF_DETAILS))
3356b1e83836Smrg     fprintf (dump_file, info, GET_MODE_NAME (load_mode));
3357b1e83836Smrg }
3358b1e83836Smrg 
3359b1e83836Smrg /* Generate a call to rawmemchr and place it before LOOP.  REDUCTION_VAR is
3360b1e83836Smrg    replaced with a fresh SSA name representing the result of the call.  */
3361b1e83836Smrg 
3362b1e83836Smrg static void
generate_rawmemchr_builtin(loop_p loop,tree reduction_var,data_reference_p store_dr,tree base,tree pattern,location_t loc)3363b1e83836Smrg generate_rawmemchr_builtin (loop_p loop, tree reduction_var,
3364b1e83836Smrg 			    data_reference_p store_dr, tree base, tree pattern,
3365b1e83836Smrg 			    location_t loc)
3366b1e83836Smrg {
3367b1e83836Smrg   gimple_seq seq = NULL;
3368b1e83836Smrg 
3369b1e83836Smrg   tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
3370b1e83836Smrg   gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, pattern);
3371b1e83836Smrg   tree reduction_var_new = copy_ssa_name (reduction_var);
3372b1e83836Smrg   gimple_call_set_lhs (fn_call, reduction_var_new);
3373b1e83836Smrg   gimple_set_location (fn_call, loc);
3374b1e83836Smrg   gimple_seq_add_stmt (&seq, fn_call);
3375b1e83836Smrg 
3376b1e83836Smrg   if (store_dr)
3377b1e83836Smrg     {
3378b1e83836Smrg       gassign *g = gimple_build_assign (DR_REF (store_dr), reduction_var_new);
3379b1e83836Smrg       gimple_seq_add_stmt (&seq, g);
3380b1e83836Smrg     }
3381b1e83836Smrg 
3382b1e83836Smrg   generate_reduction_builtin_1 (loop, seq, reduction_var, reduction_var_new,
3383b1e83836Smrg 				"generated rawmemchr%s\n",
3384b1e83836Smrg 				TYPE_MODE (TREE_TYPE (TREE_TYPE (base))));
3385b1e83836Smrg }
3386b1e83836Smrg 
3387b1e83836Smrg /* Helper function for generate_strlen_builtin(,_using_rawmemchr)  */
3388b1e83836Smrg 
3389b1e83836Smrg static void
generate_strlen_builtin_1(loop_p loop,gimple_seq & seq,tree reduction_var_old,tree reduction_var_new,machine_mode mode,tree start_len)3390b1e83836Smrg generate_strlen_builtin_1 (loop_p loop, gimple_seq &seq,
3391b1e83836Smrg 			   tree reduction_var_old, tree reduction_var_new,
3392b1e83836Smrg 			   machine_mode mode, tree start_len)
3393b1e83836Smrg {
3394b1e83836Smrg   /* REDUCTION_VAR_NEW has either size type or ptrdiff type and must be
3395b1e83836Smrg      converted if types of old and new reduction variable are not compatible. */
3396b1e83836Smrg   reduction_var_new = gimple_convert (&seq, TREE_TYPE (reduction_var_old),
3397b1e83836Smrg 				      reduction_var_new);
3398b1e83836Smrg 
3399b1e83836Smrg   /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start
3400b1e83836Smrg      length.  */
3401b1e83836Smrg   if (!integer_zerop (start_len))
3402b1e83836Smrg     {
3403b1e83836Smrg       tree lhs = make_ssa_name (TREE_TYPE (reduction_var_new));
3404b1e83836Smrg       gimple *g = gimple_build_assign (lhs, PLUS_EXPR, reduction_var_new,
3405b1e83836Smrg 				       start_len);
3406b1e83836Smrg       gimple_seq_add_stmt (&seq, g);
3407b1e83836Smrg       reduction_var_new = lhs;
3408b1e83836Smrg     }
3409b1e83836Smrg 
3410b1e83836Smrg   generate_reduction_builtin_1 (loop, seq, reduction_var_old, reduction_var_new,
3411b1e83836Smrg 				"generated strlen%s\n", mode);
3412b1e83836Smrg }
3413b1e83836Smrg 
3414b1e83836Smrg /* Generate a call to strlen and place it before LOOP.  REDUCTION_VAR is
3415b1e83836Smrg    replaced with a fresh SSA name representing the result of the call.  */
3416b1e83836Smrg 
3417b1e83836Smrg static void
generate_strlen_builtin(loop_p loop,tree reduction_var,tree base,tree start_len,location_t loc)3418b1e83836Smrg generate_strlen_builtin (loop_p loop, tree reduction_var, tree base,
3419b1e83836Smrg 			 tree start_len, location_t loc)
3420b1e83836Smrg {
3421b1e83836Smrg   gimple_seq seq = NULL;
3422b1e83836Smrg 
3423b1e83836Smrg   tree reduction_var_new = make_ssa_name (size_type_node);
3424b1e83836Smrg 
3425b1e83836Smrg   tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
3426b1e83836Smrg   tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_STRLEN));
3427b1e83836Smrg   gimple *fn_call = gimple_build_call (fn, 1, mem);
3428b1e83836Smrg   gimple_call_set_lhs (fn_call, reduction_var_new);
3429b1e83836Smrg   gimple_set_location (fn_call, loc);
3430b1e83836Smrg   gimple_seq_add_stmt (&seq, fn_call);
3431b1e83836Smrg 
3432b1e83836Smrg   generate_strlen_builtin_1 (loop, seq, reduction_var, reduction_var_new,
3433b1e83836Smrg 			     QImode, start_len);
3434b1e83836Smrg }
3435b1e83836Smrg 
3436b1e83836Smrg /* Generate code in order to mimic the behaviour of strlen but this time over
3437b1e83836Smrg    an array of elements with mode different than QI.  REDUCTION_VAR is replaced
3438b1e83836Smrg    with a fresh SSA name representing the result, i.e., the length.  */
3439b1e83836Smrg 
3440b1e83836Smrg static void
generate_strlen_builtin_using_rawmemchr(loop_p loop,tree reduction_var,tree base,tree load_type,tree start_len,location_t loc)3441b1e83836Smrg generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var,
3442b1e83836Smrg 					 tree base, tree load_type,
3443b1e83836Smrg 					 tree start_len, location_t loc)
3444b1e83836Smrg {
3445b1e83836Smrg   gimple_seq seq = NULL;
3446b1e83836Smrg 
3447b1e83836Smrg   tree start = force_gimple_operand (base, &seq, true, NULL_TREE);
3448b1e83836Smrg   tree zero = build_zero_cst (load_type);
3449b1e83836Smrg   gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, start, zero);
3450b1e83836Smrg   tree end = make_ssa_name (TREE_TYPE (base));
3451b1e83836Smrg   gimple_call_set_lhs (fn_call, end);
3452b1e83836Smrg   gimple_set_location (fn_call, loc);
3453b1e83836Smrg   gimple_seq_add_stmt (&seq, fn_call);
3454b1e83836Smrg 
3455b1e83836Smrg   /* Determine the number of elements between START and END by
3456b1e83836Smrg      evaluating (END - START) / sizeof (*START).  */
3457b1e83836Smrg   tree diff = make_ssa_name (ptrdiff_type_node);
3458b1e83836Smrg   gimple *diff_stmt = gimple_build_assign (diff, POINTER_DIFF_EXPR, end, start);
3459b1e83836Smrg   gimple_seq_add_stmt (&seq, diff_stmt);
3460b1e83836Smrg   /* Let SIZE be the size of each character.  */
3461b1e83836Smrg   tree size = gimple_convert (&seq, ptrdiff_type_node,
3462b1e83836Smrg 			      TYPE_SIZE_UNIT (load_type));
3463b1e83836Smrg   tree count = make_ssa_name (ptrdiff_type_node);
3464b1e83836Smrg   gimple *count_stmt = gimple_build_assign (count, TRUNC_DIV_EXPR, diff, size);
3465b1e83836Smrg   gimple_seq_add_stmt (&seq, count_stmt);
3466b1e83836Smrg 
3467b1e83836Smrg   generate_strlen_builtin_1 (loop, seq, reduction_var, count,
3468b1e83836Smrg 			     TYPE_MODE (load_type),
3469b1e83836Smrg 			     start_len);
3470b1e83836Smrg }
3471b1e83836Smrg 
3472b1e83836Smrg /* Return true if we can count at least as many characters by taking pointer
3473b1e83836Smrg    difference as we can count via reduction_var without an overflow.  Thus
3474b1e83836Smrg    compute 2^n < (2^(m-1) / s) where n = TYPE_PRECISION (reduction_var_type),
3475b1e83836Smrg    m = TYPE_PRECISION (ptrdiff_type_node), and s = size of each character.  */
3476b1e83836Smrg static bool
reduction_var_overflows_first(tree reduction_var_type,tree load_type)3477b1e83836Smrg reduction_var_overflows_first (tree reduction_var_type, tree load_type)
3478b1e83836Smrg {
3479b1e83836Smrg   widest_int n2 = wi::lshift (1, TYPE_PRECISION (reduction_var_type));;
3480b1e83836Smrg   widest_int m2 = wi::lshift (1, TYPE_PRECISION (ptrdiff_type_node) - 1);
3481b1e83836Smrg   widest_int s = wi::to_widest (TYPE_SIZE_UNIT (load_type));
3482b1e83836Smrg   return wi::ltu_p (n2, wi::udiv_trunc (m2, s));
3483b1e83836Smrg }
3484b1e83836Smrg 
3485b1e83836Smrg static gimple *
determine_reduction_stmt_1(const loop_p loop,const basic_block * bbs)3486b1e83836Smrg determine_reduction_stmt_1 (const loop_p loop, const basic_block *bbs)
3487b1e83836Smrg {
3488b1e83836Smrg   gimple *reduction_stmt = NULL;
3489b1e83836Smrg 
3490b1e83836Smrg   for (unsigned i = 0, ninsns = 0; i < loop->num_nodes; ++i)
3491b1e83836Smrg     {
3492b1e83836Smrg       basic_block bb = bbs[i];
3493b1e83836Smrg 
3494b1e83836Smrg       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
3495b1e83836Smrg 	   gsi_next_nondebug (&bsi))
3496b1e83836Smrg 	{
3497b1e83836Smrg 	  gphi *phi = bsi.phi ();
3498b1e83836Smrg 	  if (virtual_operand_p (gimple_phi_result (phi)))
3499b1e83836Smrg 	    continue;
3500b1e83836Smrg 	  if (stmt_has_scalar_dependences_outside_loop (loop, phi))
3501b1e83836Smrg 	    {
3502b1e83836Smrg 	      if (reduction_stmt)
3503b1e83836Smrg 		return NULL;
3504b1e83836Smrg 	      reduction_stmt = phi;
3505b1e83836Smrg 	    }
3506b1e83836Smrg 	}
3507b1e83836Smrg 
3508b1e83836Smrg       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
3509b1e83836Smrg 	   gsi_next_nondebug (&bsi), ++ninsns)
3510b1e83836Smrg 	{
3511b1e83836Smrg 	  /* Bail out early for loops which are unlikely to match.  */
3512b1e83836Smrg 	  if (ninsns > 16)
3513b1e83836Smrg 	    return NULL;
3514b1e83836Smrg 	  gimple *stmt = gsi_stmt (bsi);
3515b1e83836Smrg 	  if (gimple_clobber_p (stmt))
3516b1e83836Smrg 	    continue;
3517b1e83836Smrg 	  if (gimple_code (stmt) == GIMPLE_LABEL)
3518b1e83836Smrg 	    continue;
3519b1e83836Smrg 	  if (gimple_has_volatile_ops (stmt))
3520b1e83836Smrg 	    return NULL;
3521b1e83836Smrg 	  if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3522b1e83836Smrg 	    {
3523b1e83836Smrg 	      if (reduction_stmt)
3524b1e83836Smrg 		return NULL;
3525b1e83836Smrg 	      reduction_stmt = stmt;
3526b1e83836Smrg 	    }
3527b1e83836Smrg 	}
3528b1e83836Smrg     }
3529b1e83836Smrg 
3530b1e83836Smrg   return reduction_stmt;
3531b1e83836Smrg }
3532b1e83836Smrg 
3533b1e83836Smrg /* If LOOP has a single non-volatile reduction statement, then return a pointer
3534b1e83836Smrg    to it.  Otherwise return NULL.  */
3535b1e83836Smrg static gimple *
determine_reduction_stmt(const loop_p loop)3536b1e83836Smrg determine_reduction_stmt (const loop_p loop)
3537b1e83836Smrg {
3538b1e83836Smrg   basic_block *bbs = get_loop_body (loop);
3539b1e83836Smrg   gimple *reduction_stmt = determine_reduction_stmt_1 (loop, bbs);
3540b1e83836Smrg   XDELETEVEC (bbs);
3541b1e83836Smrg   return reduction_stmt;
3542b1e83836Smrg }
3543b1e83836Smrg 
3544b1e83836Smrg /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
3545b1e83836Smrg    replace them accordingly.  For example, a loop of the form
3546b1e83836Smrg 
3547b1e83836Smrg      for (; *p != 42; ++p);
3548b1e83836Smrg 
3549b1e83836Smrg    is replaced by
3550b1e83836Smrg 
3551b1e83836Smrg      p = rawmemchr<MODE> (p, 42);
3552b1e83836Smrg 
3553b1e83836Smrg    under the assumption that rawmemchr is available for a particular MODE.
3554b1e83836Smrg    Another example is
3555b1e83836Smrg 
3556b1e83836Smrg      int i;
3557b1e83836Smrg      for (i = 42; s[i]; ++i);
3558b1e83836Smrg 
3559b1e83836Smrg    which is replaced by
3560b1e83836Smrg 
3561b1e83836Smrg      i = (int)strlen (&s[42]) + 42;
3562b1e83836Smrg 
3563b1e83836Smrg    for some character array S.  In case array S is not of type character array
3564b1e83836Smrg    we end up with
3565b1e83836Smrg 
3566b1e83836Smrg      i = (int)(rawmemchr<MODE> (&s[42], 0) - &s[42]) + 42;
3567b1e83836Smrg 
3568b1e83836Smrg    assuming that rawmemchr is available for a particular MODE.  */
3569b1e83836Smrg 
3570b1e83836Smrg bool
transform_reduction_loop(loop_p loop)3571b1e83836Smrg loop_distribution::transform_reduction_loop (loop_p loop)
3572b1e83836Smrg {
3573b1e83836Smrg   gimple *reduction_stmt;
3574b1e83836Smrg   data_reference_p load_dr = NULL, store_dr = NULL;
3575b1e83836Smrg 
3576b1e83836Smrg   edge e = single_exit (loop);
3577b1e83836Smrg   gcond *cond = safe_dyn_cast <gcond *> (last_stmt (e->src));
3578b1e83836Smrg   if (!cond)
3579b1e83836Smrg     return false;
3580b1e83836Smrg   /* Ensure loop condition is an (in)equality test and loop is exited either if
3581b1e83836Smrg      the inequality test fails or the equality test succeeds.  */
3582b1e83836Smrg   if (!(e->flags & EDGE_FALSE_VALUE && gimple_cond_code (cond) == NE_EXPR)
3583b1e83836Smrg       && !(e->flags & EDGE_TRUE_VALUE && gimple_cond_code (cond) == EQ_EXPR))
3584b1e83836Smrg     return false;
3585b1e83836Smrg   /* A limitation of the current implementation is that we only support
3586b1e83836Smrg      constant patterns in (in)equality tests.  */
3587b1e83836Smrg   tree pattern = gimple_cond_rhs (cond);
3588b1e83836Smrg   if (TREE_CODE (pattern) != INTEGER_CST)
3589b1e83836Smrg     return false;
3590b1e83836Smrg 
3591b1e83836Smrg   reduction_stmt = determine_reduction_stmt (loop);
3592b1e83836Smrg 
3593b1e83836Smrg   /* A limitation of the current implementation is that we require a reduction
3594b1e83836Smrg      statement.  Therefore, loops without a reduction statement as in the
3595b1e83836Smrg      following are not recognized:
3596b1e83836Smrg      int *p;
3597b1e83836Smrg      void foo (void) { for (; *p; ++p); } */
3598b1e83836Smrg   if (reduction_stmt == NULL)
3599b1e83836Smrg     return false;
3600b1e83836Smrg 
3601b1e83836Smrg   /* Reduction variables are guaranteed to be SSA names.  */
3602b1e83836Smrg   tree reduction_var;
3603b1e83836Smrg   switch (gimple_code (reduction_stmt))
3604b1e83836Smrg     {
3605b1e83836Smrg     case GIMPLE_ASSIGN:
3606b1e83836Smrg     case GIMPLE_PHI:
3607b1e83836Smrg       reduction_var = gimple_get_lhs (reduction_stmt);
3608b1e83836Smrg       break;
3609b1e83836Smrg     default:
3610b1e83836Smrg       /* Bail out e.g. for GIMPLE_CALL.  */
3611b1e83836Smrg       return false;
3612b1e83836Smrg     }
3613b1e83836Smrg 
3614b1e83836Smrg   struct graph *rdg = build_rdg (loop, NULL);
3615b1e83836Smrg   if (rdg == NULL)
3616b1e83836Smrg     {
3617b1e83836Smrg       if (dump_file && (dump_flags & TDF_DETAILS))
3618b1e83836Smrg 	fprintf (dump_file,
3619b1e83836Smrg 		 "Loop %d not transformed: failed to build the RDG.\n",
3620b1e83836Smrg 		 loop->num);
3621b1e83836Smrg 
3622b1e83836Smrg       return false;
3623b1e83836Smrg     }
3624b1e83836Smrg   auto_bitmap partition_stmts;
3625b1e83836Smrg   bitmap_set_range (partition_stmts, 0, rdg->n_vertices);
3626b1e83836Smrg   find_single_drs (loop, rdg, partition_stmts, &store_dr, &load_dr);
3627b1e83836Smrg   free_rdg (rdg);
3628b1e83836Smrg 
3629b1e83836Smrg   /* Bail out if there is no single load.  */
3630b1e83836Smrg   if (load_dr == NULL)
3631b1e83836Smrg     return false;
3632b1e83836Smrg 
3633b1e83836Smrg   /* Reaching this point we have a loop with a single reduction variable,
3634b1e83836Smrg      a single load, and an optional single store.  */
3635b1e83836Smrg 
3636b1e83836Smrg   tree load_ref = DR_REF (load_dr);
3637b1e83836Smrg   tree load_type = TREE_TYPE (load_ref);
3638b1e83836Smrg   tree load_access_base = build_fold_addr_expr (load_ref);
3639b1e83836Smrg   tree load_access_size = TYPE_SIZE_UNIT (load_type);
3640b1e83836Smrg   affine_iv load_iv, reduction_iv;
3641b1e83836Smrg 
3642b1e83836Smrg   if (!INTEGRAL_TYPE_P (load_type)
3643b1e83836Smrg       || !type_has_mode_precision_p (load_type))
3644b1e83836Smrg     return false;
3645b1e83836Smrg 
3646b1e83836Smrg   /* We already ensured that the loop condition tests for (in)equality where the
3647b1e83836Smrg      rhs is a constant pattern. Now ensure that the lhs is the result of the
3648b1e83836Smrg      load.  */
3649b1e83836Smrg   if (gimple_cond_lhs (cond) != gimple_assign_lhs (DR_STMT (load_dr)))
3650b1e83836Smrg     return false;
3651b1e83836Smrg 
3652b1e83836Smrg   /* Bail out if no affine induction variable with constant step can be
3653b1e83836Smrg      determined.  */
3654b1e83836Smrg   if (!simple_iv (loop, loop, load_access_base, &load_iv, false))
3655b1e83836Smrg     return false;
3656b1e83836Smrg 
3657b1e83836Smrg   /* Bail out if memory accesses are not consecutive or not growing.  */
3658b1e83836Smrg   if (!operand_equal_p (load_iv.step, load_access_size, 0))
3659b1e83836Smrg     return false;
3660b1e83836Smrg 
3661b1e83836Smrg   if (!simple_iv (loop, loop, reduction_var, &reduction_iv, false))
3662b1e83836Smrg     return false;
3663b1e83836Smrg 
3664b1e83836Smrg   /* Handle rawmemchr like loops.  */
3665b1e83836Smrg   if (operand_equal_p (load_iv.base, reduction_iv.base)
3666b1e83836Smrg       && operand_equal_p (load_iv.step, reduction_iv.step))
3667b1e83836Smrg     {
3668b1e83836Smrg       if (store_dr)
3669b1e83836Smrg 	{
3670b1e83836Smrg 	  /* Ensure that we store to X and load from X+I where I>0.  */
3671b1e83836Smrg 	  if (TREE_CODE (load_iv.base) != POINTER_PLUS_EXPR
3672b1e83836Smrg 	      || !integer_onep (TREE_OPERAND (load_iv.base, 1)))
3673b1e83836Smrg 	    return false;
3674b1e83836Smrg 	  tree ptr_base = TREE_OPERAND (load_iv.base, 0);
3675b1e83836Smrg 	  if (TREE_CODE (ptr_base) != SSA_NAME)
3676b1e83836Smrg 	    return false;
3677b1e83836Smrg 	  gimple *def = SSA_NAME_DEF_STMT (ptr_base);
3678b1e83836Smrg 	  if (!gimple_assign_single_p (def)
3679b1e83836Smrg 	      || gimple_assign_rhs1 (def) != DR_REF (store_dr))
3680b1e83836Smrg 	    return false;
3681b1e83836Smrg 	  /* Ensure that the reduction value is stored.  */
3682b1e83836Smrg 	  if (gimple_assign_rhs1 (DR_STMT (store_dr)) != reduction_var)
3683b1e83836Smrg 	    return false;
3684b1e83836Smrg 	}
3685b1e83836Smrg       /* Bail out if target does not provide rawmemchr for a certain mode.  */
3686b1e83836Smrg       machine_mode mode = TYPE_MODE (load_type);
3687b1e83836Smrg       if (direct_optab_handler (rawmemchr_optab, mode) == CODE_FOR_nothing)
3688b1e83836Smrg 	return false;
3689b1e83836Smrg       location_t loc = gimple_location (DR_STMT (load_dr));
3690b1e83836Smrg       generate_rawmemchr_builtin (loop, reduction_var, store_dr, load_iv.base,
3691b1e83836Smrg 				  pattern, loc);
3692b1e83836Smrg       return true;
3693b1e83836Smrg     }
3694b1e83836Smrg 
3695b1e83836Smrg   /* Handle strlen like loops.  */
3696b1e83836Smrg   if (store_dr == NULL
3697b1e83836Smrg       && integer_zerop (pattern)
3698b1e83836Smrg       && INTEGRAL_TYPE_P (TREE_TYPE (reduction_var))
3699b1e83836Smrg       && TREE_CODE (reduction_iv.base) == INTEGER_CST
3700b1e83836Smrg       && TREE_CODE (reduction_iv.step) == INTEGER_CST
3701b1e83836Smrg       && integer_onep (reduction_iv.step))
3702b1e83836Smrg     {
3703b1e83836Smrg       location_t loc = gimple_location (DR_STMT (load_dr));
3704b1e83836Smrg       tree reduction_var_type = TREE_TYPE (reduction_var);
3705b1e83836Smrg       /* While determining the length of a string an overflow might occur.
3706b1e83836Smrg 	 If an overflow only occurs in the loop implementation and not in the
3707b1e83836Smrg 	 strlen implementation, then either the overflow is undefined or the
3708b1e83836Smrg 	 truncated result of strlen equals the one of the loop.  Otherwise if
3709b1e83836Smrg 	 an overflow may also occur in the strlen implementation, then
3710b1e83836Smrg 	 replacing a loop by a call to strlen is sound whenever we ensure that
3711b1e83836Smrg 	 if an overflow occurs in the strlen implementation, then also an
3712b1e83836Smrg 	 overflow occurs in the loop implementation which is undefined.  It
3713b1e83836Smrg 	 seems reasonable to relax this and assume that the strlen
3714b1e83836Smrg 	 implementation cannot overflow in case sizetype is big enough in the
3715b1e83836Smrg 	 sense that an overflow can only happen for string objects which are
3716b1e83836Smrg 	 bigger than half of the address space; at least for 32-bit targets and
3717b1e83836Smrg 	 up.
3718b1e83836Smrg 
3719b1e83836Smrg 	 For strlen which makes use of rawmemchr the maximal length of a string
3720b1e83836Smrg 	 which can be determined without an overflow is PTRDIFF_MAX / S where
3721b1e83836Smrg 	 each character has size S.  Since an overflow for ptrdiff type is
3722b1e83836Smrg 	 undefined we have to make sure that if an overflow occurs, then an
3723b1e83836Smrg 	 overflow occurs in the loop implementation, too, and this is
3724b1e83836Smrg 	 undefined, too.  Similar as before we relax this and assume that no
3725b1e83836Smrg 	 string object is larger than half of the address space; at least for
3726b1e83836Smrg 	 32-bit targets and up.  */
3727b1e83836Smrg       if (TYPE_MODE (load_type) == TYPE_MODE (char_type_node)
3728b1e83836Smrg 	  && TYPE_PRECISION (load_type) == TYPE_PRECISION (char_type_node)
3729b1e83836Smrg 	  && ((TYPE_PRECISION (sizetype) >= TYPE_PRECISION (ptr_type_node) - 1
3730b1e83836Smrg 	       && TYPE_PRECISION (ptr_type_node) >= 32)
3731b1e83836Smrg 	      || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
3732b1e83836Smrg 		  && TYPE_PRECISION (reduction_var_type) <= TYPE_PRECISION (sizetype)))
3733b1e83836Smrg 	  && builtin_decl_implicit (BUILT_IN_STRLEN))
3734b1e83836Smrg 	generate_strlen_builtin (loop, reduction_var, load_iv.base,
3735b1e83836Smrg 				 reduction_iv.base, loc);
3736b1e83836Smrg       else if (direct_optab_handler (rawmemchr_optab, TYPE_MODE (load_type))
3737b1e83836Smrg 	       != CODE_FOR_nothing
3738b1e83836Smrg 	       && ((TYPE_PRECISION (ptrdiff_type_node) == TYPE_PRECISION (ptr_type_node)
3739b1e83836Smrg 		    && TYPE_PRECISION (ptrdiff_type_node) >= 32)
3740b1e83836Smrg 		   || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
3741b1e83836Smrg 		       && reduction_var_overflows_first (reduction_var_type, load_type))))
3742b1e83836Smrg 	generate_strlen_builtin_using_rawmemchr (loop, reduction_var,
3743b1e83836Smrg 						 load_iv.base,
3744b1e83836Smrg 						 load_type,
3745b1e83836Smrg 						 reduction_iv.base, loc);
3746b1e83836Smrg       else
3747b1e83836Smrg 	return false;
3748b1e83836Smrg       return true;
3749b1e83836Smrg     }
3750b1e83836Smrg 
3751b1e83836Smrg   return false;
3752b1e83836Smrg }
3753b1e83836Smrg 
3754b1e83836Smrg /* Given innermost LOOP, return the outermost enclosing loop that forms a
3755b1e83836Smrg    perfect loop nest.  */
3756b1e83836Smrg 
3757b1e83836Smrg static class loop *
prepare_perfect_loop_nest(class loop * loop)3758b1e83836Smrg prepare_perfect_loop_nest (class loop *loop)
3759b1e83836Smrg {
3760b1e83836Smrg   class loop *outer = loop_outer (loop);
3761b1e83836Smrg   tree niters = number_of_latch_executions (loop);
3762b1e83836Smrg 
3763b1e83836Smrg   /* TODO: We only support the innermost 3-level loop nest distribution
3764b1e83836Smrg      because of compilation time issue for now.  This should be relaxed
3765b1e83836Smrg      in the future.  Note we only allow 3-level loop nest distribution
3766b1e83836Smrg      when parallelizing loops.  */
3767b1e83836Smrg   while ((loop->inner == NULL
3768b1e83836Smrg 	  || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3769b1e83836Smrg 	 && loop_outer (outer)
3770b1e83836Smrg 	 && outer->inner == loop && loop->next == NULL
3771b1e83836Smrg 	 && single_exit (outer)
3772b1e83836Smrg 	 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3773b1e83836Smrg 	 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3774b1e83836Smrg 	 && niters != chrec_dont_know)
3775b1e83836Smrg     {
3776b1e83836Smrg       loop = outer;
3777b1e83836Smrg       outer = loop_outer (loop);
3778b1e83836Smrg     }
3779b1e83836Smrg 
3780b1e83836Smrg   return loop;
3781b1e83836Smrg }
3782b1e83836Smrg 
3783b1e83836Smrg 
3784b1e83836Smrg unsigned int
execute(function * fun)3785b1e83836Smrg loop_distribution::execute (function *fun)
3786b1e83836Smrg {
3787b1e83836Smrg   bool changed = false;
3788b1e83836Smrg   basic_block bb;
3789b1e83836Smrg   control_dependences *cd = NULL;
3790b1e83836Smrg   auto_vec<loop_p> loops_to_be_destroyed;
3791b1e83836Smrg 
3792b1e83836Smrg   if (number_of_loops (fun) <= 1)
3793b1e83836Smrg     return 0;
3794b1e83836Smrg 
3795b1e83836Smrg   bb_top_order_init ();
3796b1e83836Smrg 
3797b1e83836Smrg   FOR_ALL_BB_FN (bb, fun)
3798b1e83836Smrg     {
3799b1e83836Smrg       gimple_stmt_iterator gsi;
3800b1e83836Smrg       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3801b1e83836Smrg 	gimple_set_uid (gsi_stmt (gsi), -1);
3802b1e83836Smrg       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3803b1e83836Smrg 	gimple_set_uid (gsi_stmt (gsi), -1);
3804b1e83836Smrg     }
3805b1e83836Smrg 
3806b1e83836Smrg   /* We can at the moment only distribute non-nested loops, thus restrict
3807b1e83836Smrg      walking to innermost loops.  */
3808b1e83836Smrg   for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
3809b1e83836Smrg     {
3810b1e83836Smrg       /* Don't distribute multiple exit edges loop, or cold loop when
3811b1e83836Smrg          not doing pattern detection.  */
3812b1e83836Smrg       if (!single_exit (loop)
3813b1e83836Smrg 	  || (!flag_tree_loop_distribute_patterns
3814b1e83836Smrg 	      && !optimize_loop_for_speed_p (loop)))
3815b1e83836Smrg 	continue;
3816b1e83836Smrg 
3817b1e83836Smrg       /* If niters is unknown don't distribute loop but rather try to transform
3818b1e83836Smrg 	 it to a call to a builtin.  */
3819b1e83836Smrg       tree niters = number_of_latch_executions (loop);
3820b1e83836Smrg       if (niters == NULL_TREE || niters == chrec_dont_know)
3821b1e83836Smrg 	{
3822b1e83836Smrg 	  datarefs_vec.create (20);
3823b1e83836Smrg 	  if (flag_tree_loop_distribute_patterns
3824b1e83836Smrg 	      && transform_reduction_loop (loop))
3825b1e83836Smrg 	    {
3826b1e83836Smrg 	      changed = true;
3827b1e83836Smrg 	      loops_to_be_destroyed.safe_push (loop);
3828b1e83836Smrg 	      if (dump_enabled_p ())
3829b1e83836Smrg 		{
3830b1e83836Smrg 		  dump_user_location_t loc = find_loop_location (loop);
3831b1e83836Smrg 		  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3832b1e83836Smrg 				   loc, "Loop %d transformed into a builtin.\n",
3833b1e83836Smrg 				   loop->num);
3834b1e83836Smrg 		}
3835b1e83836Smrg 	    }
3836b1e83836Smrg 	  free_data_refs (datarefs_vec);
3837b1e83836Smrg 	  continue;
3838b1e83836Smrg 	}
3839b1e83836Smrg 
3840b1e83836Smrg       /* Get the perfect loop nest for distribution.  */
3841b1e83836Smrg       loop = prepare_perfect_loop_nest (loop);
3842b1e83836Smrg       for (; loop; loop = loop->inner)
3843b1e83836Smrg 	{
3844b1e83836Smrg 	  auto_vec<gimple *> work_list;
3845b1e83836Smrg 	  if (!find_seed_stmts_for_distribution (loop, &work_list))
3846b1e83836Smrg 	    break;
3847b1e83836Smrg 
3848b1e83836Smrg 	  const char *str = loop->inner ? " nest" : "";
3849b1e83836Smrg 	  dump_user_location_t loc = find_loop_location (loop);
3850b1e83836Smrg 	  if (!cd)
3851b1e83836Smrg 	    {
3852b1e83836Smrg 	      calculate_dominance_info (CDI_DOMINATORS);
3853b1e83836Smrg 	      calculate_dominance_info (CDI_POST_DOMINATORS);
3854b1e83836Smrg 	      cd = new control_dependences ();
3855b1e83836Smrg 	      free_dominance_info (CDI_POST_DOMINATORS);
3856b1e83836Smrg 	    }
3857b1e83836Smrg 
3858b1e83836Smrg 	  bool destroy_p;
3859b1e83836Smrg 	  int nb_generated_loops, nb_generated_calls;
3860b1e83836Smrg 	  nb_generated_loops
3861b1e83836Smrg 	    = distribute_loop (loop, work_list, cd, &nb_generated_calls,
3862b1e83836Smrg 			       &destroy_p, (!optimize_loop_for_speed_p (loop)
3863b1e83836Smrg 					    || !flag_tree_loop_distribution));
3864b1e83836Smrg 	  if (destroy_p)
3865b1e83836Smrg 	    loops_to_be_destroyed.safe_push (loop);
3866b1e83836Smrg 
3867b1e83836Smrg 	  if (nb_generated_loops + nb_generated_calls > 0)
3868b1e83836Smrg 	    {
3869b1e83836Smrg 	      changed = true;
3870b1e83836Smrg 	      if (dump_enabled_p ())
3871b1e83836Smrg 		dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3872b1e83836Smrg 				 loc, "Loop%s %d distributed: split to %d loops "
3873b1e83836Smrg 				 "and %d library calls.\n", str, loop->num,
3874b1e83836Smrg 				 nb_generated_loops, nb_generated_calls);
3875b1e83836Smrg 
3876b1e83836Smrg 	      break;
3877b1e83836Smrg 	    }
3878b1e83836Smrg 
3879b1e83836Smrg 	  if (dump_file && (dump_flags & TDF_DETAILS))
3880b1e83836Smrg 	    fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3881b1e83836Smrg 	}
3882b1e83836Smrg     }
3883b1e83836Smrg 
3884b1e83836Smrg   if (cd)
3885b1e83836Smrg     delete cd;
3886b1e83836Smrg 
3887b1e83836Smrg   if (bb_top_order_index != NULL)
3888b1e83836Smrg     bb_top_order_destroy ();
3889b1e83836Smrg 
3890b1e83836Smrg   if (changed)
3891b1e83836Smrg     {
3892b1e83836Smrg       /* Destroy loop bodies that could not be reused.  Do this late as we
3893b1e83836Smrg 	 otherwise can end up refering to stale data in control dependences.  */
3894b1e83836Smrg       unsigned i;
3895b1e83836Smrg       class loop *loop;
3896b1e83836Smrg       FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3897b1e83836Smrg 	destroy_loop (loop);
3898b1e83836Smrg 
3899b1e83836Smrg       /* Cached scalar evolutions now may refer to wrong or non-existing
3900b1e83836Smrg 	 loops.  */
3901b1e83836Smrg       scev_reset ();
3902b1e83836Smrg       mark_virtual_operands_for_renaming (fun);
3903b1e83836Smrg       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3904b1e83836Smrg     }
3905b1e83836Smrg 
3906b1e83836Smrg   checking_verify_loop_structure ();
3907b1e83836Smrg 
3908b1e83836Smrg   return changed ? TODO_cleanup_cfg : 0;
3909b1e83836Smrg }
3910b1e83836Smrg 
3911b1e83836Smrg 
3912b1e83836Smrg /* Distribute all loops in the current function.  */
3913b1e83836Smrg 
3914b1e83836Smrg namespace {
3915b1e83836Smrg 
3916b1e83836Smrg const pass_data pass_data_loop_distribution =
3917b1e83836Smrg {
3918b1e83836Smrg   GIMPLE_PASS, /* type */
3919b1e83836Smrg   "ldist", /* name */
3920b1e83836Smrg   OPTGROUP_LOOP, /* optinfo_flags */
3921b1e83836Smrg   TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
3922b1e83836Smrg   ( PROP_cfg | PROP_ssa ), /* properties_required */
3923b1e83836Smrg   0, /* properties_provided */
3924b1e83836Smrg   0, /* properties_destroyed */
3925b1e83836Smrg   0, /* todo_flags_start */
3926b1e83836Smrg   0, /* todo_flags_finish */
3927b1e83836Smrg };
3928b1e83836Smrg 
3929b1e83836Smrg class pass_loop_distribution : public gimple_opt_pass
3930b1e83836Smrg {
3931b1e83836Smrg public:
pass_loop_distribution(gcc::context * ctxt)3932b1e83836Smrg   pass_loop_distribution (gcc::context *ctxt)
3933b1e83836Smrg     : gimple_opt_pass (pass_data_loop_distribution, ctxt)
3934b1e83836Smrg   {}
3935b1e83836Smrg 
3936b1e83836Smrg   /* opt_pass methods: */
gate(function *)3937b1e83836Smrg   virtual bool gate (function *)
3938b1e83836Smrg     {
3939b1e83836Smrg       return flag_tree_loop_distribution
3940b1e83836Smrg 	|| flag_tree_loop_distribute_patterns;
3941b1e83836Smrg     }
3942b1e83836Smrg 
3943b1e83836Smrg   virtual unsigned int execute (function *);
3944b1e83836Smrg 
3945b1e83836Smrg }; // class pass_loop_distribution
3946b1e83836Smrg 
3947b1e83836Smrg unsigned int
execute(function * fun)3948b1e83836Smrg pass_loop_distribution::execute (function *fun)
3949b1e83836Smrg {
3950b1e83836Smrg   return loop_distribution ().execute (fun);
3951b1e83836Smrg }
3952b1e83836Smrg 
3953b1e83836Smrg } // anon namespace
3954b1e83836Smrg 
3955b1e83836Smrg gimple_opt_pass *
make_pass_loop_distribution(gcc::context * ctxt)3956b1e83836Smrg make_pass_loop_distribution (gcc::context *ctxt)
3957b1e83836Smrg {
3958b1e83836Smrg   return new pass_loop_distribution (ctxt);
3959b1e83836Smrg }
3960