11debfc3dSmrg /* Loop distribution.
28feb0f0bSmrg Copyright (C) 2006-2020 Free Software Foundation, Inc.
31debfc3dSmrg Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
41debfc3dSmrg and Sebastian Pop <sebastian.pop@amd.com>.
51debfc3dSmrg
61debfc3dSmrg This file is part of GCC.
71debfc3dSmrg
81debfc3dSmrg GCC is free software; you can redistribute it and/or modify it
91debfc3dSmrg under the terms of the GNU General Public License as published by the
101debfc3dSmrg Free Software Foundation; either version 3, or (at your option) any
111debfc3dSmrg later version.
121debfc3dSmrg
131debfc3dSmrg GCC is distributed in the hope that it will be useful, but WITHOUT
141debfc3dSmrg ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
151debfc3dSmrg FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
161debfc3dSmrg for more details.
171debfc3dSmrg
181debfc3dSmrg You should have received a copy of the GNU General Public License
191debfc3dSmrg along with GCC; see the file COPYING3. If not see
201debfc3dSmrg <http://www.gnu.org/licenses/>. */
211debfc3dSmrg
221debfc3dSmrg /* This pass performs loop distribution: for example, the loop
231debfc3dSmrg
241debfc3dSmrg |DO I = 2, N
251debfc3dSmrg | A(I) = B(I) + C
261debfc3dSmrg | D(I) = A(I-1)*E
271debfc3dSmrg |ENDDO
281debfc3dSmrg
291debfc3dSmrg is transformed to
301debfc3dSmrg
311debfc3dSmrg |DOALL I = 2, N
321debfc3dSmrg | A(I) = B(I) + C
331debfc3dSmrg |ENDDO
341debfc3dSmrg |
351debfc3dSmrg |DOALL I = 2, N
361debfc3dSmrg | D(I) = A(I-1)*E
371debfc3dSmrg |ENDDO
381debfc3dSmrg
39a2dc1f3fSmrg Loop distribution is the dual of loop fusion. It separates statements
40a2dc1f3fSmrg of a loop (or loop nest) into multiple loops (or loop nests) with the
41a2dc1f3fSmrg same loop header. The major goal is to separate statements which may
42a2dc1f3fSmrg be vectorized from those that can't. This pass implements distribution
43a2dc1f3fSmrg in the following steps:
44a2dc1f3fSmrg
45a2dc1f3fSmrg 1) Seed partitions with specific type statements. For now we support
46a2dc1f3fSmrg two types seed statements: statement defining variable used outside
47a2dc1f3fSmrg of loop; statement storing to memory.
48a2dc1f3fSmrg 2) Build reduced dependence graph (RDG) for loop to be distributed.
49a2dc1f3fSmrg The vertices (RDG:V) model all statements in the loop and the edges
50a2dc1f3fSmrg (RDG:E) model flow and control dependencies between statements.
51a2dc1f3fSmrg 3) Apart from RDG, compute data dependencies between memory references.
52a2dc1f3fSmrg 4) Starting from seed statement, build up partition by adding depended
53a2dc1f3fSmrg statements according to RDG's dependence information. Partition is
54a2dc1f3fSmrg classified as parallel type if it can be executed paralleled; or as
55a2dc1f3fSmrg sequential type if it can't. Parallel type partition is further
56a2dc1f3fSmrg classified as different builtin kinds if it can be implemented as
57a2dc1f3fSmrg builtin function calls.
58a2dc1f3fSmrg 5) Build partition dependence graph (PG) based on data dependencies.
59a2dc1f3fSmrg The vertices (PG:V) model all partitions and the edges (PG:E) model
60a2dc1f3fSmrg all data dependencies between every partitions pair. In general,
61a2dc1f3fSmrg data dependence is either compilation time known or unknown. In C
62a2dc1f3fSmrg family languages, there exists quite amount compilation time unknown
63a2dc1f3fSmrg dependencies because of possible alias relation of data references.
64a2dc1f3fSmrg We categorize PG's edge to two types: "true" edge that represents
65a2dc1f3fSmrg compilation time known data dependencies; "alias" edge for all other
66a2dc1f3fSmrg data dependencies.
67a2dc1f3fSmrg 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge
68a2dc1f3fSmrg partitions in each strong connected component (SCC) correspondingly.
69a2dc1f3fSmrg Build new PG for merged partitions.
70a2dc1f3fSmrg 7) Traverse PG again and this time with both "true" and "alias" edges
71a2dc1f3fSmrg included. We try to break SCCs by removing some edges. Because
72a2dc1f3fSmrg SCCs by "true" edges are all fused in step 6), we can break SCCs
73a2dc1f3fSmrg by removing some "alias" edges. It's NP-hard to choose optimal
74a2dc1f3fSmrg edge set, fortunately simple approximation is good enough for us
75a2dc1f3fSmrg given the small problem scale.
76a2dc1f3fSmrg 8) Collect all data dependencies of the removed "alias" edges. Create
77a2dc1f3fSmrg runtime alias checks for collected data dependencies.
78a2dc1f3fSmrg 9) Version loop under the condition of runtime alias checks. Given
79a2dc1f3fSmrg loop distribution generally introduces additional overhead, it is
80a2dc1f3fSmrg only useful if vectorization is achieved in distributed loop. We
81a2dc1f3fSmrg version loop with internal function call IFN_LOOP_DIST_ALIAS. If
82a2dc1f3fSmrg no distributed loop can be vectorized, we simply remove distributed
83a2dc1f3fSmrg loops and recover to the original one.
84a2dc1f3fSmrg
85a2dc1f3fSmrg TODO:
86a2dc1f3fSmrg 1) We only distribute innermost two-level loop nest now. We should
87a2dc1f3fSmrg extend it for arbitrary loop nests in the future.
88a2dc1f3fSmrg 2) We only fuse partitions in SCC now. A better fusion algorithm is
89a2dc1f3fSmrg desired to minimize loop overhead, maximize parallelism and maximize
90a2dc1f3fSmrg data reuse. */
911debfc3dSmrg
921debfc3dSmrg #include "config.h"
931debfc3dSmrg #include "system.h"
941debfc3dSmrg #include "coretypes.h"
951debfc3dSmrg #include "backend.h"
961debfc3dSmrg #include "tree.h"
971debfc3dSmrg #include "gimple.h"
981debfc3dSmrg #include "cfghooks.h"
991debfc3dSmrg #include "tree-pass.h"
1001debfc3dSmrg #include "ssa.h"
1011debfc3dSmrg #include "gimple-pretty-print.h"
1021debfc3dSmrg #include "fold-const.h"
1031debfc3dSmrg #include "cfganal.h"
1041debfc3dSmrg #include "gimple-iterator.h"
1051debfc3dSmrg #include "gimplify-me.h"
1061debfc3dSmrg #include "stor-layout.h"
1071debfc3dSmrg #include "tree-cfg.h"
1081debfc3dSmrg #include "tree-ssa-loop-manip.h"
109a2dc1f3fSmrg #include "tree-ssa-loop-ivopts.h"
1101debfc3dSmrg #include "tree-ssa-loop.h"
1111debfc3dSmrg #include "tree-into-ssa.h"
1121debfc3dSmrg #include "tree-ssa.h"
1131debfc3dSmrg #include "cfgloop.h"
1141debfc3dSmrg #include "tree-scalar-evolution.h"
1151debfc3dSmrg #include "tree-vectorizer.h"
116a05ac97eSmrg #include "tree-eh.h"
1178feb0f0bSmrg #include "gimple-fold.h"
1188feb0f0bSmrg #include "tree-affine.h"
1191debfc3dSmrg
1201debfc3dSmrg
121a2dc1f3fSmrg #define MAX_DATAREFS_NUM \
1228feb0f0bSmrg ((unsigned) param_loop_max_datarefs_for_datadeps)
123a2dc1f3fSmrg
124a2dc1f3fSmrg /* Threshold controlling number of distributed partitions. Given it may
125a2dc1f3fSmrg be unnecessary if a memory stream cost model is invented in the future,
126a2dc1f3fSmrg we define it as a temporary macro, rather than a parameter. */
127a2dc1f3fSmrg #define NUM_PARTITION_THRESHOLD (4)
128a2dc1f3fSmrg
129a2dc1f3fSmrg /* Hashtable helpers. */
130a2dc1f3fSmrg
131a2dc1f3fSmrg struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
132a2dc1f3fSmrg {
133a2dc1f3fSmrg static inline hashval_t hash (const data_dependence_relation *);
134a2dc1f3fSmrg static inline bool equal (const data_dependence_relation *,
135a2dc1f3fSmrg const data_dependence_relation *);
136a2dc1f3fSmrg };
137a2dc1f3fSmrg
138a2dc1f3fSmrg /* Hash function for data dependence. */
139a2dc1f3fSmrg
140a2dc1f3fSmrg inline hashval_t
hash(const data_dependence_relation * ddr)141a2dc1f3fSmrg ddr_hasher::hash (const data_dependence_relation *ddr)
142a2dc1f3fSmrg {
143a2dc1f3fSmrg inchash::hash h;
144a2dc1f3fSmrg h.add_ptr (DDR_A (ddr));
145a2dc1f3fSmrg h.add_ptr (DDR_B (ddr));
146a2dc1f3fSmrg return h.end ();
147a2dc1f3fSmrg }
148a2dc1f3fSmrg
149a2dc1f3fSmrg /* Hash table equality function for data dependence. */
150a2dc1f3fSmrg
151a2dc1f3fSmrg inline bool
equal(const data_dependence_relation * ddr1,const data_dependence_relation * ddr2)152a2dc1f3fSmrg ddr_hasher::equal (const data_dependence_relation *ddr1,
153a2dc1f3fSmrg const data_dependence_relation *ddr2)
154a2dc1f3fSmrg {
155a2dc1f3fSmrg return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
156a2dc1f3fSmrg }
157a2dc1f3fSmrg
158a2dc1f3fSmrg
159a2dc1f3fSmrg
160a2dc1f3fSmrg #define DR_INDEX(dr) ((uintptr_t) (dr)->aux)
161a2dc1f3fSmrg
1621debfc3dSmrg /* A Reduced Dependence Graph (RDG) vertex representing a statement. */
1631debfc3dSmrg struct rdg_vertex
1641debfc3dSmrg {
1651debfc3dSmrg /* The statement represented by this vertex. */
1661debfc3dSmrg gimple *stmt;
1671debfc3dSmrg
1681debfc3dSmrg /* Vector of data-references in this statement. */
1691debfc3dSmrg vec<data_reference_p> datarefs;
1701debfc3dSmrg
1711debfc3dSmrg /* True when the statement contains a write to memory. */
1721debfc3dSmrg bool has_mem_write;
1731debfc3dSmrg
1741debfc3dSmrg /* True when the statement contains a read from memory. */
1751debfc3dSmrg bool has_mem_reads;
1761debfc3dSmrg };
1771debfc3dSmrg
1781debfc3dSmrg #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt
1791debfc3dSmrg #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
1801debfc3dSmrg #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
1811debfc3dSmrg #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
1821debfc3dSmrg #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
1831debfc3dSmrg #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
1841debfc3dSmrg #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
1851debfc3dSmrg #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
1861debfc3dSmrg
1871debfc3dSmrg /* Data dependence type. */
1881debfc3dSmrg
1891debfc3dSmrg enum rdg_dep_type
1901debfc3dSmrg {
1911debfc3dSmrg /* Read After Write (RAW). */
1921debfc3dSmrg flow_dd = 'f',
1931debfc3dSmrg
1941debfc3dSmrg /* Control dependence (execute conditional on). */
1951debfc3dSmrg control_dd = 'c'
1961debfc3dSmrg };
1971debfc3dSmrg
1981debfc3dSmrg /* Dependence information attached to an edge of the RDG. */
1991debfc3dSmrg
2001debfc3dSmrg struct rdg_edge
2011debfc3dSmrg {
2021debfc3dSmrg /* Type of the dependence. */
2031debfc3dSmrg enum rdg_dep_type type;
2041debfc3dSmrg };
2051debfc3dSmrg
2061debfc3dSmrg #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type
2071debfc3dSmrg
2088feb0f0bSmrg /* Kind of distributed loop. */
2098feb0f0bSmrg enum partition_kind {
2108feb0f0bSmrg PKIND_NORMAL,
2118feb0f0bSmrg /* Partial memset stands for a paritition can be distributed into a loop
2128feb0f0bSmrg of memset calls, rather than a single memset call. It's handled just
2138feb0f0bSmrg like a normal parition, i.e, distributed as separate loop, no memset
2148feb0f0bSmrg call is generated.
2158feb0f0bSmrg
2168feb0f0bSmrg Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
2178feb0f0bSmrg loop nest as deep as possible. As a result, parloop achieves better
2188feb0f0bSmrg parallelization by parallelizing deeper loop nest. This hack should
2198feb0f0bSmrg be unnecessary and removed once distributed memset can be understood
2208feb0f0bSmrg and analyzed in data reference analysis. See PR82604 for more. */
2218feb0f0bSmrg PKIND_PARTIAL_MEMSET,
2228feb0f0bSmrg PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
2238feb0f0bSmrg };
2248feb0f0bSmrg
2258feb0f0bSmrg /* Type of distributed loop. */
2268feb0f0bSmrg enum partition_type {
2278feb0f0bSmrg /* The distributed loop can be executed parallelly. */
2288feb0f0bSmrg PTYPE_PARALLEL = 0,
2298feb0f0bSmrg /* The distributed loop has to be executed sequentially. */
2308feb0f0bSmrg PTYPE_SEQUENTIAL
2318feb0f0bSmrg };
2328feb0f0bSmrg
2338feb0f0bSmrg /* Builtin info for loop distribution. */
2348feb0f0bSmrg struct builtin_info
2358feb0f0bSmrg {
2368feb0f0bSmrg /* data-references a kind != PKIND_NORMAL partition is about. */
2378feb0f0bSmrg data_reference_p dst_dr;
2388feb0f0bSmrg data_reference_p src_dr;
2398feb0f0bSmrg /* Base address and size of memory objects operated by the builtin. Note
2408feb0f0bSmrg both dest and source memory objects must have the same size. */
2418feb0f0bSmrg tree dst_base;
2428feb0f0bSmrg tree src_base;
2438feb0f0bSmrg tree size;
2448feb0f0bSmrg /* Base and offset part of dst_base after stripping constant offset. This
2458feb0f0bSmrg is only used in memset builtin distribution for now. */
2468feb0f0bSmrg tree dst_base_base;
2478feb0f0bSmrg unsigned HOST_WIDE_INT dst_base_offset;
2488feb0f0bSmrg };
2498feb0f0bSmrg
2508feb0f0bSmrg /* Partition for loop distribution. */
2518feb0f0bSmrg struct partition
2528feb0f0bSmrg {
2538feb0f0bSmrg /* Statements of the partition. */
2548feb0f0bSmrg bitmap stmts;
2558feb0f0bSmrg /* True if the partition defines variable which is used outside of loop. */
2568feb0f0bSmrg bool reduction_p;
2578feb0f0bSmrg location_t loc;
2588feb0f0bSmrg enum partition_kind kind;
2598feb0f0bSmrg enum partition_type type;
2608feb0f0bSmrg /* Data references in the partition. */
2618feb0f0bSmrg bitmap datarefs;
2628feb0f0bSmrg /* Information of builtin parition. */
2638feb0f0bSmrg struct builtin_info *builtin;
2648feb0f0bSmrg };
2658feb0f0bSmrg
2668feb0f0bSmrg /* Partitions are fused because of different reasons. */
2678feb0f0bSmrg enum fuse_type
2688feb0f0bSmrg {
2698feb0f0bSmrg FUSE_NON_BUILTIN = 0,
2708feb0f0bSmrg FUSE_REDUCTION = 1,
2718feb0f0bSmrg FUSE_SHARE_REF = 2,
2728feb0f0bSmrg FUSE_SAME_SCC = 3,
2738feb0f0bSmrg FUSE_FINALIZE = 4
2748feb0f0bSmrg };
2758feb0f0bSmrg
2768feb0f0bSmrg /* Description on different fusing reason. */
2778feb0f0bSmrg static const char *fuse_message[] = {
2788feb0f0bSmrg "they are non-builtins",
2798feb0f0bSmrg "they have reductions",
2808feb0f0bSmrg "they have shared memory refs",
2818feb0f0bSmrg "they are in the same dependence scc",
2828feb0f0bSmrg "there is no point to distribute loop"};
2838feb0f0bSmrg
2848feb0f0bSmrg
2851debfc3dSmrg /* Dump vertex I in RDG to FILE. */
2861debfc3dSmrg
2871debfc3dSmrg static void
dump_rdg_vertex(FILE * file,struct graph * rdg,int i)2881debfc3dSmrg dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
2891debfc3dSmrg {
2901debfc3dSmrg struct vertex *v = &(rdg->vertices[i]);
2911debfc3dSmrg struct graph_edge *e;
2921debfc3dSmrg
2931debfc3dSmrg fprintf (file, "(vertex %d: (%s%s) (in:", i,
2941debfc3dSmrg RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
2951debfc3dSmrg RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
2961debfc3dSmrg
2971debfc3dSmrg if (v->pred)
2981debfc3dSmrg for (e = v->pred; e; e = e->pred_next)
2991debfc3dSmrg fprintf (file, " %d", e->src);
3001debfc3dSmrg
3011debfc3dSmrg fprintf (file, ") (out:");
3021debfc3dSmrg
3031debfc3dSmrg if (v->succ)
3041debfc3dSmrg for (e = v->succ; e; e = e->succ_next)
3051debfc3dSmrg fprintf (file, " %d", e->dest);
3061debfc3dSmrg
3071debfc3dSmrg fprintf (file, ")\n");
3081debfc3dSmrg print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
3091debfc3dSmrg fprintf (file, ")\n");
3101debfc3dSmrg }
3111debfc3dSmrg
3121debfc3dSmrg /* Call dump_rdg_vertex on stderr. */
3131debfc3dSmrg
3141debfc3dSmrg DEBUG_FUNCTION void
debug_rdg_vertex(struct graph * rdg,int i)3151debfc3dSmrg debug_rdg_vertex (struct graph *rdg, int i)
3161debfc3dSmrg {
3171debfc3dSmrg dump_rdg_vertex (stderr, rdg, i);
3181debfc3dSmrg }
3191debfc3dSmrg
3201debfc3dSmrg /* Dump the reduced dependence graph RDG to FILE. */
3211debfc3dSmrg
3221debfc3dSmrg static void
dump_rdg(FILE * file,struct graph * rdg)3231debfc3dSmrg dump_rdg (FILE *file, struct graph *rdg)
3241debfc3dSmrg {
3251debfc3dSmrg fprintf (file, "(rdg\n");
3261debfc3dSmrg for (int i = 0; i < rdg->n_vertices; i++)
3271debfc3dSmrg dump_rdg_vertex (file, rdg, i);
3281debfc3dSmrg fprintf (file, ")\n");
3291debfc3dSmrg }
3301debfc3dSmrg
3311debfc3dSmrg /* Call dump_rdg on stderr. */
3321debfc3dSmrg
3331debfc3dSmrg DEBUG_FUNCTION void
debug_rdg(struct graph * rdg)3341debfc3dSmrg debug_rdg (struct graph *rdg)
3351debfc3dSmrg {
3361debfc3dSmrg dump_rdg (stderr, rdg);
3371debfc3dSmrg }
3381debfc3dSmrg
3391debfc3dSmrg static void
dot_rdg_1(FILE * file,struct graph * rdg)3401debfc3dSmrg dot_rdg_1 (FILE *file, struct graph *rdg)
3411debfc3dSmrg {
3421debfc3dSmrg int i;
3431debfc3dSmrg pretty_printer buffer;
3441debfc3dSmrg pp_needs_newline (&buffer) = false;
3451debfc3dSmrg buffer.buffer->stream = file;
3461debfc3dSmrg
3471debfc3dSmrg fprintf (file, "digraph RDG {\n");
3481debfc3dSmrg
3491debfc3dSmrg for (i = 0; i < rdg->n_vertices; i++)
3501debfc3dSmrg {
3511debfc3dSmrg struct vertex *v = &(rdg->vertices[i]);
3521debfc3dSmrg struct graph_edge *e;
3531debfc3dSmrg
3541debfc3dSmrg fprintf (file, "%d [label=\"[%d] ", i, i);
3551debfc3dSmrg pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
3561debfc3dSmrg pp_flush (&buffer);
3571debfc3dSmrg fprintf (file, "\"]\n");
3581debfc3dSmrg
3591debfc3dSmrg /* Highlight reads from memory. */
3601debfc3dSmrg if (RDG_MEM_READS_STMT (rdg, i))
3611debfc3dSmrg fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
3621debfc3dSmrg
3631debfc3dSmrg /* Highlight stores to memory. */
3641debfc3dSmrg if (RDG_MEM_WRITE_STMT (rdg, i))
3651debfc3dSmrg fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
3661debfc3dSmrg
3671debfc3dSmrg if (v->succ)
3681debfc3dSmrg for (e = v->succ; e; e = e->succ_next)
3691debfc3dSmrg switch (RDGE_TYPE (e))
3701debfc3dSmrg {
3711debfc3dSmrg case flow_dd:
3721debfc3dSmrg /* These are the most common dependences: don't print these. */
3731debfc3dSmrg fprintf (file, "%d -> %d \n", i, e->dest);
3741debfc3dSmrg break;
3751debfc3dSmrg
3761debfc3dSmrg case control_dd:
3771debfc3dSmrg fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
3781debfc3dSmrg break;
3791debfc3dSmrg
3801debfc3dSmrg default:
3811debfc3dSmrg gcc_unreachable ();
3821debfc3dSmrg }
3831debfc3dSmrg }
3841debfc3dSmrg
3851debfc3dSmrg fprintf (file, "}\n\n");
3861debfc3dSmrg }
3871debfc3dSmrg
3881debfc3dSmrg /* Display the Reduced Dependence Graph using dotty. */
3891debfc3dSmrg
3901debfc3dSmrg DEBUG_FUNCTION void
dot_rdg(struct graph * rdg)3911debfc3dSmrg dot_rdg (struct graph *rdg)
3921debfc3dSmrg {
3931debfc3dSmrg /* When debugging, you may want to enable the following code. */
3941debfc3dSmrg #ifdef HAVE_POPEN
3951debfc3dSmrg FILE *file = popen ("dot -Tx11", "w");
3961debfc3dSmrg if (!file)
3971debfc3dSmrg return;
3981debfc3dSmrg dot_rdg_1 (file, rdg);
3991debfc3dSmrg fflush (file);
4001debfc3dSmrg close (fileno (file));
4011debfc3dSmrg pclose (file);
4021debfc3dSmrg #else
4031debfc3dSmrg dot_rdg_1 (stderr, rdg);
4041debfc3dSmrg #endif
4051debfc3dSmrg }
4061debfc3dSmrg
4071debfc3dSmrg /* Returns the index of STMT in RDG. */
4081debfc3dSmrg
4091debfc3dSmrg static int
rdg_vertex_for_stmt(struct graph * rdg ATTRIBUTE_UNUSED,gimple * stmt)4101debfc3dSmrg rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
4111debfc3dSmrg {
4121debfc3dSmrg int index = gimple_uid (stmt);
4131debfc3dSmrg gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
4141debfc3dSmrg return index;
4151debfc3dSmrg }
4161debfc3dSmrg
4171debfc3dSmrg /* Creates dependence edges in RDG for all the uses of DEF. IDEF is
4181debfc3dSmrg the index of DEF in RDG. */
4191debfc3dSmrg
4201debfc3dSmrg static void
create_rdg_edges_for_scalar(struct graph * rdg,tree def,int idef)4211debfc3dSmrg create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
4221debfc3dSmrg {
4231debfc3dSmrg use_operand_p imm_use_p;
4241debfc3dSmrg imm_use_iterator iterator;
4251debfc3dSmrg
4261debfc3dSmrg FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
4271debfc3dSmrg {
4281debfc3dSmrg struct graph_edge *e;
4291debfc3dSmrg int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
4301debfc3dSmrg
4311debfc3dSmrg if (use < 0)
4321debfc3dSmrg continue;
4331debfc3dSmrg
4341debfc3dSmrg e = add_edge (rdg, idef, use);
4351debfc3dSmrg e->data = XNEW (struct rdg_edge);
4361debfc3dSmrg RDGE_TYPE (e) = flow_dd;
4371debfc3dSmrg }
4381debfc3dSmrg }
4391debfc3dSmrg
4401debfc3dSmrg /* Creates an edge for the control dependences of BB to the vertex V. */
4411debfc3dSmrg
4421debfc3dSmrg static void
create_edge_for_control_dependence(struct graph * rdg,basic_block bb,int v,control_dependences * cd)4431debfc3dSmrg create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
4441debfc3dSmrg int v, control_dependences *cd)
4451debfc3dSmrg {
4461debfc3dSmrg bitmap_iterator bi;
4471debfc3dSmrg unsigned edge_n;
4481debfc3dSmrg EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
4491debfc3dSmrg 0, edge_n, bi)
4501debfc3dSmrg {
4511debfc3dSmrg basic_block cond_bb = cd->get_edge_src (edge_n);
4521debfc3dSmrg gimple *stmt = last_stmt (cond_bb);
4531debfc3dSmrg if (stmt && is_ctrl_stmt (stmt))
4541debfc3dSmrg {
4551debfc3dSmrg struct graph_edge *e;
4561debfc3dSmrg int c = rdg_vertex_for_stmt (rdg, stmt);
4571debfc3dSmrg if (c < 0)
4581debfc3dSmrg continue;
4591debfc3dSmrg
4601debfc3dSmrg e = add_edge (rdg, c, v);
4611debfc3dSmrg e->data = XNEW (struct rdg_edge);
4621debfc3dSmrg RDGE_TYPE (e) = control_dd;
4631debfc3dSmrg }
4641debfc3dSmrg }
4651debfc3dSmrg }
4661debfc3dSmrg
4671debfc3dSmrg /* Creates the edges of the reduced dependence graph RDG. */
4681debfc3dSmrg
4691debfc3dSmrg static void
create_rdg_flow_edges(struct graph * rdg)4701debfc3dSmrg create_rdg_flow_edges (struct graph *rdg)
4711debfc3dSmrg {
4721debfc3dSmrg int i;
4731debfc3dSmrg def_operand_p def_p;
4741debfc3dSmrg ssa_op_iter iter;
4751debfc3dSmrg
4761debfc3dSmrg for (i = 0; i < rdg->n_vertices; i++)
4771debfc3dSmrg FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
4781debfc3dSmrg iter, SSA_OP_DEF)
4791debfc3dSmrg create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
4801debfc3dSmrg }
4811debfc3dSmrg
4821debfc3dSmrg /* Creates the edges of the reduced dependence graph RDG. */
4831debfc3dSmrg
4841debfc3dSmrg static void
create_rdg_cd_edges(struct graph * rdg,control_dependences * cd,loop_p loop)4851debfc3dSmrg create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
4861debfc3dSmrg {
4871debfc3dSmrg int i;
4881debfc3dSmrg
4891debfc3dSmrg for (i = 0; i < rdg->n_vertices; i++)
4901debfc3dSmrg {
4911debfc3dSmrg gimple *stmt = RDG_STMT (rdg, i);
4921debfc3dSmrg if (gimple_code (stmt) == GIMPLE_PHI)
4931debfc3dSmrg {
4941debfc3dSmrg edge_iterator ei;
4951debfc3dSmrg edge e;
4961debfc3dSmrg FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
4971debfc3dSmrg if (flow_bb_inside_loop_p (loop, e->src))
4981debfc3dSmrg create_edge_for_control_dependence (rdg, e->src, i, cd);
4991debfc3dSmrg }
5001debfc3dSmrg else
5011debfc3dSmrg create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
5021debfc3dSmrg }
5031debfc3dSmrg }
5041debfc3dSmrg
5058feb0f0bSmrg
5068feb0f0bSmrg class loop_distribution
5078feb0f0bSmrg {
5088feb0f0bSmrg private:
5098feb0f0bSmrg /* The loop (nest) to be distributed. */
5108feb0f0bSmrg vec<loop_p> loop_nest;
5118feb0f0bSmrg
5128feb0f0bSmrg /* Vector of data references in the loop to be distributed. */
5138feb0f0bSmrg vec<data_reference_p> datarefs_vec;
5148feb0f0bSmrg
5158feb0f0bSmrg /* If there is nonaddressable data reference in above vector. */
5168feb0f0bSmrg bool has_nonaddressable_dataref_p;
5178feb0f0bSmrg
5188feb0f0bSmrg /* Store index of data reference in aux field. */
5198feb0f0bSmrg
5208feb0f0bSmrg /* Hash table for data dependence relation in the loop to be distributed. */
5218feb0f0bSmrg hash_table<ddr_hasher> *ddrs_table;
5228feb0f0bSmrg
5238feb0f0bSmrg /* Array mapping basic block's index to its topological order. */
5248feb0f0bSmrg int *bb_top_order_index;
5258feb0f0bSmrg /* And size of the array. */
5268feb0f0bSmrg int bb_top_order_index_size;
5278feb0f0bSmrg
5281debfc3dSmrg /* Build the vertices of the reduced dependence graph RDG. Return false
5291debfc3dSmrg if that failed. */
5308feb0f0bSmrg bool create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop);
5311debfc3dSmrg
5328feb0f0bSmrg /* Initialize STMTS with all the statements of LOOP. We use topological
5338feb0f0bSmrg order to discover all statements. The order is important because
5348feb0f0bSmrg generate_loops_for_partition is using the same traversal for identifying
5358feb0f0bSmrg statements in loop copies. */
5368feb0f0bSmrg void stmts_from_loop (class loop *loop, vec<gimple *> *stmts);
5378feb0f0bSmrg
5388feb0f0bSmrg
5398feb0f0bSmrg /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
5408feb0f0bSmrg LOOP, and one edge per flow dependence or control dependence from control
5418feb0f0bSmrg dependence CD. During visiting each statement, data references are also
5428feb0f0bSmrg collected and recorded in global data DATAREFS_VEC. */
5438feb0f0bSmrg struct graph * build_rdg (class loop *loop, control_dependences *cd);
5448feb0f0bSmrg
5458feb0f0bSmrg /* Merge PARTITION into the partition DEST. RDG is the reduced dependence
5468feb0f0bSmrg graph and we update type for result partition if it is non-NULL. */
5478feb0f0bSmrg void partition_merge_into (struct graph *rdg,
5488feb0f0bSmrg partition *dest, partition *partition,
5498feb0f0bSmrg enum fuse_type ft);
5508feb0f0bSmrg
5518feb0f0bSmrg
5528feb0f0bSmrg /* Return data dependence relation for data references A and B. The two
5538feb0f0bSmrg data references must be in lexicographic order wrto reduced dependence
5548feb0f0bSmrg graph RDG. We firstly try to find ddr from global ddr hash table. If
5558feb0f0bSmrg it doesn't exist, compute the ddr and cache it. */
5568feb0f0bSmrg data_dependence_relation * get_data_dependence (struct graph *rdg,
5578feb0f0bSmrg data_reference_p a,
5588feb0f0bSmrg data_reference_p b);
5598feb0f0bSmrg
5608feb0f0bSmrg
5618feb0f0bSmrg /* In reduced dependence graph RDG for loop distribution, return true if
5628feb0f0bSmrg dependence between references DR1 and DR2 leads to a dependence cycle
5638feb0f0bSmrg and such dependence cycle can't be resolved by runtime alias check. */
5648feb0f0bSmrg bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1,
5658feb0f0bSmrg data_reference_p dr2);
5668feb0f0bSmrg
5678feb0f0bSmrg
5688feb0f0bSmrg /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
5698feb0f0bSmrg PARTITION1's type after merging PARTITION2 into PARTITION1. */
5708feb0f0bSmrg void update_type_for_merge (struct graph *rdg,
5718feb0f0bSmrg partition *partition1, partition *partition2);
5728feb0f0bSmrg
5738feb0f0bSmrg
5748feb0f0bSmrg /* Returns a partition with all the statements needed for computing
5758feb0f0bSmrg the vertex V of the RDG, also including the loop exit conditions. */
5768feb0f0bSmrg partition *build_rdg_partition_for_vertex (struct graph *rdg, int v);
5778feb0f0bSmrg
5788feb0f0bSmrg /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
5798feb0f0bSmrg if it forms builtin memcpy or memmove call. */
5808feb0f0bSmrg void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
5818feb0f0bSmrg data_reference_p dst_dr, data_reference_p src_dr);
5828feb0f0bSmrg
5838feb0f0bSmrg /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
5848feb0f0bSmrg For the moment we detect memset, memcpy and memmove patterns. Bitmap
5858feb0f0bSmrg STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.
5868feb0f0bSmrg Returns true if there is a reduction in all partitions and we
5878feb0f0bSmrg possibly did not mark PARTITION as having one for this reason. */
5888feb0f0bSmrg
5898feb0f0bSmrg bool
5908feb0f0bSmrg classify_partition (loop_p loop,
5918feb0f0bSmrg struct graph *rdg, partition *partition,
5928feb0f0bSmrg bitmap stmt_in_all_partitions);
5938feb0f0bSmrg
5948feb0f0bSmrg
5958feb0f0bSmrg /* Returns true when PARTITION1 and PARTITION2 access the same memory
5968feb0f0bSmrg object in RDG. */
5978feb0f0bSmrg bool share_memory_accesses (struct graph *rdg,
5988feb0f0bSmrg partition *partition1, partition *partition2);
5998feb0f0bSmrg
6008feb0f0bSmrg /* For each seed statement in STARTING_STMTS, this function builds
6018feb0f0bSmrg partition for it by adding depended statements according to RDG.
6028feb0f0bSmrg All partitions are recorded in PARTITIONS. */
6038feb0f0bSmrg void rdg_build_partitions (struct graph *rdg,
6048feb0f0bSmrg vec<gimple *> starting_stmts,
6058feb0f0bSmrg vec<partition *> *partitions);
6068feb0f0bSmrg
6078feb0f0bSmrg /* Compute partition dependence created by the data references in DRS1
6088feb0f0bSmrg and DRS2, modify and return DIR according to that. IF ALIAS_DDR is
6098feb0f0bSmrg not NULL, we record dependence introduced by possible alias between
6108feb0f0bSmrg two data references in ALIAS_DDRS; otherwise, we simply ignore such
6118feb0f0bSmrg dependence as if it doesn't exist at all. */
6128feb0f0bSmrg int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1,
6138feb0f0bSmrg bitmap drs2, vec<ddr_p> *alias_ddrs);
6148feb0f0bSmrg
6158feb0f0bSmrg
6168feb0f0bSmrg /* Build and return partition dependence graph for PARTITIONS. RDG is
6178feb0f0bSmrg reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
6188feb0f0bSmrg is true, data dependence caused by possible alias between references
6198feb0f0bSmrg is ignored, as if it doesn't exist at all; otherwise all depdendences
6208feb0f0bSmrg are considered. */
6218feb0f0bSmrg struct graph *build_partition_graph (struct graph *rdg,
6228feb0f0bSmrg vec<struct partition *> *partitions,
6238feb0f0bSmrg bool ignore_alias_p);
6248feb0f0bSmrg
6258feb0f0bSmrg /* Given reduced dependence graph RDG merge strong connected components
6268feb0f0bSmrg of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by
6278feb0f0bSmrg possible alias between references is ignored, as if it doesn't exist
6288feb0f0bSmrg at all; otherwise all depdendences are considered. */
6298feb0f0bSmrg void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *>
6308feb0f0bSmrg *partitions, bool ignore_alias_p);
6318feb0f0bSmrg
6328feb0f0bSmrg /* This is the main function breaking strong conected components in
6338feb0f0bSmrg PARTITIONS giving reduced depdendence graph RDG. Store data dependence
6348feb0f0bSmrg relations for runtime alias check in ALIAS_DDRS. */
6358feb0f0bSmrg void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *>
6368feb0f0bSmrg *partitions, vec<ddr_p> *alias_ddrs);
6378feb0f0bSmrg
6388feb0f0bSmrg
6398feb0f0bSmrg /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
6408feb0f0bSmrg ALIAS_DDRS contains ddrs which need runtime alias check. */
6418feb0f0bSmrg void finalize_partitions (class loop *loop, vec<struct partition *>
6428feb0f0bSmrg *partitions, vec<ddr_p> *alias_ddrs);
6438feb0f0bSmrg
6448feb0f0bSmrg /* Distributes the code from LOOP in such a way that producer statements
6458feb0f0bSmrg are placed before consumer statements. Tries to separate only the
6468feb0f0bSmrg statements from STMTS into separate loops. Returns the number of
6478feb0f0bSmrg distributed loops. Set NB_CALLS to number of generated builtin calls.
6488feb0f0bSmrg Set *DESTROY_P to whether LOOP needs to be destroyed. */
6498feb0f0bSmrg int distribute_loop (class loop *loop, vec<gimple *> stmts,
6508feb0f0bSmrg control_dependences *cd, int *nb_calls, bool *destroy_p,
6518feb0f0bSmrg bool only_patterns_p);
6528feb0f0bSmrg
6538feb0f0bSmrg /* Compute topological order for basic blocks. Topological order is
6548feb0f0bSmrg needed because data dependence is computed for data references in
6558feb0f0bSmrg lexicographical order. */
6568feb0f0bSmrg void bb_top_order_init (void);
6578feb0f0bSmrg
6588feb0f0bSmrg void bb_top_order_destroy (void);
6598feb0f0bSmrg
6608feb0f0bSmrg public:
6618feb0f0bSmrg
6628feb0f0bSmrg /* Getter for bb_top_order. */
6638feb0f0bSmrg
get_bb_top_order_index_size(void)6648feb0f0bSmrg inline int get_bb_top_order_index_size (void)
6658feb0f0bSmrg {
6668feb0f0bSmrg return bb_top_order_index_size;
6678feb0f0bSmrg }
6688feb0f0bSmrg
get_bb_top_order_index(int i)6698feb0f0bSmrg inline int get_bb_top_order_index (int i)
6708feb0f0bSmrg {
6718feb0f0bSmrg return bb_top_order_index[i];
6728feb0f0bSmrg }
6738feb0f0bSmrg
6748feb0f0bSmrg unsigned int execute (function *fun);
6758feb0f0bSmrg };
6768feb0f0bSmrg
6778feb0f0bSmrg
6788feb0f0bSmrg /* If X has a smaller topological sort number than Y, returns -1;
6798feb0f0bSmrg if greater, returns 1. */
6808feb0f0bSmrg static int
bb_top_order_cmp_r(const void * x,const void * y,void * loop)6818feb0f0bSmrg bb_top_order_cmp_r (const void *x, const void *y, void *loop)
6828feb0f0bSmrg {
6838feb0f0bSmrg loop_distribution *_loop =
6848feb0f0bSmrg (loop_distribution *) loop;
6858feb0f0bSmrg
6868feb0f0bSmrg basic_block bb1 = *(const basic_block *) x;
6878feb0f0bSmrg basic_block bb2 = *(const basic_block *) y;
6888feb0f0bSmrg
6898feb0f0bSmrg int bb_top_order_index_size = _loop->get_bb_top_order_index_size ();
6908feb0f0bSmrg
6918feb0f0bSmrg gcc_assert (bb1->index < bb_top_order_index_size
6928feb0f0bSmrg && bb2->index < bb_top_order_index_size);
6938feb0f0bSmrg gcc_assert (bb1 == bb2
6948feb0f0bSmrg || _loop->get_bb_top_order_index(bb1->index)
6958feb0f0bSmrg != _loop->get_bb_top_order_index(bb2->index));
6968feb0f0bSmrg
6978feb0f0bSmrg return (_loop->get_bb_top_order_index(bb1->index) -
6988feb0f0bSmrg _loop->get_bb_top_order_index(bb2->index));
6998feb0f0bSmrg }
7008feb0f0bSmrg
7018feb0f0bSmrg bool
create_rdg_vertices(struct graph * rdg,vec<gimple * > stmts,loop_p loop)7028feb0f0bSmrg loop_distribution::create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts,
7038feb0f0bSmrg loop_p loop)
7041debfc3dSmrg {
7051debfc3dSmrg int i;
7061debfc3dSmrg gimple *stmt;
7071debfc3dSmrg
7081debfc3dSmrg FOR_EACH_VEC_ELT (stmts, i, stmt)
7091debfc3dSmrg {
7101debfc3dSmrg struct vertex *v = &(rdg->vertices[i]);
7111debfc3dSmrg
7121debfc3dSmrg /* Record statement to vertex mapping. */
7131debfc3dSmrg gimple_set_uid (stmt, i);
7141debfc3dSmrg
7151debfc3dSmrg v->data = XNEW (struct rdg_vertex);
7161debfc3dSmrg RDGV_STMT (v) = stmt;
7171debfc3dSmrg RDGV_DATAREFS (v).create (0);
7181debfc3dSmrg RDGV_HAS_MEM_WRITE (v) = false;
7191debfc3dSmrg RDGV_HAS_MEM_READS (v) = false;
7201debfc3dSmrg if (gimple_code (stmt) == GIMPLE_PHI)
7211debfc3dSmrg continue;
7221debfc3dSmrg
723a2dc1f3fSmrg unsigned drp = datarefs_vec.length ();
724a2dc1f3fSmrg if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
7251debfc3dSmrg return false;
726a2dc1f3fSmrg for (unsigned j = drp; j < datarefs_vec.length (); ++j)
7271debfc3dSmrg {
728a2dc1f3fSmrg data_reference_p dr = datarefs_vec[j];
7291debfc3dSmrg if (DR_IS_READ (dr))
7301debfc3dSmrg RDGV_HAS_MEM_READS (v) = true;
7311debfc3dSmrg else
7321debfc3dSmrg RDGV_HAS_MEM_WRITE (v) = true;
7331debfc3dSmrg RDGV_DATAREFS (v).safe_push (dr);
734c0a68be4Smrg has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref);
7351debfc3dSmrg }
7361debfc3dSmrg }
7371debfc3dSmrg return true;
7381debfc3dSmrg }
7391debfc3dSmrg
7408feb0f0bSmrg void
stmts_from_loop(class loop * loop,vec<gimple * > * stmts)7418feb0f0bSmrg loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts)
7421debfc3dSmrg {
7431debfc3dSmrg unsigned int i;
7448feb0f0bSmrg basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
7451debfc3dSmrg
7461debfc3dSmrg for (i = 0; i < loop->num_nodes; i++)
7471debfc3dSmrg {
7481debfc3dSmrg basic_block bb = bbs[i];
7491debfc3dSmrg
7501debfc3dSmrg for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
7511debfc3dSmrg gsi_next (&bsi))
7521debfc3dSmrg if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
7531debfc3dSmrg stmts->safe_push (bsi.phi ());
7541debfc3dSmrg
7551debfc3dSmrg for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
7561debfc3dSmrg gsi_next (&bsi))
7571debfc3dSmrg {
7581debfc3dSmrg gimple *stmt = gsi_stmt (bsi);
7591debfc3dSmrg if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
7601debfc3dSmrg stmts->safe_push (stmt);
7611debfc3dSmrg }
7621debfc3dSmrg }
7631debfc3dSmrg
7641debfc3dSmrg free (bbs);
7651debfc3dSmrg }
7661debfc3dSmrg
7671debfc3dSmrg /* Free the reduced dependence graph RDG. */
7681debfc3dSmrg
7691debfc3dSmrg static void
free_rdg(struct graph * rdg)7701debfc3dSmrg free_rdg (struct graph *rdg)
7711debfc3dSmrg {
7721debfc3dSmrg int i;
7731debfc3dSmrg
7741debfc3dSmrg for (i = 0; i < rdg->n_vertices; i++)
7751debfc3dSmrg {
7761debfc3dSmrg struct vertex *v = &(rdg->vertices[i]);
7771debfc3dSmrg struct graph_edge *e;
7781debfc3dSmrg
7791debfc3dSmrg for (e = v->succ; e; e = e->succ_next)
7801debfc3dSmrg free (e->data);
7811debfc3dSmrg
7821debfc3dSmrg if (v->data)
7831debfc3dSmrg {
7841debfc3dSmrg gimple_set_uid (RDGV_STMT (v), -1);
785a2dc1f3fSmrg (RDGV_DATAREFS (v)).release ();
7861debfc3dSmrg free (v->data);
7871debfc3dSmrg }
7881debfc3dSmrg }
7891debfc3dSmrg
7901debfc3dSmrg free_graph (rdg);
7911debfc3dSmrg }
7921debfc3dSmrg
7938feb0f0bSmrg struct graph *
build_rdg(class loop * loop,control_dependences * cd)7948feb0f0bSmrg loop_distribution::build_rdg (class loop *loop, control_dependences *cd)
7951debfc3dSmrg {
7961debfc3dSmrg struct graph *rdg;
7971debfc3dSmrg
7981debfc3dSmrg /* Create the RDG vertices from the stmts of the loop nest. */
7991debfc3dSmrg auto_vec<gimple *, 10> stmts;
800a2dc1f3fSmrg stmts_from_loop (loop, &stmts);
8011debfc3dSmrg rdg = new_graph (stmts.length ());
802a2dc1f3fSmrg if (!create_rdg_vertices (rdg, stmts, loop))
8031debfc3dSmrg {
8041debfc3dSmrg free_rdg (rdg);
8051debfc3dSmrg return NULL;
8061debfc3dSmrg }
8071debfc3dSmrg stmts.release ();
8081debfc3dSmrg
8091debfc3dSmrg create_rdg_flow_edges (rdg);
8101debfc3dSmrg if (cd)
811a2dc1f3fSmrg create_rdg_cd_edges (rdg, cd, loop);
8121debfc3dSmrg
8131debfc3dSmrg return rdg;
8141debfc3dSmrg }
8151debfc3dSmrg
8161debfc3dSmrg
8171debfc3dSmrg /* Allocate and initialize a partition from BITMAP. */
8181debfc3dSmrg
8191debfc3dSmrg static partition *
partition_alloc(void)820a2dc1f3fSmrg partition_alloc (void)
8211debfc3dSmrg {
8221debfc3dSmrg partition *partition = XCNEW (struct partition);
823a2dc1f3fSmrg partition->stmts = BITMAP_ALLOC (NULL);
8241debfc3dSmrg partition->reduction_p = false;
8258feb0f0bSmrg partition->loc = UNKNOWN_LOCATION;
8261debfc3dSmrg partition->kind = PKIND_NORMAL;
8278feb0f0bSmrg partition->type = PTYPE_PARALLEL;
828a2dc1f3fSmrg partition->datarefs = BITMAP_ALLOC (NULL);
8291debfc3dSmrg return partition;
8301debfc3dSmrg }
8311debfc3dSmrg
8321debfc3dSmrg /* Free PARTITION. */
8331debfc3dSmrg
8341debfc3dSmrg static void
partition_free(partition * partition)8351debfc3dSmrg partition_free (partition *partition)
8361debfc3dSmrg {
8371debfc3dSmrg BITMAP_FREE (partition->stmts);
838a2dc1f3fSmrg BITMAP_FREE (partition->datarefs);
839a2dc1f3fSmrg if (partition->builtin)
840a2dc1f3fSmrg free (partition->builtin);
841a2dc1f3fSmrg
8421debfc3dSmrg free (partition);
8431debfc3dSmrg }
8441debfc3dSmrg
8451debfc3dSmrg /* Returns true if the partition can be generated as a builtin. */
8461debfc3dSmrg
8471debfc3dSmrg static bool
partition_builtin_p(partition * partition)8481debfc3dSmrg partition_builtin_p (partition *partition)
8491debfc3dSmrg {
850a2dc1f3fSmrg return partition->kind > PKIND_PARTIAL_MEMSET;
8511debfc3dSmrg }
8521debfc3dSmrg
8531debfc3dSmrg /* Returns true if the partition contains a reduction. */
8541debfc3dSmrg
8551debfc3dSmrg static bool
partition_reduction_p(partition * partition)8561debfc3dSmrg partition_reduction_p (partition *partition)
8571debfc3dSmrg {
8581debfc3dSmrg return partition->reduction_p;
8591debfc3dSmrg }
8601debfc3dSmrg
8618feb0f0bSmrg void
partition_merge_into(struct graph * rdg,partition * dest,partition * partition,enum fuse_type ft)8628feb0f0bSmrg loop_distribution::partition_merge_into (struct graph *rdg,
8638feb0f0bSmrg partition *dest, partition *partition, enum fuse_type ft)
8641debfc3dSmrg {
865a2dc1f3fSmrg if (dump_file && (dump_flags & TDF_DETAILS))
866a2dc1f3fSmrg {
867a2dc1f3fSmrg fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
868a2dc1f3fSmrg fprintf (dump_file, " Part 1: ");
869a2dc1f3fSmrg dump_bitmap (dump_file, dest->stmts);
870a2dc1f3fSmrg fprintf (dump_file, " Part 2: ");
871a2dc1f3fSmrg dump_bitmap (dump_file, partition->stmts);
872a2dc1f3fSmrg }
873a2dc1f3fSmrg
8741debfc3dSmrg dest->kind = PKIND_NORMAL;
875a2dc1f3fSmrg if (dest->type == PTYPE_PARALLEL)
876a2dc1f3fSmrg dest->type = partition->type;
877a2dc1f3fSmrg
8781debfc3dSmrg bitmap_ior_into (dest->stmts, partition->stmts);
8791debfc3dSmrg if (partition_reduction_p (partition))
8801debfc3dSmrg dest->reduction_p = true;
881a2dc1f3fSmrg
882a2dc1f3fSmrg /* Further check if any data dependence prevents us from executing the
883a2dc1f3fSmrg new partition parallelly. */
884a2dc1f3fSmrg if (dest->type == PTYPE_PARALLEL && rdg != NULL)
885a2dc1f3fSmrg update_type_for_merge (rdg, dest, partition);
886a2dc1f3fSmrg
887a2dc1f3fSmrg bitmap_ior_into (dest->datarefs, partition->datarefs);
8881debfc3dSmrg }
8891debfc3dSmrg
8901debfc3dSmrg
8911debfc3dSmrg /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
8921debfc3dSmrg the LOOP. */
8931debfc3dSmrg
8941debfc3dSmrg static bool
ssa_name_has_uses_outside_loop_p(tree def,loop_p loop)8951debfc3dSmrg ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
8961debfc3dSmrg {
8971debfc3dSmrg imm_use_iterator imm_iter;
8981debfc3dSmrg use_operand_p use_p;
8991debfc3dSmrg
9001debfc3dSmrg FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
9011debfc3dSmrg {
902a2dc1f3fSmrg if (is_gimple_debug (USE_STMT (use_p)))
903a2dc1f3fSmrg continue;
904a2dc1f3fSmrg
905a2dc1f3fSmrg basic_block use_bb = gimple_bb (USE_STMT (use_p));
906a2dc1f3fSmrg if (!flow_bb_inside_loop_p (loop, use_bb))
9071debfc3dSmrg return true;
9081debfc3dSmrg }
9091debfc3dSmrg
9101debfc3dSmrg return false;
9111debfc3dSmrg }
9121debfc3dSmrg
9131debfc3dSmrg /* Returns true when STMT defines a scalar variable used after the
9141debfc3dSmrg loop LOOP. */
9151debfc3dSmrg
9161debfc3dSmrg static bool
stmt_has_scalar_dependences_outside_loop(loop_p loop,gimple * stmt)9171debfc3dSmrg stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
9181debfc3dSmrg {
9191debfc3dSmrg def_operand_p def_p;
9201debfc3dSmrg ssa_op_iter op_iter;
9211debfc3dSmrg
9221debfc3dSmrg if (gimple_code (stmt) == GIMPLE_PHI)
9231debfc3dSmrg return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
9241debfc3dSmrg
9251debfc3dSmrg FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
9261debfc3dSmrg if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
9271debfc3dSmrg return true;
9281debfc3dSmrg
9291debfc3dSmrg return false;
9301debfc3dSmrg }
9311debfc3dSmrg
9321debfc3dSmrg /* Return a copy of LOOP placed before LOOP. */
9331debfc3dSmrg
9348feb0f0bSmrg static class loop *
copy_loop_before(class loop * loop)9358feb0f0bSmrg copy_loop_before (class loop *loop)
9361debfc3dSmrg {
9378feb0f0bSmrg class loop *res;
9381debfc3dSmrg edge preheader = loop_preheader_edge (loop);
9391debfc3dSmrg
9401debfc3dSmrg initialize_original_copy_tables ();
9411debfc3dSmrg res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
9421debfc3dSmrg gcc_assert (res != NULL);
9431debfc3dSmrg free_original_copy_tables ();
9441debfc3dSmrg delete_update_ssa ();
9451debfc3dSmrg
9461debfc3dSmrg return res;
9471debfc3dSmrg }
9481debfc3dSmrg
9491debfc3dSmrg /* Creates an empty basic block after LOOP. */
9501debfc3dSmrg
9511debfc3dSmrg static void
create_bb_after_loop(class loop * loop)9528feb0f0bSmrg create_bb_after_loop (class loop *loop)
9531debfc3dSmrg {
9541debfc3dSmrg edge exit = single_exit (loop);
9551debfc3dSmrg
9561debfc3dSmrg if (!exit)
9571debfc3dSmrg return;
9581debfc3dSmrg
9591debfc3dSmrg split_edge (exit);
9601debfc3dSmrg }
9611debfc3dSmrg
9621debfc3dSmrg /* Generate code for PARTITION from the code in LOOP. The loop is
9631debfc3dSmrg copied when COPY_P is true. All the statements not flagged in the
9641debfc3dSmrg PARTITION bitmap are removed from the loop or from its copy. The
9651debfc3dSmrg statements are indexed in sequence inside a basic block, and the
9661debfc3dSmrg basic blocks of a loop are taken in dom order. */
9671debfc3dSmrg
9681debfc3dSmrg static void
generate_loops_for_partition(class loop * loop,partition * partition,bool copy_p)9698feb0f0bSmrg generate_loops_for_partition (class loop *loop, partition *partition,
9701debfc3dSmrg bool copy_p)
9711debfc3dSmrg {
9721debfc3dSmrg unsigned i;
9731debfc3dSmrg basic_block *bbs;
9741debfc3dSmrg
9751debfc3dSmrg if (copy_p)
9761debfc3dSmrg {
977a2dc1f3fSmrg int orig_loop_num = loop->orig_loop_num;
9781debfc3dSmrg loop = copy_loop_before (loop);
9791debfc3dSmrg gcc_assert (loop != NULL);
980a2dc1f3fSmrg loop->orig_loop_num = orig_loop_num;
9811debfc3dSmrg create_preheader (loop, CP_SIMPLE_PREHEADERS);
9821debfc3dSmrg create_bb_after_loop (loop);
9831debfc3dSmrg }
984a2dc1f3fSmrg else
985a2dc1f3fSmrg {
986a2dc1f3fSmrg /* Origin number is set to the new versioned loop's num. */
987a2dc1f3fSmrg gcc_assert (loop->orig_loop_num != loop->num);
988a2dc1f3fSmrg }
9891debfc3dSmrg
9901debfc3dSmrg /* Remove stmts not in the PARTITION bitmap. */
9911debfc3dSmrg bbs = get_loop_body_in_dom_order (loop);
9921debfc3dSmrg
993a2dc1f3fSmrg if (MAY_HAVE_DEBUG_BIND_STMTS)
9941debfc3dSmrg for (i = 0; i < loop->num_nodes; i++)
9951debfc3dSmrg {
9961debfc3dSmrg basic_block bb = bbs[i];
9971debfc3dSmrg
9981debfc3dSmrg for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
9991debfc3dSmrg gsi_next (&bsi))
10001debfc3dSmrg {
10011debfc3dSmrg gphi *phi = bsi.phi ();
10021debfc3dSmrg if (!virtual_operand_p (gimple_phi_result (phi))
10031debfc3dSmrg && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
10041debfc3dSmrg reset_debug_uses (phi);
10051debfc3dSmrg }
10061debfc3dSmrg
10071debfc3dSmrg for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
10081debfc3dSmrg {
10091debfc3dSmrg gimple *stmt = gsi_stmt (bsi);
10101debfc3dSmrg if (gimple_code (stmt) != GIMPLE_LABEL
10111debfc3dSmrg && !is_gimple_debug (stmt)
10121debfc3dSmrg && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
10131debfc3dSmrg reset_debug_uses (stmt);
10141debfc3dSmrg }
10151debfc3dSmrg }
10161debfc3dSmrg
10171debfc3dSmrg for (i = 0; i < loop->num_nodes; i++)
10181debfc3dSmrg {
10191debfc3dSmrg basic_block bb = bbs[i];
1020a2dc1f3fSmrg edge inner_exit = NULL;
1021a2dc1f3fSmrg
1022a2dc1f3fSmrg if (loop != bb->loop_father)
1023a2dc1f3fSmrg inner_exit = single_exit (bb->loop_father);
10241debfc3dSmrg
10251debfc3dSmrg for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
10261debfc3dSmrg {
10271debfc3dSmrg gphi *phi = bsi.phi ();
10281debfc3dSmrg if (!virtual_operand_p (gimple_phi_result (phi))
10291debfc3dSmrg && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
10301debfc3dSmrg remove_phi_node (&bsi, true);
10311debfc3dSmrg else
10321debfc3dSmrg gsi_next (&bsi);
10331debfc3dSmrg }
10341debfc3dSmrg
10351debfc3dSmrg for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
10361debfc3dSmrg {
10371debfc3dSmrg gimple *stmt = gsi_stmt (bsi);
10381debfc3dSmrg if (gimple_code (stmt) != GIMPLE_LABEL
10391debfc3dSmrg && !is_gimple_debug (stmt)
10401debfc3dSmrg && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
10411debfc3dSmrg {
1042a2dc1f3fSmrg /* In distribution of loop nest, if bb is inner loop's exit_bb,
1043a2dc1f3fSmrg we choose its exit edge/path in order to avoid generating
1044a2dc1f3fSmrg infinite loop. For all other cases, we choose an arbitrary
1045a2dc1f3fSmrg path through the empty CFG part that this unnecessary
1046a2dc1f3fSmrg control stmt controls. */
10471debfc3dSmrg if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
10481debfc3dSmrg {
1049a2dc1f3fSmrg if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
1050a2dc1f3fSmrg gimple_cond_make_true (cond_stmt);
1051a2dc1f3fSmrg else
10521debfc3dSmrg gimple_cond_make_false (cond_stmt);
10531debfc3dSmrg update_stmt (stmt);
10541debfc3dSmrg }
10551debfc3dSmrg else if (gimple_code (stmt) == GIMPLE_SWITCH)
10561debfc3dSmrg {
10571debfc3dSmrg gswitch *switch_stmt = as_a <gswitch *> (stmt);
10581debfc3dSmrg gimple_switch_set_index
10591debfc3dSmrg (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
10601debfc3dSmrg update_stmt (stmt);
10611debfc3dSmrg }
10621debfc3dSmrg else
10631debfc3dSmrg {
10641debfc3dSmrg unlink_stmt_vdef (stmt);
10651debfc3dSmrg gsi_remove (&bsi, true);
10661debfc3dSmrg release_defs (stmt);
10671debfc3dSmrg continue;
10681debfc3dSmrg }
10691debfc3dSmrg }
10701debfc3dSmrg gsi_next (&bsi);
10711debfc3dSmrg }
10721debfc3dSmrg }
10731debfc3dSmrg
10741debfc3dSmrg free (bbs);
10751debfc3dSmrg }
10761debfc3dSmrg
10771debfc3dSmrg /* If VAL memory representation contains the same value in all bytes,
10781debfc3dSmrg return that value, otherwise return -1.
10791debfc3dSmrg E.g. for 0x24242424 return 0x24, for IEEE double
10801debfc3dSmrg 747708026454360457216.0 return 0x44, etc. */
10811debfc3dSmrg
10821debfc3dSmrg static int
const_with_all_bytes_same(tree val)10831debfc3dSmrg const_with_all_bytes_same (tree val)
10841debfc3dSmrg {
10851debfc3dSmrg unsigned char buf[64];
10861debfc3dSmrg int i, len;
10871debfc3dSmrg
10881debfc3dSmrg if (integer_zerop (val)
10891debfc3dSmrg || (TREE_CODE (val) == CONSTRUCTOR
10901debfc3dSmrg && !TREE_CLOBBER_P (val)
10911debfc3dSmrg && CONSTRUCTOR_NELTS (val) == 0))
10921debfc3dSmrg return 0;
10931debfc3dSmrg
10941debfc3dSmrg if (real_zerop (val))
10951debfc3dSmrg {
10961debfc3dSmrg /* Only return 0 for +0.0, not for -0.0, which doesn't have
10971debfc3dSmrg an all bytes same memory representation. Don't transform
10981debfc3dSmrg -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */
10991debfc3dSmrg switch (TREE_CODE (val))
11001debfc3dSmrg {
11011debfc3dSmrg case REAL_CST:
11021debfc3dSmrg if (!real_isneg (TREE_REAL_CST_PTR (val)))
11031debfc3dSmrg return 0;
11041debfc3dSmrg break;
11051debfc3dSmrg case COMPLEX_CST:
11061debfc3dSmrg if (!const_with_all_bytes_same (TREE_REALPART (val))
11071debfc3dSmrg && !const_with_all_bytes_same (TREE_IMAGPART (val)))
11081debfc3dSmrg return 0;
11091debfc3dSmrg break;
11101debfc3dSmrg case VECTOR_CST:
1111a2dc1f3fSmrg {
1112a2dc1f3fSmrg unsigned int count = vector_cst_encoded_nelts (val);
11131debfc3dSmrg unsigned int j;
1114a2dc1f3fSmrg for (j = 0; j < count; ++j)
1115a2dc1f3fSmrg if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
11161debfc3dSmrg break;
1117a2dc1f3fSmrg if (j == count)
11181debfc3dSmrg return 0;
11191debfc3dSmrg break;
1120a2dc1f3fSmrg }
11211debfc3dSmrg default:
11221debfc3dSmrg break;
11231debfc3dSmrg }
11241debfc3dSmrg }
11251debfc3dSmrg
11261debfc3dSmrg if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
11271debfc3dSmrg return -1;
11281debfc3dSmrg
11291debfc3dSmrg len = native_encode_expr (val, buf, sizeof (buf));
11301debfc3dSmrg if (len == 0)
11311debfc3dSmrg return -1;
11321debfc3dSmrg for (i = 1; i < len; i++)
11331debfc3dSmrg if (buf[i] != buf[0])
11341debfc3dSmrg return -1;
11351debfc3dSmrg return buf[0];
11361debfc3dSmrg }
11371debfc3dSmrg
11381debfc3dSmrg /* Generate a call to memset for PARTITION in LOOP. */
11391debfc3dSmrg
11401debfc3dSmrg static void
generate_memset_builtin(class loop * loop,partition * partition)11418feb0f0bSmrg generate_memset_builtin (class loop *loop, partition *partition)
11421debfc3dSmrg {
11431debfc3dSmrg gimple_stmt_iterator gsi;
11441debfc3dSmrg tree mem, fn, nb_bytes;
11451debfc3dSmrg tree val;
1146a2dc1f3fSmrg struct builtin_info *builtin = partition->builtin;
1147a2dc1f3fSmrg gimple *fn_call;
11481debfc3dSmrg
11491debfc3dSmrg /* The new statements will be placed before LOOP. */
11501debfc3dSmrg gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
11511debfc3dSmrg
1152a2dc1f3fSmrg nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
11531debfc3dSmrg nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
11541debfc3dSmrg false, GSI_CONTINUE_LINKING);
11558feb0f0bSmrg mem = rewrite_to_non_trapping_overflow (builtin->dst_base);
11561debfc3dSmrg mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
11571debfc3dSmrg false, GSI_CONTINUE_LINKING);
11581debfc3dSmrg
11591debfc3dSmrg /* This exactly matches the pattern recognition in classify_partition. */
1160a2dc1f3fSmrg val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
11611debfc3dSmrg /* Handle constants like 0x15151515 and similarly
11621debfc3dSmrg floating point constants etc. where all bytes are the same. */
11631debfc3dSmrg int bytev = const_with_all_bytes_same (val);
11641debfc3dSmrg if (bytev != -1)
11651debfc3dSmrg val = build_int_cst (integer_type_node, bytev);
11661debfc3dSmrg else if (TREE_CODE (val) == INTEGER_CST)
11671debfc3dSmrg val = fold_convert (integer_type_node, val);
11681debfc3dSmrg else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
11691debfc3dSmrg {
11701debfc3dSmrg tree tem = make_ssa_name (integer_type_node);
11711debfc3dSmrg gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
11721debfc3dSmrg gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
11731debfc3dSmrg val = tem;
11741debfc3dSmrg }
11751debfc3dSmrg
11761debfc3dSmrg fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
11771debfc3dSmrg fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
11788feb0f0bSmrg gimple_set_location (fn_call, partition->loc);
11791debfc3dSmrg gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
11808feb0f0bSmrg fold_stmt (&gsi);
11811debfc3dSmrg
11821debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
11831debfc3dSmrg {
11841debfc3dSmrg fprintf (dump_file, "generated memset");
11851debfc3dSmrg if (bytev == 0)
11861debfc3dSmrg fprintf (dump_file, " zero\n");
11871debfc3dSmrg else
11881debfc3dSmrg fprintf (dump_file, "\n");
11891debfc3dSmrg }
11901debfc3dSmrg }
11911debfc3dSmrg
11921debfc3dSmrg /* Generate a call to memcpy for PARTITION in LOOP. */
11931debfc3dSmrg
11941debfc3dSmrg static void
generate_memcpy_builtin(class loop * loop,partition * partition)11958feb0f0bSmrg generate_memcpy_builtin (class loop *loop, partition *partition)
11961debfc3dSmrg {
11971debfc3dSmrg gimple_stmt_iterator gsi;
1198a2dc1f3fSmrg gimple *fn_call;
11991debfc3dSmrg tree dest, src, fn, nb_bytes;
12001debfc3dSmrg enum built_in_function kind;
1201a2dc1f3fSmrg struct builtin_info *builtin = partition->builtin;
12021debfc3dSmrg
12031debfc3dSmrg /* The new statements will be placed before LOOP. */
12041debfc3dSmrg gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
12051debfc3dSmrg
1206a2dc1f3fSmrg nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
12071debfc3dSmrg nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
12081debfc3dSmrg false, GSI_CONTINUE_LINKING);
12098feb0f0bSmrg dest = rewrite_to_non_trapping_overflow (builtin->dst_base);
12108feb0f0bSmrg src = rewrite_to_non_trapping_overflow (builtin->src_base);
12111debfc3dSmrg if (partition->kind == PKIND_MEMCPY
12121debfc3dSmrg || ! ptr_derefs_may_alias_p (dest, src))
12131debfc3dSmrg kind = BUILT_IN_MEMCPY;
12141debfc3dSmrg else
12151debfc3dSmrg kind = BUILT_IN_MEMMOVE;
12168feb0f0bSmrg /* Try harder if we're copying a constant size. */
12178feb0f0bSmrg if (kind == BUILT_IN_MEMMOVE && poly_int_tree_p (nb_bytes))
12188feb0f0bSmrg {
12198feb0f0bSmrg aff_tree asrc, adest;
12208feb0f0bSmrg tree_to_aff_combination (src, ptr_type_node, &asrc);
12218feb0f0bSmrg tree_to_aff_combination (dest, ptr_type_node, &adest);
12228feb0f0bSmrg aff_combination_scale (&adest, -1);
12238feb0f0bSmrg aff_combination_add (&asrc, &adest);
12248feb0f0bSmrg if (aff_comb_cannot_overlap_p (&asrc, wi::to_poly_widest (nb_bytes),
12258feb0f0bSmrg wi::to_poly_widest (nb_bytes)))
12268feb0f0bSmrg kind = BUILT_IN_MEMCPY;
12278feb0f0bSmrg }
12281debfc3dSmrg
12291debfc3dSmrg dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
12301debfc3dSmrg false, GSI_CONTINUE_LINKING);
12311debfc3dSmrg src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
12321debfc3dSmrg false, GSI_CONTINUE_LINKING);
12331debfc3dSmrg fn = build_fold_addr_expr (builtin_decl_implicit (kind));
12341debfc3dSmrg fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
12358feb0f0bSmrg gimple_set_location (fn_call, partition->loc);
12361debfc3dSmrg gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
12378feb0f0bSmrg fold_stmt (&gsi);
12381debfc3dSmrg
12391debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
12401debfc3dSmrg {
12411debfc3dSmrg if (kind == BUILT_IN_MEMCPY)
12421debfc3dSmrg fprintf (dump_file, "generated memcpy\n");
12431debfc3dSmrg else
12441debfc3dSmrg fprintf (dump_file, "generated memmove\n");
12451debfc3dSmrg }
12461debfc3dSmrg }
12471debfc3dSmrg
12481debfc3dSmrg /* Remove and destroy the loop LOOP. */
12491debfc3dSmrg
12501debfc3dSmrg static void
destroy_loop(class loop * loop)12518feb0f0bSmrg destroy_loop (class loop *loop)
12521debfc3dSmrg {
12531debfc3dSmrg unsigned nbbs = loop->num_nodes;
12541debfc3dSmrg edge exit = single_exit (loop);
12551debfc3dSmrg basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
12561debfc3dSmrg basic_block *bbs;
12571debfc3dSmrg unsigned i;
12581debfc3dSmrg
12591debfc3dSmrg bbs = get_loop_body_in_dom_order (loop);
12601debfc3dSmrg
1261c0a68be4Smrg gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest);
1262c0a68be4Smrg bool safe_p = single_pred_p (exit->dest);
12638feb0f0bSmrg for (unsigned i = 0; i < nbbs; ++i)
12641debfc3dSmrg {
12651debfc3dSmrg /* We have made sure to not leave any dangling uses of SSA
12661debfc3dSmrg names defined in the loop. With the exception of virtuals.
12671debfc3dSmrg Make sure we replace all uses of virtual defs that will remain
12681debfc3dSmrg outside of the loop with the bare symbol as delete_basic_block
12691debfc3dSmrg will release them. */
12701debfc3dSmrg for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
12711debfc3dSmrg gsi_next (&gsi))
12721debfc3dSmrg {
12731debfc3dSmrg gphi *phi = gsi.phi ();
12741debfc3dSmrg if (virtual_operand_p (gimple_phi_result (phi)))
12751debfc3dSmrg mark_virtual_phi_result_for_renaming (phi);
12761debfc3dSmrg }
1277c0a68be4Smrg for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);)
12781debfc3dSmrg {
12791debfc3dSmrg gimple *stmt = gsi_stmt (gsi);
12801debfc3dSmrg tree vdef = gimple_vdef (stmt);
12811debfc3dSmrg if (vdef && TREE_CODE (vdef) == SSA_NAME)
12821debfc3dSmrg mark_virtual_operand_for_renaming (vdef);
1283c0a68be4Smrg /* Also move and eventually reset debug stmts. We can leave
1284c0a68be4Smrg constant values in place in case the stmt dominates the exit.
1285c0a68be4Smrg ??? Non-constant values from the last iteration can be
1286c0a68be4Smrg replaced with final values if we can compute them. */
1287c0a68be4Smrg if (gimple_debug_bind_p (stmt))
1288c0a68be4Smrg {
1289c0a68be4Smrg tree val = gimple_debug_bind_get_value (stmt);
1290c0a68be4Smrg gsi_move_before (&gsi, &dst_gsi);
1291c0a68be4Smrg if (val
1292c0a68be4Smrg && (!safe_p
1293c0a68be4Smrg || !is_gimple_min_invariant (val)
1294c0a68be4Smrg || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i])))
1295c0a68be4Smrg {
1296c0a68be4Smrg gimple_debug_bind_reset_value (stmt);
1297c0a68be4Smrg update_stmt (stmt);
12981debfc3dSmrg }
1299c0a68be4Smrg }
1300c0a68be4Smrg else
1301c0a68be4Smrg gsi_next (&gsi);
1302c0a68be4Smrg }
1303c0a68be4Smrg }
1304c0a68be4Smrg
1305c0a68be4Smrg redirect_edge_pred (exit, src);
1306c0a68be4Smrg exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1307c0a68be4Smrg exit->flags |= EDGE_FALLTHRU;
1308c0a68be4Smrg cancel_loop_tree (loop);
1309c0a68be4Smrg rescan_loop_exit (exit, false, true);
1310c0a68be4Smrg
1311c0a68be4Smrg i = nbbs;
1312c0a68be4Smrg do
1313c0a68be4Smrg {
1314c0a68be4Smrg --i;
13151debfc3dSmrg delete_basic_block (bbs[i]);
13161debfc3dSmrg }
13171debfc3dSmrg while (i != 0);
13181debfc3dSmrg
13191debfc3dSmrg free (bbs);
13201debfc3dSmrg
13211debfc3dSmrg set_immediate_dominator (CDI_DOMINATORS, dest,
13221debfc3dSmrg recompute_dominator (CDI_DOMINATORS, dest));
13231debfc3dSmrg }
13241debfc3dSmrg
13251debfc3dSmrg /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */
13261debfc3dSmrg
13271debfc3dSmrg static bool
generate_code_for_partition(class loop * loop,partition * partition,bool copy_p)13288feb0f0bSmrg generate_code_for_partition (class loop *loop,
13291debfc3dSmrg partition *partition, bool copy_p)
13301debfc3dSmrg {
13311debfc3dSmrg switch (partition->kind)
13321debfc3dSmrg {
13331debfc3dSmrg case PKIND_NORMAL:
1334a2dc1f3fSmrg case PKIND_PARTIAL_MEMSET:
13351debfc3dSmrg /* Reductions all have to be in the last partition. */
13361debfc3dSmrg gcc_assert (!partition_reduction_p (partition)
13371debfc3dSmrg || !copy_p);
13381debfc3dSmrg generate_loops_for_partition (loop, partition, copy_p);
13391debfc3dSmrg return false;
13401debfc3dSmrg
13411debfc3dSmrg case PKIND_MEMSET:
13421debfc3dSmrg generate_memset_builtin (loop, partition);
13431debfc3dSmrg break;
13441debfc3dSmrg
13451debfc3dSmrg case PKIND_MEMCPY:
13461debfc3dSmrg case PKIND_MEMMOVE:
13471debfc3dSmrg generate_memcpy_builtin (loop, partition);
13481debfc3dSmrg break;
13491debfc3dSmrg
13501debfc3dSmrg default:
13511debfc3dSmrg gcc_unreachable ();
13521debfc3dSmrg }
13531debfc3dSmrg
13541debfc3dSmrg /* Common tail for partitions we turn into a call. If this was the last
13551debfc3dSmrg partition for which we generate code, we have to destroy the loop. */
13561debfc3dSmrg if (!copy_p)
13571debfc3dSmrg return true;
13581debfc3dSmrg return false;
13591debfc3dSmrg }
13601debfc3dSmrg
13618feb0f0bSmrg data_dependence_relation *
get_data_dependence(struct graph * rdg,data_reference_p a,data_reference_p b)13628feb0f0bSmrg loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a,
13638feb0f0bSmrg data_reference_p b)
1364a2dc1f3fSmrg {
1365a2dc1f3fSmrg struct data_dependence_relation ent, **slot;
1366a2dc1f3fSmrg struct data_dependence_relation *ddr;
1367a2dc1f3fSmrg
1368a2dc1f3fSmrg gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1369a2dc1f3fSmrg gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1370a2dc1f3fSmrg <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1371a2dc1f3fSmrg ent.a = a;
1372a2dc1f3fSmrg ent.b = b;
1373a2dc1f3fSmrg slot = ddrs_table->find_slot (&ent, INSERT);
1374a2dc1f3fSmrg if (*slot == NULL)
1375a2dc1f3fSmrg {
1376a2dc1f3fSmrg ddr = initialize_data_dependence_relation (a, b, loop_nest);
1377a2dc1f3fSmrg compute_affine_dependence (ddr, loop_nest[0]);
1378a2dc1f3fSmrg *slot = ddr;
1379a2dc1f3fSmrg }
1380a2dc1f3fSmrg
1381a2dc1f3fSmrg return *slot;
1382a2dc1f3fSmrg }
1383a2dc1f3fSmrg
13848feb0f0bSmrg bool
data_dep_in_cycle_p(struct graph * rdg,data_reference_p dr1,data_reference_p dr2)13858feb0f0bSmrg loop_distribution::data_dep_in_cycle_p (struct graph *rdg,
13868feb0f0bSmrg data_reference_p dr1,
13878feb0f0bSmrg data_reference_p dr2)
1388a2dc1f3fSmrg {
1389a2dc1f3fSmrg struct data_dependence_relation *ddr;
1390a2dc1f3fSmrg
1391a2dc1f3fSmrg /* Re-shuffle data-refs to be in topological order. */
1392a2dc1f3fSmrg if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1393a2dc1f3fSmrg > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1394a2dc1f3fSmrg std::swap (dr1, dr2);
1395a2dc1f3fSmrg
1396a2dc1f3fSmrg ddr = get_data_dependence (rdg, dr1, dr2);
1397a2dc1f3fSmrg
1398a2dc1f3fSmrg /* In case of no data dependence. */
1399a2dc1f3fSmrg if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1400a2dc1f3fSmrg return false;
1401a2dc1f3fSmrg /* For unknown data dependence or known data dependence which can't be
1402a2dc1f3fSmrg expressed in classic distance vector, we check if it can be resolved
1403a2dc1f3fSmrg by runtime alias check. If yes, we still consider data dependence
1404a2dc1f3fSmrg as won't introduce data dependence cycle. */
1405a2dc1f3fSmrg else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1406a2dc1f3fSmrg || DDR_NUM_DIST_VECTS (ddr) == 0)
1407a2dc1f3fSmrg return !runtime_alias_check_p (ddr, NULL, true);
1408a2dc1f3fSmrg else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1409a2dc1f3fSmrg return true;
1410a2dc1f3fSmrg else if (DDR_REVERSED_P (ddr)
1411a2dc1f3fSmrg || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1412a2dc1f3fSmrg return false;
1413a2dc1f3fSmrg
1414a2dc1f3fSmrg return true;
1415a2dc1f3fSmrg }
1416a2dc1f3fSmrg
14178feb0f0bSmrg void
update_type_for_merge(struct graph * rdg,partition * partition1,partition * partition2)14188feb0f0bSmrg loop_distribution::update_type_for_merge (struct graph *rdg,
14198feb0f0bSmrg partition *partition1,
14208feb0f0bSmrg partition *partition2)
1421a2dc1f3fSmrg {
1422a2dc1f3fSmrg unsigned i, j;
1423a2dc1f3fSmrg bitmap_iterator bi, bj;
1424a2dc1f3fSmrg data_reference_p dr1, dr2;
1425a2dc1f3fSmrg
1426a2dc1f3fSmrg EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1427a2dc1f3fSmrg {
1428a2dc1f3fSmrg unsigned start = (partition1 == partition2) ? i + 1 : 0;
1429a2dc1f3fSmrg
1430a2dc1f3fSmrg dr1 = datarefs_vec[i];
1431a2dc1f3fSmrg EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1432a2dc1f3fSmrg {
1433a2dc1f3fSmrg dr2 = datarefs_vec[j];
1434a2dc1f3fSmrg if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1435a2dc1f3fSmrg continue;
1436a2dc1f3fSmrg
1437a2dc1f3fSmrg /* Partition can only be executed sequentially if there is any
1438a2dc1f3fSmrg data dependence cycle. */
1439a2dc1f3fSmrg if (data_dep_in_cycle_p (rdg, dr1, dr2))
1440a2dc1f3fSmrg {
1441a2dc1f3fSmrg partition1->type = PTYPE_SEQUENTIAL;
1442a2dc1f3fSmrg return;
1443a2dc1f3fSmrg }
1444a2dc1f3fSmrg }
1445a2dc1f3fSmrg }
1446a2dc1f3fSmrg }
14471debfc3dSmrg
14488feb0f0bSmrg partition *
build_rdg_partition_for_vertex(struct graph * rdg,int v)14498feb0f0bSmrg loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v)
14501debfc3dSmrg {
1451a2dc1f3fSmrg partition *partition = partition_alloc ();
14521debfc3dSmrg auto_vec<int, 3> nodes;
1453a2dc1f3fSmrg unsigned i, j;
14541debfc3dSmrg int x;
1455a2dc1f3fSmrg data_reference_p dr;
14561debfc3dSmrg
14571debfc3dSmrg graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
14581debfc3dSmrg
14591debfc3dSmrg FOR_EACH_VEC_ELT (nodes, i, x)
14601debfc3dSmrg {
14611debfc3dSmrg bitmap_set_bit (partition->stmts, x);
1462a2dc1f3fSmrg
1463a2dc1f3fSmrg for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1464a2dc1f3fSmrg {
1465a2dc1f3fSmrg unsigned idx = (unsigned) DR_INDEX (dr);
1466a2dc1f3fSmrg gcc_assert (idx < datarefs_vec.length ());
1467a2dc1f3fSmrg
1468a2dc1f3fSmrg /* Partition can only be executed sequentially if there is any
1469a2dc1f3fSmrg unknown data reference. */
1470a2dc1f3fSmrg if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1471a2dc1f3fSmrg || !DR_INIT (dr) || !DR_STEP (dr))
1472a2dc1f3fSmrg partition->type = PTYPE_SEQUENTIAL;
1473a2dc1f3fSmrg
1474a2dc1f3fSmrg bitmap_set_bit (partition->datarefs, idx);
14751debfc3dSmrg }
1476a2dc1f3fSmrg }
1477a2dc1f3fSmrg
1478a2dc1f3fSmrg if (partition->type == PTYPE_SEQUENTIAL)
1479a2dc1f3fSmrg return partition;
1480a2dc1f3fSmrg
1481a2dc1f3fSmrg /* Further check if any data dependence prevents us from executing the
1482a2dc1f3fSmrg partition parallelly. */
1483a2dc1f3fSmrg update_type_for_merge (rdg, partition, partition);
14841debfc3dSmrg
14851debfc3dSmrg return partition;
14861debfc3dSmrg }
14871debfc3dSmrg
1488a2dc1f3fSmrg /* Given PARTITION of LOOP and RDG, record single load/store data references
1489a2dc1f3fSmrg for builtin partition in SRC_DR/DST_DR, return false if there is no such
1490a2dc1f3fSmrg data references. */
14911debfc3dSmrg
1492a2dc1f3fSmrg static bool
find_single_drs(class loop * loop,struct graph * rdg,partition * partition,data_reference_p * dst_dr,data_reference_p * src_dr)14938feb0f0bSmrg find_single_drs (class loop *loop, struct graph *rdg, partition *partition,
1494a2dc1f3fSmrg data_reference_p *dst_dr, data_reference_p *src_dr)
14951debfc3dSmrg {
14961debfc3dSmrg unsigned i;
1497a2dc1f3fSmrg data_reference_p single_ld = NULL, single_st = NULL;
1498a2dc1f3fSmrg bitmap_iterator bi;
14991debfc3dSmrg
15001debfc3dSmrg EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
15011debfc3dSmrg {
15021debfc3dSmrg gimple *stmt = RDG_STMT (rdg, i);
15031debfc3dSmrg data_reference_p dr;
15041debfc3dSmrg
15051debfc3dSmrg if (gimple_code (stmt) == GIMPLE_PHI)
15061debfc3dSmrg continue;
15071debfc3dSmrg
15081debfc3dSmrg /* Any scalar stmts are ok. */
15091debfc3dSmrg if (!gimple_vuse (stmt))
15101debfc3dSmrg continue;
15111debfc3dSmrg
15121debfc3dSmrg /* Otherwise just regular loads/stores. */
15131debfc3dSmrg if (!gimple_assign_single_p (stmt))
1514a2dc1f3fSmrg return false;
15151debfc3dSmrg
15161debfc3dSmrg /* But exactly one store and/or load. */
1517a2dc1f3fSmrg for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
15181debfc3dSmrg {
15191debfc3dSmrg tree type = TREE_TYPE (DR_REF (dr));
15201debfc3dSmrg
15211debfc3dSmrg /* The memset, memcpy and memmove library calls are only
15221debfc3dSmrg able to deal with generic address space. */
15231debfc3dSmrg if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1524a2dc1f3fSmrg return false;
15251debfc3dSmrg
15261debfc3dSmrg if (DR_IS_READ (dr))
15271debfc3dSmrg {
1528a2dc1f3fSmrg if (single_ld != NULL)
1529a2dc1f3fSmrg return false;
1530a2dc1f3fSmrg single_ld = dr;
15311debfc3dSmrg }
15321debfc3dSmrg else
15331debfc3dSmrg {
1534a2dc1f3fSmrg if (single_st != NULL)
1535a2dc1f3fSmrg return false;
1536a2dc1f3fSmrg single_st = dr;
15371debfc3dSmrg }
15381debfc3dSmrg }
15391debfc3dSmrg }
15401debfc3dSmrg
1541a2dc1f3fSmrg if (!single_st)
1542a2dc1f3fSmrg return false;
15431debfc3dSmrg
1544a2dc1f3fSmrg /* Bail out if this is a bitfield memory reference. */
1545a2dc1f3fSmrg if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1546a2dc1f3fSmrg && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1547a2dc1f3fSmrg return false;
15481debfc3dSmrg
1549a2dc1f3fSmrg /* Data reference must be executed exactly once per iteration of each
1550a2dc1f3fSmrg loop in the loop nest. We only need to check dominance information
1551a2dc1f3fSmrg against the outermost one in a perfect loop nest because a bb can't
1552a2dc1f3fSmrg dominate outermost loop's latch without dominating inner loop's. */
1553a2dc1f3fSmrg basic_block bb_st = gimple_bb (DR_STMT (single_st));
1554a2dc1f3fSmrg if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1555a2dc1f3fSmrg return false;
1556a2dc1f3fSmrg
1557a2dc1f3fSmrg if (single_ld)
15581debfc3dSmrg {
1559a2dc1f3fSmrg gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1560a2dc1f3fSmrg /* Direct aggregate copy or via an SSA name temporary. */
1561a2dc1f3fSmrg if (load != store
1562a2dc1f3fSmrg && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1563a2dc1f3fSmrg return false;
1564a2dc1f3fSmrg
1565a2dc1f3fSmrg /* Bail out if this is a bitfield memory reference. */
1566a2dc1f3fSmrg if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1567a2dc1f3fSmrg && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1568a2dc1f3fSmrg return false;
1569a2dc1f3fSmrg
1570a2dc1f3fSmrg /* Load and store must be in the same loop nest. */
1571a2dc1f3fSmrg basic_block bb_ld = gimple_bb (DR_STMT (single_ld));
1572a2dc1f3fSmrg if (bb_st->loop_father != bb_ld->loop_father)
1573a2dc1f3fSmrg return false;
1574a2dc1f3fSmrg
1575a2dc1f3fSmrg /* Data reference must be executed exactly once per iteration.
1576a2dc1f3fSmrg Same as single_st, we only need to check against the outermost
1577a2dc1f3fSmrg loop. */
1578a2dc1f3fSmrg if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1579a2dc1f3fSmrg return false;
1580a2dc1f3fSmrg
1581a2dc1f3fSmrg edge e = single_exit (bb_st->loop_father);
1582a2dc1f3fSmrg bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1583a2dc1f3fSmrg bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1584a2dc1f3fSmrg if (dom_ld != dom_st)
1585a2dc1f3fSmrg return false;
1586a2dc1f3fSmrg }
1587a2dc1f3fSmrg
1588a2dc1f3fSmrg *src_dr = single_ld;
1589a2dc1f3fSmrg *dst_dr = single_st;
1590a2dc1f3fSmrg return true;
1591a2dc1f3fSmrg }
1592a2dc1f3fSmrg
1593a2dc1f3fSmrg /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1594a2dc1f3fSmrg loops from inner to outer to see if loop's step equals to access size at
1595a2dc1f3fSmrg each level of loop. Return 2 if we can prove this at all level loops;
1596a2dc1f3fSmrg record access base and size in BASE and SIZE; save loop's step at each
1597a2dc1f3fSmrg level of loop in STEPS if it is not null. For example:
1598a2dc1f3fSmrg
1599a2dc1f3fSmrg int arr[100][100][100];
1600a2dc1f3fSmrg for (i = 0; i < 100; i++) ;steps[2] = 40000
1601a2dc1f3fSmrg for (j = 100; j > 0; j--) ;steps[1] = -400
1602a2dc1f3fSmrg for (k = 0; k < 100; k++) ;steps[0] = 4
1603a2dc1f3fSmrg arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000
1604a2dc1f3fSmrg
1605a2dc1f3fSmrg Return 1 if we can prove the equality at the innermost loop, but not all
1606a2dc1f3fSmrg level loops. In this case, no information is recorded.
1607a2dc1f3fSmrg
1608a2dc1f3fSmrg Return 0 if no equality can be proven at any level loops. */
1609a2dc1f3fSmrg
1610a2dc1f3fSmrg static int
1611a2dc1f3fSmrg compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1612a2dc1f3fSmrg tree *size, vec<tree> *steps = NULL)
1613a2dc1f3fSmrg {
1614a2dc1f3fSmrg location_t loc = gimple_location (DR_STMT (dr));
1615a2dc1f3fSmrg basic_block bb = gimple_bb (DR_STMT (dr));
16168feb0f0bSmrg class loop *loop = bb->loop_father;
1617a2dc1f3fSmrg tree ref = DR_REF (dr);
1618a2dc1f3fSmrg tree access_base = build_fold_addr_expr (ref);
1619a2dc1f3fSmrg tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1620a2dc1f3fSmrg int res = 0;
1621a2dc1f3fSmrg
1622a2dc1f3fSmrg do {
1623a2dc1f3fSmrg tree scev_fn = analyze_scalar_evolution (loop, access_base);
1624a2dc1f3fSmrg if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1625a2dc1f3fSmrg return res;
1626a2dc1f3fSmrg
1627a2dc1f3fSmrg access_base = CHREC_LEFT (scev_fn);
1628a2dc1f3fSmrg if (tree_contains_chrecs (access_base, NULL))
1629a2dc1f3fSmrg return res;
1630a2dc1f3fSmrg
1631a2dc1f3fSmrg tree scev_step = CHREC_RIGHT (scev_fn);
1632a2dc1f3fSmrg /* Only support constant steps. */
1633a2dc1f3fSmrg if (TREE_CODE (scev_step) != INTEGER_CST)
1634a2dc1f3fSmrg return res;
1635a2dc1f3fSmrg
1636a2dc1f3fSmrg enum ev_direction access_dir = scev_direction (scev_fn);
1637a2dc1f3fSmrg if (access_dir == EV_DIR_UNKNOWN)
1638a2dc1f3fSmrg return res;
1639a2dc1f3fSmrg
1640a2dc1f3fSmrg if (steps != NULL)
1641a2dc1f3fSmrg steps->safe_push (scev_step);
1642a2dc1f3fSmrg
1643a2dc1f3fSmrg scev_step = fold_convert_loc (loc, sizetype, scev_step);
1644a2dc1f3fSmrg /* Compute absolute value of scev step. */
1645a2dc1f3fSmrg if (access_dir == EV_DIR_DECREASES)
1646a2dc1f3fSmrg scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1647a2dc1f3fSmrg
1648a2dc1f3fSmrg /* At each level of loop, scev step must equal to access size. In other
1649a2dc1f3fSmrg words, DR must access consecutive memory between loop iterations. */
1650a2dc1f3fSmrg if (!operand_equal_p (scev_step, access_size, 0))
1651a2dc1f3fSmrg return res;
1652a2dc1f3fSmrg
1653a2dc1f3fSmrg /* Access stride can be computed for data reference at least for the
1654a2dc1f3fSmrg innermost loop. */
1655a2dc1f3fSmrg res = 1;
1656a2dc1f3fSmrg
1657a2dc1f3fSmrg /* Compute DR's execution times in loop. */
1658a2dc1f3fSmrg tree niters = number_of_latch_executions (loop);
1659a2dc1f3fSmrg niters = fold_convert_loc (loc, sizetype, niters);
1660a2dc1f3fSmrg if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1661a2dc1f3fSmrg niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1662a2dc1f3fSmrg
1663a2dc1f3fSmrg /* Compute DR's overall access size in loop. */
1664a2dc1f3fSmrg access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1665a2dc1f3fSmrg niters, scev_step);
1666a2dc1f3fSmrg /* Adjust base address in case of negative step. */
1667a2dc1f3fSmrg if (access_dir == EV_DIR_DECREASES)
1668a2dc1f3fSmrg {
1669a2dc1f3fSmrg tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1670a2dc1f3fSmrg scev_step, access_size);
1671a2dc1f3fSmrg access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1672a2dc1f3fSmrg }
1673a2dc1f3fSmrg } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1674a2dc1f3fSmrg
1675a2dc1f3fSmrg *base = access_base;
1676a2dc1f3fSmrg *size = access_size;
1677a2dc1f3fSmrg /* Access stride can be computed for data reference at each level loop. */
1678a2dc1f3fSmrg return 2;
1679a2dc1f3fSmrg }
1680a2dc1f3fSmrg
1681a2dc1f3fSmrg /* Allocate and return builtin struct. Record information like DST_DR,
1682a2dc1f3fSmrg SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */
1683a2dc1f3fSmrg
1684a2dc1f3fSmrg static struct builtin_info *
alloc_builtin(data_reference_p dst_dr,data_reference_p src_dr,tree dst_base,tree src_base,tree size)1685a2dc1f3fSmrg alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1686a2dc1f3fSmrg tree dst_base, tree src_base, tree size)
1687a2dc1f3fSmrg {
1688a2dc1f3fSmrg struct builtin_info *builtin = XNEW (struct builtin_info);
1689a2dc1f3fSmrg builtin->dst_dr = dst_dr;
1690a2dc1f3fSmrg builtin->src_dr = src_dr;
1691a2dc1f3fSmrg builtin->dst_base = dst_base;
1692a2dc1f3fSmrg builtin->src_base = src_base;
1693a2dc1f3fSmrg builtin->size = size;
1694a2dc1f3fSmrg return builtin;
1695a2dc1f3fSmrg }
1696a2dc1f3fSmrg
1697a2dc1f3fSmrg /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1698a2dc1f3fSmrg memset call. */
1699a2dc1f3fSmrg
1700a2dc1f3fSmrg static void
classify_builtin_st(loop_p loop,partition * partition,data_reference_p dr)1701a2dc1f3fSmrg classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1702a2dc1f3fSmrg {
1703a2dc1f3fSmrg gimple *stmt = DR_STMT (dr);
1704a2dc1f3fSmrg tree base, size, rhs = gimple_assign_rhs1 (stmt);
1705a2dc1f3fSmrg
17061debfc3dSmrg if (const_with_all_bytes_same (rhs) == -1
17071debfc3dSmrg && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
17081debfc3dSmrg || (TYPE_MODE (TREE_TYPE (rhs))
17091debfc3dSmrg != TYPE_MODE (unsigned_char_type_node))))
17101debfc3dSmrg return;
1711a2dc1f3fSmrg
17121debfc3dSmrg if (TREE_CODE (rhs) == SSA_NAME
17131debfc3dSmrg && !SSA_NAME_IS_DEFAULT_DEF (rhs)
17141debfc3dSmrg && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
17151debfc3dSmrg return;
1716a2dc1f3fSmrg
1717a2dc1f3fSmrg int res = compute_access_range (loop, dr, &base, &size);
1718a2dc1f3fSmrg if (res == 0)
17191debfc3dSmrg return;
1720a2dc1f3fSmrg if (res == 1)
1721a2dc1f3fSmrg {
1722a2dc1f3fSmrg partition->kind = PKIND_PARTIAL_MEMSET;
1723a2dc1f3fSmrg return;
1724a2dc1f3fSmrg }
1725a2dc1f3fSmrg
1726a2dc1f3fSmrg poly_uint64 base_offset;
1727a2dc1f3fSmrg unsigned HOST_WIDE_INT const_base_offset;
1728a2dc1f3fSmrg tree base_base = strip_offset (base, &base_offset);
1729a2dc1f3fSmrg if (!base_offset.is_constant (&const_base_offset))
1730a2dc1f3fSmrg return;
1731a2dc1f3fSmrg
1732a2dc1f3fSmrg struct builtin_info *builtin;
1733a2dc1f3fSmrg builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1734a2dc1f3fSmrg builtin->dst_base_base = base_base;
1735a2dc1f3fSmrg builtin->dst_base_offset = const_base_offset;
1736a2dc1f3fSmrg partition->builtin = builtin;
17371debfc3dSmrg partition->kind = PKIND_MEMSET;
17381debfc3dSmrg }
1739a2dc1f3fSmrg
1740a2dc1f3fSmrg /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1741a2dc1f3fSmrg if it forms builtin memcpy or memmove call. */
1742a2dc1f3fSmrg
17438feb0f0bSmrg void
classify_builtin_ldst(loop_p loop,struct graph * rdg,partition * partition,data_reference_p dst_dr,data_reference_p src_dr)17448feb0f0bSmrg loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg,
17458feb0f0bSmrg partition *partition,
17468feb0f0bSmrg data_reference_p dst_dr,
17478feb0f0bSmrg data_reference_p src_dr)
17481debfc3dSmrg {
1749a2dc1f3fSmrg tree base, size, src_base, src_size;
1750a2dc1f3fSmrg auto_vec<tree> dst_steps, src_steps;
1751a2dc1f3fSmrg
1752a2dc1f3fSmrg /* Compute access range of both load and store. */
1753a2dc1f3fSmrg int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1754a2dc1f3fSmrg if (res != 2)
17551debfc3dSmrg return;
1756a2dc1f3fSmrg res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1757a2dc1f3fSmrg if (res != 2)
17581debfc3dSmrg return;
1759a2dc1f3fSmrg
1760a2dc1f3fSmrg /* They much have the same access size. */
1761a2dc1f3fSmrg if (!operand_equal_p (size, src_size, 0))
1762a2dc1f3fSmrg return;
1763a2dc1f3fSmrg
1764a2dc1f3fSmrg /* Load and store in loop nest must access memory in the same way, i.e,
1765a2dc1f3fSmrg their must have the same steps in each loop of the nest. */
1766a2dc1f3fSmrg if (dst_steps.length () != src_steps.length ())
1767a2dc1f3fSmrg return;
1768a2dc1f3fSmrg for (unsigned i = 0; i < dst_steps.length (); ++i)
1769a2dc1f3fSmrg if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1770a2dc1f3fSmrg return;
1771a2dc1f3fSmrg
1772a2dc1f3fSmrg /* Now check that if there is a dependence. */
1773a2dc1f3fSmrg ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1774a2dc1f3fSmrg
17758feb0f0bSmrg /* Classify as memmove if no dependence between load and store. */
1776a2dc1f3fSmrg if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
17771debfc3dSmrg {
1778a2dc1f3fSmrg partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
17798feb0f0bSmrg partition->kind = PKIND_MEMMOVE;
17801debfc3dSmrg return;
17811debfc3dSmrg }
1782a2dc1f3fSmrg
1783a2dc1f3fSmrg /* Can't do memmove in case of unknown dependence or dependence without
1784a2dc1f3fSmrg classical distance vector. */
1785a2dc1f3fSmrg if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1786a2dc1f3fSmrg || DDR_NUM_DIST_VECTS (ddr) == 0)
17871debfc3dSmrg return;
1788a2dc1f3fSmrg
1789a2dc1f3fSmrg unsigned i;
17901debfc3dSmrg lambda_vector dist_v;
1791a2dc1f3fSmrg int num_lev = (DDR_LOOP_NEST (ddr)).length ();
17921debfc3dSmrg FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
17931debfc3dSmrg {
1794a2dc1f3fSmrg unsigned dep_lev = dependence_level (dist_v, num_lev);
1795a2dc1f3fSmrg /* Can't do memmove if load depends on store. */
1796a2dc1f3fSmrg if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
17971debfc3dSmrg return;
17981debfc3dSmrg }
1799a2dc1f3fSmrg
1800a2dc1f3fSmrg partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
18011debfc3dSmrg partition->kind = PKIND_MEMMOVE;
1802a2dc1f3fSmrg return;
18031debfc3dSmrg }
18041debfc3dSmrg
18058feb0f0bSmrg bool
classify_partition(loop_p loop,struct graph * rdg,partition * partition,bitmap stmt_in_all_partitions)18068feb0f0bSmrg loop_distribution::classify_partition (loop_p loop,
18078feb0f0bSmrg struct graph *rdg, partition *partition,
1808a2dc1f3fSmrg bitmap stmt_in_all_partitions)
18091debfc3dSmrg {
1810a2dc1f3fSmrg bitmap_iterator bi;
1811a2dc1f3fSmrg unsigned i;
1812a2dc1f3fSmrg data_reference_p single_ld = NULL, single_st = NULL;
1813a2dc1f3fSmrg bool volatiles_p = false, has_reduction = false;
18141debfc3dSmrg
1815a2dc1f3fSmrg EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1816a2dc1f3fSmrg {
1817a2dc1f3fSmrg gimple *stmt = RDG_STMT (rdg, i);
1818a2dc1f3fSmrg
1819a2dc1f3fSmrg if (gimple_has_volatile_ops (stmt))
1820a2dc1f3fSmrg volatiles_p = true;
1821a2dc1f3fSmrg
1822a2dc1f3fSmrg /* If the stmt is not included by all partitions and there is uses
1823a2dc1f3fSmrg outside of the loop, then mark the partition as reduction. */
1824a2dc1f3fSmrg if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1825a2dc1f3fSmrg {
1826a2dc1f3fSmrg /* Due to limitation in the transform phase we have to fuse all
1827a2dc1f3fSmrg reduction partitions. As a result, this could cancel valid
1828a2dc1f3fSmrg loop distribution especially for loop that induction variable
1829a2dc1f3fSmrg is used outside of loop. To workaround this issue, we skip
1830a2dc1f3fSmrg marking partition as reudction if the reduction stmt belongs
1831a2dc1f3fSmrg to all partitions. In such case, reduction will be computed
1832a2dc1f3fSmrg correctly no matter how partitions are fused/distributed. */
1833a2dc1f3fSmrg if (!bitmap_bit_p (stmt_in_all_partitions, i))
1834a2dc1f3fSmrg partition->reduction_p = true;
18358feb0f0bSmrg else
1836a2dc1f3fSmrg has_reduction = true;
1837a2dc1f3fSmrg }
18381debfc3dSmrg }
18391debfc3dSmrg
18408feb0f0bSmrg /* Simple workaround to prevent classifying the partition as builtin
18418feb0f0bSmrg if it contains any use outside of loop. For the case where all
18428feb0f0bSmrg partitions have the reduction this simple workaround is delayed
18438feb0f0bSmrg to only affect the last partition. */
18448feb0f0bSmrg if (partition->reduction_p)
18458feb0f0bSmrg return has_reduction;
18468feb0f0bSmrg
1847a2dc1f3fSmrg /* Perform general partition disqualification for builtins. */
1848a2dc1f3fSmrg if (volatiles_p
1849a2dc1f3fSmrg || !flag_tree_loop_distribute_patterns)
18508feb0f0bSmrg return has_reduction;
1851a2dc1f3fSmrg
1852a2dc1f3fSmrg /* Find single load/store data references for builtin partition. */
1853a2dc1f3fSmrg if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld))
18548feb0f0bSmrg return has_reduction;
18558feb0f0bSmrg
18568feb0f0bSmrg partition->loc = gimple_location (DR_STMT (single_st));
1857a2dc1f3fSmrg
1858a2dc1f3fSmrg /* Classify the builtin kind. */
1859a2dc1f3fSmrg if (single_ld == NULL)
1860a2dc1f3fSmrg classify_builtin_st (loop, partition, single_st);
1861a2dc1f3fSmrg else
1862a2dc1f3fSmrg classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
18638feb0f0bSmrg return has_reduction;
1864a2dc1f3fSmrg }
1865a2dc1f3fSmrg
18668feb0f0bSmrg bool
share_memory_accesses(struct graph * rdg,partition * partition1,partition * partition2)18678feb0f0bSmrg loop_distribution::share_memory_accesses (struct graph *rdg,
1868a2dc1f3fSmrg partition *partition1, partition *partition2)
18691debfc3dSmrg {
1870a2dc1f3fSmrg unsigned i, j;
18711debfc3dSmrg bitmap_iterator bi, bj;
1872a2dc1f3fSmrg data_reference_p dr1, dr2;
18731debfc3dSmrg
18741debfc3dSmrg /* First check whether in the intersection of the two partitions are
18751debfc3dSmrg any loads or stores. Common loads are the situation that happens
18761debfc3dSmrg most often. */
18771debfc3dSmrg EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
18781debfc3dSmrg if (RDG_MEM_WRITE_STMT (rdg, i)
18791debfc3dSmrg || RDG_MEM_READS_STMT (rdg, i))
18801debfc3dSmrg return true;
18811debfc3dSmrg
1882a2dc1f3fSmrg /* Then check whether the two partitions access the same memory object. */
1883a2dc1f3fSmrg EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
18841debfc3dSmrg {
1885a2dc1f3fSmrg dr1 = datarefs_vec[i];
1886a2dc1f3fSmrg
1887a2dc1f3fSmrg if (!DR_BASE_ADDRESS (dr1)
1888a2dc1f3fSmrg || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1889a2dc1f3fSmrg continue;
1890a2dc1f3fSmrg
1891a2dc1f3fSmrg EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
18921debfc3dSmrg {
1893a2dc1f3fSmrg dr2 = datarefs_vec[j];
1894a2dc1f3fSmrg
1895a2dc1f3fSmrg if (!DR_BASE_ADDRESS (dr2)
1896a2dc1f3fSmrg || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1897a2dc1f3fSmrg continue;
1898a2dc1f3fSmrg
1899a2dc1f3fSmrg if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1900a2dc1f3fSmrg && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1901a2dc1f3fSmrg && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1902a2dc1f3fSmrg && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
19031debfc3dSmrg return true;
19041debfc3dSmrg }
19051debfc3dSmrg }
19061debfc3dSmrg
19071debfc3dSmrg return false;
19081debfc3dSmrg }
19091debfc3dSmrg
1910a2dc1f3fSmrg /* For each seed statement in STARTING_STMTS, this function builds
1911a2dc1f3fSmrg partition for it by adding depended statements according to RDG.
1912a2dc1f3fSmrg All partitions are recorded in PARTITIONS. */
19131debfc3dSmrg
19148feb0f0bSmrg void
rdg_build_partitions(struct graph * rdg,vec<gimple * > starting_stmts,vec<partition * > * partitions)19158feb0f0bSmrg loop_distribution::rdg_build_partitions (struct graph *rdg,
19161debfc3dSmrg vec<gimple *> starting_stmts,
19171debfc3dSmrg vec<partition *> *partitions)
19181debfc3dSmrg {
1919a2dc1f3fSmrg auto_bitmap processed;
19201debfc3dSmrg int i;
19211debfc3dSmrg gimple *stmt;
19221debfc3dSmrg
19231debfc3dSmrg FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
19241debfc3dSmrg {
19251debfc3dSmrg int v = rdg_vertex_for_stmt (rdg, stmt);
19261debfc3dSmrg
19271debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
19281debfc3dSmrg fprintf (dump_file,
19291debfc3dSmrg "ldist asked to generate code for vertex %d\n", v);
19301debfc3dSmrg
19311debfc3dSmrg /* If the vertex is already contained in another partition so
19321debfc3dSmrg is the partition rooted at it. */
19331debfc3dSmrg if (bitmap_bit_p (processed, v))
19341debfc3dSmrg continue;
19351debfc3dSmrg
19361debfc3dSmrg partition *partition = build_rdg_partition_for_vertex (rdg, v);
19371debfc3dSmrg bitmap_ior_into (processed, partition->stmts);
19381debfc3dSmrg
19391debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
19401debfc3dSmrg {
1941a2dc1f3fSmrg fprintf (dump_file, "ldist creates useful %s partition:\n",
1942a2dc1f3fSmrg partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1943a2dc1f3fSmrg bitmap_print (dump_file, partition->stmts, " ", "\n");
19441debfc3dSmrg }
19451debfc3dSmrg
19461debfc3dSmrg partitions->safe_push (partition);
19471debfc3dSmrg }
19481debfc3dSmrg
19491debfc3dSmrg /* All vertices should have been assigned to at least one partition now,
19501debfc3dSmrg other than vertices belonging to dead code. */
19511debfc3dSmrg }
19521debfc3dSmrg
19531debfc3dSmrg /* Dump to FILE the PARTITIONS. */
19541debfc3dSmrg
19551debfc3dSmrg static void
dump_rdg_partitions(FILE * file,vec<partition * > partitions)19561debfc3dSmrg dump_rdg_partitions (FILE *file, vec<partition *> partitions)
19571debfc3dSmrg {
19581debfc3dSmrg int i;
19591debfc3dSmrg partition *partition;
19601debfc3dSmrg
19611debfc3dSmrg FOR_EACH_VEC_ELT (partitions, i, partition)
19621debfc3dSmrg debug_bitmap_file (file, partition->stmts);
19631debfc3dSmrg }
19641debfc3dSmrg
19651debfc3dSmrg /* Debug PARTITIONS. */
19661debfc3dSmrg extern void debug_rdg_partitions (vec<partition *> );
19671debfc3dSmrg
19681debfc3dSmrg DEBUG_FUNCTION void
debug_rdg_partitions(vec<partition * > partitions)19691debfc3dSmrg debug_rdg_partitions (vec<partition *> partitions)
19701debfc3dSmrg {
19711debfc3dSmrg dump_rdg_partitions (stderr, partitions);
19721debfc3dSmrg }
19731debfc3dSmrg
19741debfc3dSmrg /* Returns the number of read and write operations in the RDG. */
19751debfc3dSmrg
19761debfc3dSmrg static int
number_of_rw_in_rdg(struct graph * rdg)19771debfc3dSmrg number_of_rw_in_rdg (struct graph *rdg)
19781debfc3dSmrg {
19791debfc3dSmrg int i, res = 0;
19801debfc3dSmrg
19811debfc3dSmrg for (i = 0; i < rdg->n_vertices; i++)
19821debfc3dSmrg {
19831debfc3dSmrg if (RDG_MEM_WRITE_STMT (rdg, i))
19841debfc3dSmrg ++res;
19851debfc3dSmrg
19861debfc3dSmrg if (RDG_MEM_READS_STMT (rdg, i))
19871debfc3dSmrg ++res;
19881debfc3dSmrg }
19891debfc3dSmrg
19901debfc3dSmrg return res;
19911debfc3dSmrg }
19921debfc3dSmrg
19931debfc3dSmrg /* Returns the number of read and write operations in a PARTITION of
19941debfc3dSmrg the RDG. */
19951debfc3dSmrg
19961debfc3dSmrg static int
number_of_rw_in_partition(struct graph * rdg,partition * partition)19971debfc3dSmrg number_of_rw_in_partition (struct graph *rdg, partition *partition)
19981debfc3dSmrg {
19991debfc3dSmrg int res = 0;
20001debfc3dSmrg unsigned i;
20011debfc3dSmrg bitmap_iterator ii;
20021debfc3dSmrg
20031debfc3dSmrg EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
20041debfc3dSmrg {
20051debfc3dSmrg if (RDG_MEM_WRITE_STMT (rdg, i))
20061debfc3dSmrg ++res;
20071debfc3dSmrg
20081debfc3dSmrg if (RDG_MEM_READS_STMT (rdg, i))
20091debfc3dSmrg ++res;
20101debfc3dSmrg }
20111debfc3dSmrg
20121debfc3dSmrg return res;
20131debfc3dSmrg }
20141debfc3dSmrg
20151debfc3dSmrg /* Returns true when one of the PARTITIONS contains all the read or
20161debfc3dSmrg write operations of RDG. */
20171debfc3dSmrg
20181debfc3dSmrg static bool
partition_contains_all_rw(struct graph * rdg,vec<partition * > partitions)20191debfc3dSmrg partition_contains_all_rw (struct graph *rdg,
20201debfc3dSmrg vec<partition *> partitions)
20211debfc3dSmrg {
20221debfc3dSmrg int i;
20231debfc3dSmrg partition *partition;
20241debfc3dSmrg int nrw = number_of_rw_in_rdg (rdg);
20251debfc3dSmrg
20261debfc3dSmrg FOR_EACH_VEC_ELT (partitions, i, partition)
20271debfc3dSmrg if (nrw == number_of_rw_in_partition (rdg, partition))
20281debfc3dSmrg return true;
20291debfc3dSmrg
20301debfc3dSmrg return false;
20311debfc3dSmrg }
20321debfc3dSmrg
20338feb0f0bSmrg int
pg_add_dependence_edges(struct graph * rdg,int dir,bitmap drs1,bitmap drs2,vec<ddr_p> * alias_ddrs)20348feb0f0bSmrg loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir,
2035a2dc1f3fSmrg bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
20361debfc3dSmrg {
2037a2dc1f3fSmrg unsigned i, j;
2038a2dc1f3fSmrg bitmap_iterator bi, bj;
2039a2dc1f3fSmrg data_reference_p dr1, dr2, saved_dr1;
20401debfc3dSmrg
20411debfc3dSmrg /* dependence direction - 0 is no dependence, -1 is back,
20421debfc3dSmrg 1 is forth, 2 is both (we can stop then, merging will occur). */
2043a2dc1f3fSmrg EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
20441debfc3dSmrg {
2045a2dc1f3fSmrg dr1 = datarefs_vec[i];
2046a2dc1f3fSmrg
2047a2dc1f3fSmrg EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
2048a2dc1f3fSmrg {
2049a2dc1f3fSmrg int res, this_dir = 1;
20501debfc3dSmrg ddr_p ddr;
2051a2dc1f3fSmrg
2052a2dc1f3fSmrg dr2 = datarefs_vec[j];
2053a2dc1f3fSmrg
2054a2dc1f3fSmrg /* Skip all <read, read> data dependence. */
2055a2dc1f3fSmrg if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
2056a2dc1f3fSmrg continue;
2057a2dc1f3fSmrg
2058a2dc1f3fSmrg saved_dr1 = dr1;
2059a2dc1f3fSmrg /* Re-shuffle data-refs to be in topological order. */
20601debfc3dSmrg if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
20611debfc3dSmrg > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
20621debfc3dSmrg {
20631debfc3dSmrg std::swap (dr1, dr2);
20641debfc3dSmrg this_dir = -this_dir;
20651debfc3dSmrg }
2066a2dc1f3fSmrg ddr = get_data_dependence (rdg, dr1, dr2);
20671debfc3dSmrg if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2068a2dc1f3fSmrg {
2069a2dc1f3fSmrg this_dir = 0;
2070a2dc1f3fSmrg res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
2071a2dc1f3fSmrg DR_BASE_ADDRESS (dr2));
2072a2dc1f3fSmrg /* Be conservative. If data references are not well analyzed,
2073a2dc1f3fSmrg or the two data references have the same base address and
2074a2dc1f3fSmrg offset, add dependence and consider it alias to each other.
2075a2dc1f3fSmrg In other words, the dependence cannot be resolved by
2076a2dc1f3fSmrg runtime alias check. */
2077a2dc1f3fSmrg if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
2078a2dc1f3fSmrg || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
2079a2dc1f3fSmrg || !DR_INIT (dr1) || !DR_INIT (dr2)
2080a2dc1f3fSmrg || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
2081a2dc1f3fSmrg || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
2082a2dc1f3fSmrg || res == 0)
20831debfc3dSmrg this_dir = 2;
2084a2dc1f3fSmrg /* Data dependence could be resolved by runtime alias check,
2085a2dc1f3fSmrg record it in ALIAS_DDRS. */
2086a2dc1f3fSmrg else if (alias_ddrs != NULL)
2087a2dc1f3fSmrg alias_ddrs->safe_push (ddr);
2088a2dc1f3fSmrg /* Or simply ignore it. */
2089a2dc1f3fSmrg }
20901debfc3dSmrg else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
20911debfc3dSmrg {
20921debfc3dSmrg if (DDR_REVERSED_P (ddr))
20931debfc3dSmrg this_dir = -this_dir;
2094a2dc1f3fSmrg
20951debfc3dSmrg /* Known dependences can still be unordered througout the
20968feb0f0bSmrg iteration space, see gcc.dg/tree-ssa/ldist-16.c and
20978feb0f0bSmrg gcc.dg/tree-ssa/pr94969.c. */
20981debfc3dSmrg if (DDR_NUM_DIST_VECTS (ddr) != 1)
20991debfc3dSmrg this_dir = 2;
21001debfc3dSmrg /* If the overlap is exact preserve stmt order. */
2101a2dc1f3fSmrg else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
2102a2dc1f3fSmrg DDR_NB_LOOPS (ddr)))
21031debfc3dSmrg ;
2104a2dc1f3fSmrg /* Else as the distance vector is lexicographic positive swap
2105a2dc1f3fSmrg the dependence direction. */
21061debfc3dSmrg else
21071debfc3dSmrg this_dir = -this_dir;
21081debfc3dSmrg }
21091debfc3dSmrg else
21101debfc3dSmrg this_dir = 0;
21111debfc3dSmrg if (this_dir == 2)
21121debfc3dSmrg return 2;
21131debfc3dSmrg else if (dir == 0)
21141debfc3dSmrg dir = this_dir;
21151debfc3dSmrg else if (this_dir != 0 && dir != this_dir)
21161debfc3dSmrg return 2;
21171debfc3dSmrg /* Shuffle "back" dr1. */
21181debfc3dSmrg dr1 = saved_dr1;
21191debfc3dSmrg }
2120a2dc1f3fSmrg }
21211debfc3dSmrg return dir;
21221debfc3dSmrg }
21231debfc3dSmrg
21241debfc3dSmrg /* Compare postorder number of the partition graph vertices V1 and V2. */
21251debfc3dSmrg
21261debfc3dSmrg static int
pgcmp(const void * v1_,const void * v2_)21271debfc3dSmrg pgcmp (const void *v1_, const void *v2_)
21281debfc3dSmrg {
21291debfc3dSmrg const vertex *v1 = (const vertex *)v1_;
21301debfc3dSmrg const vertex *v2 = (const vertex *)v2_;
21311debfc3dSmrg return v2->post - v1->post;
21321debfc3dSmrg }
21331debfc3dSmrg
2134a2dc1f3fSmrg /* Data attached to vertices of partition dependence graph. */
2135a2dc1f3fSmrg struct pg_vdata
2136a2dc1f3fSmrg {
2137a2dc1f3fSmrg /* ID of the corresponding partition. */
2138a2dc1f3fSmrg int id;
2139a2dc1f3fSmrg /* The partition. */
2140a2dc1f3fSmrg struct partition *partition;
2141a2dc1f3fSmrg };
2142a2dc1f3fSmrg
2143a2dc1f3fSmrg /* Data attached to edges of partition dependence graph. */
2144a2dc1f3fSmrg struct pg_edata
2145a2dc1f3fSmrg {
2146a2dc1f3fSmrg /* If the dependence edge can be resolved by runtime alias check,
2147a2dc1f3fSmrg this vector contains data dependence relations for runtime alias
2148a2dc1f3fSmrg check. On the other hand, if the dependence edge is introduced
2149a2dc1f3fSmrg because of compilation time known data dependence, this vector
2150a2dc1f3fSmrg contains nothing. */
2151a2dc1f3fSmrg vec<ddr_p> alias_ddrs;
2152a2dc1f3fSmrg };
2153a2dc1f3fSmrg
2154a2dc1f3fSmrg /* Callback data for traversing edges in graph. */
2155a2dc1f3fSmrg struct pg_edge_callback_data
2156a2dc1f3fSmrg {
2157a2dc1f3fSmrg /* Bitmap contains strong connected components should be merged. */
2158a2dc1f3fSmrg bitmap sccs_to_merge;
2159a2dc1f3fSmrg /* Array constains component information for all vertices. */
2160a2dc1f3fSmrg int *vertices_component;
2161a2dc1f3fSmrg /* Vector to record all data dependence relations which are needed
2162a2dc1f3fSmrg to break strong connected components by runtime alias checks. */
2163a2dc1f3fSmrg vec<ddr_p> *alias_ddrs;
2164a2dc1f3fSmrg };
2165a2dc1f3fSmrg
2166a2dc1f3fSmrg /* Initialize vertice's data for partition dependence graph PG with
2167a2dc1f3fSmrg PARTITIONS. */
2168a2dc1f3fSmrg
2169a2dc1f3fSmrg static void
init_partition_graph_vertices(struct graph * pg,vec<struct partition * > * partitions)2170a2dc1f3fSmrg init_partition_graph_vertices (struct graph *pg,
2171a2dc1f3fSmrg vec<struct partition *> *partitions)
2172a2dc1f3fSmrg {
2173a2dc1f3fSmrg int i;
2174a2dc1f3fSmrg partition *partition;
2175a2dc1f3fSmrg struct pg_vdata *data;
2176a2dc1f3fSmrg
2177a2dc1f3fSmrg for (i = 0; partitions->iterate (i, &partition); ++i)
2178a2dc1f3fSmrg {
2179a2dc1f3fSmrg data = new pg_vdata;
2180a2dc1f3fSmrg pg->vertices[i].data = data;
2181a2dc1f3fSmrg data->id = i;
2182a2dc1f3fSmrg data->partition = partition;
2183a2dc1f3fSmrg }
2184a2dc1f3fSmrg }
2185a2dc1f3fSmrg
2186a2dc1f3fSmrg /* Add edge <I, J> to partition dependence graph PG. Attach vector of data
2187a2dc1f3fSmrg dependence relations to the EDGE if DDRS isn't NULL. */
2188a2dc1f3fSmrg
2189a2dc1f3fSmrg static void
add_partition_graph_edge(struct graph * pg,int i,int j,vec<ddr_p> * ddrs)2190a2dc1f3fSmrg add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2191a2dc1f3fSmrg {
2192a2dc1f3fSmrg struct graph_edge *e = add_edge (pg, i, j);
2193a2dc1f3fSmrg
2194a2dc1f3fSmrg /* If the edge is attached with data dependence relations, it means this
2195a2dc1f3fSmrg dependence edge can be resolved by runtime alias checks. */
2196a2dc1f3fSmrg if (ddrs != NULL)
2197a2dc1f3fSmrg {
2198a2dc1f3fSmrg struct pg_edata *data = new pg_edata;
2199a2dc1f3fSmrg
2200a2dc1f3fSmrg gcc_assert (ddrs->length () > 0);
2201a2dc1f3fSmrg e->data = data;
2202a2dc1f3fSmrg data->alias_ddrs = vNULL;
2203a2dc1f3fSmrg data->alias_ddrs.safe_splice (*ddrs);
2204a2dc1f3fSmrg }
2205a2dc1f3fSmrg }
2206a2dc1f3fSmrg
2207a2dc1f3fSmrg /* Callback function for graph travesal algorithm. It returns true
2208a2dc1f3fSmrg if edge E should skipped when traversing the graph. */
2209a2dc1f3fSmrg
2210a2dc1f3fSmrg static bool
pg_skip_alias_edge(struct graph_edge * e)2211a2dc1f3fSmrg pg_skip_alias_edge (struct graph_edge *e)
2212a2dc1f3fSmrg {
2213a2dc1f3fSmrg struct pg_edata *data = (struct pg_edata *)e->data;
2214a2dc1f3fSmrg return (data != NULL && data->alias_ddrs.length () > 0);
2215a2dc1f3fSmrg }
2216a2dc1f3fSmrg
2217a2dc1f3fSmrg /* Callback function freeing data attached to edge E of graph. */
2218a2dc1f3fSmrg
2219a2dc1f3fSmrg static void
free_partition_graph_edata_cb(struct graph *,struct graph_edge * e,void *)2220a2dc1f3fSmrg free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2221a2dc1f3fSmrg {
2222a2dc1f3fSmrg if (e->data != NULL)
2223a2dc1f3fSmrg {
2224a2dc1f3fSmrg struct pg_edata *data = (struct pg_edata *)e->data;
2225a2dc1f3fSmrg data->alias_ddrs.release ();
2226a2dc1f3fSmrg delete data;
2227a2dc1f3fSmrg }
2228a2dc1f3fSmrg }
2229a2dc1f3fSmrg
2230a2dc1f3fSmrg /* Free data attached to vertice of partition dependence graph PG. */
2231a2dc1f3fSmrg
2232a2dc1f3fSmrg static void
free_partition_graph_vdata(struct graph * pg)2233a2dc1f3fSmrg free_partition_graph_vdata (struct graph *pg)
2234a2dc1f3fSmrg {
2235a2dc1f3fSmrg int i;
2236a2dc1f3fSmrg struct pg_vdata *data;
2237a2dc1f3fSmrg
2238a2dc1f3fSmrg for (i = 0; i < pg->n_vertices; ++i)
2239a2dc1f3fSmrg {
2240a2dc1f3fSmrg data = (struct pg_vdata *)pg->vertices[i].data;
2241a2dc1f3fSmrg delete data;
2242a2dc1f3fSmrg }
2243a2dc1f3fSmrg }
2244a2dc1f3fSmrg
2245a2dc1f3fSmrg /* Build and return partition dependence graph for PARTITIONS. RDG is
2246a2dc1f3fSmrg reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P
2247a2dc1f3fSmrg is true, data dependence caused by possible alias between references
2248a2dc1f3fSmrg is ignored, as if it doesn't exist at all; otherwise all depdendences
2249a2dc1f3fSmrg are considered. */
2250a2dc1f3fSmrg
22518feb0f0bSmrg struct graph *
build_partition_graph(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)22528feb0f0bSmrg loop_distribution::build_partition_graph (struct graph *rdg,
2253a2dc1f3fSmrg vec<struct partition *> *partitions,
2254a2dc1f3fSmrg bool ignore_alias_p)
2255a2dc1f3fSmrg {
2256a2dc1f3fSmrg int i, j;
2257a2dc1f3fSmrg struct partition *partition1, *partition2;
2258a2dc1f3fSmrg graph *pg = new_graph (partitions->length ());
2259a2dc1f3fSmrg auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2260a2dc1f3fSmrg
2261a2dc1f3fSmrg alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2262a2dc1f3fSmrg
2263a2dc1f3fSmrg init_partition_graph_vertices (pg, partitions);
2264a2dc1f3fSmrg
2265a2dc1f3fSmrg for (i = 0; partitions->iterate (i, &partition1); ++i)
2266a2dc1f3fSmrg {
2267a2dc1f3fSmrg for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2268a2dc1f3fSmrg {
2269a2dc1f3fSmrg /* dependence direction - 0 is no dependence, -1 is back,
2270a2dc1f3fSmrg 1 is forth, 2 is both (we can stop then, merging will occur). */
2271a2dc1f3fSmrg int dir = 0;
2272a2dc1f3fSmrg
2273a2dc1f3fSmrg /* If the first partition has reduction, add back edge; if the
2274a2dc1f3fSmrg second partition has reduction, add forth edge. This makes
2275a2dc1f3fSmrg sure that reduction partition will be sorted as the last one. */
2276a2dc1f3fSmrg if (partition_reduction_p (partition1))
2277a2dc1f3fSmrg dir = -1;
2278a2dc1f3fSmrg else if (partition_reduction_p (partition2))
2279a2dc1f3fSmrg dir = 1;
2280a2dc1f3fSmrg
2281a2dc1f3fSmrg /* Cleanup the temporary vector. */
2282a2dc1f3fSmrg alias_ddrs.truncate (0);
2283a2dc1f3fSmrg
2284a2dc1f3fSmrg dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2285a2dc1f3fSmrg partition2->datarefs, alias_ddrs_p);
2286a2dc1f3fSmrg
2287a2dc1f3fSmrg /* Add edge to partition graph if there exists dependence. There
2288a2dc1f3fSmrg are two types of edges. One type edge is caused by compilation
2289a2dc1f3fSmrg time known dependence, this type cannot be resolved by runtime
2290a2dc1f3fSmrg alias check. The other type can be resolved by runtime alias
2291a2dc1f3fSmrg check. */
2292a2dc1f3fSmrg if (dir == 1 || dir == 2
2293a2dc1f3fSmrg || alias_ddrs.length () > 0)
2294a2dc1f3fSmrg {
2295a2dc1f3fSmrg /* Attach data dependence relations to edge that can be resolved
2296a2dc1f3fSmrg by runtime alias check. */
2297a2dc1f3fSmrg bool alias_edge_p = (dir != 1 && dir != 2);
2298a2dc1f3fSmrg add_partition_graph_edge (pg, i, j,
2299a2dc1f3fSmrg (alias_edge_p) ? &alias_ddrs : NULL);
2300a2dc1f3fSmrg }
2301a2dc1f3fSmrg if (dir == -1 || dir == 2
2302a2dc1f3fSmrg || alias_ddrs.length () > 0)
2303a2dc1f3fSmrg {
2304a2dc1f3fSmrg /* Attach data dependence relations to edge that can be resolved
2305a2dc1f3fSmrg by runtime alias check. */
2306a2dc1f3fSmrg bool alias_edge_p = (dir != -1 && dir != 2);
2307a2dc1f3fSmrg add_partition_graph_edge (pg, j, i,
2308a2dc1f3fSmrg (alias_edge_p) ? &alias_ddrs : NULL);
2309a2dc1f3fSmrg }
2310a2dc1f3fSmrg }
2311a2dc1f3fSmrg }
2312a2dc1f3fSmrg return pg;
2313a2dc1f3fSmrg }
2314a2dc1f3fSmrg
2315a2dc1f3fSmrg /* Sort partitions in PG in descending post order and store them in
2316a2dc1f3fSmrg PARTITIONS. */
2317a2dc1f3fSmrg
2318a2dc1f3fSmrg static void
sort_partitions_by_post_order(struct graph * pg,vec<struct partition * > * partitions)2319a2dc1f3fSmrg sort_partitions_by_post_order (struct graph *pg,
2320a2dc1f3fSmrg vec<struct partition *> *partitions)
2321a2dc1f3fSmrg {
2322a2dc1f3fSmrg int i;
2323a2dc1f3fSmrg struct pg_vdata *data;
2324a2dc1f3fSmrg
2325a2dc1f3fSmrg /* Now order the remaining nodes in descending postorder. */
2326a2dc1f3fSmrg qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2327a2dc1f3fSmrg partitions->truncate (0);
2328a2dc1f3fSmrg for (i = 0; i < pg->n_vertices; ++i)
2329a2dc1f3fSmrg {
2330a2dc1f3fSmrg data = (struct pg_vdata *)pg->vertices[i].data;
2331a2dc1f3fSmrg if (data->partition)
2332a2dc1f3fSmrg partitions->safe_push (data->partition);
2333a2dc1f3fSmrg }
2334a2dc1f3fSmrg }
2335a2dc1f3fSmrg
23368feb0f0bSmrg void
merge_dep_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)23378feb0f0bSmrg loop_distribution::merge_dep_scc_partitions (struct graph *rdg,
2338a2dc1f3fSmrg vec<struct partition *> *partitions,
2339a2dc1f3fSmrg bool ignore_alias_p)
2340a2dc1f3fSmrg {
2341a2dc1f3fSmrg struct partition *partition1, *partition2;
2342a2dc1f3fSmrg struct pg_vdata *data;
2343a2dc1f3fSmrg graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2344a2dc1f3fSmrg int i, j, num_sccs = graphds_scc (pg, NULL);
2345a2dc1f3fSmrg
2346a2dc1f3fSmrg /* Strong connected compoenent means dependence cycle, we cannot distribute
2347a2dc1f3fSmrg them. So fuse them together. */
2348a2dc1f3fSmrg if ((unsigned) num_sccs < partitions->length ())
2349a2dc1f3fSmrg {
2350a2dc1f3fSmrg for (i = 0; i < num_sccs; ++i)
2351a2dc1f3fSmrg {
2352a2dc1f3fSmrg for (j = 0; partitions->iterate (j, &partition1); ++j)
2353a2dc1f3fSmrg if (pg->vertices[j].component == i)
2354a2dc1f3fSmrg break;
2355a2dc1f3fSmrg for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2356a2dc1f3fSmrg if (pg->vertices[j].component == i)
2357a2dc1f3fSmrg {
2358a2dc1f3fSmrg partition_merge_into (NULL, partition1,
2359a2dc1f3fSmrg partition2, FUSE_SAME_SCC);
2360a2dc1f3fSmrg partition1->type = PTYPE_SEQUENTIAL;
2361a2dc1f3fSmrg (*partitions)[j] = NULL;
2362a2dc1f3fSmrg partition_free (partition2);
2363a2dc1f3fSmrg data = (struct pg_vdata *)pg->vertices[j].data;
2364a2dc1f3fSmrg data->partition = NULL;
2365a2dc1f3fSmrg }
2366a2dc1f3fSmrg }
2367a2dc1f3fSmrg }
2368a2dc1f3fSmrg
2369a2dc1f3fSmrg sort_partitions_by_post_order (pg, partitions);
2370a2dc1f3fSmrg gcc_assert (partitions->length () == (unsigned)num_sccs);
2371a2dc1f3fSmrg free_partition_graph_vdata (pg);
2372a2dc1f3fSmrg free_graph (pg);
2373a2dc1f3fSmrg }
2374a2dc1f3fSmrg
2375a2dc1f3fSmrg /* Callback function for traversing edge E in graph G. DATA is private
2376a2dc1f3fSmrg callback data. */
2377a2dc1f3fSmrg
2378a2dc1f3fSmrg static void
pg_collect_alias_ddrs(struct graph * g,struct graph_edge * e,void * data)2379a2dc1f3fSmrg pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2380a2dc1f3fSmrg {
2381a2dc1f3fSmrg int i, j, component;
2382a2dc1f3fSmrg struct pg_edge_callback_data *cbdata;
2383a2dc1f3fSmrg struct pg_edata *edata = (struct pg_edata *) e->data;
2384a2dc1f3fSmrg
2385a2dc1f3fSmrg /* If the edge doesn't have attached data dependence, it represents
2386a2dc1f3fSmrg compilation time known dependences. This type dependence cannot
2387a2dc1f3fSmrg be resolved by runtime alias check. */
2388a2dc1f3fSmrg if (edata == NULL || edata->alias_ddrs.length () == 0)
2389a2dc1f3fSmrg return;
2390a2dc1f3fSmrg
2391a2dc1f3fSmrg cbdata = (struct pg_edge_callback_data *) data;
2392a2dc1f3fSmrg i = e->src;
2393a2dc1f3fSmrg j = e->dest;
2394a2dc1f3fSmrg component = cbdata->vertices_component[i];
2395a2dc1f3fSmrg /* Vertices are topologically sorted according to compilation time
2396a2dc1f3fSmrg known dependences, so we can break strong connected components
2397a2dc1f3fSmrg by removing edges of the opposite direction, i.e, edges pointing
2398a2dc1f3fSmrg from vertice with smaller post number to vertice with bigger post
2399a2dc1f3fSmrg number. */
2400a2dc1f3fSmrg if (g->vertices[i].post < g->vertices[j].post
2401a2dc1f3fSmrg /* We only need to remove edges connecting vertices in the same
2402a2dc1f3fSmrg strong connected component to break it. */
2403a2dc1f3fSmrg && component == cbdata->vertices_component[j]
2404a2dc1f3fSmrg /* Check if we want to break the strong connected component or not. */
2405a2dc1f3fSmrg && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2406a2dc1f3fSmrg cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2407a2dc1f3fSmrg }
2408a2dc1f3fSmrg
2409*23f5f463Smrg /* Callback function for traversing edge E. DATA is private
2410*23f5f463Smrg callback data. */
2411*23f5f463Smrg
2412*23f5f463Smrg static void
pg_unmark_merged_alias_ddrs(struct graph *,struct graph_edge * e,void * data)2413*23f5f463Smrg pg_unmark_merged_alias_ddrs (struct graph *, struct graph_edge *e, void *data)
2414*23f5f463Smrg {
2415*23f5f463Smrg int i, j, component;
2416*23f5f463Smrg struct pg_edge_callback_data *cbdata;
2417*23f5f463Smrg struct pg_edata *edata = (struct pg_edata *) e->data;
2418*23f5f463Smrg
2419*23f5f463Smrg if (edata == NULL || edata->alias_ddrs.length () == 0)
2420*23f5f463Smrg return;
2421*23f5f463Smrg
2422*23f5f463Smrg cbdata = (struct pg_edge_callback_data *) data;
2423*23f5f463Smrg i = e->src;
2424*23f5f463Smrg j = e->dest;
2425*23f5f463Smrg component = cbdata->vertices_component[i];
2426*23f5f463Smrg /* Make sure to not skip vertices inside SCCs we are going to merge. */
2427*23f5f463Smrg if (component == cbdata->vertices_component[j]
2428*23f5f463Smrg && bitmap_bit_p (cbdata->sccs_to_merge, component))
2429*23f5f463Smrg {
2430*23f5f463Smrg edata->alias_ddrs.release ();
2431*23f5f463Smrg delete edata;
2432*23f5f463Smrg e->data = NULL;
2433*23f5f463Smrg }
2434*23f5f463Smrg }
2435*23f5f463Smrg
2436a2dc1f3fSmrg /* This is the main function breaking strong conected components in
2437a2dc1f3fSmrg PARTITIONS giving reduced depdendence graph RDG. Store data dependence
2438a2dc1f3fSmrg relations for runtime alias check in ALIAS_DDRS. */
24398feb0f0bSmrg void
break_alias_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)24408feb0f0bSmrg loop_distribution::break_alias_scc_partitions (struct graph *rdg,
2441a2dc1f3fSmrg vec<struct partition *> *partitions,
2442a2dc1f3fSmrg vec<ddr_p> *alias_ddrs)
2443a2dc1f3fSmrg {
24448feb0f0bSmrg int i, j, k, num_sccs, num_sccs_no_alias = 0;
2445a2dc1f3fSmrg /* Build partition dependence graph. */
2446a2dc1f3fSmrg graph *pg = build_partition_graph (rdg, partitions, false);
2447a2dc1f3fSmrg
2448a2dc1f3fSmrg alias_ddrs->truncate (0);
2449a2dc1f3fSmrg /* Find strong connected components in the graph, with all dependence edges
2450a2dc1f3fSmrg considered. */
2451a2dc1f3fSmrg num_sccs = graphds_scc (pg, NULL);
2452a2dc1f3fSmrg /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2453a2dc1f3fSmrg compilation time known dependences are merged before this function. */
2454a2dc1f3fSmrg if ((unsigned) num_sccs < partitions->length ())
2455a2dc1f3fSmrg {
2456a2dc1f3fSmrg struct pg_edge_callback_data cbdata;
2457a2dc1f3fSmrg auto_bitmap sccs_to_merge;
2458a2dc1f3fSmrg auto_vec<enum partition_type> scc_types;
2459a2dc1f3fSmrg struct partition *partition, *first;
2460a2dc1f3fSmrg
2461a2dc1f3fSmrg /* If all partitions in a SCC have the same type, we can simply merge the
2462a2dc1f3fSmrg SCC. This loop finds out such SCCS and record them in bitmap. */
2463a2dc1f3fSmrg bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2464a2dc1f3fSmrg for (i = 0; i < num_sccs; ++i)
2465a2dc1f3fSmrg {
2466a2dc1f3fSmrg for (j = 0; partitions->iterate (j, &first); ++j)
2467a2dc1f3fSmrg if (pg->vertices[j].component == i)
2468a2dc1f3fSmrg break;
2469c0a68be4Smrg
2470c0a68be4Smrg bool same_type = true, all_builtins = partition_builtin_p (first);
2471a2dc1f3fSmrg for (++j; partitions->iterate (j, &partition); ++j)
2472a2dc1f3fSmrg {
2473a2dc1f3fSmrg if (pg->vertices[j].component != i)
2474a2dc1f3fSmrg continue;
2475a2dc1f3fSmrg
2476a2dc1f3fSmrg if (first->type != partition->type)
2477a2dc1f3fSmrg {
2478c0a68be4Smrg same_type = false;
2479a2dc1f3fSmrg break;
2480a2dc1f3fSmrg }
2481c0a68be4Smrg all_builtins &= partition_builtin_p (partition);
2482a2dc1f3fSmrg }
2483c0a68be4Smrg /* Merge SCC if all partitions in SCC have the same type, though the
2484c0a68be4Smrg result partition is sequential, because vectorizer can do better
2485c0a68be4Smrg runtime alias check. One expecption is all partitions in SCC are
2486c0a68be4Smrg builtins. */
2487c0a68be4Smrg if (!same_type || all_builtins)
2488c0a68be4Smrg bitmap_clear_bit (sccs_to_merge, i);
2489a2dc1f3fSmrg }
2490a2dc1f3fSmrg
2491a2dc1f3fSmrg /* Initialize callback data for traversing. */
2492a2dc1f3fSmrg cbdata.sccs_to_merge = sccs_to_merge;
2493a2dc1f3fSmrg cbdata.alias_ddrs = alias_ddrs;
2494a2dc1f3fSmrg cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2495a2dc1f3fSmrg /* Record the component information which will be corrupted by next
2496a2dc1f3fSmrg graph scc finding call. */
2497a2dc1f3fSmrg for (i = 0; i < pg->n_vertices; ++i)
2498a2dc1f3fSmrg cbdata.vertices_component[i] = pg->vertices[i].component;
2499a2dc1f3fSmrg
2500a2dc1f3fSmrg /* Collect data dependences for runtime alias checks to break SCCs. */
2501a2dc1f3fSmrg if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2502a2dc1f3fSmrg {
2503*23f5f463Smrg /* For SCCs we want to merge clear all alias_ddrs for edges
2504*23f5f463Smrg inside the component. */
2505*23f5f463Smrg for_each_edge (pg, pg_unmark_merged_alias_ddrs, &cbdata);
25068feb0f0bSmrg
2507a2dc1f3fSmrg /* Run SCC finding algorithm again, with alias dependence edges
2508a2dc1f3fSmrg skipped. This is to topologically sort partitions according to
2509a2dc1f3fSmrg compilation time known dependence. Note the topological order
2510a2dc1f3fSmrg is stored in the form of pg's post order number. */
2511a2dc1f3fSmrg num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2512*23f5f463Smrg /* We cannot assert partitions->length () == num_sccs_no_alias
2513*23f5f463Smrg since we are not ignoring alias edges in cycles we are
2514*23f5f463Smrg going to merge. That's required to compute correct postorder. */
2515a2dc1f3fSmrg /* With topological order, we can construct two subgraphs L and R.
2516a2dc1f3fSmrg L contains edge <x, y> where x < y in terms of post order, while
2517a2dc1f3fSmrg R contains edge <x, y> where x > y. Edges for compilation time
2518a2dc1f3fSmrg known dependence all fall in R, so we break SCCs by removing all
2519a2dc1f3fSmrg (alias) edges of in subgraph L. */
2520a2dc1f3fSmrg for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2521a2dc1f3fSmrg }
2522a2dc1f3fSmrg
2523a2dc1f3fSmrg /* For SCC that doesn't need to be broken, merge it. */
2524a2dc1f3fSmrg for (i = 0; i < num_sccs; ++i)
2525a2dc1f3fSmrg {
2526a2dc1f3fSmrg if (!bitmap_bit_p (sccs_to_merge, i))
2527a2dc1f3fSmrg continue;
2528a2dc1f3fSmrg
2529a2dc1f3fSmrg for (j = 0; partitions->iterate (j, &first); ++j)
2530a2dc1f3fSmrg if (cbdata.vertices_component[j] == i)
2531a2dc1f3fSmrg break;
2532a2dc1f3fSmrg for (k = j + 1; partitions->iterate (k, &partition); ++k)
2533a2dc1f3fSmrg {
2534a2dc1f3fSmrg struct pg_vdata *data;
2535a2dc1f3fSmrg
2536a2dc1f3fSmrg if (cbdata.vertices_component[k] != i)
2537a2dc1f3fSmrg continue;
2538a2dc1f3fSmrg
2539a2dc1f3fSmrg partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2540a2dc1f3fSmrg (*partitions)[k] = NULL;
2541a2dc1f3fSmrg partition_free (partition);
2542a2dc1f3fSmrg data = (struct pg_vdata *)pg->vertices[k].data;
2543a2dc1f3fSmrg gcc_assert (data->id == k);
2544a2dc1f3fSmrg data->partition = NULL;
2545a2dc1f3fSmrg /* The result partition of merged SCC must be sequential. */
2546a2dc1f3fSmrg first->type = PTYPE_SEQUENTIAL;
2547a2dc1f3fSmrg }
2548a2dc1f3fSmrg }
2549*23f5f463Smrg /* If reduction partition's SCC is broken by runtime alias checks,
2550*23f5f463Smrg we force a negative post order to it making sure it will be scheduled
2551*23f5f463Smrg in the last. */
25528feb0f0bSmrg if (num_sccs_no_alias > 0)
25538feb0f0bSmrg {
25548feb0f0bSmrg j = -1;
25558feb0f0bSmrg for (i = 0; i < pg->n_vertices; ++i)
25568feb0f0bSmrg {
25578feb0f0bSmrg struct pg_vdata *data = (struct pg_vdata *)pg->vertices[i].data;
25588feb0f0bSmrg if (data->partition && partition_reduction_p (data->partition))
25598feb0f0bSmrg {
25608feb0f0bSmrg gcc_assert (j == -1);
25618feb0f0bSmrg j = i;
25628feb0f0bSmrg }
25638feb0f0bSmrg }
25648feb0f0bSmrg if (j >= 0)
25658feb0f0bSmrg pg->vertices[j].post = -1;
25668feb0f0bSmrg }
25678feb0f0bSmrg
25688feb0f0bSmrg free (cbdata.vertices_component);
2569a2dc1f3fSmrg }
2570a2dc1f3fSmrg
2571a2dc1f3fSmrg sort_partitions_by_post_order (pg, partitions);
2572a2dc1f3fSmrg free_partition_graph_vdata (pg);
2573a2dc1f3fSmrg for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2574a2dc1f3fSmrg free_graph (pg);
2575a2dc1f3fSmrg
2576a2dc1f3fSmrg if (dump_file && (dump_flags & TDF_DETAILS))
2577a2dc1f3fSmrg {
2578a2dc1f3fSmrg fprintf (dump_file, "Possible alias data dependence to break:\n");
2579a2dc1f3fSmrg dump_data_dependence_relations (dump_file, *alias_ddrs);
2580a2dc1f3fSmrg }
2581a2dc1f3fSmrg }
2582a2dc1f3fSmrg
2583a2dc1f3fSmrg /* Compute and return an expression whose value is the segment length which
2584a2dc1f3fSmrg will be accessed by DR in NITERS iterations. */
2585a2dc1f3fSmrg
2586a2dc1f3fSmrg static tree
data_ref_segment_size(struct data_reference * dr,tree niters)2587a2dc1f3fSmrg data_ref_segment_size (struct data_reference *dr, tree niters)
2588a2dc1f3fSmrg {
2589a2dc1f3fSmrg niters = size_binop (MINUS_EXPR,
2590a2dc1f3fSmrg fold_convert (sizetype, niters),
2591a2dc1f3fSmrg size_one_node);
2592a2dc1f3fSmrg return size_binop (MULT_EXPR,
2593a2dc1f3fSmrg fold_convert (sizetype, DR_STEP (dr)),
2594a2dc1f3fSmrg fold_convert (sizetype, niters));
2595a2dc1f3fSmrg }
2596a2dc1f3fSmrg
2597a2dc1f3fSmrg /* Return true if LOOP's latch is dominated by statement for data reference
2598a2dc1f3fSmrg DR. */
2599a2dc1f3fSmrg
2600a2dc1f3fSmrg static inline bool
latch_dominated_by_data_ref(class loop * loop,data_reference * dr)26018feb0f0bSmrg latch_dominated_by_data_ref (class loop *loop, data_reference *dr)
2602a2dc1f3fSmrg {
2603a2dc1f3fSmrg return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2604a2dc1f3fSmrg gimple_bb (DR_STMT (dr)));
2605a2dc1f3fSmrg }
2606a2dc1f3fSmrg
2607a2dc1f3fSmrg /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2608a2dc1f3fSmrg data dependence relations ALIAS_DDRS. */
2609a2dc1f3fSmrg
2610a2dc1f3fSmrg static void
compute_alias_check_pairs(class loop * loop,vec<ddr_p> * alias_ddrs,vec<dr_with_seg_len_pair_t> * comp_alias_pairs)26118feb0f0bSmrg compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs,
2612a2dc1f3fSmrg vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2613a2dc1f3fSmrg {
2614a2dc1f3fSmrg unsigned int i;
2615a2dc1f3fSmrg unsigned HOST_WIDE_INT factor = 1;
2616a2dc1f3fSmrg tree niters_plus_one, niters = number_of_latch_executions (loop);
2617a2dc1f3fSmrg
2618a2dc1f3fSmrg gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2619a2dc1f3fSmrg niters = fold_convert (sizetype, niters);
2620a2dc1f3fSmrg niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2621a2dc1f3fSmrg
2622a2dc1f3fSmrg if (dump_file && (dump_flags & TDF_DETAILS))
2623a2dc1f3fSmrg fprintf (dump_file, "Creating alias check pairs:\n");
2624a2dc1f3fSmrg
2625a2dc1f3fSmrg /* Iterate all data dependence relations and compute alias check pairs. */
2626a2dc1f3fSmrg for (i = 0; i < alias_ddrs->length (); i++)
2627a2dc1f3fSmrg {
2628a2dc1f3fSmrg ddr_p ddr = (*alias_ddrs)[i];
2629a2dc1f3fSmrg struct data_reference *dr_a = DDR_A (ddr);
2630a2dc1f3fSmrg struct data_reference *dr_b = DDR_B (ddr);
2631a2dc1f3fSmrg tree seg_length_a, seg_length_b;
2632a2dc1f3fSmrg
2633a2dc1f3fSmrg if (latch_dominated_by_data_ref (loop, dr_a))
2634a2dc1f3fSmrg seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2635a2dc1f3fSmrg else
2636a2dc1f3fSmrg seg_length_a = data_ref_segment_size (dr_a, niters);
2637a2dc1f3fSmrg
2638a2dc1f3fSmrg if (latch_dominated_by_data_ref (loop, dr_b))
2639a2dc1f3fSmrg seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2640a2dc1f3fSmrg else
2641a2dc1f3fSmrg seg_length_b = data_ref_segment_size (dr_b, niters);
2642a2dc1f3fSmrg
2643a2dc1f3fSmrg unsigned HOST_WIDE_INT access_size_a
2644a2dc1f3fSmrg = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2645a2dc1f3fSmrg unsigned HOST_WIDE_INT access_size_b
2646a2dc1f3fSmrg = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2647a2dc1f3fSmrg unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2648a2dc1f3fSmrg unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2649a2dc1f3fSmrg
2650a2dc1f3fSmrg dr_with_seg_len_pair_t dr_with_seg_len_pair
2651a2dc1f3fSmrg (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
26528feb0f0bSmrg dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b),
26538feb0f0bSmrg /* ??? Would WELL_ORDERED be safe? */
26548feb0f0bSmrg dr_with_seg_len_pair_t::REORDERED);
2655a2dc1f3fSmrg
2656a2dc1f3fSmrg comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2657a2dc1f3fSmrg }
2658a2dc1f3fSmrg
2659a2dc1f3fSmrg if (tree_fits_uhwi_p (niters))
2660a2dc1f3fSmrg factor = tree_to_uhwi (niters);
2661a2dc1f3fSmrg
2662a2dc1f3fSmrg /* Prune alias check pairs. */
2663a2dc1f3fSmrg prune_runtime_alias_test_list (comp_alias_pairs, factor);
2664a2dc1f3fSmrg if (dump_file && (dump_flags & TDF_DETAILS))
2665a2dc1f3fSmrg fprintf (dump_file,
2666a2dc1f3fSmrg "Improved number of alias checks from %d to %d\n",
2667a2dc1f3fSmrg alias_ddrs->length (), comp_alias_pairs->length ());
2668a2dc1f3fSmrg }
2669a2dc1f3fSmrg
2670a2dc1f3fSmrg /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2671a2dc1f3fSmrg checks and version LOOP under condition of these runtime alias checks. */
2672a2dc1f3fSmrg
2673a2dc1f3fSmrg static void
version_loop_by_alias_check(vec<struct partition * > * partitions,class loop * loop,vec<ddr_p> * alias_ddrs)2674c0a68be4Smrg version_loop_by_alias_check (vec<struct partition *> *partitions,
26758feb0f0bSmrg class loop *loop, vec<ddr_p> *alias_ddrs)
2676a2dc1f3fSmrg {
2677a2dc1f3fSmrg profile_probability prob;
2678a2dc1f3fSmrg basic_block cond_bb;
26798feb0f0bSmrg class loop *nloop;
2680a2dc1f3fSmrg tree lhs, arg0, cond_expr = NULL_TREE;
2681a2dc1f3fSmrg gimple_seq cond_stmts = NULL;
2682a2dc1f3fSmrg gimple *call_stmt = NULL;
2683a2dc1f3fSmrg auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2684a2dc1f3fSmrg
2685a2dc1f3fSmrg /* Generate code for runtime alias checks if necessary. */
2686a2dc1f3fSmrg gcc_assert (alias_ddrs->length () > 0);
2687a2dc1f3fSmrg
2688a2dc1f3fSmrg if (dump_file && (dump_flags & TDF_DETAILS))
2689a2dc1f3fSmrg fprintf (dump_file,
2690a2dc1f3fSmrg "Version loop <%d> with runtime alias check\n", loop->num);
2691a2dc1f3fSmrg
2692a2dc1f3fSmrg compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2693a2dc1f3fSmrg create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2694a2dc1f3fSmrg cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2695a2dc1f3fSmrg is_gimple_val, NULL_TREE);
2696a2dc1f3fSmrg
2697a2dc1f3fSmrg /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */
2698c0a68be4Smrg bool cancelable_p = flag_tree_loop_vectorize;
2699c0a68be4Smrg if (cancelable_p)
2700a2dc1f3fSmrg {
2701c0a68be4Smrg unsigned i = 0;
2702c0a68be4Smrg struct partition *partition;
2703c0a68be4Smrg for (; partitions->iterate (i, &partition); ++i)
2704c0a68be4Smrg if (!partition_builtin_p (partition))
2705c0a68be4Smrg break;
2706c0a68be4Smrg
2707c0a68be4Smrg /* If all partitions are builtins, distributing it would be profitable and
2708c0a68be4Smrg we don't want to cancel the runtime alias checks. */
2709c0a68be4Smrg if (i == partitions->length ())
2710c0a68be4Smrg cancelable_p = false;
2711c0a68be4Smrg }
2712c0a68be4Smrg
2713c0a68be4Smrg /* Generate internal function call for loop distribution alias check if the
2714c0a68be4Smrg runtime alias check should be cancelable. */
2715c0a68be4Smrg if (cancelable_p)
2716c0a68be4Smrg {
2717a2dc1f3fSmrg call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2718a2dc1f3fSmrg 2, NULL_TREE, cond_expr);
2719a2dc1f3fSmrg lhs = make_ssa_name (boolean_type_node);
2720a2dc1f3fSmrg gimple_call_set_lhs (call_stmt, lhs);
2721a2dc1f3fSmrg }
2722a2dc1f3fSmrg else
2723a2dc1f3fSmrg lhs = cond_expr;
2724a2dc1f3fSmrg
2725a2dc1f3fSmrg prob = profile_probability::guessed_always ().apply_scale (9, 10);
2726a2dc1f3fSmrg initialize_original_copy_tables ();
2727a2dc1f3fSmrg nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2728a2dc1f3fSmrg prob, prob.invert (), true);
2729a2dc1f3fSmrg free_original_copy_tables ();
2730a2dc1f3fSmrg /* Record the original loop number in newly generated loops. In case of
2731a2dc1f3fSmrg distribution, the original loop will be distributed and the new loop
2732a2dc1f3fSmrg is kept. */
2733a2dc1f3fSmrg loop->orig_loop_num = nloop->num;
2734a2dc1f3fSmrg nloop->orig_loop_num = nloop->num;
2735a2dc1f3fSmrg nloop->dont_vectorize = true;
2736a2dc1f3fSmrg nloop->force_vectorize = false;
2737a2dc1f3fSmrg
2738a2dc1f3fSmrg if (call_stmt)
2739a2dc1f3fSmrg {
2740a2dc1f3fSmrg /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2741a2dc1f3fSmrg loop could be destroyed. */
2742a2dc1f3fSmrg arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2743a2dc1f3fSmrg gimple_call_set_arg (call_stmt, 0, arg0);
2744a2dc1f3fSmrg gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2745a2dc1f3fSmrg }
2746a2dc1f3fSmrg
2747a2dc1f3fSmrg if (cond_stmts)
2748a2dc1f3fSmrg {
2749a2dc1f3fSmrg gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2750a2dc1f3fSmrg gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2751a2dc1f3fSmrg }
2752a2dc1f3fSmrg update_ssa (TODO_update_ssa);
2753a2dc1f3fSmrg }
2754a2dc1f3fSmrg
2755a2dc1f3fSmrg /* Return true if loop versioning is needed to distrubute PARTITIONS.
2756a2dc1f3fSmrg ALIAS_DDRS are data dependence relations for runtime alias check. */
2757a2dc1f3fSmrg
2758a2dc1f3fSmrg static inline bool
version_for_distribution_p(vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2759a2dc1f3fSmrg version_for_distribution_p (vec<struct partition *> *partitions,
2760a2dc1f3fSmrg vec<ddr_p> *alias_ddrs)
2761a2dc1f3fSmrg {
2762a2dc1f3fSmrg /* No need to version loop if we have only one partition. */
2763a2dc1f3fSmrg if (partitions->length () == 1)
2764a2dc1f3fSmrg return false;
2765a2dc1f3fSmrg
2766a2dc1f3fSmrg /* Need to version loop if runtime alias check is necessary. */
2767a2dc1f3fSmrg return (alias_ddrs->length () > 0);
2768a2dc1f3fSmrg }
2769a2dc1f3fSmrg
2770a2dc1f3fSmrg /* Compare base offset of builtin mem* partitions P1 and P2. */
2771a2dc1f3fSmrg
2772c0a68be4Smrg static int
offset_cmp(const void * vp1,const void * vp2)2773c0a68be4Smrg offset_cmp (const void *vp1, const void *vp2)
2774a2dc1f3fSmrg {
2775c0a68be4Smrg struct partition *p1 = *(struct partition *const *) vp1;
2776c0a68be4Smrg struct partition *p2 = *(struct partition *const *) vp2;
2777c0a68be4Smrg unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
2778c0a68be4Smrg unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
2779c0a68be4Smrg return (o2 < o1) - (o1 < o2);
2780a2dc1f3fSmrg }
2781a2dc1f3fSmrg
2782a2dc1f3fSmrg /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special
2783a2dc1f3fSmrg case optimization transforming below code:
2784a2dc1f3fSmrg
2785a2dc1f3fSmrg __builtin_memset (&obj, 0, 100);
2786a2dc1f3fSmrg _1 = &obj + 100;
2787a2dc1f3fSmrg __builtin_memset (_1, 0, 200);
2788a2dc1f3fSmrg _2 = &obj + 300;
2789a2dc1f3fSmrg __builtin_memset (_2, 0, 100);
2790a2dc1f3fSmrg
2791a2dc1f3fSmrg into:
2792a2dc1f3fSmrg
2793a2dc1f3fSmrg __builtin_memset (&obj, 0, 400);
2794a2dc1f3fSmrg
2795a2dc1f3fSmrg Note we don't have dependence information between different partitions
2796a2dc1f3fSmrg at this point, as a result, we can't handle nonadjacent memset builtin
2797a2dc1f3fSmrg partitions since dependence might be broken. */
2798a2dc1f3fSmrg
2799a2dc1f3fSmrg static void
fuse_memset_builtins(vec<struct partition * > * partitions)2800a2dc1f3fSmrg fuse_memset_builtins (vec<struct partition *> *partitions)
2801a2dc1f3fSmrg {
2802a2dc1f3fSmrg unsigned i, j;
2803a2dc1f3fSmrg struct partition *part1, *part2;
2804a2dc1f3fSmrg tree rhs1, rhs2;
2805a2dc1f3fSmrg
2806a2dc1f3fSmrg for (i = 0; partitions->iterate (i, &part1);)
2807a2dc1f3fSmrg {
2808a2dc1f3fSmrg if (part1->kind != PKIND_MEMSET)
2809a2dc1f3fSmrg {
2810a2dc1f3fSmrg i++;
2811a2dc1f3fSmrg continue;
2812a2dc1f3fSmrg }
2813a2dc1f3fSmrg
2814a2dc1f3fSmrg /* Find sub-array of memset builtins of the same base. Index range
2815a2dc1f3fSmrg of the sub-array is [i, j) with "j > i". */
2816a2dc1f3fSmrg for (j = i + 1; partitions->iterate (j, &part2); ++j)
2817a2dc1f3fSmrg {
2818a2dc1f3fSmrg if (part2->kind != PKIND_MEMSET
2819a2dc1f3fSmrg || !operand_equal_p (part1->builtin->dst_base_base,
2820a2dc1f3fSmrg part2->builtin->dst_base_base, 0))
2821a2dc1f3fSmrg break;
2822a2dc1f3fSmrg
2823a2dc1f3fSmrg /* Memset calls setting different values can't be merged. */
2824a2dc1f3fSmrg rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2825a2dc1f3fSmrg rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2826a2dc1f3fSmrg if (!operand_equal_p (rhs1, rhs2, 0))
2827a2dc1f3fSmrg break;
2828a2dc1f3fSmrg }
2829a2dc1f3fSmrg
2830a2dc1f3fSmrg /* Stable sort is required in order to avoid breaking dependence. */
2831c0a68be4Smrg gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
2832c0a68be4Smrg offset_cmp);
2833a2dc1f3fSmrg /* Continue with next partition. */
2834a2dc1f3fSmrg i = j;
2835a2dc1f3fSmrg }
2836a2dc1f3fSmrg
2837a2dc1f3fSmrg /* Merge all consecutive memset builtin partitions. */
2838a2dc1f3fSmrg for (i = 0; i < partitions->length () - 1;)
2839a2dc1f3fSmrg {
2840a2dc1f3fSmrg part1 = (*partitions)[i];
2841a2dc1f3fSmrg if (part1->kind != PKIND_MEMSET)
2842a2dc1f3fSmrg {
2843a2dc1f3fSmrg i++;
2844a2dc1f3fSmrg continue;
2845a2dc1f3fSmrg }
2846a2dc1f3fSmrg
2847a2dc1f3fSmrg part2 = (*partitions)[i + 1];
2848a2dc1f3fSmrg /* Only merge memset partitions of the same base and with constant
2849a2dc1f3fSmrg access sizes. */
2850a2dc1f3fSmrg if (part2->kind != PKIND_MEMSET
2851a2dc1f3fSmrg || TREE_CODE (part1->builtin->size) != INTEGER_CST
2852a2dc1f3fSmrg || TREE_CODE (part2->builtin->size) != INTEGER_CST
2853a2dc1f3fSmrg || !operand_equal_p (part1->builtin->dst_base_base,
2854a2dc1f3fSmrg part2->builtin->dst_base_base, 0))
2855a2dc1f3fSmrg {
2856a2dc1f3fSmrg i++;
2857a2dc1f3fSmrg continue;
2858a2dc1f3fSmrg }
2859a2dc1f3fSmrg rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2860a2dc1f3fSmrg rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2861a2dc1f3fSmrg int bytev1 = const_with_all_bytes_same (rhs1);
2862a2dc1f3fSmrg int bytev2 = const_with_all_bytes_same (rhs2);
2863a2dc1f3fSmrg /* Only merge memset partitions of the same value. */
2864a2dc1f3fSmrg if (bytev1 != bytev2 || bytev1 == -1)
2865a2dc1f3fSmrg {
2866a2dc1f3fSmrg i++;
2867a2dc1f3fSmrg continue;
2868a2dc1f3fSmrg }
2869a2dc1f3fSmrg wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2870a2dc1f3fSmrg wi::to_wide (part1->builtin->size));
2871a2dc1f3fSmrg /* Only merge adjacent memset partitions. */
2872a2dc1f3fSmrg if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2873a2dc1f3fSmrg {
2874a2dc1f3fSmrg i++;
2875a2dc1f3fSmrg continue;
2876a2dc1f3fSmrg }
2877a2dc1f3fSmrg /* Merge partitions[i] and partitions[i+1]. */
2878a2dc1f3fSmrg part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2879a2dc1f3fSmrg part1->builtin->size,
2880a2dc1f3fSmrg part2->builtin->size);
2881a2dc1f3fSmrg partition_free (part2);
2882a2dc1f3fSmrg partitions->ordered_remove (i + 1);
2883a2dc1f3fSmrg }
2884a2dc1f3fSmrg }
2885a2dc1f3fSmrg
28868feb0f0bSmrg void
finalize_partitions(class loop * loop,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)28878feb0f0bSmrg loop_distribution::finalize_partitions (class loop *loop,
28888feb0f0bSmrg vec<struct partition *> *partitions,
2889a2dc1f3fSmrg vec<ddr_p> *alias_ddrs)
2890a2dc1f3fSmrg {
2891a2dc1f3fSmrg unsigned i;
2892a2dc1f3fSmrg struct partition *partition, *a;
2893a2dc1f3fSmrg
2894a2dc1f3fSmrg if (partitions->length () == 1
2895a2dc1f3fSmrg || alias_ddrs->length () > 0)
2896a2dc1f3fSmrg return;
2897a2dc1f3fSmrg
2898a2dc1f3fSmrg unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2899a2dc1f3fSmrg bool same_type_p = true;
2900a2dc1f3fSmrg enum partition_type type = ((*partitions)[0])->type;
2901a2dc1f3fSmrg for (i = 0; partitions->iterate (i, &partition); ++i)
2902a2dc1f3fSmrg {
2903a2dc1f3fSmrg same_type_p &= (type == partition->type);
2904a2dc1f3fSmrg if (partition_builtin_p (partition))
2905a2dc1f3fSmrg {
2906a2dc1f3fSmrg num_builtin++;
2907a2dc1f3fSmrg continue;
2908a2dc1f3fSmrg }
2909a2dc1f3fSmrg num_normal++;
2910a2dc1f3fSmrg if (partition->kind == PKIND_PARTIAL_MEMSET)
2911a2dc1f3fSmrg num_partial_memset++;
2912a2dc1f3fSmrg }
2913a2dc1f3fSmrg
2914a2dc1f3fSmrg /* Don't distribute current loop into too many loops given we don't have
2915a2dc1f3fSmrg memory stream cost model. Be even more conservative in case of loop
2916a2dc1f3fSmrg nest distribution. */
2917a2dc1f3fSmrg if ((same_type_p && num_builtin == 0
2918a2dc1f3fSmrg && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2919a2dc1f3fSmrg || (loop->inner != NULL
2920a2dc1f3fSmrg && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2921a2dc1f3fSmrg || (loop->inner == NULL
2922a2dc1f3fSmrg && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2923a2dc1f3fSmrg {
2924a2dc1f3fSmrg a = (*partitions)[0];
2925a2dc1f3fSmrg for (i = 1; partitions->iterate (i, &partition); ++i)
2926a2dc1f3fSmrg {
2927a2dc1f3fSmrg partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2928a2dc1f3fSmrg partition_free (partition);
2929a2dc1f3fSmrg }
2930a2dc1f3fSmrg partitions->truncate (1);
2931a2dc1f3fSmrg }
2932a2dc1f3fSmrg
2933a2dc1f3fSmrg /* Fuse memset builtins if possible. */
2934a2dc1f3fSmrg if (partitions->length () > 1)
2935a2dc1f3fSmrg fuse_memset_builtins (partitions);
2936a2dc1f3fSmrg }
2937a2dc1f3fSmrg
2938a2dc1f3fSmrg /* Distributes the code from LOOP in such a way that producer statements
2939a2dc1f3fSmrg are placed before consumer statements. Tries to separate only the
2940a2dc1f3fSmrg statements from STMTS into separate loops. Returns the number of
2941a2dc1f3fSmrg distributed loops. Set NB_CALLS to number of generated builtin calls.
2942a2dc1f3fSmrg Set *DESTROY_P to whether LOOP needs to be destroyed. */
29431debfc3dSmrg
29448feb0f0bSmrg int
distribute_loop(class loop * loop,vec<gimple * > stmts,control_dependences * cd,int * nb_calls,bool * destroy_p,bool only_patterns_p)29458feb0f0bSmrg loop_distribution::distribute_loop (class loop *loop, vec<gimple *> stmts,
29468feb0f0bSmrg control_dependences *cd, int *nb_calls, bool *destroy_p,
29478feb0f0bSmrg bool only_patterns_p)
29481debfc3dSmrg {
2949a2dc1f3fSmrg ddrs_table = new hash_table<ddr_hasher> (389);
29501debfc3dSmrg struct graph *rdg;
29511debfc3dSmrg partition *partition;
29521debfc3dSmrg int i, nbp;
29531debfc3dSmrg
29541debfc3dSmrg *destroy_p = false;
29551debfc3dSmrg *nb_calls = 0;
2956a2dc1f3fSmrg loop_nest.create (0);
29571debfc3dSmrg if (!find_loop_nest (loop, &loop_nest))
2958a2dc1f3fSmrg {
2959a2dc1f3fSmrg loop_nest.release ();
2960a2dc1f3fSmrg delete ddrs_table;
29611debfc3dSmrg return 0;
2962a2dc1f3fSmrg }
29631debfc3dSmrg
2964a2dc1f3fSmrg datarefs_vec.create (20);
2965c0a68be4Smrg has_nonaddressable_dataref_p = false;
2966a2dc1f3fSmrg rdg = build_rdg (loop, cd);
29671debfc3dSmrg if (!rdg)
29681debfc3dSmrg {
29691debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
29701debfc3dSmrg fprintf (dump_file,
29711debfc3dSmrg "Loop %d not distributed: failed to build the RDG.\n",
29721debfc3dSmrg loop->num);
29731debfc3dSmrg
2974a2dc1f3fSmrg loop_nest.release ();
2975a2dc1f3fSmrg free_data_refs (datarefs_vec);
2976a2dc1f3fSmrg delete ddrs_table;
29771debfc3dSmrg return 0;
29781debfc3dSmrg }
29791debfc3dSmrg
2980a2dc1f3fSmrg if (datarefs_vec.length () > MAX_DATAREFS_NUM)
2981a2dc1f3fSmrg {
2982a2dc1f3fSmrg if (dump_file && (dump_flags & TDF_DETAILS))
2983a2dc1f3fSmrg fprintf (dump_file,
2984a2dc1f3fSmrg "Loop %d not distributed: too many memory references.\n",
2985a2dc1f3fSmrg loop->num);
2986a2dc1f3fSmrg
2987a2dc1f3fSmrg free_rdg (rdg);
2988a2dc1f3fSmrg loop_nest.release ();
2989a2dc1f3fSmrg free_data_refs (datarefs_vec);
2990a2dc1f3fSmrg delete ddrs_table;
2991a2dc1f3fSmrg return 0;
2992a2dc1f3fSmrg }
2993a2dc1f3fSmrg
2994a2dc1f3fSmrg data_reference_p dref;
2995a2dc1f3fSmrg for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
2996a2dc1f3fSmrg dref->aux = (void *) (uintptr_t) i;
2997a2dc1f3fSmrg
29981debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
29991debfc3dSmrg dump_rdg (dump_file, rdg);
30001debfc3dSmrg
30011debfc3dSmrg auto_vec<struct partition *, 3> partitions;
30021debfc3dSmrg rdg_build_partitions (rdg, stmts, &partitions);
30031debfc3dSmrg
3004a2dc1f3fSmrg auto_vec<ddr_p> alias_ddrs;
3005a2dc1f3fSmrg
3006a2dc1f3fSmrg auto_bitmap stmt_in_all_partitions;
3007a2dc1f3fSmrg bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
3008a2dc1f3fSmrg for (i = 1; partitions.iterate (i, &partition); ++i)
3009a2dc1f3fSmrg bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
3010a2dc1f3fSmrg
30118feb0f0bSmrg bool any_builtin = false;
30128feb0f0bSmrg bool reduction_in_all = false;
30131debfc3dSmrg FOR_EACH_VEC_ELT (partitions, i, partition)
30141debfc3dSmrg {
30158feb0f0bSmrg reduction_in_all
30168feb0f0bSmrg |= classify_partition (loop, rdg, partition, stmt_in_all_partitions);
30171debfc3dSmrg any_builtin |= partition_builtin_p (partition);
30181debfc3dSmrg }
30191debfc3dSmrg
30201debfc3dSmrg /* If we are only distributing patterns but did not detect any,
30211debfc3dSmrg simply bail out. */
30228feb0f0bSmrg if (only_patterns_p
30231debfc3dSmrg && !any_builtin)
30241debfc3dSmrg {
30251debfc3dSmrg nbp = 0;
30261debfc3dSmrg goto ldist_done;
30271debfc3dSmrg }
30281debfc3dSmrg
30291debfc3dSmrg /* If we are only distributing patterns fuse all partitions that
30301debfc3dSmrg were not classified as builtins. This also avoids chopping
30311debfc3dSmrg a loop into pieces, separated by builtin calls. That is, we
30321debfc3dSmrg only want no or a single loop body remaining. */
30331debfc3dSmrg struct partition *into;
30348feb0f0bSmrg if (only_patterns_p)
30351debfc3dSmrg {
30361debfc3dSmrg for (i = 0; partitions.iterate (i, &into); ++i)
30371debfc3dSmrg if (!partition_builtin_p (into))
30381debfc3dSmrg break;
30391debfc3dSmrg for (++i; partitions.iterate (i, &partition); ++i)
30401debfc3dSmrg if (!partition_builtin_p (partition))
30411debfc3dSmrg {
3042a2dc1f3fSmrg partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
30431debfc3dSmrg partitions.unordered_remove (i);
30441debfc3dSmrg partition_free (partition);
30451debfc3dSmrg i--;
30461debfc3dSmrg }
30471debfc3dSmrg }
30481debfc3dSmrg
30491debfc3dSmrg /* Due to limitations in the transform phase we have to fuse all
30501debfc3dSmrg reduction partitions into the last partition so the existing
30511debfc3dSmrg loop will contain all loop-closed PHI nodes. */
30521debfc3dSmrg for (i = 0; partitions.iterate (i, &into); ++i)
30531debfc3dSmrg if (partition_reduction_p (into))
30541debfc3dSmrg break;
30551debfc3dSmrg for (i = i + 1; partitions.iterate (i, &partition); ++i)
30561debfc3dSmrg if (partition_reduction_p (partition))
30571debfc3dSmrg {
3058a2dc1f3fSmrg partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
30591debfc3dSmrg partitions.unordered_remove (i);
30601debfc3dSmrg partition_free (partition);
30611debfc3dSmrg i--;
30621debfc3dSmrg }
30631debfc3dSmrg
30641debfc3dSmrg /* Apply our simple cost model - fuse partitions with similar
30651debfc3dSmrg memory accesses. */
30661debfc3dSmrg for (i = 0; partitions.iterate (i, &into); ++i)
30671debfc3dSmrg {
30681debfc3dSmrg bool changed = false;
3069a2dc1f3fSmrg if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
30701debfc3dSmrg continue;
30711debfc3dSmrg for (int j = i + 1;
30721debfc3dSmrg partitions.iterate (j, &partition); ++j)
30731debfc3dSmrg {
3074a2dc1f3fSmrg if (share_memory_accesses (rdg, into, partition))
30751debfc3dSmrg {
3076a2dc1f3fSmrg partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
30771debfc3dSmrg partitions.unordered_remove (j);
30781debfc3dSmrg partition_free (partition);
30791debfc3dSmrg j--;
30801debfc3dSmrg changed = true;
30811debfc3dSmrg }
30821debfc3dSmrg }
30831debfc3dSmrg /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
30841debfc3dSmrg accesses when 1 and 2 have similar accesses but not 0 and 1
30851debfc3dSmrg then in the next iteration we will fail to consider merging
30861debfc3dSmrg 1 into 0,2. So try again if we did any merging into 0. */
30871debfc3dSmrg if (changed)
30881debfc3dSmrg i--;
30891debfc3dSmrg }
30901debfc3dSmrg
30918feb0f0bSmrg /* Put a non-builtin partition last if we need to preserve a reduction.
30928feb0f0bSmrg ??? This is a workaround that makes sort_partitions_by_post_order do
30938feb0f0bSmrg the correct thing while in reality it should sort each component
30948feb0f0bSmrg separately and then put the component with a reduction or a non-builtin
30958feb0f0bSmrg last. */
30968feb0f0bSmrg if (reduction_in_all
30978feb0f0bSmrg && partition_builtin_p (partitions.last()))
30988feb0f0bSmrg FOR_EACH_VEC_ELT (partitions, i, partition)
30998feb0f0bSmrg if (!partition_builtin_p (partition))
31008feb0f0bSmrg {
31018feb0f0bSmrg partitions.unordered_remove (i);
31028feb0f0bSmrg partitions.quick_push (partition);
31038feb0f0bSmrg break;
31048feb0f0bSmrg }
31058feb0f0bSmrg
3106a2dc1f3fSmrg /* Build the partition dependency graph and fuse partitions in strong
3107a2dc1f3fSmrg connected component. */
31081debfc3dSmrg if (partitions.length () > 1)
31091debfc3dSmrg {
3110a2dc1f3fSmrg /* Don't support loop nest distribution under runtime alias check
3111c0a68be4Smrg since it's not likely to enable many vectorization opportunities.
3112c0a68be4Smrg Also if loop has any data reference which may be not addressable
3113c0a68be4Smrg since alias check needs to take, compare address of the object. */
3114c0a68be4Smrg if (loop->inner || has_nonaddressable_dataref_p)
3115a2dc1f3fSmrg merge_dep_scc_partitions (rdg, &partitions, false);
31161debfc3dSmrg else
31171debfc3dSmrg {
3118a2dc1f3fSmrg merge_dep_scc_partitions (rdg, &partitions, true);
3119a2dc1f3fSmrg if (partitions.length () > 1)
3120a2dc1f3fSmrg break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
31211debfc3dSmrg }
31221debfc3dSmrg }
31231debfc3dSmrg
3124a2dc1f3fSmrg finalize_partitions (loop, &partitions, &alias_ddrs);
31251debfc3dSmrg
31268feb0f0bSmrg /* If there is a reduction in all partitions make sure the last one
31278feb0f0bSmrg is not classified for builtin code generation. */
31288feb0f0bSmrg if (reduction_in_all)
31298feb0f0bSmrg {
31308feb0f0bSmrg partition = partitions.last ();
31318feb0f0bSmrg if (only_patterns_p
31328feb0f0bSmrg && partition_builtin_p (partition)
31338feb0f0bSmrg && !partition_builtin_p (partitions[0]))
31348feb0f0bSmrg {
31358feb0f0bSmrg nbp = 0;
31368feb0f0bSmrg goto ldist_done;
31378feb0f0bSmrg }
31388feb0f0bSmrg partition->kind = PKIND_NORMAL;
31398feb0f0bSmrg }
31408feb0f0bSmrg
31411debfc3dSmrg nbp = partitions.length ();
31421debfc3dSmrg if (nbp == 0
31431debfc3dSmrg || (nbp == 1 && !partition_builtin_p (partitions[0]))
31441debfc3dSmrg || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
31451debfc3dSmrg {
31461debfc3dSmrg nbp = 0;
31471debfc3dSmrg goto ldist_done;
31481debfc3dSmrg }
31491debfc3dSmrg
3150a2dc1f3fSmrg if (version_for_distribution_p (&partitions, &alias_ddrs))
3151c0a68be4Smrg version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
3152a2dc1f3fSmrg
31531debfc3dSmrg if (dump_file && (dump_flags & TDF_DETAILS))
3154a2dc1f3fSmrg {
3155a2dc1f3fSmrg fprintf (dump_file,
3156a2dc1f3fSmrg "distribute loop <%d> into partitions:\n", loop->num);
31571debfc3dSmrg dump_rdg_partitions (dump_file, partitions);
3158a2dc1f3fSmrg }
31591debfc3dSmrg
31601debfc3dSmrg FOR_EACH_VEC_ELT (partitions, i, partition)
31611debfc3dSmrg {
31621debfc3dSmrg if (partition_builtin_p (partition))
31631debfc3dSmrg (*nb_calls)++;
31641debfc3dSmrg *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
31651debfc3dSmrg }
31661debfc3dSmrg
31671debfc3dSmrg ldist_done:
3168a2dc1f3fSmrg loop_nest.release ();
3169a2dc1f3fSmrg free_data_refs (datarefs_vec);
3170a2dc1f3fSmrg for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
3171a2dc1f3fSmrg iter != ddrs_table->end (); ++iter)
3172a2dc1f3fSmrg {
3173a2dc1f3fSmrg free_dependence_relation (*iter);
3174a2dc1f3fSmrg *iter = NULL;
3175a2dc1f3fSmrg }
3176a2dc1f3fSmrg delete ddrs_table;
31771debfc3dSmrg
31781debfc3dSmrg FOR_EACH_VEC_ELT (partitions, i, partition)
31791debfc3dSmrg partition_free (partition);
31801debfc3dSmrg
31811debfc3dSmrg free_rdg (rdg);
31821debfc3dSmrg return nbp - *nb_calls;
31831debfc3dSmrg }
31841debfc3dSmrg
31858feb0f0bSmrg
bb_top_order_init(void)31868feb0f0bSmrg void loop_distribution::bb_top_order_init (void)
31878feb0f0bSmrg {
31888feb0f0bSmrg int rpo_num;
31898feb0f0bSmrg int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (cfun) - NUM_FIXED_BLOCKS);
31908feb0f0bSmrg edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
31918feb0f0bSmrg bitmap exit_bbs = BITMAP_ALLOC (NULL);
31928feb0f0bSmrg
31938feb0f0bSmrg bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
31948feb0f0bSmrg bb_top_order_index_size = last_basic_block_for_fn (cfun);
31958feb0f0bSmrg
31968feb0f0bSmrg entry->flags &= ~EDGE_DFS_BACK;
31978feb0f0bSmrg bitmap_set_bit (exit_bbs, EXIT_BLOCK);
31988feb0f0bSmrg rpo_num = rev_post_order_and_mark_dfs_back_seme (cfun, entry, exit_bbs, true,
31998feb0f0bSmrg rpo, NULL);
32008feb0f0bSmrg BITMAP_FREE (exit_bbs);
32018feb0f0bSmrg
32028feb0f0bSmrg for (int i = 0; i < rpo_num; i++)
32038feb0f0bSmrg bb_top_order_index[rpo[i]] = i;
32048feb0f0bSmrg
32058feb0f0bSmrg free (rpo);
32068feb0f0bSmrg }
32078feb0f0bSmrg
bb_top_order_destroy()32088feb0f0bSmrg void loop_distribution::bb_top_order_destroy ()
32098feb0f0bSmrg {
32108feb0f0bSmrg free (bb_top_order_index);
32118feb0f0bSmrg bb_top_order_index = NULL;
32128feb0f0bSmrg bb_top_order_index_size = 0;
32138feb0f0bSmrg }
32148feb0f0bSmrg
32158feb0f0bSmrg
32168feb0f0bSmrg /* Given LOOP, this function records seed statements for distribution in
32178feb0f0bSmrg WORK_LIST. Return false if there is nothing for distribution. */
32188feb0f0bSmrg
32198feb0f0bSmrg static bool
find_seed_stmts_for_distribution(class loop * loop,vec<gimple * > * work_list)32208feb0f0bSmrg find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list)
32218feb0f0bSmrg {
32228feb0f0bSmrg basic_block *bbs = get_loop_body_in_dom_order (loop);
32238feb0f0bSmrg
32248feb0f0bSmrg /* Initialize the worklist with stmts we seed the partitions with. */
32258feb0f0bSmrg for (unsigned i = 0; i < loop->num_nodes; ++i)
32268feb0f0bSmrg {
32278feb0f0bSmrg /* In irreducible sub-regions we don't know how to redirect
32288feb0f0bSmrg conditions, so fail. See PR100492. */
32298feb0f0bSmrg if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
32308feb0f0bSmrg {
32318feb0f0bSmrg if (dump_file && (dump_flags & TDF_DETAILS))
32328feb0f0bSmrg fprintf (dump_file, "loop %d contains an irreducible region.\n",
32338feb0f0bSmrg loop->num);
32348feb0f0bSmrg work_list->truncate (0);
32358feb0f0bSmrg break;
32368feb0f0bSmrg }
32378feb0f0bSmrg for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
32388feb0f0bSmrg !gsi_end_p (gsi); gsi_next (&gsi))
32398feb0f0bSmrg {
32408feb0f0bSmrg gphi *phi = gsi.phi ();
32418feb0f0bSmrg if (virtual_operand_p (gimple_phi_result (phi)))
32428feb0f0bSmrg continue;
32438feb0f0bSmrg /* Distribute stmts which have defs that are used outside of
32448feb0f0bSmrg the loop. */
32458feb0f0bSmrg if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
32468feb0f0bSmrg continue;
32478feb0f0bSmrg work_list->safe_push (phi);
32488feb0f0bSmrg }
32498feb0f0bSmrg for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
32508feb0f0bSmrg !gsi_end_p (gsi); gsi_next (&gsi))
32518feb0f0bSmrg {
32528feb0f0bSmrg gimple *stmt = gsi_stmt (gsi);
32538feb0f0bSmrg
32548feb0f0bSmrg /* Ignore clobbers, they do not have true side effects. */
32558feb0f0bSmrg if (gimple_clobber_p (stmt))
32568feb0f0bSmrg continue;
32578feb0f0bSmrg
32588feb0f0bSmrg /* If there is a stmt with side-effects bail out - we
32598feb0f0bSmrg cannot and should not distribute this loop. */
32608feb0f0bSmrg if (gimple_has_side_effects (stmt))
32618feb0f0bSmrg {
32628feb0f0bSmrg free (bbs);
32638feb0f0bSmrg return false;
32648feb0f0bSmrg }
32658feb0f0bSmrg
32668feb0f0bSmrg /* Distribute stmts which have defs that are used outside of
32678feb0f0bSmrg the loop. */
32688feb0f0bSmrg if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
32698feb0f0bSmrg ;
32708feb0f0bSmrg /* Otherwise only distribute stores for now. */
32718feb0f0bSmrg else if (!gimple_vdef (stmt))
32728feb0f0bSmrg continue;
32738feb0f0bSmrg
32748feb0f0bSmrg work_list->safe_push (stmt);
32758feb0f0bSmrg }
32768feb0f0bSmrg }
32778feb0f0bSmrg free (bbs);
32788feb0f0bSmrg return work_list->length () > 0;
32798feb0f0bSmrg }
32808feb0f0bSmrg
32818feb0f0bSmrg /* Given innermost LOOP, return the outermost enclosing loop that forms a
32828feb0f0bSmrg perfect loop nest. */
32838feb0f0bSmrg
32848feb0f0bSmrg static class loop *
prepare_perfect_loop_nest(class loop * loop)32858feb0f0bSmrg prepare_perfect_loop_nest (class loop *loop)
32868feb0f0bSmrg {
32878feb0f0bSmrg class loop *outer = loop_outer (loop);
32888feb0f0bSmrg tree niters = number_of_latch_executions (loop);
32898feb0f0bSmrg
32908feb0f0bSmrg /* TODO: We only support the innermost 3-level loop nest distribution
32918feb0f0bSmrg because of compilation time issue for now. This should be relaxed
32928feb0f0bSmrg in the future. Note we only allow 3-level loop nest distribution
32938feb0f0bSmrg when parallelizing loops. */
32948feb0f0bSmrg while ((loop->inner == NULL
32958feb0f0bSmrg || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
32968feb0f0bSmrg && loop_outer (outer)
32978feb0f0bSmrg && outer->inner == loop && loop->next == NULL
32988feb0f0bSmrg && single_exit (outer)
32998feb0f0bSmrg && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
33008feb0f0bSmrg && (niters = number_of_latch_executions (outer)) != NULL_TREE
33018feb0f0bSmrg && niters != chrec_dont_know)
33028feb0f0bSmrg {
33038feb0f0bSmrg loop = outer;
33048feb0f0bSmrg outer = loop_outer (loop);
33058feb0f0bSmrg }
33068feb0f0bSmrg
33078feb0f0bSmrg return loop;
33088feb0f0bSmrg }
33098feb0f0bSmrg
33108feb0f0bSmrg
33118feb0f0bSmrg unsigned int
execute(function * fun)33128feb0f0bSmrg loop_distribution::execute (function *fun)
33138feb0f0bSmrg {
33148feb0f0bSmrg class loop *loop;
33158feb0f0bSmrg bool changed = false;
33168feb0f0bSmrg basic_block bb;
33178feb0f0bSmrg control_dependences *cd = NULL;
33188feb0f0bSmrg auto_vec<loop_p> loops_to_be_destroyed;
33198feb0f0bSmrg
33208feb0f0bSmrg if (number_of_loops (fun) <= 1)
33218feb0f0bSmrg return 0;
33228feb0f0bSmrg
33238feb0f0bSmrg bb_top_order_init ();
33248feb0f0bSmrg
33258feb0f0bSmrg FOR_ALL_BB_FN (bb, fun)
33268feb0f0bSmrg {
33278feb0f0bSmrg gimple_stmt_iterator gsi;
33288feb0f0bSmrg for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
33298feb0f0bSmrg gimple_set_uid (gsi_stmt (gsi), -1);
33308feb0f0bSmrg for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
33318feb0f0bSmrg gimple_set_uid (gsi_stmt (gsi), -1);
33328feb0f0bSmrg }
33338feb0f0bSmrg
33348feb0f0bSmrg /* We can at the moment only distribute non-nested loops, thus restrict
33358feb0f0bSmrg walking to innermost loops. */
33368feb0f0bSmrg FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
33378feb0f0bSmrg {
33388feb0f0bSmrg /* Don't distribute multiple exit edges loop, or cold loop when
33398feb0f0bSmrg not doing pattern detection. */
33408feb0f0bSmrg if (!single_exit (loop)
33418feb0f0bSmrg || (!flag_tree_loop_distribute_patterns
33428feb0f0bSmrg && !optimize_loop_for_speed_p (loop)))
33438feb0f0bSmrg continue;
33448feb0f0bSmrg
33458feb0f0bSmrg /* Don't distribute loop if niters is unknown. */
33468feb0f0bSmrg tree niters = number_of_latch_executions (loop);
33478feb0f0bSmrg if (niters == NULL_TREE || niters == chrec_dont_know)
33488feb0f0bSmrg continue;
33498feb0f0bSmrg
33508feb0f0bSmrg /* Get the perfect loop nest for distribution. */
33518feb0f0bSmrg loop = prepare_perfect_loop_nest (loop);
33528feb0f0bSmrg for (; loop; loop = loop->inner)
33538feb0f0bSmrg {
33548feb0f0bSmrg auto_vec<gimple *> work_list;
33558feb0f0bSmrg if (!find_seed_stmts_for_distribution (loop, &work_list))
33568feb0f0bSmrg break;
33578feb0f0bSmrg
33588feb0f0bSmrg const char *str = loop->inner ? " nest" : "";
33598feb0f0bSmrg dump_user_location_t loc = find_loop_location (loop);
33608feb0f0bSmrg if (!cd)
33618feb0f0bSmrg {
33628feb0f0bSmrg calculate_dominance_info (CDI_DOMINATORS);
33638feb0f0bSmrg calculate_dominance_info (CDI_POST_DOMINATORS);
33648feb0f0bSmrg cd = new control_dependences ();
33658feb0f0bSmrg free_dominance_info (CDI_POST_DOMINATORS);
33668feb0f0bSmrg }
33678feb0f0bSmrg
33688feb0f0bSmrg bool destroy_p;
33698feb0f0bSmrg int nb_generated_loops, nb_generated_calls;
33708feb0f0bSmrg nb_generated_loops
33718feb0f0bSmrg = distribute_loop (loop, work_list, cd, &nb_generated_calls,
33728feb0f0bSmrg &destroy_p, (!optimize_loop_for_speed_p (loop)
33738feb0f0bSmrg || !flag_tree_loop_distribution));
33748feb0f0bSmrg if (destroy_p)
33758feb0f0bSmrg loops_to_be_destroyed.safe_push (loop);
33768feb0f0bSmrg
33778feb0f0bSmrg if (nb_generated_loops + nb_generated_calls > 0)
33788feb0f0bSmrg {
33798feb0f0bSmrg changed = true;
33808feb0f0bSmrg if (dump_enabled_p ())
33818feb0f0bSmrg dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
33828feb0f0bSmrg loc, "Loop%s %d distributed: split to %d loops "
33838feb0f0bSmrg "and %d library calls.\n", str, loop->num,
33848feb0f0bSmrg nb_generated_loops, nb_generated_calls);
33858feb0f0bSmrg
33868feb0f0bSmrg break;
33878feb0f0bSmrg }
33888feb0f0bSmrg
33898feb0f0bSmrg if (dump_file && (dump_flags & TDF_DETAILS))
33908feb0f0bSmrg fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
33918feb0f0bSmrg }
33928feb0f0bSmrg }
33938feb0f0bSmrg
33948feb0f0bSmrg if (cd)
33958feb0f0bSmrg delete cd;
33968feb0f0bSmrg
33978feb0f0bSmrg if (bb_top_order_index != NULL)
33988feb0f0bSmrg bb_top_order_destroy ();
33998feb0f0bSmrg
34008feb0f0bSmrg if (changed)
34018feb0f0bSmrg {
34028feb0f0bSmrg /* Destroy loop bodies that could not be reused. Do this late as we
34038feb0f0bSmrg otherwise can end up refering to stale data in control dependences. */
34048feb0f0bSmrg unsigned i;
34058feb0f0bSmrg FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
34068feb0f0bSmrg destroy_loop (loop);
34078feb0f0bSmrg
34088feb0f0bSmrg /* Cached scalar evolutions now may refer to wrong or non-existing
34098feb0f0bSmrg loops. */
34108feb0f0bSmrg scev_reset_htab ();
34118feb0f0bSmrg mark_virtual_operands_for_renaming (fun);
34128feb0f0bSmrg rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
34138feb0f0bSmrg }
34148feb0f0bSmrg
34158feb0f0bSmrg checking_verify_loop_structure ();
34168feb0f0bSmrg
34178feb0f0bSmrg return changed ? TODO_cleanup_cfg : 0;
34188feb0f0bSmrg }
34198feb0f0bSmrg
34208feb0f0bSmrg
34211debfc3dSmrg /* Distribute all loops in the current function. */
34221debfc3dSmrg
34231debfc3dSmrg namespace {
34241debfc3dSmrg
34251debfc3dSmrg const pass_data pass_data_loop_distribution =
34261debfc3dSmrg {
34271debfc3dSmrg GIMPLE_PASS, /* type */
34281debfc3dSmrg "ldist", /* name */
34291debfc3dSmrg OPTGROUP_LOOP, /* optinfo_flags */
34301debfc3dSmrg TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
34311debfc3dSmrg ( PROP_cfg | PROP_ssa ), /* properties_required */
34321debfc3dSmrg 0, /* properties_provided */
34331debfc3dSmrg 0, /* properties_destroyed */
34341debfc3dSmrg 0, /* todo_flags_start */
34351debfc3dSmrg 0, /* todo_flags_finish */
34361debfc3dSmrg };
34371debfc3dSmrg
34381debfc3dSmrg class pass_loop_distribution : public gimple_opt_pass
34391debfc3dSmrg {
34401debfc3dSmrg public:
pass_loop_distribution(gcc::context * ctxt)34411debfc3dSmrg pass_loop_distribution (gcc::context *ctxt)
34421debfc3dSmrg : gimple_opt_pass (pass_data_loop_distribution, ctxt)
34431debfc3dSmrg {}
34441debfc3dSmrg
34451debfc3dSmrg /* opt_pass methods: */
gate(function *)34461debfc3dSmrg virtual bool gate (function *)
34471debfc3dSmrg {
34481debfc3dSmrg return flag_tree_loop_distribution
34491debfc3dSmrg || flag_tree_loop_distribute_patterns;
34501debfc3dSmrg }
34511debfc3dSmrg
34521debfc3dSmrg virtual unsigned int execute (function *);
34531debfc3dSmrg
34541debfc3dSmrg }; // class pass_loop_distribution
34551debfc3dSmrg
34561debfc3dSmrg unsigned int
execute(function * fun)34571debfc3dSmrg pass_loop_distribution::execute (function *fun)
34581debfc3dSmrg {
34598feb0f0bSmrg return loop_distribution ().execute (fun);
34601debfc3dSmrg }
34611debfc3dSmrg
34621debfc3dSmrg } // anon namespace
34631debfc3dSmrg
34641debfc3dSmrg gimple_opt_pass *
make_pass_loop_distribution(gcc::context * ctxt)34651debfc3dSmrg make_pass_loop_distribution (gcc::context *ctxt)
34661debfc3dSmrg {
34671debfc3dSmrg return new pass_loop_distribution (ctxt);
34681debfc3dSmrg }
3469