1*38fd1498Szrj /* Loop distribution. 2*38fd1498Szrj Copyright (C) 2006-2018 Free Software Foundation, Inc. 3*38fd1498Szrj Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr> 4*38fd1498Szrj and Sebastian Pop <sebastian.pop@amd.com>. 5*38fd1498Szrj 6*38fd1498Szrj This file is part of GCC. 7*38fd1498Szrj 8*38fd1498Szrj GCC is free software; you can redistribute it and/or modify it 9*38fd1498Szrj under the terms of the GNU General Public License as published by the 10*38fd1498Szrj Free Software Foundation; either version 3, or (at your option) any 11*38fd1498Szrj later version. 12*38fd1498Szrj 13*38fd1498Szrj GCC is distributed in the hope that it will be useful, but WITHOUT 14*38fd1498Szrj ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 15*38fd1498Szrj FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 16*38fd1498Szrj for more details. 17*38fd1498Szrj 18*38fd1498Szrj You should have received a copy of the GNU General Public License 19*38fd1498Szrj along with GCC; see the file COPYING3. If not see 20*38fd1498Szrj <http://www.gnu.org/licenses/>. */ 21*38fd1498Szrj 22*38fd1498Szrj /* This pass performs loop distribution: for example, the loop 23*38fd1498Szrj 24*38fd1498Szrj |DO I = 2, N 25*38fd1498Szrj | A(I) = B(I) + C 26*38fd1498Szrj | D(I) = A(I-1)*E 27*38fd1498Szrj |ENDDO 28*38fd1498Szrj 29*38fd1498Szrj is transformed to 30*38fd1498Szrj 31*38fd1498Szrj |DOALL I = 2, N 32*38fd1498Szrj | A(I) = B(I) + C 33*38fd1498Szrj |ENDDO 34*38fd1498Szrj | 35*38fd1498Szrj |DOALL I = 2, N 36*38fd1498Szrj | D(I) = A(I-1)*E 37*38fd1498Szrj |ENDDO 38*38fd1498Szrj 39*38fd1498Szrj Loop distribution is the dual of loop fusion. It separates statements 40*38fd1498Szrj of a loop (or loop nest) into multiple loops (or loop nests) with the 41*38fd1498Szrj same loop header. The major goal is to separate statements which may 42*38fd1498Szrj be vectorized from those that can't. This pass implements distribution 43*38fd1498Szrj in the following steps: 44*38fd1498Szrj 45*38fd1498Szrj 1) Seed partitions with specific type statements. For now we support 46*38fd1498Szrj two types seed statements: statement defining variable used outside 47*38fd1498Szrj of loop; statement storing to memory. 48*38fd1498Szrj 2) Build reduced dependence graph (RDG) for loop to be distributed. 49*38fd1498Szrj The vertices (RDG:V) model all statements in the loop and the edges 50*38fd1498Szrj (RDG:E) model flow and control dependencies between statements. 51*38fd1498Szrj 3) Apart from RDG, compute data dependencies between memory references. 52*38fd1498Szrj 4) Starting from seed statement, build up partition by adding depended 53*38fd1498Szrj statements according to RDG's dependence information. Partition is 54*38fd1498Szrj classified as parallel type if it can be executed paralleled; or as 55*38fd1498Szrj sequential type if it can't. Parallel type partition is further 56*38fd1498Szrj classified as different builtin kinds if it can be implemented as 57*38fd1498Szrj builtin function calls. 58*38fd1498Szrj 5) Build partition dependence graph (PG) based on data dependencies. 59*38fd1498Szrj The vertices (PG:V) model all partitions and the edges (PG:E) model 60*38fd1498Szrj all data dependencies between every partitions pair. In general, 61*38fd1498Szrj data dependence is either compilation time known or unknown. In C 62*38fd1498Szrj family languages, there exists quite amount compilation time unknown 63*38fd1498Szrj dependencies because of possible alias relation of data references. 64*38fd1498Szrj We categorize PG's edge to two types: "true" edge that represents 65*38fd1498Szrj compilation time known data dependencies; "alias" edge for all other 66*38fd1498Szrj data dependencies. 67*38fd1498Szrj 6) Traverse subgraph of PG as if all "alias" edges don't exist. Merge 68*38fd1498Szrj partitions in each strong connected component (SCC) correspondingly. 69*38fd1498Szrj Build new PG for merged partitions. 70*38fd1498Szrj 7) Traverse PG again and this time with both "true" and "alias" edges 71*38fd1498Szrj included. We try to break SCCs by removing some edges. Because 72*38fd1498Szrj SCCs by "true" edges are all fused in step 6), we can break SCCs 73*38fd1498Szrj by removing some "alias" edges. It's NP-hard to choose optimal 74*38fd1498Szrj edge set, fortunately simple approximation is good enough for us 75*38fd1498Szrj given the small problem scale. 76*38fd1498Szrj 8) Collect all data dependencies of the removed "alias" edges. Create 77*38fd1498Szrj runtime alias checks for collected data dependencies. 78*38fd1498Szrj 9) Version loop under the condition of runtime alias checks. Given 79*38fd1498Szrj loop distribution generally introduces additional overhead, it is 80*38fd1498Szrj only useful if vectorization is achieved in distributed loop. We 81*38fd1498Szrj version loop with internal function call IFN_LOOP_DIST_ALIAS. If 82*38fd1498Szrj no distributed loop can be vectorized, we simply remove distributed 83*38fd1498Szrj loops and recover to the original one. 84*38fd1498Szrj 85*38fd1498Szrj TODO: 86*38fd1498Szrj 1) We only distribute innermost two-level loop nest now. We should 87*38fd1498Szrj extend it for arbitrary loop nests in the future. 88*38fd1498Szrj 2) We only fuse partitions in SCC now. A better fusion algorithm is 89*38fd1498Szrj desired to minimize loop overhead, maximize parallelism and maximize 90*38fd1498Szrj data reuse. */ 91*38fd1498Szrj 92*38fd1498Szrj #include "config.h" 93*38fd1498Szrj #define INCLUDE_ALGORITHM /* stable_sort */ 94*38fd1498Szrj #include "system.h" 95*38fd1498Szrj #include "coretypes.h" 96*38fd1498Szrj #include "backend.h" 97*38fd1498Szrj #include "tree.h" 98*38fd1498Szrj #include "gimple.h" 99*38fd1498Szrj #include "cfghooks.h" 100*38fd1498Szrj #include "tree-pass.h" 101*38fd1498Szrj #include "ssa.h" 102*38fd1498Szrj #include "gimple-pretty-print.h" 103*38fd1498Szrj #include "fold-const.h" 104*38fd1498Szrj #include "cfganal.h" 105*38fd1498Szrj #include "gimple-iterator.h" 106*38fd1498Szrj #include "gimplify-me.h" 107*38fd1498Szrj #include "stor-layout.h" 108*38fd1498Szrj #include "tree-cfg.h" 109*38fd1498Szrj #include "tree-ssa-loop-manip.h" 110*38fd1498Szrj #include "tree-ssa-loop-ivopts.h" 111*38fd1498Szrj #include "tree-ssa-loop.h" 112*38fd1498Szrj #include "tree-into-ssa.h" 113*38fd1498Szrj #include "tree-ssa.h" 114*38fd1498Szrj #include "cfgloop.h" 115*38fd1498Szrj #include "tree-scalar-evolution.h" 116*38fd1498Szrj #include "params.h" 117*38fd1498Szrj #include "tree-vectorizer.h" 118*38fd1498Szrj 119*38fd1498Szrj 120*38fd1498Szrj #define MAX_DATAREFS_NUM \ 121*38fd1498Szrj ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS)) 122*38fd1498Szrj 123*38fd1498Szrj /* Threshold controlling number of distributed partitions. Given it may 124*38fd1498Szrj be unnecessary if a memory stream cost model is invented in the future, 125*38fd1498Szrj we define it as a temporary macro, rather than a parameter. */ 126*38fd1498Szrj #define NUM_PARTITION_THRESHOLD (4) 127*38fd1498Szrj 128*38fd1498Szrj /* Hashtable helpers. */ 129*38fd1498Szrj 130*38fd1498Szrj struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation> 131*38fd1498Szrj { 132*38fd1498Szrj static inline hashval_t hash (const data_dependence_relation *); 133*38fd1498Szrj static inline bool equal (const data_dependence_relation *, 134*38fd1498Szrj const data_dependence_relation *); 135*38fd1498Szrj }; 136*38fd1498Szrj 137*38fd1498Szrj /* Hash function for data dependence. */ 138*38fd1498Szrj 139*38fd1498Szrj inline hashval_t 140*38fd1498Szrj ddr_hasher::hash (const data_dependence_relation *ddr) 141*38fd1498Szrj { 142*38fd1498Szrj inchash::hash h; 143*38fd1498Szrj h.add_ptr (DDR_A (ddr)); 144*38fd1498Szrj h.add_ptr (DDR_B (ddr)); 145*38fd1498Szrj return h.end (); 146*38fd1498Szrj } 147*38fd1498Szrj 148*38fd1498Szrj /* Hash table equality function for data dependence. */ 149*38fd1498Szrj 150*38fd1498Szrj inline bool 151*38fd1498Szrj ddr_hasher::equal (const data_dependence_relation *ddr1, 152*38fd1498Szrj const data_dependence_relation *ddr2) 153*38fd1498Szrj { 154*38fd1498Szrj return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2)); 155*38fd1498Szrj } 156*38fd1498Szrj 157*38fd1498Szrj /* The loop (nest) to be distributed. */ 158*38fd1498Szrj static vec<loop_p> loop_nest; 159*38fd1498Szrj 160*38fd1498Szrj /* Vector of data references in the loop to be distributed. */ 161*38fd1498Szrj static vec<data_reference_p> datarefs_vec; 162*38fd1498Szrj 163*38fd1498Szrj /* Store index of data reference in aux field. */ 164*38fd1498Szrj #define DR_INDEX(dr) ((uintptr_t) (dr)->aux) 165*38fd1498Szrj 166*38fd1498Szrj /* Hash table for data dependence relation in the loop to be distributed. */ 167*38fd1498Szrj static hash_table<ddr_hasher> *ddrs_table; 168*38fd1498Szrj 169*38fd1498Szrj /* A Reduced Dependence Graph (RDG) vertex representing a statement. */ 170*38fd1498Szrj struct rdg_vertex 171*38fd1498Szrj { 172*38fd1498Szrj /* The statement represented by this vertex. */ 173*38fd1498Szrj gimple *stmt; 174*38fd1498Szrj 175*38fd1498Szrj /* Vector of data-references in this statement. */ 176*38fd1498Szrj vec<data_reference_p> datarefs; 177*38fd1498Szrj 178*38fd1498Szrj /* True when the statement contains a write to memory. */ 179*38fd1498Szrj bool has_mem_write; 180*38fd1498Szrj 181*38fd1498Szrj /* True when the statement contains a read from memory. */ 182*38fd1498Szrj bool has_mem_reads; 183*38fd1498Szrj }; 184*38fd1498Szrj 185*38fd1498Szrj #define RDGV_STMT(V) ((struct rdg_vertex *) ((V)->data))->stmt 186*38fd1498Szrj #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs 187*38fd1498Szrj #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write 188*38fd1498Szrj #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads 189*38fd1498Szrj #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I])) 190*38fd1498Szrj #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I])) 191*38fd1498Szrj #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I])) 192*38fd1498Szrj #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I])) 193*38fd1498Szrj 194*38fd1498Szrj /* Data dependence type. */ 195*38fd1498Szrj 196*38fd1498Szrj enum rdg_dep_type 197*38fd1498Szrj { 198*38fd1498Szrj /* Read After Write (RAW). */ 199*38fd1498Szrj flow_dd = 'f', 200*38fd1498Szrj 201*38fd1498Szrj /* Control dependence (execute conditional on). */ 202*38fd1498Szrj control_dd = 'c' 203*38fd1498Szrj }; 204*38fd1498Szrj 205*38fd1498Szrj /* Dependence information attached to an edge of the RDG. */ 206*38fd1498Szrj 207*38fd1498Szrj struct rdg_edge 208*38fd1498Szrj { 209*38fd1498Szrj /* Type of the dependence. */ 210*38fd1498Szrj enum rdg_dep_type type; 211*38fd1498Szrj }; 212*38fd1498Szrj 213*38fd1498Szrj #define RDGE_TYPE(E) ((struct rdg_edge *) ((E)->data))->type 214*38fd1498Szrj 215*38fd1498Szrj /* Dump vertex I in RDG to FILE. */ 216*38fd1498Szrj 217*38fd1498Szrj static void 218*38fd1498Szrj dump_rdg_vertex (FILE *file, struct graph *rdg, int i) 219*38fd1498Szrj { 220*38fd1498Szrj struct vertex *v = &(rdg->vertices[i]); 221*38fd1498Szrj struct graph_edge *e; 222*38fd1498Szrj 223*38fd1498Szrj fprintf (file, "(vertex %d: (%s%s) (in:", i, 224*38fd1498Szrj RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "", 225*38fd1498Szrj RDG_MEM_READS_STMT (rdg, i) ? "r" : ""); 226*38fd1498Szrj 227*38fd1498Szrj if (v->pred) 228*38fd1498Szrj for (e = v->pred; e; e = e->pred_next) 229*38fd1498Szrj fprintf (file, " %d", e->src); 230*38fd1498Szrj 231*38fd1498Szrj fprintf (file, ") (out:"); 232*38fd1498Szrj 233*38fd1498Szrj if (v->succ) 234*38fd1498Szrj for (e = v->succ; e; e = e->succ_next) 235*38fd1498Szrj fprintf (file, " %d", e->dest); 236*38fd1498Szrj 237*38fd1498Szrj fprintf (file, ")\n"); 238*38fd1498Szrj print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS); 239*38fd1498Szrj fprintf (file, ")\n"); 240*38fd1498Szrj } 241*38fd1498Szrj 242*38fd1498Szrj /* Call dump_rdg_vertex on stderr. */ 243*38fd1498Szrj 244*38fd1498Szrj DEBUG_FUNCTION void 245*38fd1498Szrj debug_rdg_vertex (struct graph *rdg, int i) 246*38fd1498Szrj { 247*38fd1498Szrj dump_rdg_vertex (stderr, rdg, i); 248*38fd1498Szrj } 249*38fd1498Szrj 250*38fd1498Szrj /* Dump the reduced dependence graph RDG to FILE. */ 251*38fd1498Szrj 252*38fd1498Szrj static void 253*38fd1498Szrj dump_rdg (FILE *file, struct graph *rdg) 254*38fd1498Szrj { 255*38fd1498Szrj fprintf (file, "(rdg\n"); 256*38fd1498Szrj for (int i = 0; i < rdg->n_vertices; i++) 257*38fd1498Szrj dump_rdg_vertex (file, rdg, i); 258*38fd1498Szrj fprintf (file, ")\n"); 259*38fd1498Szrj } 260*38fd1498Szrj 261*38fd1498Szrj /* Call dump_rdg on stderr. */ 262*38fd1498Szrj 263*38fd1498Szrj DEBUG_FUNCTION void 264*38fd1498Szrj debug_rdg (struct graph *rdg) 265*38fd1498Szrj { 266*38fd1498Szrj dump_rdg (stderr, rdg); 267*38fd1498Szrj } 268*38fd1498Szrj 269*38fd1498Szrj static void 270*38fd1498Szrj dot_rdg_1 (FILE *file, struct graph *rdg) 271*38fd1498Szrj { 272*38fd1498Szrj int i; 273*38fd1498Szrj pretty_printer buffer; 274*38fd1498Szrj pp_needs_newline (&buffer) = false; 275*38fd1498Szrj buffer.buffer->stream = file; 276*38fd1498Szrj 277*38fd1498Szrj fprintf (file, "digraph RDG {\n"); 278*38fd1498Szrj 279*38fd1498Szrj for (i = 0; i < rdg->n_vertices; i++) 280*38fd1498Szrj { 281*38fd1498Szrj struct vertex *v = &(rdg->vertices[i]); 282*38fd1498Szrj struct graph_edge *e; 283*38fd1498Szrj 284*38fd1498Szrj fprintf (file, "%d [label=\"[%d] ", i, i); 285*38fd1498Szrj pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM); 286*38fd1498Szrj pp_flush (&buffer); 287*38fd1498Szrj fprintf (file, "\"]\n"); 288*38fd1498Szrj 289*38fd1498Szrj /* Highlight reads from memory. */ 290*38fd1498Szrj if (RDG_MEM_READS_STMT (rdg, i)) 291*38fd1498Szrj fprintf (file, "%d [style=filled, fillcolor=green]\n", i); 292*38fd1498Szrj 293*38fd1498Szrj /* Highlight stores to memory. */ 294*38fd1498Szrj if (RDG_MEM_WRITE_STMT (rdg, i)) 295*38fd1498Szrj fprintf (file, "%d [style=filled, fillcolor=red]\n", i); 296*38fd1498Szrj 297*38fd1498Szrj if (v->succ) 298*38fd1498Szrj for (e = v->succ; e; e = e->succ_next) 299*38fd1498Szrj switch (RDGE_TYPE (e)) 300*38fd1498Szrj { 301*38fd1498Szrj case flow_dd: 302*38fd1498Szrj /* These are the most common dependences: don't print these. */ 303*38fd1498Szrj fprintf (file, "%d -> %d \n", i, e->dest); 304*38fd1498Szrj break; 305*38fd1498Szrj 306*38fd1498Szrj case control_dd: 307*38fd1498Szrj fprintf (file, "%d -> %d [label=control] \n", i, e->dest); 308*38fd1498Szrj break; 309*38fd1498Szrj 310*38fd1498Szrj default: 311*38fd1498Szrj gcc_unreachable (); 312*38fd1498Szrj } 313*38fd1498Szrj } 314*38fd1498Szrj 315*38fd1498Szrj fprintf (file, "}\n\n"); 316*38fd1498Szrj } 317*38fd1498Szrj 318*38fd1498Szrj /* Display the Reduced Dependence Graph using dotty. */ 319*38fd1498Szrj 320*38fd1498Szrj DEBUG_FUNCTION void 321*38fd1498Szrj dot_rdg (struct graph *rdg) 322*38fd1498Szrj { 323*38fd1498Szrj /* When debugging, you may want to enable the following code. */ 324*38fd1498Szrj #ifdef HAVE_POPEN 325*38fd1498Szrj FILE *file = popen ("dot -Tx11", "w"); 326*38fd1498Szrj if (!file) 327*38fd1498Szrj return; 328*38fd1498Szrj dot_rdg_1 (file, rdg); 329*38fd1498Szrj fflush (file); 330*38fd1498Szrj close (fileno (file)); 331*38fd1498Szrj pclose (file); 332*38fd1498Szrj #else 333*38fd1498Szrj dot_rdg_1 (stderr, rdg); 334*38fd1498Szrj #endif 335*38fd1498Szrj } 336*38fd1498Szrj 337*38fd1498Szrj /* Returns the index of STMT in RDG. */ 338*38fd1498Szrj 339*38fd1498Szrj static int 340*38fd1498Szrj rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt) 341*38fd1498Szrj { 342*38fd1498Szrj int index = gimple_uid (stmt); 343*38fd1498Szrj gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt); 344*38fd1498Szrj return index; 345*38fd1498Szrj } 346*38fd1498Szrj 347*38fd1498Szrj /* Creates dependence edges in RDG for all the uses of DEF. IDEF is 348*38fd1498Szrj the index of DEF in RDG. */ 349*38fd1498Szrj 350*38fd1498Szrj static void 351*38fd1498Szrj create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef) 352*38fd1498Szrj { 353*38fd1498Szrj use_operand_p imm_use_p; 354*38fd1498Szrj imm_use_iterator iterator; 355*38fd1498Szrj 356*38fd1498Szrj FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def) 357*38fd1498Szrj { 358*38fd1498Szrj struct graph_edge *e; 359*38fd1498Szrj int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p)); 360*38fd1498Szrj 361*38fd1498Szrj if (use < 0) 362*38fd1498Szrj continue; 363*38fd1498Szrj 364*38fd1498Szrj e = add_edge (rdg, idef, use); 365*38fd1498Szrj e->data = XNEW (struct rdg_edge); 366*38fd1498Szrj RDGE_TYPE (e) = flow_dd; 367*38fd1498Szrj } 368*38fd1498Szrj } 369*38fd1498Szrj 370*38fd1498Szrj /* Creates an edge for the control dependences of BB to the vertex V. */ 371*38fd1498Szrj 372*38fd1498Szrj static void 373*38fd1498Szrj create_edge_for_control_dependence (struct graph *rdg, basic_block bb, 374*38fd1498Szrj int v, control_dependences *cd) 375*38fd1498Szrj { 376*38fd1498Szrj bitmap_iterator bi; 377*38fd1498Szrj unsigned edge_n; 378*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index), 379*38fd1498Szrj 0, edge_n, bi) 380*38fd1498Szrj { 381*38fd1498Szrj basic_block cond_bb = cd->get_edge_src (edge_n); 382*38fd1498Szrj gimple *stmt = last_stmt (cond_bb); 383*38fd1498Szrj if (stmt && is_ctrl_stmt (stmt)) 384*38fd1498Szrj { 385*38fd1498Szrj struct graph_edge *e; 386*38fd1498Szrj int c = rdg_vertex_for_stmt (rdg, stmt); 387*38fd1498Szrj if (c < 0) 388*38fd1498Szrj continue; 389*38fd1498Szrj 390*38fd1498Szrj e = add_edge (rdg, c, v); 391*38fd1498Szrj e->data = XNEW (struct rdg_edge); 392*38fd1498Szrj RDGE_TYPE (e) = control_dd; 393*38fd1498Szrj } 394*38fd1498Szrj } 395*38fd1498Szrj } 396*38fd1498Szrj 397*38fd1498Szrj /* Creates the edges of the reduced dependence graph RDG. */ 398*38fd1498Szrj 399*38fd1498Szrj static void 400*38fd1498Szrj create_rdg_flow_edges (struct graph *rdg) 401*38fd1498Szrj { 402*38fd1498Szrj int i; 403*38fd1498Szrj def_operand_p def_p; 404*38fd1498Szrj ssa_op_iter iter; 405*38fd1498Szrj 406*38fd1498Szrj for (i = 0; i < rdg->n_vertices; i++) 407*38fd1498Szrj FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i), 408*38fd1498Szrj iter, SSA_OP_DEF) 409*38fd1498Szrj create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i); 410*38fd1498Szrj } 411*38fd1498Szrj 412*38fd1498Szrj /* Creates the edges of the reduced dependence graph RDG. */ 413*38fd1498Szrj 414*38fd1498Szrj static void 415*38fd1498Szrj create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop) 416*38fd1498Szrj { 417*38fd1498Szrj int i; 418*38fd1498Szrj 419*38fd1498Szrj for (i = 0; i < rdg->n_vertices; i++) 420*38fd1498Szrj { 421*38fd1498Szrj gimple *stmt = RDG_STMT (rdg, i); 422*38fd1498Szrj if (gimple_code (stmt) == GIMPLE_PHI) 423*38fd1498Szrj { 424*38fd1498Szrj edge_iterator ei; 425*38fd1498Szrj edge e; 426*38fd1498Szrj FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds) 427*38fd1498Szrj if (flow_bb_inside_loop_p (loop, e->src)) 428*38fd1498Szrj create_edge_for_control_dependence (rdg, e->src, i, cd); 429*38fd1498Szrj } 430*38fd1498Szrj else 431*38fd1498Szrj create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd); 432*38fd1498Szrj } 433*38fd1498Szrj } 434*38fd1498Szrj 435*38fd1498Szrj /* Build the vertices of the reduced dependence graph RDG. Return false 436*38fd1498Szrj if that failed. */ 437*38fd1498Szrj 438*38fd1498Szrj static bool 439*38fd1498Szrj create_rdg_vertices (struct graph *rdg, vec<gimple *> stmts, loop_p loop) 440*38fd1498Szrj { 441*38fd1498Szrj int i; 442*38fd1498Szrj gimple *stmt; 443*38fd1498Szrj 444*38fd1498Szrj FOR_EACH_VEC_ELT (stmts, i, stmt) 445*38fd1498Szrj { 446*38fd1498Szrj struct vertex *v = &(rdg->vertices[i]); 447*38fd1498Szrj 448*38fd1498Szrj /* Record statement to vertex mapping. */ 449*38fd1498Szrj gimple_set_uid (stmt, i); 450*38fd1498Szrj 451*38fd1498Szrj v->data = XNEW (struct rdg_vertex); 452*38fd1498Szrj RDGV_STMT (v) = stmt; 453*38fd1498Szrj RDGV_DATAREFS (v).create (0); 454*38fd1498Szrj RDGV_HAS_MEM_WRITE (v) = false; 455*38fd1498Szrj RDGV_HAS_MEM_READS (v) = false; 456*38fd1498Szrj if (gimple_code (stmt) == GIMPLE_PHI) 457*38fd1498Szrj continue; 458*38fd1498Szrj 459*38fd1498Szrj unsigned drp = datarefs_vec.length (); 460*38fd1498Szrj if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec)) 461*38fd1498Szrj return false; 462*38fd1498Szrj for (unsigned j = drp; j < datarefs_vec.length (); ++j) 463*38fd1498Szrj { 464*38fd1498Szrj data_reference_p dr = datarefs_vec[j]; 465*38fd1498Szrj if (DR_IS_READ (dr)) 466*38fd1498Szrj RDGV_HAS_MEM_READS (v) = true; 467*38fd1498Szrj else 468*38fd1498Szrj RDGV_HAS_MEM_WRITE (v) = true; 469*38fd1498Szrj RDGV_DATAREFS (v).safe_push (dr); 470*38fd1498Szrj } 471*38fd1498Szrj } 472*38fd1498Szrj return true; 473*38fd1498Szrj } 474*38fd1498Szrj 475*38fd1498Szrj /* Array mapping basic block's index to its topological order. */ 476*38fd1498Szrj static int *bb_top_order_index; 477*38fd1498Szrj /* And size of the array. */ 478*38fd1498Szrj static int bb_top_order_index_size; 479*38fd1498Szrj 480*38fd1498Szrj /* If X has a smaller topological sort number than Y, returns -1; 481*38fd1498Szrj if greater, returns 1. */ 482*38fd1498Szrj 483*38fd1498Szrj static int 484*38fd1498Szrj bb_top_order_cmp (const void *x, const void *y) 485*38fd1498Szrj { 486*38fd1498Szrj basic_block bb1 = *(const basic_block *) x; 487*38fd1498Szrj basic_block bb2 = *(const basic_block *) y; 488*38fd1498Szrj 489*38fd1498Szrj gcc_assert (bb1->index < bb_top_order_index_size 490*38fd1498Szrj && bb2->index < bb_top_order_index_size); 491*38fd1498Szrj gcc_assert (bb1 == bb2 492*38fd1498Szrj || bb_top_order_index[bb1->index] 493*38fd1498Szrj != bb_top_order_index[bb2->index]); 494*38fd1498Szrj 495*38fd1498Szrj return (bb_top_order_index[bb1->index] - bb_top_order_index[bb2->index]); 496*38fd1498Szrj } 497*38fd1498Szrj 498*38fd1498Szrj /* Initialize STMTS with all the statements of LOOP. We use topological 499*38fd1498Szrj order to discover all statements. The order is important because 500*38fd1498Szrj generate_loops_for_partition is using the same traversal for identifying 501*38fd1498Szrj statements in loop copies. */ 502*38fd1498Szrj 503*38fd1498Szrj static void 504*38fd1498Szrj stmts_from_loop (struct loop *loop, vec<gimple *> *stmts) 505*38fd1498Szrj { 506*38fd1498Szrj unsigned int i; 507*38fd1498Szrj basic_block *bbs = get_loop_body_in_custom_order (loop, bb_top_order_cmp); 508*38fd1498Szrj 509*38fd1498Szrj for (i = 0; i < loop->num_nodes; i++) 510*38fd1498Szrj { 511*38fd1498Szrj basic_block bb = bbs[i]; 512*38fd1498Szrj 513*38fd1498Szrj for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); 514*38fd1498Szrj gsi_next (&bsi)) 515*38fd1498Szrj if (!virtual_operand_p (gimple_phi_result (bsi.phi ()))) 516*38fd1498Szrj stmts->safe_push (bsi.phi ()); 517*38fd1498Szrj 518*38fd1498Szrj for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); 519*38fd1498Szrj gsi_next (&bsi)) 520*38fd1498Szrj { 521*38fd1498Szrj gimple *stmt = gsi_stmt (bsi); 522*38fd1498Szrj if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt)) 523*38fd1498Szrj stmts->safe_push (stmt); 524*38fd1498Szrj } 525*38fd1498Szrj } 526*38fd1498Szrj 527*38fd1498Szrj free (bbs); 528*38fd1498Szrj } 529*38fd1498Szrj 530*38fd1498Szrj /* Free the reduced dependence graph RDG. */ 531*38fd1498Szrj 532*38fd1498Szrj static void 533*38fd1498Szrj free_rdg (struct graph *rdg) 534*38fd1498Szrj { 535*38fd1498Szrj int i; 536*38fd1498Szrj 537*38fd1498Szrj for (i = 0; i < rdg->n_vertices; i++) 538*38fd1498Szrj { 539*38fd1498Szrj struct vertex *v = &(rdg->vertices[i]); 540*38fd1498Szrj struct graph_edge *e; 541*38fd1498Szrj 542*38fd1498Szrj for (e = v->succ; e; e = e->succ_next) 543*38fd1498Szrj free (e->data); 544*38fd1498Szrj 545*38fd1498Szrj if (v->data) 546*38fd1498Szrj { 547*38fd1498Szrj gimple_set_uid (RDGV_STMT (v), -1); 548*38fd1498Szrj (RDGV_DATAREFS (v)).release (); 549*38fd1498Szrj free (v->data); 550*38fd1498Szrj } 551*38fd1498Szrj } 552*38fd1498Szrj 553*38fd1498Szrj free_graph (rdg); 554*38fd1498Szrj } 555*38fd1498Szrj 556*38fd1498Szrj /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of 557*38fd1498Szrj LOOP, and one edge per flow dependence or control dependence from control 558*38fd1498Szrj dependence CD. During visiting each statement, data references are also 559*38fd1498Szrj collected and recorded in global data DATAREFS_VEC. */ 560*38fd1498Szrj 561*38fd1498Szrj static struct graph * 562*38fd1498Szrj build_rdg (struct loop *loop, control_dependences *cd) 563*38fd1498Szrj { 564*38fd1498Szrj struct graph *rdg; 565*38fd1498Szrj 566*38fd1498Szrj /* Create the RDG vertices from the stmts of the loop nest. */ 567*38fd1498Szrj auto_vec<gimple *, 10> stmts; 568*38fd1498Szrj stmts_from_loop (loop, &stmts); 569*38fd1498Szrj rdg = new_graph (stmts.length ()); 570*38fd1498Szrj if (!create_rdg_vertices (rdg, stmts, loop)) 571*38fd1498Szrj { 572*38fd1498Szrj free_rdg (rdg); 573*38fd1498Szrj return NULL; 574*38fd1498Szrj } 575*38fd1498Szrj stmts.release (); 576*38fd1498Szrj 577*38fd1498Szrj create_rdg_flow_edges (rdg); 578*38fd1498Szrj if (cd) 579*38fd1498Szrj create_rdg_cd_edges (rdg, cd, loop); 580*38fd1498Szrj 581*38fd1498Szrj return rdg; 582*38fd1498Szrj } 583*38fd1498Szrj 584*38fd1498Szrj 585*38fd1498Szrj /* Kind of distributed loop. */ 586*38fd1498Szrj enum partition_kind { 587*38fd1498Szrj PKIND_NORMAL, 588*38fd1498Szrj /* Partial memset stands for a paritition can be distributed into a loop 589*38fd1498Szrj of memset calls, rather than a single memset call. It's handled just 590*38fd1498Szrj like a normal parition, i.e, distributed as separate loop, no memset 591*38fd1498Szrj call is generated. 592*38fd1498Szrj 593*38fd1498Szrj Note: This is a hacking fix trying to distribute ZERO-ing stmt in a 594*38fd1498Szrj loop nest as deep as possible. As a result, parloop achieves better 595*38fd1498Szrj parallelization by parallelizing deeper loop nest. This hack should 596*38fd1498Szrj be unnecessary and removed once distributed memset can be understood 597*38fd1498Szrj and analyzed in data reference analysis. See PR82604 for more. */ 598*38fd1498Szrj PKIND_PARTIAL_MEMSET, 599*38fd1498Szrj PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE 600*38fd1498Szrj }; 601*38fd1498Szrj 602*38fd1498Szrj /* Type of distributed loop. */ 603*38fd1498Szrj enum partition_type { 604*38fd1498Szrj /* The distributed loop can be executed parallelly. */ 605*38fd1498Szrj PTYPE_PARALLEL = 0, 606*38fd1498Szrj /* The distributed loop has to be executed sequentially. */ 607*38fd1498Szrj PTYPE_SEQUENTIAL 608*38fd1498Szrj }; 609*38fd1498Szrj 610*38fd1498Szrj /* Builtin info for loop distribution. */ 611*38fd1498Szrj struct builtin_info 612*38fd1498Szrj { 613*38fd1498Szrj /* data-references a kind != PKIND_NORMAL partition is about. */ 614*38fd1498Szrj data_reference_p dst_dr; 615*38fd1498Szrj data_reference_p src_dr; 616*38fd1498Szrj /* Base address and size of memory objects operated by the builtin. Note 617*38fd1498Szrj both dest and source memory objects must have the same size. */ 618*38fd1498Szrj tree dst_base; 619*38fd1498Szrj tree src_base; 620*38fd1498Szrj tree size; 621*38fd1498Szrj /* Base and offset part of dst_base after stripping constant offset. This 622*38fd1498Szrj is only used in memset builtin distribution for now. */ 623*38fd1498Szrj tree dst_base_base; 624*38fd1498Szrj unsigned HOST_WIDE_INT dst_base_offset; 625*38fd1498Szrj }; 626*38fd1498Szrj 627*38fd1498Szrj /* Partition for loop distribution. */ 628*38fd1498Szrj struct partition 629*38fd1498Szrj { 630*38fd1498Szrj /* Statements of the partition. */ 631*38fd1498Szrj bitmap stmts; 632*38fd1498Szrj /* True if the partition defines variable which is used outside of loop. */ 633*38fd1498Szrj bool reduction_p; 634*38fd1498Szrj enum partition_kind kind; 635*38fd1498Szrj enum partition_type type; 636*38fd1498Szrj /* Data references in the partition. */ 637*38fd1498Szrj bitmap datarefs; 638*38fd1498Szrj /* Information of builtin parition. */ 639*38fd1498Szrj struct builtin_info *builtin; 640*38fd1498Szrj }; 641*38fd1498Szrj 642*38fd1498Szrj 643*38fd1498Szrj /* Allocate and initialize a partition from BITMAP. */ 644*38fd1498Szrj 645*38fd1498Szrj static partition * 646*38fd1498Szrj partition_alloc (void) 647*38fd1498Szrj { 648*38fd1498Szrj partition *partition = XCNEW (struct partition); 649*38fd1498Szrj partition->stmts = BITMAP_ALLOC (NULL); 650*38fd1498Szrj partition->reduction_p = false; 651*38fd1498Szrj partition->kind = PKIND_NORMAL; 652*38fd1498Szrj partition->datarefs = BITMAP_ALLOC (NULL); 653*38fd1498Szrj return partition; 654*38fd1498Szrj } 655*38fd1498Szrj 656*38fd1498Szrj /* Free PARTITION. */ 657*38fd1498Szrj 658*38fd1498Szrj static void 659*38fd1498Szrj partition_free (partition *partition) 660*38fd1498Szrj { 661*38fd1498Szrj BITMAP_FREE (partition->stmts); 662*38fd1498Szrj BITMAP_FREE (partition->datarefs); 663*38fd1498Szrj if (partition->builtin) 664*38fd1498Szrj free (partition->builtin); 665*38fd1498Szrj 666*38fd1498Szrj free (partition); 667*38fd1498Szrj } 668*38fd1498Szrj 669*38fd1498Szrj /* Returns true if the partition can be generated as a builtin. */ 670*38fd1498Szrj 671*38fd1498Szrj static bool 672*38fd1498Szrj partition_builtin_p (partition *partition) 673*38fd1498Szrj { 674*38fd1498Szrj return partition->kind > PKIND_PARTIAL_MEMSET; 675*38fd1498Szrj } 676*38fd1498Szrj 677*38fd1498Szrj /* Returns true if the partition contains a reduction. */ 678*38fd1498Szrj 679*38fd1498Szrj static bool 680*38fd1498Szrj partition_reduction_p (partition *partition) 681*38fd1498Szrj { 682*38fd1498Szrj return partition->reduction_p; 683*38fd1498Szrj } 684*38fd1498Szrj 685*38fd1498Szrj /* Partitions are fused because of different reasons. */ 686*38fd1498Szrj enum fuse_type 687*38fd1498Szrj { 688*38fd1498Szrj FUSE_NON_BUILTIN = 0, 689*38fd1498Szrj FUSE_REDUCTION = 1, 690*38fd1498Szrj FUSE_SHARE_REF = 2, 691*38fd1498Szrj FUSE_SAME_SCC = 3, 692*38fd1498Szrj FUSE_FINALIZE = 4 693*38fd1498Szrj }; 694*38fd1498Szrj 695*38fd1498Szrj /* Description on different fusing reason. */ 696*38fd1498Szrj static const char *fuse_message[] = { 697*38fd1498Szrj "they are non-builtins", 698*38fd1498Szrj "they have reductions", 699*38fd1498Szrj "they have shared memory refs", 700*38fd1498Szrj "they are in the same dependence scc", 701*38fd1498Szrj "there is no point to distribute loop"}; 702*38fd1498Szrj 703*38fd1498Szrj static void 704*38fd1498Szrj update_type_for_merge (struct graph *, partition *, partition *); 705*38fd1498Szrj 706*38fd1498Szrj /* Merge PARTITION into the partition DEST. RDG is the reduced dependence 707*38fd1498Szrj graph and we update type for result partition if it is non-NULL. */ 708*38fd1498Szrj 709*38fd1498Szrj static void 710*38fd1498Szrj partition_merge_into (struct graph *rdg, partition *dest, 711*38fd1498Szrj partition *partition, enum fuse_type ft) 712*38fd1498Szrj { 713*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 714*38fd1498Szrj { 715*38fd1498Szrj fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]); 716*38fd1498Szrj fprintf (dump_file, " Part 1: "); 717*38fd1498Szrj dump_bitmap (dump_file, dest->stmts); 718*38fd1498Szrj fprintf (dump_file, " Part 2: "); 719*38fd1498Szrj dump_bitmap (dump_file, partition->stmts); 720*38fd1498Szrj } 721*38fd1498Szrj 722*38fd1498Szrj dest->kind = PKIND_NORMAL; 723*38fd1498Szrj if (dest->type == PTYPE_PARALLEL) 724*38fd1498Szrj dest->type = partition->type; 725*38fd1498Szrj 726*38fd1498Szrj bitmap_ior_into (dest->stmts, partition->stmts); 727*38fd1498Szrj if (partition_reduction_p (partition)) 728*38fd1498Szrj dest->reduction_p = true; 729*38fd1498Szrj 730*38fd1498Szrj /* Further check if any data dependence prevents us from executing the 731*38fd1498Szrj new partition parallelly. */ 732*38fd1498Szrj if (dest->type == PTYPE_PARALLEL && rdg != NULL) 733*38fd1498Szrj update_type_for_merge (rdg, dest, partition); 734*38fd1498Szrj 735*38fd1498Szrj bitmap_ior_into (dest->datarefs, partition->datarefs); 736*38fd1498Szrj } 737*38fd1498Szrj 738*38fd1498Szrj 739*38fd1498Szrj /* Returns true when DEF is an SSA_NAME defined in LOOP and used after 740*38fd1498Szrj the LOOP. */ 741*38fd1498Szrj 742*38fd1498Szrj static bool 743*38fd1498Szrj ssa_name_has_uses_outside_loop_p (tree def, loop_p loop) 744*38fd1498Szrj { 745*38fd1498Szrj imm_use_iterator imm_iter; 746*38fd1498Szrj use_operand_p use_p; 747*38fd1498Szrj 748*38fd1498Szrj FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def) 749*38fd1498Szrj { 750*38fd1498Szrj if (is_gimple_debug (USE_STMT (use_p))) 751*38fd1498Szrj continue; 752*38fd1498Szrj 753*38fd1498Szrj basic_block use_bb = gimple_bb (USE_STMT (use_p)); 754*38fd1498Szrj if (!flow_bb_inside_loop_p (loop, use_bb)) 755*38fd1498Szrj return true; 756*38fd1498Szrj } 757*38fd1498Szrj 758*38fd1498Szrj return false; 759*38fd1498Szrj } 760*38fd1498Szrj 761*38fd1498Szrj /* Returns true when STMT defines a scalar variable used after the 762*38fd1498Szrj loop LOOP. */ 763*38fd1498Szrj 764*38fd1498Szrj static bool 765*38fd1498Szrj stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt) 766*38fd1498Szrj { 767*38fd1498Szrj def_operand_p def_p; 768*38fd1498Szrj ssa_op_iter op_iter; 769*38fd1498Szrj 770*38fd1498Szrj if (gimple_code (stmt) == GIMPLE_PHI) 771*38fd1498Szrj return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop); 772*38fd1498Szrj 773*38fd1498Szrj FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF) 774*38fd1498Szrj if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop)) 775*38fd1498Szrj return true; 776*38fd1498Szrj 777*38fd1498Szrj return false; 778*38fd1498Szrj } 779*38fd1498Szrj 780*38fd1498Szrj /* Return a copy of LOOP placed before LOOP. */ 781*38fd1498Szrj 782*38fd1498Szrj static struct loop * 783*38fd1498Szrj copy_loop_before (struct loop *loop) 784*38fd1498Szrj { 785*38fd1498Szrj struct loop *res; 786*38fd1498Szrj edge preheader = loop_preheader_edge (loop); 787*38fd1498Szrj 788*38fd1498Szrj initialize_original_copy_tables (); 789*38fd1498Szrj res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader); 790*38fd1498Szrj gcc_assert (res != NULL); 791*38fd1498Szrj free_original_copy_tables (); 792*38fd1498Szrj delete_update_ssa (); 793*38fd1498Szrj 794*38fd1498Szrj return res; 795*38fd1498Szrj } 796*38fd1498Szrj 797*38fd1498Szrj /* Creates an empty basic block after LOOP. */ 798*38fd1498Szrj 799*38fd1498Szrj static void 800*38fd1498Szrj create_bb_after_loop (struct loop *loop) 801*38fd1498Szrj { 802*38fd1498Szrj edge exit = single_exit (loop); 803*38fd1498Szrj 804*38fd1498Szrj if (!exit) 805*38fd1498Szrj return; 806*38fd1498Szrj 807*38fd1498Szrj split_edge (exit); 808*38fd1498Szrj } 809*38fd1498Szrj 810*38fd1498Szrj /* Generate code for PARTITION from the code in LOOP. The loop is 811*38fd1498Szrj copied when COPY_P is true. All the statements not flagged in the 812*38fd1498Szrj PARTITION bitmap are removed from the loop or from its copy. The 813*38fd1498Szrj statements are indexed in sequence inside a basic block, and the 814*38fd1498Szrj basic blocks of a loop are taken in dom order. */ 815*38fd1498Szrj 816*38fd1498Szrj static void 817*38fd1498Szrj generate_loops_for_partition (struct loop *loop, partition *partition, 818*38fd1498Szrj bool copy_p) 819*38fd1498Szrj { 820*38fd1498Szrj unsigned i; 821*38fd1498Szrj basic_block *bbs; 822*38fd1498Szrj 823*38fd1498Szrj if (copy_p) 824*38fd1498Szrj { 825*38fd1498Szrj int orig_loop_num = loop->orig_loop_num; 826*38fd1498Szrj loop = copy_loop_before (loop); 827*38fd1498Szrj gcc_assert (loop != NULL); 828*38fd1498Szrj loop->orig_loop_num = orig_loop_num; 829*38fd1498Szrj create_preheader (loop, CP_SIMPLE_PREHEADERS); 830*38fd1498Szrj create_bb_after_loop (loop); 831*38fd1498Szrj } 832*38fd1498Szrj else 833*38fd1498Szrj { 834*38fd1498Szrj /* Origin number is set to the new versioned loop's num. */ 835*38fd1498Szrj gcc_assert (loop->orig_loop_num != loop->num); 836*38fd1498Szrj } 837*38fd1498Szrj 838*38fd1498Szrj /* Remove stmts not in the PARTITION bitmap. */ 839*38fd1498Szrj bbs = get_loop_body_in_dom_order (loop); 840*38fd1498Szrj 841*38fd1498Szrj if (MAY_HAVE_DEBUG_BIND_STMTS) 842*38fd1498Szrj for (i = 0; i < loop->num_nodes; i++) 843*38fd1498Szrj { 844*38fd1498Szrj basic_block bb = bbs[i]; 845*38fd1498Szrj 846*38fd1498Szrj for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi); 847*38fd1498Szrj gsi_next (&bsi)) 848*38fd1498Szrj { 849*38fd1498Szrj gphi *phi = bsi.phi (); 850*38fd1498Szrj if (!virtual_operand_p (gimple_phi_result (phi)) 851*38fd1498Szrj && !bitmap_bit_p (partition->stmts, gimple_uid (phi))) 852*38fd1498Szrj reset_debug_uses (phi); 853*38fd1498Szrj } 854*38fd1498Szrj 855*38fd1498Szrj for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi)) 856*38fd1498Szrj { 857*38fd1498Szrj gimple *stmt = gsi_stmt (bsi); 858*38fd1498Szrj if (gimple_code (stmt) != GIMPLE_LABEL 859*38fd1498Szrj && !is_gimple_debug (stmt) 860*38fd1498Szrj && !bitmap_bit_p (partition->stmts, gimple_uid (stmt))) 861*38fd1498Szrj reset_debug_uses (stmt); 862*38fd1498Szrj } 863*38fd1498Szrj } 864*38fd1498Szrj 865*38fd1498Szrj for (i = 0; i < loop->num_nodes; i++) 866*38fd1498Szrj { 867*38fd1498Szrj basic_block bb = bbs[i]; 868*38fd1498Szrj edge inner_exit = NULL; 869*38fd1498Szrj 870*38fd1498Szrj if (loop != bb->loop_father) 871*38fd1498Szrj inner_exit = single_exit (bb->loop_father); 872*38fd1498Szrj 873*38fd1498Szrj for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);) 874*38fd1498Szrj { 875*38fd1498Szrj gphi *phi = bsi.phi (); 876*38fd1498Szrj if (!virtual_operand_p (gimple_phi_result (phi)) 877*38fd1498Szrj && !bitmap_bit_p (partition->stmts, gimple_uid (phi))) 878*38fd1498Szrj remove_phi_node (&bsi, true); 879*38fd1498Szrj else 880*38fd1498Szrj gsi_next (&bsi); 881*38fd1498Szrj } 882*38fd1498Szrj 883*38fd1498Szrj for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);) 884*38fd1498Szrj { 885*38fd1498Szrj gimple *stmt = gsi_stmt (bsi); 886*38fd1498Szrj if (gimple_code (stmt) != GIMPLE_LABEL 887*38fd1498Szrj && !is_gimple_debug (stmt) 888*38fd1498Szrj && !bitmap_bit_p (partition->stmts, gimple_uid (stmt))) 889*38fd1498Szrj { 890*38fd1498Szrj /* In distribution of loop nest, if bb is inner loop's exit_bb, 891*38fd1498Szrj we choose its exit edge/path in order to avoid generating 892*38fd1498Szrj infinite loop. For all other cases, we choose an arbitrary 893*38fd1498Szrj path through the empty CFG part that this unnecessary 894*38fd1498Szrj control stmt controls. */ 895*38fd1498Szrj if (gcond *cond_stmt = dyn_cast <gcond *> (stmt)) 896*38fd1498Szrj { 897*38fd1498Szrj if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE) 898*38fd1498Szrj gimple_cond_make_true (cond_stmt); 899*38fd1498Szrj else 900*38fd1498Szrj gimple_cond_make_false (cond_stmt); 901*38fd1498Szrj update_stmt (stmt); 902*38fd1498Szrj } 903*38fd1498Szrj else if (gimple_code (stmt) == GIMPLE_SWITCH) 904*38fd1498Szrj { 905*38fd1498Szrj gswitch *switch_stmt = as_a <gswitch *> (stmt); 906*38fd1498Szrj gimple_switch_set_index 907*38fd1498Szrj (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1))); 908*38fd1498Szrj update_stmt (stmt); 909*38fd1498Szrj } 910*38fd1498Szrj else 911*38fd1498Szrj { 912*38fd1498Szrj unlink_stmt_vdef (stmt); 913*38fd1498Szrj gsi_remove (&bsi, true); 914*38fd1498Szrj release_defs (stmt); 915*38fd1498Szrj continue; 916*38fd1498Szrj } 917*38fd1498Szrj } 918*38fd1498Szrj gsi_next (&bsi); 919*38fd1498Szrj } 920*38fd1498Szrj } 921*38fd1498Szrj 922*38fd1498Szrj free (bbs); 923*38fd1498Szrj } 924*38fd1498Szrj 925*38fd1498Szrj /* If VAL memory representation contains the same value in all bytes, 926*38fd1498Szrj return that value, otherwise return -1. 927*38fd1498Szrj E.g. for 0x24242424 return 0x24, for IEEE double 928*38fd1498Szrj 747708026454360457216.0 return 0x44, etc. */ 929*38fd1498Szrj 930*38fd1498Szrj static int 931*38fd1498Szrj const_with_all_bytes_same (tree val) 932*38fd1498Szrj { 933*38fd1498Szrj unsigned char buf[64]; 934*38fd1498Szrj int i, len; 935*38fd1498Szrj 936*38fd1498Szrj if (integer_zerop (val) 937*38fd1498Szrj || (TREE_CODE (val) == CONSTRUCTOR 938*38fd1498Szrj && !TREE_CLOBBER_P (val) 939*38fd1498Szrj && CONSTRUCTOR_NELTS (val) == 0)) 940*38fd1498Szrj return 0; 941*38fd1498Szrj 942*38fd1498Szrj if (real_zerop (val)) 943*38fd1498Szrj { 944*38fd1498Szrj /* Only return 0 for +0.0, not for -0.0, which doesn't have 945*38fd1498Szrj an all bytes same memory representation. Don't transform 946*38fd1498Szrj -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS. */ 947*38fd1498Szrj switch (TREE_CODE (val)) 948*38fd1498Szrj { 949*38fd1498Szrj case REAL_CST: 950*38fd1498Szrj if (!real_isneg (TREE_REAL_CST_PTR (val))) 951*38fd1498Szrj return 0; 952*38fd1498Szrj break; 953*38fd1498Szrj case COMPLEX_CST: 954*38fd1498Szrj if (!const_with_all_bytes_same (TREE_REALPART (val)) 955*38fd1498Szrj && !const_with_all_bytes_same (TREE_IMAGPART (val))) 956*38fd1498Szrj return 0; 957*38fd1498Szrj break; 958*38fd1498Szrj case VECTOR_CST: 959*38fd1498Szrj { 960*38fd1498Szrj unsigned int count = vector_cst_encoded_nelts (val); 961*38fd1498Szrj unsigned int j; 962*38fd1498Szrj for (j = 0; j < count; ++j) 963*38fd1498Szrj if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j))) 964*38fd1498Szrj break; 965*38fd1498Szrj if (j == count) 966*38fd1498Szrj return 0; 967*38fd1498Szrj break; 968*38fd1498Szrj } 969*38fd1498Szrj default: 970*38fd1498Szrj break; 971*38fd1498Szrj } 972*38fd1498Szrj } 973*38fd1498Szrj 974*38fd1498Szrj if (CHAR_BIT != 8 || BITS_PER_UNIT != 8) 975*38fd1498Szrj return -1; 976*38fd1498Szrj 977*38fd1498Szrj len = native_encode_expr (val, buf, sizeof (buf)); 978*38fd1498Szrj if (len == 0) 979*38fd1498Szrj return -1; 980*38fd1498Szrj for (i = 1; i < len; i++) 981*38fd1498Szrj if (buf[i] != buf[0]) 982*38fd1498Szrj return -1; 983*38fd1498Szrj return buf[0]; 984*38fd1498Szrj } 985*38fd1498Szrj 986*38fd1498Szrj /* Generate a call to memset for PARTITION in LOOP. */ 987*38fd1498Szrj 988*38fd1498Szrj static void 989*38fd1498Szrj generate_memset_builtin (struct loop *loop, partition *partition) 990*38fd1498Szrj { 991*38fd1498Szrj gimple_stmt_iterator gsi; 992*38fd1498Szrj tree mem, fn, nb_bytes; 993*38fd1498Szrj tree val; 994*38fd1498Szrj struct builtin_info *builtin = partition->builtin; 995*38fd1498Szrj gimple *fn_call; 996*38fd1498Szrj 997*38fd1498Szrj /* The new statements will be placed before LOOP. */ 998*38fd1498Szrj gsi = gsi_last_bb (loop_preheader_edge (loop)->src); 999*38fd1498Szrj 1000*38fd1498Szrj nb_bytes = builtin->size; 1001*38fd1498Szrj nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, 1002*38fd1498Szrj false, GSI_CONTINUE_LINKING); 1003*38fd1498Szrj mem = builtin->dst_base; 1004*38fd1498Szrj mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE, 1005*38fd1498Szrj false, GSI_CONTINUE_LINKING); 1006*38fd1498Szrj 1007*38fd1498Szrj /* This exactly matches the pattern recognition in classify_partition. */ 1008*38fd1498Szrj val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr)); 1009*38fd1498Szrj /* Handle constants like 0x15151515 and similarly 1010*38fd1498Szrj floating point constants etc. where all bytes are the same. */ 1011*38fd1498Szrj int bytev = const_with_all_bytes_same (val); 1012*38fd1498Szrj if (bytev != -1) 1013*38fd1498Szrj val = build_int_cst (integer_type_node, bytev); 1014*38fd1498Szrj else if (TREE_CODE (val) == INTEGER_CST) 1015*38fd1498Szrj val = fold_convert (integer_type_node, val); 1016*38fd1498Szrj else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val))) 1017*38fd1498Szrj { 1018*38fd1498Szrj tree tem = make_ssa_name (integer_type_node); 1019*38fd1498Szrj gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val); 1020*38fd1498Szrj gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING); 1021*38fd1498Szrj val = tem; 1022*38fd1498Szrj } 1023*38fd1498Szrj 1024*38fd1498Szrj fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET)); 1025*38fd1498Szrj fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes); 1026*38fd1498Szrj gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); 1027*38fd1498Szrj 1028*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 1029*38fd1498Szrj { 1030*38fd1498Szrj fprintf (dump_file, "generated memset"); 1031*38fd1498Szrj if (bytev == 0) 1032*38fd1498Szrj fprintf (dump_file, " zero\n"); 1033*38fd1498Szrj else 1034*38fd1498Szrj fprintf (dump_file, "\n"); 1035*38fd1498Szrj } 1036*38fd1498Szrj } 1037*38fd1498Szrj 1038*38fd1498Szrj /* Generate a call to memcpy for PARTITION in LOOP. */ 1039*38fd1498Szrj 1040*38fd1498Szrj static void 1041*38fd1498Szrj generate_memcpy_builtin (struct loop *loop, partition *partition) 1042*38fd1498Szrj { 1043*38fd1498Szrj gimple_stmt_iterator gsi; 1044*38fd1498Szrj gimple *fn_call; 1045*38fd1498Szrj tree dest, src, fn, nb_bytes; 1046*38fd1498Szrj enum built_in_function kind; 1047*38fd1498Szrj struct builtin_info *builtin = partition->builtin; 1048*38fd1498Szrj 1049*38fd1498Szrj /* The new statements will be placed before LOOP. */ 1050*38fd1498Szrj gsi = gsi_last_bb (loop_preheader_edge (loop)->src); 1051*38fd1498Szrj 1052*38fd1498Szrj nb_bytes = builtin->size; 1053*38fd1498Szrj nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE, 1054*38fd1498Szrj false, GSI_CONTINUE_LINKING); 1055*38fd1498Szrj dest = builtin->dst_base; 1056*38fd1498Szrj src = builtin->src_base; 1057*38fd1498Szrj if (partition->kind == PKIND_MEMCPY 1058*38fd1498Szrj || ! ptr_derefs_may_alias_p (dest, src)) 1059*38fd1498Szrj kind = BUILT_IN_MEMCPY; 1060*38fd1498Szrj else 1061*38fd1498Szrj kind = BUILT_IN_MEMMOVE; 1062*38fd1498Szrj 1063*38fd1498Szrj dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE, 1064*38fd1498Szrj false, GSI_CONTINUE_LINKING); 1065*38fd1498Szrj src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE, 1066*38fd1498Szrj false, GSI_CONTINUE_LINKING); 1067*38fd1498Szrj fn = build_fold_addr_expr (builtin_decl_implicit (kind)); 1068*38fd1498Szrj fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes); 1069*38fd1498Szrj gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING); 1070*38fd1498Szrj 1071*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 1072*38fd1498Szrj { 1073*38fd1498Szrj if (kind == BUILT_IN_MEMCPY) 1074*38fd1498Szrj fprintf (dump_file, "generated memcpy\n"); 1075*38fd1498Szrj else 1076*38fd1498Szrj fprintf (dump_file, "generated memmove\n"); 1077*38fd1498Szrj } 1078*38fd1498Szrj } 1079*38fd1498Szrj 1080*38fd1498Szrj /* Remove and destroy the loop LOOP. */ 1081*38fd1498Szrj 1082*38fd1498Szrj static void 1083*38fd1498Szrj destroy_loop (struct loop *loop) 1084*38fd1498Szrj { 1085*38fd1498Szrj unsigned nbbs = loop->num_nodes; 1086*38fd1498Szrj edge exit = single_exit (loop); 1087*38fd1498Szrj basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest; 1088*38fd1498Szrj basic_block *bbs; 1089*38fd1498Szrj unsigned i; 1090*38fd1498Szrj 1091*38fd1498Szrj bbs = get_loop_body_in_dom_order (loop); 1092*38fd1498Szrj 1093*38fd1498Szrj redirect_edge_pred (exit, src); 1094*38fd1498Szrj exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE); 1095*38fd1498Szrj exit->flags |= EDGE_FALLTHRU; 1096*38fd1498Szrj cancel_loop_tree (loop); 1097*38fd1498Szrj rescan_loop_exit (exit, false, true); 1098*38fd1498Szrj 1099*38fd1498Szrj i = nbbs; 1100*38fd1498Szrj do 1101*38fd1498Szrj { 1102*38fd1498Szrj /* We have made sure to not leave any dangling uses of SSA 1103*38fd1498Szrj names defined in the loop. With the exception of virtuals. 1104*38fd1498Szrj Make sure we replace all uses of virtual defs that will remain 1105*38fd1498Szrj outside of the loop with the bare symbol as delete_basic_block 1106*38fd1498Szrj will release them. */ 1107*38fd1498Szrj --i; 1108*38fd1498Szrj for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); 1109*38fd1498Szrj gsi_next (&gsi)) 1110*38fd1498Szrj { 1111*38fd1498Szrj gphi *phi = gsi.phi (); 1112*38fd1498Szrj if (virtual_operand_p (gimple_phi_result (phi))) 1113*38fd1498Szrj mark_virtual_phi_result_for_renaming (phi); 1114*38fd1498Szrj } 1115*38fd1498Szrj for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); 1116*38fd1498Szrj gsi_next (&gsi)) 1117*38fd1498Szrj { 1118*38fd1498Szrj gimple *stmt = gsi_stmt (gsi); 1119*38fd1498Szrj tree vdef = gimple_vdef (stmt); 1120*38fd1498Szrj if (vdef && TREE_CODE (vdef) == SSA_NAME) 1121*38fd1498Szrj mark_virtual_operand_for_renaming (vdef); 1122*38fd1498Szrj } 1123*38fd1498Szrj delete_basic_block (bbs[i]); 1124*38fd1498Szrj } 1125*38fd1498Szrj while (i != 0); 1126*38fd1498Szrj 1127*38fd1498Szrj free (bbs); 1128*38fd1498Szrj 1129*38fd1498Szrj set_immediate_dominator (CDI_DOMINATORS, dest, 1130*38fd1498Szrj recompute_dominator (CDI_DOMINATORS, dest)); 1131*38fd1498Szrj } 1132*38fd1498Szrj 1133*38fd1498Szrj /* Generates code for PARTITION. Return whether LOOP needs to be destroyed. */ 1134*38fd1498Szrj 1135*38fd1498Szrj static bool 1136*38fd1498Szrj generate_code_for_partition (struct loop *loop, 1137*38fd1498Szrj partition *partition, bool copy_p) 1138*38fd1498Szrj { 1139*38fd1498Szrj switch (partition->kind) 1140*38fd1498Szrj { 1141*38fd1498Szrj case PKIND_NORMAL: 1142*38fd1498Szrj case PKIND_PARTIAL_MEMSET: 1143*38fd1498Szrj /* Reductions all have to be in the last partition. */ 1144*38fd1498Szrj gcc_assert (!partition_reduction_p (partition) 1145*38fd1498Szrj || !copy_p); 1146*38fd1498Szrj generate_loops_for_partition (loop, partition, copy_p); 1147*38fd1498Szrj return false; 1148*38fd1498Szrj 1149*38fd1498Szrj case PKIND_MEMSET: 1150*38fd1498Szrj generate_memset_builtin (loop, partition); 1151*38fd1498Szrj break; 1152*38fd1498Szrj 1153*38fd1498Szrj case PKIND_MEMCPY: 1154*38fd1498Szrj case PKIND_MEMMOVE: 1155*38fd1498Szrj generate_memcpy_builtin (loop, partition); 1156*38fd1498Szrj break; 1157*38fd1498Szrj 1158*38fd1498Szrj default: 1159*38fd1498Szrj gcc_unreachable (); 1160*38fd1498Szrj } 1161*38fd1498Szrj 1162*38fd1498Szrj /* Common tail for partitions we turn into a call. If this was the last 1163*38fd1498Szrj partition for which we generate code, we have to destroy the loop. */ 1164*38fd1498Szrj if (!copy_p) 1165*38fd1498Szrj return true; 1166*38fd1498Szrj return false; 1167*38fd1498Szrj } 1168*38fd1498Szrj 1169*38fd1498Szrj /* Return data dependence relation for data references A and B. The two 1170*38fd1498Szrj data references must be in lexicographic order wrto reduced dependence 1171*38fd1498Szrj graph RDG. We firstly try to find ddr from global ddr hash table. If 1172*38fd1498Szrj it doesn't exist, compute the ddr and cache it. */ 1173*38fd1498Szrj 1174*38fd1498Szrj static data_dependence_relation * 1175*38fd1498Szrj get_data_dependence (struct graph *rdg, data_reference_p a, data_reference_p b) 1176*38fd1498Szrj { 1177*38fd1498Szrj struct data_dependence_relation ent, **slot; 1178*38fd1498Szrj struct data_dependence_relation *ddr; 1179*38fd1498Szrj 1180*38fd1498Szrj gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b)); 1181*38fd1498Szrj gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a)) 1182*38fd1498Szrj <= rdg_vertex_for_stmt (rdg, DR_STMT (b))); 1183*38fd1498Szrj ent.a = a; 1184*38fd1498Szrj ent.b = b; 1185*38fd1498Szrj slot = ddrs_table->find_slot (&ent, INSERT); 1186*38fd1498Szrj if (*slot == NULL) 1187*38fd1498Szrj { 1188*38fd1498Szrj ddr = initialize_data_dependence_relation (a, b, loop_nest); 1189*38fd1498Szrj compute_affine_dependence (ddr, loop_nest[0]); 1190*38fd1498Szrj *slot = ddr; 1191*38fd1498Szrj } 1192*38fd1498Szrj 1193*38fd1498Szrj return *slot; 1194*38fd1498Szrj } 1195*38fd1498Szrj 1196*38fd1498Szrj /* In reduced dependence graph RDG for loop distribution, return true if 1197*38fd1498Szrj dependence between references DR1 and DR2 leads to a dependence cycle 1198*38fd1498Szrj and such dependence cycle can't be resolved by runtime alias check. */ 1199*38fd1498Szrj 1200*38fd1498Szrj static bool 1201*38fd1498Szrj data_dep_in_cycle_p (struct graph *rdg, 1202*38fd1498Szrj data_reference_p dr1, data_reference_p dr2) 1203*38fd1498Szrj { 1204*38fd1498Szrj struct data_dependence_relation *ddr; 1205*38fd1498Szrj 1206*38fd1498Szrj /* Re-shuffle data-refs to be in topological order. */ 1207*38fd1498Szrj if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1)) 1208*38fd1498Szrj > rdg_vertex_for_stmt (rdg, DR_STMT (dr2))) 1209*38fd1498Szrj std::swap (dr1, dr2); 1210*38fd1498Szrj 1211*38fd1498Szrj ddr = get_data_dependence (rdg, dr1, dr2); 1212*38fd1498Szrj 1213*38fd1498Szrj /* In case of no data dependence. */ 1214*38fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 1215*38fd1498Szrj return false; 1216*38fd1498Szrj /* For unknown data dependence or known data dependence which can't be 1217*38fd1498Szrj expressed in classic distance vector, we check if it can be resolved 1218*38fd1498Szrj by runtime alias check. If yes, we still consider data dependence 1219*38fd1498Szrj as won't introduce data dependence cycle. */ 1220*38fd1498Szrj else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know 1221*38fd1498Szrj || DDR_NUM_DIST_VECTS (ddr) == 0) 1222*38fd1498Szrj return !runtime_alias_check_p (ddr, NULL, true); 1223*38fd1498Szrj else if (DDR_NUM_DIST_VECTS (ddr) > 1) 1224*38fd1498Szrj return true; 1225*38fd1498Szrj else if (DDR_REVERSED_P (ddr) 1226*38fd1498Szrj || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1)) 1227*38fd1498Szrj return false; 1228*38fd1498Szrj 1229*38fd1498Szrj return true; 1230*38fd1498Szrj } 1231*38fd1498Szrj 1232*38fd1498Szrj /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update 1233*38fd1498Szrj PARTITION1's type after merging PARTITION2 into PARTITION1. */ 1234*38fd1498Szrj 1235*38fd1498Szrj static void 1236*38fd1498Szrj update_type_for_merge (struct graph *rdg, 1237*38fd1498Szrj partition *partition1, partition *partition2) 1238*38fd1498Szrj { 1239*38fd1498Szrj unsigned i, j; 1240*38fd1498Szrj bitmap_iterator bi, bj; 1241*38fd1498Szrj data_reference_p dr1, dr2; 1242*38fd1498Szrj 1243*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi) 1244*38fd1498Szrj { 1245*38fd1498Szrj unsigned start = (partition1 == partition2) ? i + 1 : 0; 1246*38fd1498Szrj 1247*38fd1498Szrj dr1 = datarefs_vec[i]; 1248*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj) 1249*38fd1498Szrj { 1250*38fd1498Szrj dr2 = datarefs_vec[j]; 1251*38fd1498Szrj if (DR_IS_READ (dr1) && DR_IS_READ (dr2)) 1252*38fd1498Szrj continue; 1253*38fd1498Szrj 1254*38fd1498Szrj /* Partition can only be executed sequentially if there is any 1255*38fd1498Szrj data dependence cycle. */ 1256*38fd1498Szrj if (data_dep_in_cycle_p (rdg, dr1, dr2)) 1257*38fd1498Szrj { 1258*38fd1498Szrj partition1->type = PTYPE_SEQUENTIAL; 1259*38fd1498Szrj return; 1260*38fd1498Szrj } 1261*38fd1498Szrj } 1262*38fd1498Szrj } 1263*38fd1498Szrj } 1264*38fd1498Szrj 1265*38fd1498Szrj /* Returns a partition with all the statements needed for computing 1266*38fd1498Szrj the vertex V of the RDG, also including the loop exit conditions. */ 1267*38fd1498Szrj 1268*38fd1498Szrj static partition * 1269*38fd1498Szrj build_rdg_partition_for_vertex (struct graph *rdg, int v) 1270*38fd1498Szrj { 1271*38fd1498Szrj partition *partition = partition_alloc (); 1272*38fd1498Szrj auto_vec<int, 3> nodes; 1273*38fd1498Szrj unsigned i, j; 1274*38fd1498Szrj int x; 1275*38fd1498Szrj data_reference_p dr; 1276*38fd1498Szrj 1277*38fd1498Szrj graphds_dfs (rdg, &v, 1, &nodes, false, NULL); 1278*38fd1498Szrj 1279*38fd1498Szrj FOR_EACH_VEC_ELT (nodes, i, x) 1280*38fd1498Szrj { 1281*38fd1498Szrj bitmap_set_bit (partition->stmts, x); 1282*38fd1498Szrj 1283*38fd1498Szrj for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j) 1284*38fd1498Szrj { 1285*38fd1498Szrj unsigned idx = (unsigned) DR_INDEX (dr); 1286*38fd1498Szrj gcc_assert (idx < datarefs_vec.length ()); 1287*38fd1498Szrj 1288*38fd1498Szrj /* Partition can only be executed sequentially if there is any 1289*38fd1498Szrj unknown data reference. */ 1290*38fd1498Szrj if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) 1291*38fd1498Szrj || !DR_INIT (dr) || !DR_STEP (dr)) 1292*38fd1498Szrj partition->type = PTYPE_SEQUENTIAL; 1293*38fd1498Szrj 1294*38fd1498Szrj bitmap_set_bit (partition->datarefs, idx); 1295*38fd1498Szrj } 1296*38fd1498Szrj } 1297*38fd1498Szrj 1298*38fd1498Szrj if (partition->type == PTYPE_SEQUENTIAL) 1299*38fd1498Szrj return partition; 1300*38fd1498Szrj 1301*38fd1498Szrj /* Further check if any data dependence prevents us from executing the 1302*38fd1498Szrj partition parallelly. */ 1303*38fd1498Szrj update_type_for_merge (rdg, partition, partition); 1304*38fd1498Szrj 1305*38fd1498Szrj return partition; 1306*38fd1498Szrj } 1307*38fd1498Szrj 1308*38fd1498Szrj /* Given PARTITION of LOOP and RDG, record single load/store data references 1309*38fd1498Szrj for builtin partition in SRC_DR/DST_DR, return false if there is no such 1310*38fd1498Szrj data references. */ 1311*38fd1498Szrj 1312*38fd1498Szrj static bool 1313*38fd1498Szrj find_single_drs (struct loop *loop, struct graph *rdg, partition *partition, 1314*38fd1498Szrj data_reference_p *dst_dr, data_reference_p *src_dr) 1315*38fd1498Szrj { 1316*38fd1498Szrj unsigned i; 1317*38fd1498Szrj data_reference_p single_ld = NULL, single_st = NULL; 1318*38fd1498Szrj bitmap_iterator bi; 1319*38fd1498Szrj 1320*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) 1321*38fd1498Szrj { 1322*38fd1498Szrj gimple *stmt = RDG_STMT (rdg, i); 1323*38fd1498Szrj data_reference_p dr; 1324*38fd1498Szrj 1325*38fd1498Szrj if (gimple_code (stmt) == GIMPLE_PHI) 1326*38fd1498Szrj continue; 1327*38fd1498Szrj 1328*38fd1498Szrj /* Any scalar stmts are ok. */ 1329*38fd1498Szrj if (!gimple_vuse (stmt)) 1330*38fd1498Szrj continue; 1331*38fd1498Szrj 1332*38fd1498Szrj /* Otherwise just regular loads/stores. */ 1333*38fd1498Szrj if (!gimple_assign_single_p (stmt)) 1334*38fd1498Szrj return false; 1335*38fd1498Szrj 1336*38fd1498Szrj /* But exactly one store and/or load. */ 1337*38fd1498Szrj for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j) 1338*38fd1498Szrj { 1339*38fd1498Szrj tree type = TREE_TYPE (DR_REF (dr)); 1340*38fd1498Szrj 1341*38fd1498Szrj /* The memset, memcpy and memmove library calls are only 1342*38fd1498Szrj able to deal with generic address space. */ 1343*38fd1498Szrj if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type))) 1344*38fd1498Szrj return false; 1345*38fd1498Szrj 1346*38fd1498Szrj if (DR_IS_READ (dr)) 1347*38fd1498Szrj { 1348*38fd1498Szrj if (single_ld != NULL) 1349*38fd1498Szrj return false; 1350*38fd1498Szrj single_ld = dr; 1351*38fd1498Szrj } 1352*38fd1498Szrj else 1353*38fd1498Szrj { 1354*38fd1498Szrj if (single_st != NULL) 1355*38fd1498Szrj return false; 1356*38fd1498Szrj single_st = dr; 1357*38fd1498Szrj } 1358*38fd1498Szrj } 1359*38fd1498Szrj } 1360*38fd1498Szrj 1361*38fd1498Szrj if (!single_st) 1362*38fd1498Szrj return false; 1363*38fd1498Szrj 1364*38fd1498Szrj /* Bail out if this is a bitfield memory reference. */ 1365*38fd1498Szrj if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF 1366*38fd1498Szrj && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1))) 1367*38fd1498Szrj return false; 1368*38fd1498Szrj 1369*38fd1498Szrj /* Data reference must be executed exactly once per iteration of each 1370*38fd1498Szrj loop in the loop nest. We only need to check dominance information 1371*38fd1498Szrj against the outermost one in a perfect loop nest because a bb can't 1372*38fd1498Szrj dominate outermost loop's latch without dominating inner loop's. */ 1373*38fd1498Szrj basic_block bb_st = gimple_bb (DR_STMT (single_st)); 1374*38fd1498Szrj if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st)) 1375*38fd1498Szrj return false; 1376*38fd1498Szrj 1377*38fd1498Szrj if (single_ld) 1378*38fd1498Szrj { 1379*38fd1498Szrj gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld); 1380*38fd1498Szrj /* Direct aggregate copy or via an SSA name temporary. */ 1381*38fd1498Szrj if (load != store 1382*38fd1498Szrj && gimple_assign_lhs (load) != gimple_assign_rhs1 (store)) 1383*38fd1498Szrj return false; 1384*38fd1498Szrj 1385*38fd1498Szrj /* Bail out if this is a bitfield memory reference. */ 1386*38fd1498Szrj if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF 1387*38fd1498Szrj && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1))) 1388*38fd1498Szrj return false; 1389*38fd1498Szrj 1390*38fd1498Szrj /* Load and store must be in the same loop nest. */ 1391*38fd1498Szrj basic_block bb_ld = gimple_bb (DR_STMT (single_ld)); 1392*38fd1498Szrj if (bb_st->loop_father != bb_ld->loop_father) 1393*38fd1498Szrj return false; 1394*38fd1498Szrj 1395*38fd1498Szrj /* Data reference must be executed exactly once per iteration. 1396*38fd1498Szrj Same as single_st, we only need to check against the outermost 1397*38fd1498Szrj loop. */ 1398*38fd1498Szrj if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld)) 1399*38fd1498Szrj return false; 1400*38fd1498Szrj 1401*38fd1498Szrj edge e = single_exit (bb_st->loop_father); 1402*38fd1498Szrj bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld); 1403*38fd1498Szrj bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st); 1404*38fd1498Szrj if (dom_ld != dom_st) 1405*38fd1498Szrj return false; 1406*38fd1498Szrj } 1407*38fd1498Szrj 1408*38fd1498Szrj *src_dr = single_ld; 1409*38fd1498Szrj *dst_dr = single_st; 1410*38fd1498Szrj return true; 1411*38fd1498Szrj } 1412*38fd1498Szrj 1413*38fd1498Szrj /* Given data reference DR in LOOP_NEST, this function checks the enclosing 1414*38fd1498Szrj loops from inner to outer to see if loop's step equals to access size at 1415*38fd1498Szrj each level of loop. Return 2 if we can prove this at all level loops; 1416*38fd1498Szrj record access base and size in BASE and SIZE; save loop's step at each 1417*38fd1498Szrj level of loop in STEPS if it is not null. For example: 1418*38fd1498Szrj 1419*38fd1498Szrj int arr[100][100][100]; 1420*38fd1498Szrj for (i = 0; i < 100; i++) ;steps[2] = 40000 1421*38fd1498Szrj for (j = 100; j > 0; j--) ;steps[1] = -400 1422*38fd1498Szrj for (k = 0; k < 100; k++) ;steps[0] = 4 1423*38fd1498Szrj arr[i][j - 1][k] = 0; ;base = &arr, size = 4000000 1424*38fd1498Szrj 1425*38fd1498Szrj Return 1 if we can prove the equality at the innermost loop, but not all 1426*38fd1498Szrj level loops. In this case, no information is recorded. 1427*38fd1498Szrj 1428*38fd1498Szrj Return 0 if no equality can be proven at any level loops. */ 1429*38fd1498Szrj 1430*38fd1498Szrj static int 1431*38fd1498Szrj compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base, 1432*38fd1498Szrj tree *size, vec<tree> *steps = NULL) 1433*38fd1498Szrj { 1434*38fd1498Szrj location_t loc = gimple_location (DR_STMT (dr)); 1435*38fd1498Szrj basic_block bb = gimple_bb (DR_STMT (dr)); 1436*38fd1498Szrj struct loop *loop = bb->loop_father; 1437*38fd1498Szrj tree ref = DR_REF (dr); 1438*38fd1498Szrj tree access_base = build_fold_addr_expr (ref); 1439*38fd1498Szrj tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref)); 1440*38fd1498Szrj int res = 0; 1441*38fd1498Szrj 1442*38fd1498Szrj do { 1443*38fd1498Szrj tree scev_fn = analyze_scalar_evolution (loop, access_base); 1444*38fd1498Szrj if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC) 1445*38fd1498Szrj return res; 1446*38fd1498Szrj 1447*38fd1498Szrj access_base = CHREC_LEFT (scev_fn); 1448*38fd1498Szrj if (tree_contains_chrecs (access_base, NULL)) 1449*38fd1498Szrj return res; 1450*38fd1498Szrj 1451*38fd1498Szrj tree scev_step = CHREC_RIGHT (scev_fn); 1452*38fd1498Szrj /* Only support constant steps. */ 1453*38fd1498Szrj if (TREE_CODE (scev_step) != INTEGER_CST) 1454*38fd1498Szrj return res; 1455*38fd1498Szrj 1456*38fd1498Szrj enum ev_direction access_dir = scev_direction (scev_fn); 1457*38fd1498Szrj if (access_dir == EV_DIR_UNKNOWN) 1458*38fd1498Szrj return res; 1459*38fd1498Szrj 1460*38fd1498Szrj if (steps != NULL) 1461*38fd1498Szrj steps->safe_push (scev_step); 1462*38fd1498Szrj 1463*38fd1498Szrj scev_step = fold_convert_loc (loc, sizetype, scev_step); 1464*38fd1498Szrj /* Compute absolute value of scev step. */ 1465*38fd1498Szrj if (access_dir == EV_DIR_DECREASES) 1466*38fd1498Szrj scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step); 1467*38fd1498Szrj 1468*38fd1498Szrj /* At each level of loop, scev step must equal to access size. In other 1469*38fd1498Szrj words, DR must access consecutive memory between loop iterations. */ 1470*38fd1498Szrj if (!operand_equal_p (scev_step, access_size, 0)) 1471*38fd1498Szrj return res; 1472*38fd1498Szrj 1473*38fd1498Szrj /* Access stride can be computed for data reference at least for the 1474*38fd1498Szrj innermost loop. */ 1475*38fd1498Szrj res = 1; 1476*38fd1498Szrj 1477*38fd1498Szrj /* Compute DR's execution times in loop. */ 1478*38fd1498Szrj tree niters = number_of_latch_executions (loop); 1479*38fd1498Szrj niters = fold_convert_loc (loc, sizetype, niters); 1480*38fd1498Szrj if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb)) 1481*38fd1498Szrj niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node); 1482*38fd1498Szrj 1483*38fd1498Szrj /* Compute DR's overall access size in loop. */ 1484*38fd1498Szrj access_size = fold_build2_loc (loc, MULT_EXPR, sizetype, 1485*38fd1498Szrj niters, scev_step); 1486*38fd1498Szrj /* Adjust base address in case of negative step. */ 1487*38fd1498Szrj if (access_dir == EV_DIR_DECREASES) 1488*38fd1498Szrj { 1489*38fd1498Szrj tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype, 1490*38fd1498Szrj scev_step, access_size); 1491*38fd1498Szrj access_base = fold_build_pointer_plus_loc (loc, access_base, adj); 1492*38fd1498Szrj } 1493*38fd1498Szrj } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL); 1494*38fd1498Szrj 1495*38fd1498Szrj *base = access_base; 1496*38fd1498Szrj *size = access_size; 1497*38fd1498Szrj /* Access stride can be computed for data reference at each level loop. */ 1498*38fd1498Szrj return 2; 1499*38fd1498Szrj } 1500*38fd1498Szrj 1501*38fd1498Szrj /* Allocate and return builtin struct. Record information like DST_DR, 1502*38fd1498Szrj SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct. */ 1503*38fd1498Szrj 1504*38fd1498Szrj static struct builtin_info * 1505*38fd1498Szrj alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr, 1506*38fd1498Szrj tree dst_base, tree src_base, tree size) 1507*38fd1498Szrj { 1508*38fd1498Szrj struct builtin_info *builtin = XNEW (struct builtin_info); 1509*38fd1498Szrj builtin->dst_dr = dst_dr; 1510*38fd1498Szrj builtin->src_dr = src_dr; 1511*38fd1498Szrj builtin->dst_base = dst_base; 1512*38fd1498Szrj builtin->src_base = src_base; 1513*38fd1498Szrj builtin->size = size; 1514*38fd1498Szrj return builtin; 1515*38fd1498Szrj } 1516*38fd1498Szrj 1517*38fd1498Szrj /* Given data reference DR in loop nest LOOP, classify if it forms builtin 1518*38fd1498Szrj memset call. */ 1519*38fd1498Szrj 1520*38fd1498Szrj static void 1521*38fd1498Szrj classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr) 1522*38fd1498Szrj { 1523*38fd1498Szrj gimple *stmt = DR_STMT (dr); 1524*38fd1498Szrj tree base, size, rhs = gimple_assign_rhs1 (stmt); 1525*38fd1498Szrj 1526*38fd1498Szrj if (const_with_all_bytes_same (rhs) == -1 1527*38fd1498Szrj && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs)) 1528*38fd1498Szrj || (TYPE_MODE (TREE_TYPE (rhs)) 1529*38fd1498Szrj != TYPE_MODE (unsigned_char_type_node)))) 1530*38fd1498Szrj return; 1531*38fd1498Szrj 1532*38fd1498Szrj if (TREE_CODE (rhs) == SSA_NAME 1533*38fd1498Szrj && !SSA_NAME_IS_DEFAULT_DEF (rhs) 1534*38fd1498Szrj && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs)))) 1535*38fd1498Szrj return; 1536*38fd1498Szrj 1537*38fd1498Szrj int res = compute_access_range (loop, dr, &base, &size); 1538*38fd1498Szrj if (res == 0) 1539*38fd1498Szrj return; 1540*38fd1498Szrj if (res == 1) 1541*38fd1498Szrj { 1542*38fd1498Szrj partition->kind = PKIND_PARTIAL_MEMSET; 1543*38fd1498Szrj return; 1544*38fd1498Szrj } 1545*38fd1498Szrj 1546*38fd1498Szrj poly_uint64 base_offset; 1547*38fd1498Szrj unsigned HOST_WIDE_INT const_base_offset; 1548*38fd1498Szrj tree base_base = strip_offset (base, &base_offset); 1549*38fd1498Szrj if (!base_offset.is_constant (&const_base_offset)) 1550*38fd1498Szrj return; 1551*38fd1498Szrj 1552*38fd1498Szrj struct builtin_info *builtin; 1553*38fd1498Szrj builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size); 1554*38fd1498Szrj builtin->dst_base_base = base_base; 1555*38fd1498Szrj builtin->dst_base_offset = const_base_offset; 1556*38fd1498Szrj partition->builtin = builtin; 1557*38fd1498Szrj partition->kind = PKIND_MEMSET; 1558*38fd1498Szrj } 1559*38fd1498Szrj 1560*38fd1498Szrj /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify 1561*38fd1498Szrj if it forms builtin memcpy or memmove call. */ 1562*38fd1498Szrj 1563*38fd1498Szrj static void 1564*38fd1498Szrj classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition, 1565*38fd1498Szrj data_reference_p dst_dr, data_reference_p src_dr) 1566*38fd1498Szrj { 1567*38fd1498Szrj tree base, size, src_base, src_size; 1568*38fd1498Szrj auto_vec<tree> dst_steps, src_steps; 1569*38fd1498Szrj 1570*38fd1498Szrj /* Compute access range of both load and store. */ 1571*38fd1498Szrj int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps); 1572*38fd1498Szrj if (res != 2) 1573*38fd1498Szrj return; 1574*38fd1498Szrj res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps); 1575*38fd1498Szrj if (res != 2) 1576*38fd1498Szrj return; 1577*38fd1498Szrj 1578*38fd1498Szrj /* They much have the same access size. */ 1579*38fd1498Szrj if (!operand_equal_p (size, src_size, 0)) 1580*38fd1498Szrj return; 1581*38fd1498Szrj 1582*38fd1498Szrj /* Load and store in loop nest must access memory in the same way, i.e, 1583*38fd1498Szrj their must have the same steps in each loop of the nest. */ 1584*38fd1498Szrj if (dst_steps.length () != src_steps.length ()) 1585*38fd1498Szrj return; 1586*38fd1498Szrj for (unsigned i = 0; i < dst_steps.length (); ++i) 1587*38fd1498Szrj if (!operand_equal_p (dst_steps[i], src_steps[i], 0)) 1588*38fd1498Szrj return; 1589*38fd1498Szrj 1590*38fd1498Szrj /* Now check that if there is a dependence. */ 1591*38fd1498Szrj ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr); 1592*38fd1498Szrj 1593*38fd1498Szrj /* Classify as memcpy if no dependence between load and store. */ 1594*38fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_known) 1595*38fd1498Szrj { 1596*38fd1498Szrj partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size); 1597*38fd1498Szrj partition->kind = PKIND_MEMCPY; 1598*38fd1498Szrj return; 1599*38fd1498Szrj } 1600*38fd1498Szrj 1601*38fd1498Szrj /* Can't do memmove in case of unknown dependence or dependence without 1602*38fd1498Szrj classical distance vector. */ 1603*38fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know 1604*38fd1498Szrj || DDR_NUM_DIST_VECTS (ddr) == 0) 1605*38fd1498Szrj return; 1606*38fd1498Szrj 1607*38fd1498Szrj unsigned i; 1608*38fd1498Szrj lambda_vector dist_v; 1609*38fd1498Szrj int num_lev = (DDR_LOOP_NEST (ddr)).length (); 1610*38fd1498Szrj FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) 1611*38fd1498Szrj { 1612*38fd1498Szrj unsigned dep_lev = dependence_level (dist_v, num_lev); 1613*38fd1498Szrj /* Can't do memmove if load depends on store. */ 1614*38fd1498Szrj if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr)) 1615*38fd1498Szrj return; 1616*38fd1498Szrj } 1617*38fd1498Szrj 1618*38fd1498Szrj partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size); 1619*38fd1498Szrj partition->kind = PKIND_MEMMOVE; 1620*38fd1498Szrj return; 1621*38fd1498Szrj } 1622*38fd1498Szrj 1623*38fd1498Szrj /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP. 1624*38fd1498Szrj For the moment we detect memset, memcpy and memmove patterns. Bitmap 1625*38fd1498Szrj STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions. */ 1626*38fd1498Szrj 1627*38fd1498Szrj static void 1628*38fd1498Szrj classify_partition (loop_p loop, struct graph *rdg, partition *partition, 1629*38fd1498Szrj bitmap stmt_in_all_partitions) 1630*38fd1498Szrj { 1631*38fd1498Szrj bitmap_iterator bi; 1632*38fd1498Szrj unsigned i; 1633*38fd1498Szrj data_reference_p single_ld = NULL, single_st = NULL; 1634*38fd1498Szrj bool volatiles_p = false, has_reduction = false; 1635*38fd1498Szrj 1636*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi) 1637*38fd1498Szrj { 1638*38fd1498Szrj gimple *stmt = RDG_STMT (rdg, i); 1639*38fd1498Szrj 1640*38fd1498Szrj if (gimple_has_volatile_ops (stmt)) 1641*38fd1498Szrj volatiles_p = true; 1642*38fd1498Szrj 1643*38fd1498Szrj /* If the stmt is not included by all partitions and there is uses 1644*38fd1498Szrj outside of the loop, then mark the partition as reduction. */ 1645*38fd1498Szrj if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) 1646*38fd1498Szrj { 1647*38fd1498Szrj /* Due to limitation in the transform phase we have to fuse all 1648*38fd1498Szrj reduction partitions. As a result, this could cancel valid 1649*38fd1498Szrj loop distribution especially for loop that induction variable 1650*38fd1498Szrj is used outside of loop. To workaround this issue, we skip 1651*38fd1498Szrj marking partition as reudction if the reduction stmt belongs 1652*38fd1498Szrj to all partitions. In such case, reduction will be computed 1653*38fd1498Szrj correctly no matter how partitions are fused/distributed. */ 1654*38fd1498Szrj if (!bitmap_bit_p (stmt_in_all_partitions, i)) 1655*38fd1498Szrj { 1656*38fd1498Szrj partition->reduction_p = true; 1657*38fd1498Szrj return; 1658*38fd1498Szrj } 1659*38fd1498Szrj has_reduction = true; 1660*38fd1498Szrj } 1661*38fd1498Szrj } 1662*38fd1498Szrj 1663*38fd1498Szrj /* Perform general partition disqualification for builtins. */ 1664*38fd1498Szrj if (volatiles_p 1665*38fd1498Szrj /* Simple workaround to prevent classifying the partition as builtin 1666*38fd1498Szrj if it contains any use outside of loop. */ 1667*38fd1498Szrj || has_reduction 1668*38fd1498Szrj || !flag_tree_loop_distribute_patterns) 1669*38fd1498Szrj return; 1670*38fd1498Szrj 1671*38fd1498Szrj /* Find single load/store data references for builtin partition. */ 1672*38fd1498Szrj if (!find_single_drs (loop, rdg, partition, &single_st, &single_ld)) 1673*38fd1498Szrj return; 1674*38fd1498Szrj 1675*38fd1498Szrj /* Classify the builtin kind. */ 1676*38fd1498Szrj if (single_ld == NULL) 1677*38fd1498Szrj classify_builtin_st (loop, partition, single_st); 1678*38fd1498Szrj else 1679*38fd1498Szrj classify_builtin_ldst (loop, rdg, partition, single_st, single_ld); 1680*38fd1498Szrj } 1681*38fd1498Szrj 1682*38fd1498Szrj /* Returns true when PARTITION1 and PARTITION2 access the same memory 1683*38fd1498Szrj object in RDG. */ 1684*38fd1498Szrj 1685*38fd1498Szrj static bool 1686*38fd1498Szrj share_memory_accesses (struct graph *rdg, 1687*38fd1498Szrj partition *partition1, partition *partition2) 1688*38fd1498Szrj { 1689*38fd1498Szrj unsigned i, j; 1690*38fd1498Szrj bitmap_iterator bi, bj; 1691*38fd1498Szrj data_reference_p dr1, dr2; 1692*38fd1498Szrj 1693*38fd1498Szrj /* First check whether in the intersection of the two partitions are 1694*38fd1498Szrj any loads or stores. Common loads are the situation that happens 1695*38fd1498Szrj most often. */ 1696*38fd1498Szrj EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi) 1697*38fd1498Szrj if (RDG_MEM_WRITE_STMT (rdg, i) 1698*38fd1498Szrj || RDG_MEM_READS_STMT (rdg, i)) 1699*38fd1498Szrj return true; 1700*38fd1498Szrj 1701*38fd1498Szrj /* Then check whether the two partitions access the same memory object. */ 1702*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi) 1703*38fd1498Szrj { 1704*38fd1498Szrj dr1 = datarefs_vec[i]; 1705*38fd1498Szrj 1706*38fd1498Szrj if (!DR_BASE_ADDRESS (dr1) 1707*38fd1498Szrj || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1)) 1708*38fd1498Szrj continue; 1709*38fd1498Szrj 1710*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj) 1711*38fd1498Szrj { 1712*38fd1498Szrj dr2 = datarefs_vec[j]; 1713*38fd1498Szrj 1714*38fd1498Szrj if (!DR_BASE_ADDRESS (dr2) 1715*38fd1498Szrj || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2)) 1716*38fd1498Szrj continue; 1717*38fd1498Szrj 1718*38fd1498Szrj if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0) 1719*38fd1498Szrj && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0) 1720*38fd1498Szrj && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0) 1721*38fd1498Szrj && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0)) 1722*38fd1498Szrj return true; 1723*38fd1498Szrj } 1724*38fd1498Szrj } 1725*38fd1498Szrj 1726*38fd1498Szrj return false; 1727*38fd1498Szrj } 1728*38fd1498Szrj 1729*38fd1498Szrj /* For each seed statement in STARTING_STMTS, this function builds 1730*38fd1498Szrj partition for it by adding depended statements according to RDG. 1731*38fd1498Szrj All partitions are recorded in PARTITIONS. */ 1732*38fd1498Szrj 1733*38fd1498Szrj static void 1734*38fd1498Szrj rdg_build_partitions (struct graph *rdg, 1735*38fd1498Szrj vec<gimple *> starting_stmts, 1736*38fd1498Szrj vec<partition *> *partitions) 1737*38fd1498Szrj { 1738*38fd1498Szrj auto_bitmap processed; 1739*38fd1498Szrj int i; 1740*38fd1498Szrj gimple *stmt; 1741*38fd1498Szrj 1742*38fd1498Szrj FOR_EACH_VEC_ELT (starting_stmts, i, stmt) 1743*38fd1498Szrj { 1744*38fd1498Szrj int v = rdg_vertex_for_stmt (rdg, stmt); 1745*38fd1498Szrj 1746*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 1747*38fd1498Szrj fprintf (dump_file, 1748*38fd1498Szrj "ldist asked to generate code for vertex %d\n", v); 1749*38fd1498Szrj 1750*38fd1498Szrj /* If the vertex is already contained in another partition so 1751*38fd1498Szrj is the partition rooted at it. */ 1752*38fd1498Szrj if (bitmap_bit_p (processed, v)) 1753*38fd1498Szrj continue; 1754*38fd1498Szrj 1755*38fd1498Szrj partition *partition = build_rdg_partition_for_vertex (rdg, v); 1756*38fd1498Szrj bitmap_ior_into (processed, partition->stmts); 1757*38fd1498Szrj 1758*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 1759*38fd1498Szrj { 1760*38fd1498Szrj fprintf (dump_file, "ldist creates useful %s partition:\n", 1761*38fd1498Szrj partition->type == PTYPE_PARALLEL ? "parallel" : "sequent"); 1762*38fd1498Szrj bitmap_print (dump_file, partition->stmts, " ", "\n"); 1763*38fd1498Szrj } 1764*38fd1498Szrj 1765*38fd1498Szrj partitions->safe_push (partition); 1766*38fd1498Szrj } 1767*38fd1498Szrj 1768*38fd1498Szrj /* All vertices should have been assigned to at least one partition now, 1769*38fd1498Szrj other than vertices belonging to dead code. */ 1770*38fd1498Szrj } 1771*38fd1498Szrj 1772*38fd1498Szrj /* Dump to FILE the PARTITIONS. */ 1773*38fd1498Szrj 1774*38fd1498Szrj static void 1775*38fd1498Szrj dump_rdg_partitions (FILE *file, vec<partition *> partitions) 1776*38fd1498Szrj { 1777*38fd1498Szrj int i; 1778*38fd1498Szrj partition *partition; 1779*38fd1498Szrj 1780*38fd1498Szrj FOR_EACH_VEC_ELT (partitions, i, partition) 1781*38fd1498Szrj debug_bitmap_file (file, partition->stmts); 1782*38fd1498Szrj } 1783*38fd1498Szrj 1784*38fd1498Szrj /* Debug PARTITIONS. */ 1785*38fd1498Szrj extern void debug_rdg_partitions (vec<partition *> ); 1786*38fd1498Szrj 1787*38fd1498Szrj DEBUG_FUNCTION void 1788*38fd1498Szrj debug_rdg_partitions (vec<partition *> partitions) 1789*38fd1498Szrj { 1790*38fd1498Szrj dump_rdg_partitions (stderr, partitions); 1791*38fd1498Szrj } 1792*38fd1498Szrj 1793*38fd1498Szrj /* Returns the number of read and write operations in the RDG. */ 1794*38fd1498Szrj 1795*38fd1498Szrj static int 1796*38fd1498Szrj number_of_rw_in_rdg (struct graph *rdg) 1797*38fd1498Szrj { 1798*38fd1498Szrj int i, res = 0; 1799*38fd1498Szrj 1800*38fd1498Szrj for (i = 0; i < rdg->n_vertices; i++) 1801*38fd1498Szrj { 1802*38fd1498Szrj if (RDG_MEM_WRITE_STMT (rdg, i)) 1803*38fd1498Szrj ++res; 1804*38fd1498Szrj 1805*38fd1498Szrj if (RDG_MEM_READS_STMT (rdg, i)) 1806*38fd1498Szrj ++res; 1807*38fd1498Szrj } 1808*38fd1498Szrj 1809*38fd1498Szrj return res; 1810*38fd1498Szrj } 1811*38fd1498Szrj 1812*38fd1498Szrj /* Returns the number of read and write operations in a PARTITION of 1813*38fd1498Szrj the RDG. */ 1814*38fd1498Szrj 1815*38fd1498Szrj static int 1816*38fd1498Szrj number_of_rw_in_partition (struct graph *rdg, partition *partition) 1817*38fd1498Szrj { 1818*38fd1498Szrj int res = 0; 1819*38fd1498Szrj unsigned i; 1820*38fd1498Szrj bitmap_iterator ii; 1821*38fd1498Szrj 1822*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii) 1823*38fd1498Szrj { 1824*38fd1498Szrj if (RDG_MEM_WRITE_STMT (rdg, i)) 1825*38fd1498Szrj ++res; 1826*38fd1498Szrj 1827*38fd1498Szrj if (RDG_MEM_READS_STMT (rdg, i)) 1828*38fd1498Szrj ++res; 1829*38fd1498Szrj } 1830*38fd1498Szrj 1831*38fd1498Szrj return res; 1832*38fd1498Szrj } 1833*38fd1498Szrj 1834*38fd1498Szrj /* Returns true when one of the PARTITIONS contains all the read or 1835*38fd1498Szrj write operations of RDG. */ 1836*38fd1498Szrj 1837*38fd1498Szrj static bool 1838*38fd1498Szrj partition_contains_all_rw (struct graph *rdg, 1839*38fd1498Szrj vec<partition *> partitions) 1840*38fd1498Szrj { 1841*38fd1498Szrj int i; 1842*38fd1498Szrj partition *partition; 1843*38fd1498Szrj int nrw = number_of_rw_in_rdg (rdg); 1844*38fd1498Szrj 1845*38fd1498Szrj FOR_EACH_VEC_ELT (partitions, i, partition) 1846*38fd1498Szrj if (nrw == number_of_rw_in_partition (rdg, partition)) 1847*38fd1498Szrj return true; 1848*38fd1498Szrj 1849*38fd1498Szrj return false; 1850*38fd1498Szrj } 1851*38fd1498Szrj 1852*38fd1498Szrj /* Compute partition dependence created by the data references in DRS1 1853*38fd1498Szrj and DRS2, modify and return DIR according to that. IF ALIAS_DDR is 1854*38fd1498Szrj not NULL, we record dependence introduced by possible alias between 1855*38fd1498Szrj two data references in ALIAS_DDRS; otherwise, we simply ignore such 1856*38fd1498Szrj dependence as if it doesn't exist at all. */ 1857*38fd1498Szrj 1858*38fd1498Szrj static int 1859*38fd1498Szrj pg_add_dependence_edges (struct graph *rdg, int dir, 1860*38fd1498Szrj bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs) 1861*38fd1498Szrj { 1862*38fd1498Szrj unsigned i, j; 1863*38fd1498Szrj bitmap_iterator bi, bj; 1864*38fd1498Szrj data_reference_p dr1, dr2, saved_dr1; 1865*38fd1498Szrj 1866*38fd1498Szrj /* dependence direction - 0 is no dependence, -1 is back, 1867*38fd1498Szrj 1 is forth, 2 is both (we can stop then, merging will occur). */ 1868*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi) 1869*38fd1498Szrj { 1870*38fd1498Szrj dr1 = datarefs_vec[i]; 1871*38fd1498Szrj 1872*38fd1498Szrj EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj) 1873*38fd1498Szrj { 1874*38fd1498Szrj int res, this_dir = 1; 1875*38fd1498Szrj ddr_p ddr; 1876*38fd1498Szrj 1877*38fd1498Szrj dr2 = datarefs_vec[j]; 1878*38fd1498Szrj 1879*38fd1498Szrj /* Skip all <read, read> data dependence. */ 1880*38fd1498Szrj if (DR_IS_READ (dr1) && DR_IS_READ (dr2)) 1881*38fd1498Szrj continue; 1882*38fd1498Szrj 1883*38fd1498Szrj saved_dr1 = dr1; 1884*38fd1498Szrj /* Re-shuffle data-refs to be in topological order. */ 1885*38fd1498Szrj if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1)) 1886*38fd1498Szrj > rdg_vertex_for_stmt (rdg, DR_STMT (dr2))) 1887*38fd1498Szrj { 1888*38fd1498Szrj std::swap (dr1, dr2); 1889*38fd1498Szrj this_dir = -this_dir; 1890*38fd1498Szrj } 1891*38fd1498Szrj ddr = get_data_dependence (rdg, dr1, dr2); 1892*38fd1498Szrj if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) 1893*38fd1498Szrj { 1894*38fd1498Szrj this_dir = 0; 1895*38fd1498Szrj res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1), 1896*38fd1498Szrj DR_BASE_ADDRESS (dr2)); 1897*38fd1498Szrj /* Be conservative. If data references are not well analyzed, 1898*38fd1498Szrj or the two data references have the same base address and 1899*38fd1498Szrj offset, add dependence and consider it alias to each other. 1900*38fd1498Szrj In other words, the dependence can not be resolved by 1901*38fd1498Szrj runtime alias check. */ 1902*38fd1498Szrj if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2) 1903*38fd1498Szrj || !DR_OFFSET (dr1) || !DR_OFFSET (dr2) 1904*38fd1498Szrj || !DR_INIT (dr1) || !DR_INIT (dr2) 1905*38fd1498Szrj || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1)) 1906*38fd1498Szrj || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2)) 1907*38fd1498Szrj || res == 0) 1908*38fd1498Szrj this_dir = 2; 1909*38fd1498Szrj /* Data dependence could be resolved by runtime alias check, 1910*38fd1498Szrj record it in ALIAS_DDRS. */ 1911*38fd1498Szrj else if (alias_ddrs != NULL) 1912*38fd1498Szrj alias_ddrs->safe_push (ddr); 1913*38fd1498Szrj /* Or simply ignore it. */ 1914*38fd1498Szrj } 1915*38fd1498Szrj else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE) 1916*38fd1498Szrj { 1917*38fd1498Szrj if (DDR_REVERSED_P (ddr)) 1918*38fd1498Szrj this_dir = -this_dir; 1919*38fd1498Szrj 1920*38fd1498Szrj /* Known dependences can still be unordered througout the 1921*38fd1498Szrj iteration space, see gcc.dg/tree-ssa/ldist-16.c. */ 1922*38fd1498Szrj if (DDR_NUM_DIST_VECTS (ddr) != 1) 1923*38fd1498Szrj this_dir = 2; 1924*38fd1498Szrj /* If the overlap is exact preserve stmt order. */ 1925*38fd1498Szrj else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1)) 1926*38fd1498Szrj ; 1927*38fd1498Szrj /* Else as the distance vector is lexicographic positive swap 1928*38fd1498Szrj the dependence direction. */ 1929*38fd1498Szrj else 1930*38fd1498Szrj this_dir = -this_dir; 1931*38fd1498Szrj } 1932*38fd1498Szrj else 1933*38fd1498Szrj this_dir = 0; 1934*38fd1498Szrj if (this_dir == 2) 1935*38fd1498Szrj return 2; 1936*38fd1498Szrj else if (dir == 0) 1937*38fd1498Szrj dir = this_dir; 1938*38fd1498Szrj else if (this_dir != 0 && dir != this_dir) 1939*38fd1498Szrj return 2; 1940*38fd1498Szrj /* Shuffle "back" dr1. */ 1941*38fd1498Szrj dr1 = saved_dr1; 1942*38fd1498Szrj } 1943*38fd1498Szrj } 1944*38fd1498Szrj return dir; 1945*38fd1498Szrj } 1946*38fd1498Szrj 1947*38fd1498Szrj /* Compare postorder number of the partition graph vertices V1 and V2. */ 1948*38fd1498Szrj 1949*38fd1498Szrj static int 1950*38fd1498Szrj pgcmp (const void *v1_, const void *v2_) 1951*38fd1498Szrj { 1952*38fd1498Szrj const vertex *v1 = (const vertex *)v1_; 1953*38fd1498Szrj const vertex *v2 = (const vertex *)v2_; 1954*38fd1498Szrj return v2->post - v1->post; 1955*38fd1498Szrj } 1956*38fd1498Szrj 1957*38fd1498Szrj /* Data attached to vertices of partition dependence graph. */ 1958*38fd1498Szrj struct pg_vdata 1959*38fd1498Szrj { 1960*38fd1498Szrj /* ID of the corresponding partition. */ 1961*38fd1498Szrj int id; 1962*38fd1498Szrj /* The partition. */ 1963*38fd1498Szrj struct partition *partition; 1964*38fd1498Szrj }; 1965*38fd1498Szrj 1966*38fd1498Szrj /* Data attached to edges of partition dependence graph. */ 1967*38fd1498Szrj struct pg_edata 1968*38fd1498Szrj { 1969*38fd1498Szrj /* If the dependence edge can be resolved by runtime alias check, 1970*38fd1498Szrj this vector contains data dependence relations for runtime alias 1971*38fd1498Szrj check. On the other hand, if the dependence edge is introduced 1972*38fd1498Szrj because of compilation time known data dependence, this vector 1973*38fd1498Szrj contains nothing. */ 1974*38fd1498Szrj vec<ddr_p> alias_ddrs; 1975*38fd1498Szrj }; 1976*38fd1498Szrj 1977*38fd1498Szrj /* Callback data for traversing edges in graph. */ 1978*38fd1498Szrj struct pg_edge_callback_data 1979*38fd1498Szrj { 1980*38fd1498Szrj /* Bitmap contains strong connected components should be merged. */ 1981*38fd1498Szrj bitmap sccs_to_merge; 1982*38fd1498Szrj /* Array constains component information for all vertices. */ 1983*38fd1498Szrj int *vertices_component; 1984*38fd1498Szrj /* Vector to record all data dependence relations which are needed 1985*38fd1498Szrj to break strong connected components by runtime alias checks. */ 1986*38fd1498Szrj vec<ddr_p> *alias_ddrs; 1987*38fd1498Szrj }; 1988*38fd1498Szrj 1989*38fd1498Szrj /* Initialize vertice's data for partition dependence graph PG with 1990*38fd1498Szrj PARTITIONS. */ 1991*38fd1498Szrj 1992*38fd1498Szrj static void 1993*38fd1498Szrj init_partition_graph_vertices (struct graph *pg, 1994*38fd1498Szrj vec<struct partition *> *partitions) 1995*38fd1498Szrj { 1996*38fd1498Szrj int i; 1997*38fd1498Szrj partition *partition; 1998*38fd1498Szrj struct pg_vdata *data; 1999*38fd1498Szrj 2000*38fd1498Szrj for (i = 0; partitions->iterate (i, &partition); ++i) 2001*38fd1498Szrj { 2002*38fd1498Szrj data = new pg_vdata; 2003*38fd1498Szrj pg->vertices[i].data = data; 2004*38fd1498Szrj data->id = i; 2005*38fd1498Szrj data->partition = partition; 2006*38fd1498Szrj } 2007*38fd1498Szrj } 2008*38fd1498Szrj 2009*38fd1498Szrj /* Add edge <I, J> to partition dependence graph PG. Attach vector of data 2010*38fd1498Szrj dependence relations to the EDGE if DDRS isn't NULL. */ 2011*38fd1498Szrj 2012*38fd1498Szrj static void 2013*38fd1498Szrj add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs) 2014*38fd1498Szrj { 2015*38fd1498Szrj struct graph_edge *e = add_edge (pg, i, j); 2016*38fd1498Szrj 2017*38fd1498Szrj /* If the edge is attached with data dependence relations, it means this 2018*38fd1498Szrj dependence edge can be resolved by runtime alias checks. */ 2019*38fd1498Szrj if (ddrs != NULL) 2020*38fd1498Szrj { 2021*38fd1498Szrj struct pg_edata *data = new pg_edata; 2022*38fd1498Szrj 2023*38fd1498Szrj gcc_assert (ddrs->length () > 0); 2024*38fd1498Szrj e->data = data; 2025*38fd1498Szrj data->alias_ddrs = vNULL; 2026*38fd1498Szrj data->alias_ddrs.safe_splice (*ddrs); 2027*38fd1498Szrj } 2028*38fd1498Szrj } 2029*38fd1498Szrj 2030*38fd1498Szrj /* Callback function for graph travesal algorithm. It returns true 2031*38fd1498Szrj if edge E should skipped when traversing the graph. */ 2032*38fd1498Szrj 2033*38fd1498Szrj static bool 2034*38fd1498Szrj pg_skip_alias_edge (struct graph_edge *e) 2035*38fd1498Szrj { 2036*38fd1498Szrj struct pg_edata *data = (struct pg_edata *)e->data; 2037*38fd1498Szrj return (data != NULL && data->alias_ddrs.length () > 0); 2038*38fd1498Szrj } 2039*38fd1498Szrj 2040*38fd1498Szrj /* Callback function freeing data attached to edge E of graph. */ 2041*38fd1498Szrj 2042*38fd1498Szrj static void 2043*38fd1498Szrj free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *) 2044*38fd1498Szrj { 2045*38fd1498Szrj if (e->data != NULL) 2046*38fd1498Szrj { 2047*38fd1498Szrj struct pg_edata *data = (struct pg_edata *)e->data; 2048*38fd1498Szrj data->alias_ddrs.release (); 2049*38fd1498Szrj delete data; 2050*38fd1498Szrj } 2051*38fd1498Szrj } 2052*38fd1498Szrj 2053*38fd1498Szrj /* Free data attached to vertice of partition dependence graph PG. */ 2054*38fd1498Szrj 2055*38fd1498Szrj static void 2056*38fd1498Szrj free_partition_graph_vdata (struct graph *pg) 2057*38fd1498Szrj { 2058*38fd1498Szrj int i; 2059*38fd1498Szrj struct pg_vdata *data; 2060*38fd1498Szrj 2061*38fd1498Szrj for (i = 0; i < pg->n_vertices; ++i) 2062*38fd1498Szrj { 2063*38fd1498Szrj data = (struct pg_vdata *)pg->vertices[i].data; 2064*38fd1498Szrj delete data; 2065*38fd1498Szrj } 2066*38fd1498Szrj } 2067*38fd1498Szrj 2068*38fd1498Szrj /* Build and return partition dependence graph for PARTITIONS. RDG is 2069*38fd1498Szrj reduced dependence graph for the loop to be distributed. If IGNORE_ALIAS_P 2070*38fd1498Szrj is true, data dependence caused by possible alias between references 2071*38fd1498Szrj is ignored, as if it doesn't exist at all; otherwise all depdendences 2072*38fd1498Szrj are considered. */ 2073*38fd1498Szrj 2074*38fd1498Szrj static struct graph * 2075*38fd1498Szrj build_partition_graph (struct graph *rdg, 2076*38fd1498Szrj vec<struct partition *> *partitions, 2077*38fd1498Szrj bool ignore_alias_p) 2078*38fd1498Szrj { 2079*38fd1498Szrj int i, j; 2080*38fd1498Szrj struct partition *partition1, *partition2; 2081*38fd1498Szrj graph *pg = new_graph (partitions->length ()); 2082*38fd1498Szrj auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p; 2083*38fd1498Szrj 2084*38fd1498Szrj alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs; 2085*38fd1498Szrj 2086*38fd1498Szrj init_partition_graph_vertices (pg, partitions); 2087*38fd1498Szrj 2088*38fd1498Szrj for (i = 0; partitions->iterate (i, &partition1); ++i) 2089*38fd1498Szrj { 2090*38fd1498Szrj for (j = i + 1; partitions->iterate (j, &partition2); ++j) 2091*38fd1498Szrj { 2092*38fd1498Szrj /* dependence direction - 0 is no dependence, -1 is back, 2093*38fd1498Szrj 1 is forth, 2 is both (we can stop then, merging will occur). */ 2094*38fd1498Szrj int dir = 0; 2095*38fd1498Szrj 2096*38fd1498Szrj /* If the first partition has reduction, add back edge; if the 2097*38fd1498Szrj second partition has reduction, add forth edge. This makes 2098*38fd1498Szrj sure that reduction partition will be sorted as the last one. */ 2099*38fd1498Szrj if (partition_reduction_p (partition1)) 2100*38fd1498Szrj dir = -1; 2101*38fd1498Szrj else if (partition_reduction_p (partition2)) 2102*38fd1498Szrj dir = 1; 2103*38fd1498Szrj 2104*38fd1498Szrj /* Cleanup the temporary vector. */ 2105*38fd1498Szrj alias_ddrs.truncate (0); 2106*38fd1498Szrj 2107*38fd1498Szrj dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs, 2108*38fd1498Szrj partition2->datarefs, alias_ddrs_p); 2109*38fd1498Szrj 2110*38fd1498Szrj /* Add edge to partition graph if there exists dependence. There 2111*38fd1498Szrj are two types of edges. One type edge is caused by compilation 2112*38fd1498Szrj time known dependence, this type can not be resolved by runtime 2113*38fd1498Szrj alias check. The other type can be resolved by runtime alias 2114*38fd1498Szrj check. */ 2115*38fd1498Szrj if (dir == 1 || dir == 2 2116*38fd1498Szrj || alias_ddrs.length () > 0) 2117*38fd1498Szrj { 2118*38fd1498Szrj /* Attach data dependence relations to edge that can be resolved 2119*38fd1498Szrj by runtime alias check. */ 2120*38fd1498Szrj bool alias_edge_p = (dir != 1 && dir != 2); 2121*38fd1498Szrj add_partition_graph_edge (pg, i, j, 2122*38fd1498Szrj (alias_edge_p) ? &alias_ddrs : NULL); 2123*38fd1498Szrj } 2124*38fd1498Szrj if (dir == -1 || dir == 2 2125*38fd1498Szrj || alias_ddrs.length () > 0) 2126*38fd1498Szrj { 2127*38fd1498Szrj /* Attach data dependence relations to edge that can be resolved 2128*38fd1498Szrj by runtime alias check. */ 2129*38fd1498Szrj bool alias_edge_p = (dir != -1 && dir != 2); 2130*38fd1498Szrj add_partition_graph_edge (pg, j, i, 2131*38fd1498Szrj (alias_edge_p) ? &alias_ddrs : NULL); 2132*38fd1498Szrj } 2133*38fd1498Szrj } 2134*38fd1498Szrj } 2135*38fd1498Szrj return pg; 2136*38fd1498Szrj } 2137*38fd1498Szrj 2138*38fd1498Szrj /* Sort partitions in PG in descending post order and store them in 2139*38fd1498Szrj PARTITIONS. */ 2140*38fd1498Szrj 2141*38fd1498Szrj static void 2142*38fd1498Szrj sort_partitions_by_post_order (struct graph *pg, 2143*38fd1498Szrj vec<struct partition *> *partitions) 2144*38fd1498Szrj { 2145*38fd1498Szrj int i; 2146*38fd1498Szrj struct pg_vdata *data; 2147*38fd1498Szrj 2148*38fd1498Szrj /* Now order the remaining nodes in descending postorder. */ 2149*38fd1498Szrj qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp); 2150*38fd1498Szrj partitions->truncate (0); 2151*38fd1498Szrj for (i = 0; i < pg->n_vertices; ++i) 2152*38fd1498Szrj { 2153*38fd1498Szrj data = (struct pg_vdata *)pg->vertices[i].data; 2154*38fd1498Szrj if (data->partition) 2155*38fd1498Szrj partitions->safe_push (data->partition); 2156*38fd1498Szrj } 2157*38fd1498Szrj } 2158*38fd1498Szrj 2159*38fd1498Szrj /* Given reduced dependence graph RDG merge strong connected components 2160*38fd1498Szrj of PARTITIONS. If IGNORE_ALIAS_P is true, data dependence caused by 2161*38fd1498Szrj possible alias between references is ignored, as if it doesn't exist 2162*38fd1498Szrj at all; otherwise all depdendences are considered. */ 2163*38fd1498Szrj 2164*38fd1498Szrj static void 2165*38fd1498Szrj merge_dep_scc_partitions (struct graph *rdg, 2166*38fd1498Szrj vec<struct partition *> *partitions, 2167*38fd1498Szrj bool ignore_alias_p) 2168*38fd1498Szrj { 2169*38fd1498Szrj struct partition *partition1, *partition2; 2170*38fd1498Szrj struct pg_vdata *data; 2171*38fd1498Szrj graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p); 2172*38fd1498Szrj int i, j, num_sccs = graphds_scc (pg, NULL); 2173*38fd1498Szrj 2174*38fd1498Szrj /* Strong connected compoenent means dependence cycle, we cannot distribute 2175*38fd1498Szrj them. So fuse them together. */ 2176*38fd1498Szrj if ((unsigned) num_sccs < partitions->length ()) 2177*38fd1498Szrj { 2178*38fd1498Szrj for (i = 0; i < num_sccs; ++i) 2179*38fd1498Szrj { 2180*38fd1498Szrj for (j = 0; partitions->iterate (j, &partition1); ++j) 2181*38fd1498Szrj if (pg->vertices[j].component == i) 2182*38fd1498Szrj break; 2183*38fd1498Szrj for (j = j + 1; partitions->iterate (j, &partition2); ++j) 2184*38fd1498Szrj if (pg->vertices[j].component == i) 2185*38fd1498Szrj { 2186*38fd1498Szrj partition_merge_into (NULL, partition1, 2187*38fd1498Szrj partition2, FUSE_SAME_SCC); 2188*38fd1498Szrj partition1->type = PTYPE_SEQUENTIAL; 2189*38fd1498Szrj (*partitions)[j] = NULL; 2190*38fd1498Szrj partition_free (partition2); 2191*38fd1498Szrj data = (struct pg_vdata *)pg->vertices[j].data; 2192*38fd1498Szrj data->partition = NULL; 2193*38fd1498Szrj } 2194*38fd1498Szrj } 2195*38fd1498Szrj } 2196*38fd1498Szrj 2197*38fd1498Szrj sort_partitions_by_post_order (pg, partitions); 2198*38fd1498Szrj gcc_assert (partitions->length () == (unsigned)num_sccs); 2199*38fd1498Szrj free_partition_graph_vdata (pg); 2200*38fd1498Szrj free_graph (pg); 2201*38fd1498Szrj } 2202*38fd1498Szrj 2203*38fd1498Szrj /* Callback function for traversing edge E in graph G. DATA is private 2204*38fd1498Szrj callback data. */ 2205*38fd1498Szrj 2206*38fd1498Szrj static void 2207*38fd1498Szrj pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data) 2208*38fd1498Szrj { 2209*38fd1498Szrj int i, j, component; 2210*38fd1498Szrj struct pg_edge_callback_data *cbdata; 2211*38fd1498Szrj struct pg_edata *edata = (struct pg_edata *) e->data; 2212*38fd1498Szrj 2213*38fd1498Szrj /* If the edge doesn't have attached data dependence, it represents 2214*38fd1498Szrj compilation time known dependences. This type dependence cannot 2215*38fd1498Szrj be resolved by runtime alias check. */ 2216*38fd1498Szrj if (edata == NULL || edata->alias_ddrs.length () == 0) 2217*38fd1498Szrj return; 2218*38fd1498Szrj 2219*38fd1498Szrj cbdata = (struct pg_edge_callback_data *) data; 2220*38fd1498Szrj i = e->src; 2221*38fd1498Szrj j = e->dest; 2222*38fd1498Szrj component = cbdata->vertices_component[i]; 2223*38fd1498Szrj /* Vertices are topologically sorted according to compilation time 2224*38fd1498Szrj known dependences, so we can break strong connected components 2225*38fd1498Szrj by removing edges of the opposite direction, i.e, edges pointing 2226*38fd1498Szrj from vertice with smaller post number to vertice with bigger post 2227*38fd1498Szrj number. */ 2228*38fd1498Szrj if (g->vertices[i].post < g->vertices[j].post 2229*38fd1498Szrj /* We only need to remove edges connecting vertices in the same 2230*38fd1498Szrj strong connected component to break it. */ 2231*38fd1498Szrj && component == cbdata->vertices_component[j] 2232*38fd1498Szrj /* Check if we want to break the strong connected component or not. */ 2233*38fd1498Szrj && !bitmap_bit_p (cbdata->sccs_to_merge, component)) 2234*38fd1498Szrj cbdata->alias_ddrs->safe_splice (edata->alias_ddrs); 2235*38fd1498Szrj } 2236*38fd1498Szrj 2237*38fd1498Szrj /* This is the main function breaking strong conected components in 2238*38fd1498Szrj PARTITIONS giving reduced depdendence graph RDG. Store data dependence 2239*38fd1498Szrj relations for runtime alias check in ALIAS_DDRS. */ 2240*38fd1498Szrj 2241*38fd1498Szrj static void 2242*38fd1498Szrj break_alias_scc_partitions (struct graph *rdg, 2243*38fd1498Szrj vec<struct partition *> *partitions, 2244*38fd1498Szrj vec<ddr_p> *alias_ddrs) 2245*38fd1498Szrj { 2246*38fd1498Szrj int i, j, k, num_sccs, num_sccs_no_alias; 2247*38fd1498Szrj /* Build partition dependence graph. */ 2248*38fd1498Szrj graph *pg = build_partition_graph (rdg, partitions, false); 2249*38fd1498Szrj 2250*38fd1498Szrj alias_ddrs->truncate (0); 2251*38fd1498Szrj /* Find strong connected components in the graph, with all dependence edges 2252*38fd1498Szrj considered. */ 2253*38fd1498Szrj num_sccs = graphds_scc (pg, NULL); 2254*38fd1498Szrj /* All SCCs now can be broken by runtime alias checks because SCCs caused by 2255*38fd1498Szrj compilation time known dependences are merged before this function. */ 2256*38fd1498Szrj if ((unsigned) num_sccs < partitions->length ()) 2257*38fd1498Szrj { 2258*38fd1498Szrj struct pg_edge_callback_data cbdata; 2259*38fd1498Szrj auto_bitmap sccs_to_merge; 2260*38fd1498Szrj auto_vec<enum partition_type> scc_types; 2261*38fd1498Szrj struct partition *partition, *first; 2262*38fd1498Szrj 2263*38fd1498Szrj /* If all partitions in a SCC have the same type, we can simply merge the 2264*38fd1498Szrj SCC. This loop finds out such SCCS and record them in bitmap. */ 2265*38fd1498Szrj bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs); 2266*38fd1498Szrj for (i = 0; i < num_sccs; ++i) 2267*38fd1498Szrj { 2268*38fd1498Szrj for (j = 0; partitions->iterate (j, &first); ++j) 2269*38fd1498Szrj if (pg->vertices[j].component == i) 2270*38fd1498Szrj break; 2271*38fd1498Szrj for (++j; partitions->iterate (j, &partition); ++j) 2272*38fd1498Szrj { 2273*38fd1498Szrj if (pg->vertices[j].component != i) 2274*38fd1498Szrj continue; 2275*38fd1498Szrj 2276*38fd1498Szrj /* Note we Merge partitions of parallel type on purpose, though 2277*38fd1498Szrj the result partition is sequential. The reason is vectorizer 2278*38fd1498Szrj can do more accurate runtime alias check in this case. Also 2279*38fd1498Szrj it results in more conservative distribution. */ 2280*38fd1498Szrj if (first->type != partition->type) 2281*38fd1498Szrj { 2282*38fd1498Szrj bitmap_clear_bit (sccs_to_merge, i); 2283*38fd1498Szrj break; 2284*38fd1498Szrj } 2285*38fd1498Szrj } 2286*38fd1498Szrj } 2287*38fd1498Szrj 2288*38fd1498Szrj /* Initialize callback data for traversing. */ 2289*38fd1498Szrj cbdata.sccs_to_merge = sccs_to_merge; 2290*38fd1498Szrj cbdata.alias_ddrs = alias_ddrs; 2291*38fd1498Szrj cbdata.vertices_component = XNEWVEC (int, pg->n_vertices); 2292*38fd1498Szrj /* Record the component information which will be corrupted by next 2293*38fd1498Szrj graph scc finding call. */ 2294*38fd1498Szrj for (i = 0; i < pg->n_vertices; ++i) 2295*38fd1498Szrj cbdata.vertices_component[i] = pg->vertices[i].component; 2296*38fd1498Szrj 2297*38fd1498Szrj /* Collect data dependences for runtime alias checks to break SCCs. */ 2298*38fd1498Szrj if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs) 2299*38fd1498Szrj { 2300*38fd1498Szrj /* Run SCC finding algorithm again, with alias dependence edges 2301*38fd1498Szrj skipped. This is to topologically sort partitions according to 2302*38fd1498Szrj compilation time known dependence. Note the topological order 2303*38fd1498Szrj is stored in the form of pg's post order number. */ 2304*38fd1498Szrj num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge); 2305*38fd1498Szrj gcc_assert (partitions->length () == (unsigned) num_sccs_no_alias); 2306*38fd1498Szrj /* With topological order, we can construct two subgraphs L and R. 2307*38fd1498Szrj L contains edge <x, y> where x < y in terms of post order, while 2308*38fd1498Szrj R contains edge <x, y> where x > y. Edges for compilation time 2309*38fd1498Szrj known dependence all fall in R, so we break SCCs by removing all 2310*38fd1498Szrj (alias) edges of in subgraph L. */ 2311*38fd1498Szrj for_each_edge (pg, pg_collect_alias_ddrs, &cbdata); 2312*38fd1498Szrj } 2313*38fd1498Szrj 2314*38fd1498Szrj /* For SCC that doesn't need to be broken, merge it. */ 2315*38fd1498Szrj for (i = 0; i < num_sccs; ++i) 2316*38fd1498Szrj { 2317*38fd1498Szrj if (!bitmap_bit_p (sccs_to_merge, i)) 2318*38fd1498Szrj continue; 2319*38fd1498Szrj 2320*38fd1498Szrj for (j = 0; partitions->iterate (j, &first); ++j) 2321*38fd1498Szrj if (cbdata.vertices_component[j] == i) 2322*38fd1498Szrj break; 2323*38fd1498Szrj for (k = j + 1; partitions->iterate (k, &partition); ++k) 2324*38fd1498Szrj { 2325*38fd1498Szrj struct pg_vdata *data; 2326*38fd1498Szrj 2327*38fd1498Szrj if (cbdata.vertices_component[k] != i) 2328*38fd1498Szrj continue; 2329*38fd1498Szrj 2330*38fd1498Szrj /* Update postorder number so that merged reduction partition is 2331*38fd1498Szrj sorted after other partitions. */ 2332*38fd1498Szrj if (!partition_reduction_p (first) 2333*38fd1498Szrj && partition_reduction_p (partition)) 2334*38fd1498Szrj { 2335*38fd1498Szrj gcc_assert (pg->vertices[k].post < pg->vertices[j].post); 2336*38fd1498Szrj pg->vertices[j].post = pg->vertices[k].post; 2337*38fd1498Szrj } 2338*38fd1498Szrj partition_merge_into (NULL, first, partition, FUSE_SAME_SCC); 2339*38fd1498Szrj (*partitions)[k] = NULL; 2340*38fd1498Szrj partition_free (partition); 2341*38fd1498Szrj data = (struct pg_vdata *)pg->vertices[k].data; 2342*38fd1498Szrj gcc_assert (data->id == k); 2343*38fd1498Szrj data->partition = NULL; 2344*38fd1498Szrj /* The result partition of merged SCC must be sequential. */ 2345*38fd1498Szrj first->type = PTYPE_SEQUENTIAL; 2346*38fd1498Szrj } 2347*38fd1498Szrj } 2348*38fd1498Szrj } 2349*38fd1498Szrj 2350*38fd1498Szrj sort_partitions_by_post_order (pg, partitions); 2351*38fd1498Szrj free_partition_graph_vdata (pg); 2352*38fd1498Szrj for_each_edge (pg, free_partition_graph_edata_cb, NULL); 2353*38fd1498Szrj free_graph (pg); 2354*38fd1498Szrj 2355*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 2356*38fd1498Szrj { 2357*38fd1498Szrj fprintf (dump_file, "Possible alias data dependence to break:\n"); 2358*38fd1498Szrj dump_data_dependence_relations (dump_file, *alias_ddrs); 2359*38fd1498Szrj } 2360*38fd1498Szrj } 2361*38fd1498Szrj 2362*38fd1498Szrj /* Compute and return an expression whose value is the segment length which 2363*38fd1498Szrj will be accessed by DR in NITERS iterations. */ 2364*38fd1498Szrj 2365*38fd1498Szrj static tree 2366*38fd1498Szrj data_ref_segment_size (struct data_reference *dr, tree niters) 2367*38fd1498Szrj { 2368*38fd1498Szrj niters = size_binop (MINUS_EXPR, 2369*38fd1498Szrj fold_convert (sizetype, niters), 2370*38fd1498Szrj size_one_node); 2371*38fd1498Szrj return size_binop (MULT_EXPR, 2372*38fd1498Szrj fold_convert (sizetype, DR_STEP (dr)), 2373*38fd1498Szrj fold_convert (sizetype, niters)); 2374*38fd1498Szrj } 2375*38fd1498Szrj 2376*38fd1498Szrj /* Return true if LOOP's latch is dominated by statement for data reference 2377*38fd1498Szrj DR. */ 2378*38fd1498Szrj 2379*38fd1498Szrj static inline bool 2380*38fd1498Szrj latch_dominated_by_data_ref (struct loop *loop, data_reference *dr) 2381*38fd1498Szrj { 2382*38fd1498Szrj return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, 2383*38fd1498Szrj gimple_bb (DR_STMT (dr))); 2384*38fd1498Szrj } 2385*38fd1498Szrj 2386*38fd1498Szrj /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's 2387*38fd1498Szrj data dependence relations ALIAS_DDRS. */ 2388*38fd1498Szrj 2389*38fd1498Szrj static void 2390*38fd1498Szrj compute_alias_check_pairs (struct loop *loop, vec<ddr_p> *alias_ddrs, 2391*38fd1498Szrj vec<dr_with_seg_len_pair_t> *comp_alias_pairs) 2392*38fd1498Szrj { 2393*38fd1498Szrj unsigned int i; 2394*38fd1498Szrj unsigned HOST_WIDE_INT factor = 1; 2395*38fd1498Szrj tree niters_plus_one, niters = number_of_latch_executions (loop); 2396*38fd1498Szrj 2397*38fd1498Szrj gcc_assert (niters != NULL_TREE && niters != chrec_dont_know); 2398*38fd1498Szrj niters = fold_convert (sizetype, niters); 2399*38fd1498Szrj niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node); 2400*38fd1498Szrj 2401*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 2402*38fd1498Szrj fprintf (dump_file, "Creating alias check pairs:\n"); 2403*38fd1498Szrj 2404*38fd1498Szrj /* Iterate all data dependence relations and compute alias check pairs. */ 2405*38fd1498Szrj for (i = 0; i < alias_ddrs->length (); i++) 2406*38fd1498Szrj { 2407*38fd1498Szrj ddr_p ddr = (*alias_ddrs)[i]; 2408*38fd1498Szrj struct data_reference *dr_a = DDR_A (ddr); 2409*38fd1498Szrj struct data_reference *dr_b = DDR_B (ddr); 2410*38fd1498Szrj tree seg_length_a, seg_length_b; 2411*38fd1498Szrj int comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_a), 2412*38fd1498Szrj DR_BASE_ADDRESS (dr_b)); 2413*38fd1498Szrj 2414*38fd1498Szrj if (comp_res == 0) 2415*38fd1498Szrj comp_res = data_ref_compare_tree (DR_OFFSET (dr_a), DR_OFFSET (dr_b)); 2416*38fd1498Szrj gcc_assert (comp_res != 0); 2417*38fd1498Szrj 2418*38fd1498Szrj if (latch_dominated_by_data_ref (loop, dr_a)) 2419*38fd1498Szrj seg_length_a = data_ref_segment_size (dr_a, niters_plus_one); 2420*38fd1498Szrj else 2421*38fd1498Szrj seg_length_a = data_ref_segment_size (dr_a, niters); 2422*38fd1498Szrj 2423*38fd1498Szrj if (latch_dominated_by_data_ref (loop, dr_b)) 2424*38fd1498Szrj seg_length_b = data_ref_segment_size (dr_b, niters_plus_one); 2425*38fd1498Szrj else 2426*38fd1498Szrj seg_length_b = data_ref_segment_size (dr_b, niters); 2427*38fd1498Szrj 2428*38fd1498Szrj unsigned HOST_WIDE_INT access_size_a 2429*38fd1498Szrj = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a)))); 2430*38fd1498Szrj unsigned HOST_WIDE_INT access_size_b 2431*38fd1498Szrj = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b)))); 2432*38fd1498Szrj unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a))); 2433*38fd1498Szrj unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b))); 2434*38fd1498Szrj 2435*38fd1498Szrj dr_with_seg_len_pair_t dr_with_seg_len_pair 2436*38fd1498Szrj (dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a), 2437*38fd1498Szrj dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b)); 2438*38fd1498Szrj 2439*38fd1498Szrj /* Canonicalize pairs by sorting the two DR members. */ 2440*38fd1498Szrj if (comp_res > 0) 2441*38fd1498Szrj std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second); 2442*38fd1498Szrj 2443*38fd1498Szrj comp_alias_pairs->safe_push (dr_with_seg_len_pair); 2444*38fd1498Szrj } 2445*38fd1498Szrj 2446*38fd1498Szrj if (tree_fits_uhwi_p (niters)) 2447*38fd1498Szrj factor = tree_to_uhwi (niters); 2448*38fd1498Szrj 2449*38fd1498Szrj /* Prune alias check pairs. */ 2450*38fd1498Szrj prune_runtime_alias_test_list (comp_alias_pairs, factor); 2451*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 2452*38fd1498Szrj fprintf (dump_file, 2453*38fd1498Szrj "Improved number of alias checks from %d to %d\n", 2454*38fd1498Szrj alias_ddrs->length (), comp_alias_pairs->length ()); 2455*38fd1498Szrj } 2456*38fd1498Szrj 2457*38fd1498Szrj /* Given data dependence relations in ALIAS_DDRS, generate runtime alias 2458*38fd1498Szrj checks and version LOOP under condition of these runtime alias checks. */ 2459*38fd1498Szrj 2460*38fd1498Szrj static void 2461*38fd1498Szrj version_loop_by_alias_check (struct loop *loop, vec<ddr_p> *alias_ddrs) 2462*38fd1498Szrj { 2463*38fd1498Szrj profile_probability prob; 2464*38fd1498Szrj basic_block cond_bb; 2465*38fd1498Szrj struct loop *nloop; 2466*38fd1498Szrj tree lhs, arg0, cond_expr = NULL_TREE; 2467*38fd1498Szrj gimple_seq cond_stmts = NULL; 2468*38fd1498Szrj gimple *call_stmt = NULL; 2469*38fd1498Szrj auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs; 2470*38fd1498Szrj 2471*38fd1498Szrj /* Generate code for runtime alias checks if necessary. */ 2472*38fd1498Szrj gcc_assert (alias_ddrs->length () > 0); 2473*38fd1498Szrj 2474*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 2475*38fd1498Szrj fprintf (dump_file, 2476*38fd1498Szrj "Version loop <%d> with runtime alias check\n", loop->num); 2477*38fd1498Szrj 2478*38fd1498Szrj compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs); 2479*38fd1498Szrj create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr); 2480*38fd1498Szrj cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts, 2481*38fd1498Szrj is_gimple_val, NULL_TREE); 2482*38fd1498Szrj 2483*38fd1498Szrj /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS. */ 2484*38fd1498Szrj if (flag_tree_loop_vectorize) 2485*38fd1498Szrj { 2486*38fd1498Szrj /* Generate internal function call for loop distribution alias check. */ 2487*38fd1498Szrj call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS, 2488*38fd1498Szrj 2, NULL_TREE, cond_expr); 2489*38fd1498Szrj lhs = make_ssa_name (boolean_type_node); 2490*38fd1498Szrj gimple_call_set_lhs (call_stmt, lhs); 2491*38fd1498Szrj } 2492*38fd1498Szrj else 2493*38fd1498Szrj lhs = cond_expr; 2494*38fd1498Szrj 2495*38fd1498Szrj prob = profile_probability::guessed_always ().apply_scale (9, 10); 2496*38fd1498Szrj initialize_original_copy_tables (); 2497*38fd1498Szrj nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (), 2498*38fd1498Szrj prob, prob.invert (), true); 2499*38fd1498Szrj free_original_copy_tables (); 2500*38fd1498Szrj /* Record the original loop number in newly generated loops. In case of 2501*38fd1498Szrj distribution, the original loop will be distributed and the new loop 2502*38fd1498Szrj is kept. */ 2503*38fd1498Szrj loop->orig_loop_num = nloop->num; 2504*38fd1498Szrj nloop->orig_loop_num = nloop->num; 2505*38fd1498Szrj nloop->dont_vectorize = true; 2506*38fd1498Szrj nloop->force_vectorize = false; 2507*38fd1498Szrj 2508*38fd1498Szrj if (call_stmt) 2509*38fd1498Szrj { 2510*38fd1498Szrj /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original 2511*38fd1498Szrj loop could be destroyed. */ 2512*38fd1498Szrj arg0 = build_int_cst (integer_type_node, loop->orig_loop_num); 2513*38fd1498Szrj gimple_call_set_arg (call_stmt, 0, arg0); 2514*38fd1498Szrj gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt); 2515*38fd1498Szrj } 2516*38fd1498Szrj 2517*38fd1498Szrj if (cond_stmts) 2518*38fd1498Szrj { 2519*38fd1498Szrj gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb); 2520*38fd1498Szrj gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT); 2521*38fd1498Szrj } 2522*38fd1498Szrj update_ssa (TODO_update_ssa); 2523*38fd1498Szrj } 2524*38fd1498Szrj 2525*38fd1498Szrj /* Return true if loop versioning is needed to distrubute PARTITIONS. 2526*38fd1498Szrj ALIAS_DDRS are data dependence relations for runtime alias check. */ 2527*38fd1498Szrj 2528*38fd1498Szrj static inline bool 2529*38fd1498Szrj version_for_distribution_p (vec<struct partition *> *partitions, 2530*38fd1498Szrj vec<ddr_p> *alias_ddrs) 2531*38fd1498Szrj { 2532*38fd1498Szrj /* No need to version loop if we have only one partition. */ 2533*38fd1498Szrj if (partitions->length () == 1) 2534*38fd1498Szrj return false; 2535*38fd1498Szrj 2536*38fd1498Szrj /* Need to version loop if runtime alias check is necessary. */ 2537*38fd1498Szrj return (alias_ddrs->length () > 0); 2538*38fd1498Szrj } 2539*38fd1498Szrj 2540*38fd1498Szrj /* Compare base offset of builtin mem* partitions P1 and P2. */ 2541*38fd1498Szrj 2542*38fd1498Szrj static bool 2543*38fd1498Szrj offset_cmp (struct partition *p1, struct partition *p2) 2544*38fd1498Szrj { 2545*38fd1498Szrj gcc_assert (p1 != NULL && p1->builtin != NULL); 2546*38fd1498Szrj gcc_assert (p2 != NULL && p2->builtin != NULL); 2547*38fd1498Szrj return p1->builtin->dst_base_offset < p2->builtin->dst_base_offset; 2548*38fd1498Szrj } 2549*38fd1498Szrj 2550*38fd1498Szrj /* Fuse adjacent memset builtin PARTITIONS if possible. This is a special 2551*38fd1498Szrj case optimization transforming below code: 2552*38fd1498Szrj 2553*38fd1498Szrj __builtin_memset (&obj, 0, 100); 2554*38fd1498Szrj _1 = &obj + 100; 2555*38fd1498Szrj __builtin_memset (_1, 0, 200); 2556*38fd1498Szrj _2 = &obj + 300; 2557*38fd1498Szrj __builtin_memset (_2, 0, 100); 2558*38fd1498Szrj 2559*38fd1498Szrj into: 2560*38fd1498Szrj 2561*38fd1498Szrj __builtin_memset (&obj, 0, 400); 2562*38fd1498Szrj 2563*38fd1498Szrj Note we don't have dependence information between different partitions 2564*38fd1498Szrj at this point, as a result, we can't handle nonadjacent memset builtin 2565*38fd1498Szrj partitions since dependence might be broken. */ 2566*38fd1498Szrj 2567*38fd1498Szrj static void 2568*38fd1498Szrj fuse_memset_builtins (vec<struct partition *> *partitions) 2569*38fd1498Szrj { 2570*38fd1498Szrj unsigned i, j; 2571*38fd1498Szrj struct partition *part1, *part2; 2572*38fd1498Szrj tree rhs1, rhs2; 2573*38fd1498Szrj 2574*38fd1498Szrj for (i = 0; partitions->iterate (i, &part1);) 2575*38fd1498Szrj { 2576*38fd1498Szrj if (part1->kind != PKIND_MEMSET) 2577*38fd1498Szrj { 2578*38fd1498Szrj i++; 2579*38fd1498Szrj continue; 2580*38fd1498Szrj } 2581*38fd1498Szrj 2582*38fd1498Szrj /* Find sub-array of memset builtins of the same base. Index range 2583*38fd1498Szrj of the sub-array is [i, j) with "j > i". */ 2584*38fd1498Szrj for (j = i + 1; partitions->iterate (j, &part2); ++j) 2585*38fd1498Szrj { 2586*38fd1498Szrj if (part2->kind != PKIND_MEMSET 2587*38fd1498Szrj || !operand_equal_p (part1->builtin->dst_base_base, 2588*38fd1498Szrj part2->builtin->dst_base_base, 0)) 2589*38fd1498Szrj break; 2590*38fd1498Szrj 2591*38fd1498Szrj /* Memset calls setting different values can't be merged. */ 2592*38fd1498Szrj rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); 2593*38fd1498Szrj rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); 2594*38fd1498Szrj if (!operand_equal_p (rhs1, rhs2, 0)) 2595*38fd1498Szrj break; 2596*38fd1498Szrj } 2597*38fd1498Szrj 2598*38fd1498Szrj /* Stable sort is required in order to avoid breaking dependence. */ 2599*38fd1498Szrj std::stable_sort (&(*partitions)[i], 2600*38fd1498Szrj &(*partitions)[i] + j - i, offset_cmp); 2601*38fd1498Szrj /* Continue with next partition. */ 2602*38fd1498Szrj i = j; 2603*38fd1498Szrj } 2604*38fd1498Szrj 2605*38fd1498Szrj /* Merge all consecutive memset builtin partitions. */ 2606*38fd1498Szrj for (i = 0; i < partitions->length () - 1;) 2607*38fd1498Szrj { 2608*38fd1498Szrj part1 = (*partitions)[i]; 2609*38fd1498Szrj if (part1->kind != PKIND_MEMSET) 2610*38fd1498Szrj { 2611*38fd1498Szrj i++; 2612*38fd1498Szrj continue; 2613*38fd1498Szrj } 2614*38fd1498Szrj 2615*38fd1498Szrj part2 = (*partitions)[i + 1]; 2616*38fd1498Szrj /* Only merge memset partitions of the same base and with constant 2617*38fd1498Szrj access sizes. */ 2618*38fd1498Szrj if (part2->kind != PKIND_MEMSET 2619*38fd1498Szrj || TREE_CODE (part1->builtin->size) != INTEGER_CST 2620*38fd1498Szrj || TREE_CODE (part2->builtin->size) != INTEGER_CST 2621*38fd1498Szrj || !operand_equal_p (part1->builtin->dst_base_base, 2622*38fd1498Szrj part2->builtin->dst_base_base, 0)) 2623*38fd1498Szrj { 2624*38fd1498Szrj i++; 2625*38fd1498Szrj continue; 2626*38fd1498Szrj } 2627*38fd1498Szrj rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr)); 2628*38fd1498Szrj rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr)); 2629*38fd1498Szrj int bytev1 = const_with_all_bytes_same (rhs1); 2630*38fd1498Szrj int bytev2 = const_with_all_bytes_same (rhs2); 2631*38fd1498Szrj /* Only merge memset partitions of the same value. */ 2632*38fd1498Szrj if (bytev1 != bytev2 || bytev1 == -1) 2633*38fd1498Szrj { 2634*38fd1498Szrj i++; 2635*38fd1498Szrj continue; 2636*38fd1498Szrj } 2637*38fd1498Szrj wide_int end1 = wi::add (part1->builtin->dst_base_offset, 2638*38fd1498Szrj wi::to_wide (part1->builtin->size)); 2639*38fd1498Szrj /* Only merge adjacent memset partitions. */ 2640*38fd1498Szrj if (wi::ne_p (end1, part2->builtin->dst_base_offset)) 2641*38fd1498Szrj { 2642*38fd1498Szrj i++; 2643*38fd1498Szrj continue; 2644*38fd1498Szrj } 2645*38fd1498Szrj /* Merge partitions[i] and partitions[i+1]. */ 2646*38fd1498Szrj part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype, 2647*38fd1498Szrj part1->builtin->size, 2648*38fd1498Szrj part2->builtin->size); 2649*38fd1498Szrj partition_free (part2); 2650*38fd1498Szrj partitions->ordered_remove (i + 1); 2651*38fd1498Szrj } 2652*38fd1498Szrj } 2653*38fd1498Szrj 2654*38fd1498Szrj /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution. 2655*38fd1498Szrj ALIAS_DDRS contains ddrs which need runtime alias check. */ 2656*38fd1498Szrj 2657*38fd1498Szrj static void 2658*38fd1498Szrj finalize_partitions (struct loop *loop, vec<struct partition *> *partitions, 2659*38fd1498Szrj vec<ddr_p> *alias_ddrs) 2660*38fd1498Szrj { 2661*38fd1498Szrj unsigned i; 2662*38fd1498Szrj struct partition *partition, *a; 2663*38fd1498Szrj 2664*38fd1498Szrj if (partitions->length () == 1 2665*38fd1498Szrj || alias_ddrs->length () > 0) 2666*38fd1498Szrj return; 2667*38fd1498Szrj 2668*38fd1498Szrj unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0; 2669*38fd1498Szrj bool same_type_p = true; 2670*38fd1498Szrj enum partition_type type = ((*partitions)[0])->type; 2671*38fd1498Szrj for (i = 0; partitions->iterate (i, &partition); ++i) 2672*38fd1498Szrj { 2673*38fd1498Szrj same_type_p &= (type == partition->type); 2674*38fd1498Szrj if (partition_builtin_p (partition)) 2675*38fd1498Szrj { 2676*38fd1498Szrj num_builtin++; 2677*38fd1498Szrj continue; 2678*38fd1498Szrj } 2679*38fd1498Szrj num_normal++; 2680*38fd1498Szrj if (partition->kind == PKIND_PARTIAL_MEMSET) 2681*38fd1498Szrj num_partial_memset++; 2682*38fd1498Szrj } 2683*38fd1498Szrj 2684*38fd1498Szrj /* Don't distribute current loop into too many loops given we don't have 2685*38fd1498Szrj memory stream cost model. Be even more conservative in case of loop 2686*38fd1498Szrj nest distribution. */ 2687*38fd1498Szrj if ((same_type_p && num_builtin == 0 2688*38fd1498Szrj && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1)) 2689*38fd1498Szrj || (loop->inner != NULL 2690*38fd1498Szrj && i >= NUM_PARTITION_THRESHOLD && num_normal > 1) 2691*38fd1498Szrj || (loop->inner == NULL 2692*38fd1498Szrj && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin)) 2693*38fd1498Szrj { 2694*38fd1498Szrj a = (*partitions)[0]; 2695*38fd1498Szrj for (i = 1; partitions->iterate (i, &partition); ++i) 2696*38fd1498Szrj { 2697*38fd1498Szrj partition_merge_into (NULL, a, partition, FUSE_FINALIZE); 2698*38fd1498Szrj partition_free (partition); 2699*38fd1498Szrj } 2700*38fd1498Szrj partitions->truncate (1); 2701*38fd1498Szrj } 2702*38fd1498Szrj 2703*38fd1498Szrj /* Fuse memset builtins if possible. */ 2704*38fd1498Szrj if (partitions->length () > 1) 2705*38fd1498Szrj fuse_memset_builtins (partitions); 2706*38fd1498Szrj } 2707*38fd1498Szrj 2708*38fd1498Szrj /* Distributes the code from LOOP in such a way that producer statements 2709*38fd1498Szrj are placed before consumer statements. Tries to separate only the 2710*38fd1498Szrj statements from STMTS into separate loops. Returns the number of 2711*38fd1498Szrj distributed loops. Set NB_CALLS to number of generated builtin calls. 2712*38fd1498Szrj Set *DESTROY_P to whether LOOP needs to be destroyed. */ 2713*38fd1498Szrj 2714*38fd1498Szrj static int 2715*38fd1498Szrj distribute_loop (struct loop *loop, vec<gimple *> stmts, 2716*38fd1498Szrj control_dependences *cd, int *nb_calls, bool *destroy_p) 2717*38fd1498Szrj { 2718*38fd1498Szrj ddrs_table = new hash_table<ddr_hasher> (389); 2719*38fd1498Szrj struct graph *rdg; 2720*38fd1498Szrj partition *partition; 2721*38fd1498Szrj bool any_builtin; 2722*38fd1498Szrj int i, nbp; 2723*38fd1498Szrj 2724*38fd1498Szrj *destroy_p = false; 2725*38fd1498Szrj *nb_calls = 0; 2726*38fd1498Szrj loop_nest.create (0); 2727*38fd1498Szrj if (!find_loop_nest (loop, &loop_nest)) 2728*38fd1498Szrj { 2729*38fd1498Szrj loop_nest.release (); 2730*38fd1498Szrj delete ddrs_table; 2731*38fd1498Szrj return 0; 2732*38fd1498Szrj } 2733*38fd1498Szrj 2734*38fd1498Szrj datarefs_vec.create (20); 2735*38fd1498Szrj rdg = build_rdg (loop, cd); 2736*38fd1498Szrj if (!rdg) 2737*38fd1498Szrj { 2738*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 2739*38fd1498Szrj fprintf (dump_file, 2740*38fd1498Szrj "Loop %d not distributed: failed to build the RDG.\n", 2741*38fd1498Szrj loop->num); 2742*38fd1498Szrj 2743*38fd1498Szrj loop_nest.release (); 2744*38fd1498Szrj free_data_refs (datarefs_vec); 2745*38fd1498Szrj delete ddrs_table; 2746*38fd1498Szrj return 0; 2747*38fd1498Szrj } 2748*38fd1498Szrj 2749*38fd1498Szrj if (datarefs_vec.length () > MAX_DATAREFS_NUM) 2750*38fd1498Szrj { 2751*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 2752*38fd1498Szrj fprintf (dump_file, 2753*38fd1498Szrj "Loop %d not distributed: too many memory references.\n", 2754*38fd1498Szrj loop->num); 2755*38fd1498Szrj 2756*38fd1498Szrj free_rdg (rdg); 2757*38fd1498Szrj loop_nest.release (); 2758*38fd1498Szrj free_data_refs (datarefs_vec); 2759*38fd1498Szrj delete ddrs_table; 2760*38fd1498Szrj return 0; 2761*38fd1498Szrj } 2762*38fd1498Szrj 2763*38fd1498Szrj data_reference_p dref; 2764*38fd1498Szrj for (i = 0; datarefs_vec.iterate (i, &dref); ++i) 2765*38fd1498Szrj dref->aux = (void *) (uintptr_t) i; 2766*38fd1498Szrj 2767*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 2768*38fd1498Szrj dump_rdg (dump_file, rdg); 2769*38fd1498Szrj 2770*38fd1498Szrj auto_vec<struct partition *, 3> partitions; 2771*38fd1498Szrj rdg_build_partitions (rdg, stmts, &partitions); 2772*38fd1498Szrj 2773*38fd1498Szrj auto_vec<ddr_p> alias_ddrs; 2774*38fd1498Szrj 2775*38fd1498Szrj auto_bitmap stmt_in_all_partitions; 2776*38fd1498Szrj bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts); 2777*38fd1498Szrj for (i = 1; partitions.iterate (i, &partition); ++i) 2778*38fd1498Szrj bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts); 2779*38fd1498Szrj 2780*38fd1498Szrj any_builtin = false; 2781*38fd1498Szrj FOR_EACH_VEC_ELT (partitions, i, partition) 2782*38fd1498Szrj { 2783*38fd1498Szrj classify_partition (loop, rdg, partition, stmt_in_all_partitions); 2784*38fd1498Szrj any_builtin |= partition_builtin_p (partition); 2785*38fd1498Szrj } 2786*38fd1498Szrj 2787*38fd1498Szrj /* If we are only distributing patterns but did not detect any, 2788*38fd1498Szrj simply bail out. */ 2789*38fd1498Szrj if (!flag_tree_loop_distribution 2790*38fd1498Szrj && !any_builtin) 2791*38fd1498Szrj { 2792*38fd1498Szrj nbp = 0; 2793*38fd1498Szrj goto ldist_done; 2794*38fd1498Szrj } 2795*38fd1498Szrj 2796*38fd1498Szrj /* If we are only distributing patterns fuse all partitions that 2797*38fd1498Szrj were not classified as builtins. This also avoids chopping 2798*38fd1498Szrj a loop into pieces, separated by builtin calls. That is, we 2799*38fd1498Szrj only want no or a single loop body remaining. */ 2800*38fd1498Szrj struct partition *into; 2801*38fd1498Szrj if (!flag_tree_loop_distribution) 2802*38fd1498Szrj { 2803*38fd1498Szrj for (i = 0; partitions.iterate (i, &into); ++i) 2804*38fd1498Szrj if (!partition_builtin_p (into)) 2805*38fd1498Szrj break; 2806*38fd1498Szrj for (++i; partitions.iterate (i, &partition); ++i) 2807*38fd1498Szrj if (!partition_builtin_p (partition)) 2808*38fd1498Szrj { 2809*38fd1498Szrj partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN); 2810*38fd1498Szrj partitions.unordered_remove (i); 2811*38fd1498Szrj partition_free (partition); 2812*38fd1498Szrj i--; 2813*38fd1498Szrj } 2814*38fd1498Szrj } 2815*38fd1498Szrj 2816*38fd1498Szrj /* Due to limitations in the transform phase we have to fuse all 2817*38fd1498Szrj reduction partitions into the last partition so the existing 2818*38fd1498Szrj loop will contain all loop-closed PHI nodes. */ 2819*38fd1498Szrj for (i = 0; partitions.iterate (i, &into); ++i) 2820*38fd1498Szrj if (partition_reduction_p (into)) 2821*38fd1498Szrj break; 2822*38fd1498Szrj for (i = i + 1; partitions.iterate (i, &partition); ++i) 2823*38fd1498Szrj if (partition_reduction_p (partition)) 2824*38fd1498Szrj { 2825*38fd1498Szrj partition_merge_into (rdg, into, partition, FUSE_REDUCTION); 2826*38fd1498Szrj partitions.unordered_remove (i); 2827*38fd1498Szrj partition_free (partition); 2828*38fd1498Szrj i--; 2829*38fd1498Szrj } 2830*38fd1498Szrj 2831*38fd1498Szrj /* Apply our simple cost model - fuse partitions with similar 2832*38fd1498Szrj memory accesses. */ 2833*38fd1498Szrj for (i = 0; partitions.iterate (i, &into); ++i) 2834*38fd1498Szrj { 2835*38fd1498Szrj bool changed = false; 2836*38fd1498Szrj if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET) 2837*38fd1498Szrj continue; 2838*38fd1498Szrj for (int j = i + 1; 2839*38fd1498Szrj partitions.iterate (j, &partition); ++j) 2840*38fd1498Szrj { 2841*38fd1498Szrj if (share_memory_accesses (rdg, into, partition)) 2842*38fd1498Szrj { 2843*38fd1498Szrj partition_merge_into (rdg, into, partition, FUSE_SHARE_REF); 2844*38fd1498Szrj partitions.unordered_remove (j); 2845*38fd1498Szrj partition_free (partition); 2846*38fd1498Szrj j--; 2847*38fd1498Szrj changed = true; 2848*38fd1498Szrj } 2849*38fd1498Szrj } 2850*38fd1498Szrj /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar 2851*38fd1498Szrj accesses when 1 and 2 have similar accesses but not 0 and 1 2852*38fd1498Szrj then in the next iteration we will fail to consider merging 2853*38fd1498Szrj 1 into 0,2. So try again if we did any merging into 0. */ 2854*38fd1498Szrj if (changed) 2855*38fd1498Szrj i--; 2856*38fd1498Szrj } 2857*38fd1498Szrj 2858*38fd1498Szrj /* Build the partition dependency graph and fuse partitions in strong 2859*38fd1498Szrj connected component. */ 2860*38fd1498Szrj if (partitions.length () > 1) 2861*38fd1498Szrj { 2862*38fd1498Szrj /* Don't support loop nest distribution under runtime alias check 2863*38fd1498Szrj since it's not likely to enable many vectorization opportunities. */ 2864*38fd1498Szrj if (loop->inner) 2865*38fd1498Szrj merge_dep_scc_partitions (rdg, &partitions, false); 2866*38fd1498Szrj else 2867*38fd1498Szrj { 2868*38fd1498Szrj merge_dep_scc_partitions (rdg, &partitions, true); 2869*38fd1498Szrj if (partitions.length () > 1) 2870*38fd1498Szrj break_alias_scc_partitions (rdg, &partitions, &alias_ddrs); 2871*38fd1498Szrj } 2872*38fd1498Szrj } 2873*38fd1498Szrj 2874*38fd1498Szrj finalize_partitions (loop, &partitions, &alias_ddrs); 2875*38fd1498Szrj 2876*38fd1498Szrj nbp = partitions.length (); 2877*38fd1498Szrj if (nbp == 0 2878*38fd1498Szrj || (nbp == 1 && !partition_builtin_p (partitions[0])) 2879*38fd1498Szrj || (nbp > 1 && partition_contains_all_rw (rdg, partitions))) 2880*38fd1498Szrj { 2881*38fd1498Szrj nbp = 0; 2882*38fd1498Szrj goto ldist_done; 2883*38fd1498Szrj } 2884*38fd1498Szrj 2885*38fd1498Szrj if (version_for_distribution_p (&partitions, &alias_ddrs)) 2886*38fd1498Szrj version_loop_by_alias_check (loop, &alias_ddrs); 2887*38fd1498Szrj 2888*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 2889*38fd1498Szrj { 2890*38fd1498Szrj fprintf (dump_file, 2891*38fd1498Szrj "distribute loop <%d> into partitions:\n", loop->num); 2892*38fd1498Szrj dump_rdg_partitions (dump_file, partitions); 2893*38fd1498Szrj } 2894*38fd1498Szrj 2895*38fd1498Szrj FOR_EACH_VEC_ELT (partitions, i, partition) 2896*38fd1498Szrj { 2897*38fd1498Szrj if (partition_builtin_p (partition)) 2898*38fd1498Szrj (*nb_calls)++; 2899*38fd1498Szrj *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1); 2900*38fd1498Szrj } 2901*38fd1498Szrj 2902*38fd1498Szrj ldist_done: 2903*38fd1498Szrj loop_nest.release (); 2904*38fd1498Szrj free_data_refs (datarefs_vec); 2905*38fd1498Szrj for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin (); 2906*38fd1498Szrj iter != ddrs_table->end (); ++iter) 2907*38fd1498Szrj { 2908*38fd1498Szrj free_dependence_relation (*iter); 2909*38fd1498Szrj *iter = NULL; 2910*38fd1498Szrj } 2911*38fd1498Szrj delete ddrs_table; 2912*38fd1498Szrj 2913*38fd1498Szrj FOR_EACH_VEC_ELT (partitions, i, partition) 2914*38fd1498Szrj partition_free (partition); 2915*38fd1498Szrj 2916*38fd1498Szrj free_rdg (rdg); 2917*38fd1498Szrj return nbp - *nb_calls; 2918*38fd1498Szrj } 2919*38fd1498Szrj 2920*38fd1498Szrj /* Distribute all loops in the current function. */ 2921*38fd1498Szrj 2922*38fd1498Szrj namespace { 2923*38fd1498Szrj 2924*38fd1498Szrj const pass_data pass_data_loop_distribution = 2925*38fd1498Szrj { 2926*38fd1498Szrj GIMPLE_PASS, /* type */ 2927*38fd1498Szrj "ldist", /* name */ 2928*38fd1498Szrj OPTGROUP_LOOP, /* optinfo_flags */ 2929*38fd1498Szrj TV_TREE_LOOP_DISTRIBUTION, /* tv_id */ 2930*38fd1498Szrj ( PROP_cfg | PROP_ssa ), /* properties_required */ 2931*38fd1498Szrj 0, /* properties_provided */ 2932*38fd1498Szrj 0, /* properties_destroyed */ 2933*38fd1498Szrj 0, /* todo_flags_start */ 2934*38fd1498Szrj 0, /* todo_flags_finish */ 2935*38fd1498Szrj }; 2936*38fd1498Szrj 2937*38fd1498Szrj class pass_loop_distribution : public gimple_opt_pass 2938*38fd1498Szrj { 2939*38fd1498Szrj public: 2940*38fd1498Szrj pass_loop_distribution (gcc::context *ctxt) 2941*38fd1498Szrj : gimple_opt_pass (pass_data_loop_distribution, ctxt) 2942*38fd1498Szrj {} 2943*38fd1498Szrj 2944*38fd1498Szrj /* opt_pass methods: */ 2945*38fd1498Szrj virtual bool gate (function *) 2946*38fd1498Szrj { 2947*38fd1498Szrj return flag_tree_loop_distribution 2948*38fd1498Szrj || flag_tree_loop_distribute_patterns; 2949*38fd1498Szrj } 2950*38fd1498Szrj 2951*38fd1498Szrj virtual unsigned int execute (function *); 2952*38fd1498Szrj 2953*38fd1498Szrj }; // class pass_loop_distribution 2954*38fd1498Szrj 2955*38fd1498Szrj 2956*38fd1498Szrj /* Given LOOP, this function records seed statements for distribution in 2957*38fd1498Szrj WORK_LIST. Return false if there is nothing for distribution. */ 2958*38fd1498Szrj 2959*38fd1498Szrj static bool 2960*38fd1498Szrj find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list) 2961*38fd1498Szrj { 2962*38fd1498Szrj basic_block *bbs = get_loop_body_in_dom_order (loop); 2963*38fd1498Szrj 2964*38fd1498Szrj /* Initialize the worklist with stmts we seed the partitions with. */ 2965*38fd1498Szrj for (unsigned i = 0; i < loop->num_nodes; ++i) 2966*38fd1498Szrj { 2967*38fd1498Szrj for (gphi_iterator gsi = gsi_start_phis (bbs[i]); 2968*38fd1498Szrj !gsi_end_p (gsi); gsi_next (&gsi)) 2969*38fd1498Szrj { 2970*38fd1498Szrj gphi *phi = gsi.phi (); 2971*38fd1498Szrj if (virtual_operand_p (gimple_phi_result (phi))) 2972*38fd1498Szrj continue; 2973*38fd1498Szrj /* Distribute stmts which have defs that are used outside of 2974*38fd1498Szrj the loop. */ 2975*38fd1498Szrj if (!stmt_has_scalar_dependences_outside_loop (loop, phi)) 2976*38fd1498Szrj continue; 2977*38fd1498Szrj work_list->safe_push (phi); 2978*38fd1498Szrj } 2979*38fd1498Szrj for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); 2980*38fd1498Szrj !gsi_end_p (gsi); gsi_next (&gsi)) 2981*38fd1498Szrj { 2982*38fd1498Szrj gimple *stmt = gsi_stmt (gsi); 2983*38fd1498Szrj 2984*38fd1498Szrj /* If there is a stmt with side-effects bail out - we 2985*38fd1498Szrj cannot and should not distribute this loop. */ 2986*38fd1498Szrj if (gimple_has_side_effects (stmt)) 2987*38fd1498Szrj { 2988*38fd1498Szrj free (bbs); 2989*38fd1498Szrj return false; 2990*38fd1498Szrj } 2991*38fd1498Szrj 2992*38fd1498Szrj /* Distribute stmts which have defs that are used outside of 2993*38fd1498Szrj the loop. */ 2994*38fd1498Szrj if (stmt_has_scalar_dependences_outside_loop (loop, stmt)) 2995*38fd1498Szrj ; 2996*38fd1498Szrj /* Otherwise only distribute stores for now. */ 2997*38fd1498Szrj else if (!gimple_vdef (stmt)) 2998*38fd1498Szrj continue; 2999*38fd1498Szrj 3000*38fd1498Szrj work_list->safe_push (stmt); 3001*38fd1498Szrj } 3002*38fd1498Szrj } 3003*38fd1498Szrj free (bbs); 3004*38fd1498Szrj return work_list->length () > 0; 3005*38fd1498Szrj } 3006*38fd1498Szrj 3007*38fd1498Szrj /* Given innermost LOOP, return the outermost enclosing loop that forms a 3008*38fd1498Szrj perfect loop nest. */ 3009*38fd1498Szrj 3010*38fd1498Szrj static struct loop * 3011*38fd1498Szrj prepare_perfect_loop_nest (struct loop *loop) 3012*38fd1498Szrj { 3013*38fd1498Szrj struct loop *outer = loop_outer (loop); 3014*38fd1498Szrj tree niters = number_of_latch_executions (loop); 3015*38fd1498Szrj 3016*38fd1498Szrj /* TODO: We only support the innermost 3-level loop nest distribution 3017*38fd1498Szrj because of compilation time issue for now. This should be relaxed 3018*38fd1498Szrj in the future. Note we only allow 3-level loop nest distribution 3019*38fd1498Szrj when parallelizing loops. */ 3020*38fd1498Szrj while ((loop->inner == NULL 3021*38fd1498Szrj || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1)) 3022*38fd1498Szrj && loop_outer (outer) 3023*38fd1498Szrj && outer->inner == loop && loop->next == NULL 3024*38fd1498Szrj && single_exit (outer) 3025*38fd1498Szrj && optimize_loop_for_speed_p (outer) 3026*38fd1498Szrj && !chrec_contains_symbols_defined_in_loop (niters, outer->num) 3027*38fd1498Szrj && (niters = number_of_latch_executions (outer)) != NULL_TREE 3028*38fd1498Szrj && niters != chrec_dont_know) 3029*38fd1498Szrj { 3030*38fd1498Szrj loop = outer; 3031*38fd1498Szrj outer = loop_outer (loop); 3032*38fd1498Szrj } 3033*38fd1498Szrj 3034*38fd1498Szrj return loop; 3035*38fd1498Szrj } 3036*38fd1498Szrj 3037*38fd1498Szrj unsigned int 3038*38fd1498Szrj pass_loop_distribution::execute (function *fun) 3039*38fd1498Szrj { 3040*38fd1498Szrj struct loop *loop; 3041*38fd1498Szrj bool changed = false; 3042*38fd1498Szrj basic_block bb; 3043*38fd1498Szrj control_dependences *cd = NULL; 3044*38fd1498Szrj auto_vec<loop_p> loops_to_be_destroyed; 3045*38fd1498Szrj 3046*38fd1498Szrj if (number_of_loops (fun) <= 1) 3047*38fd1498Szrj return 0; 3048*38fd1498Szrj 3049*38fd1498Szrj /* Compute topological order for basic blocks. Topological order is 3050*38fd1498Szrj needed because data dependence is computed for data references in 3051*38fd1498Szrj lexicographical order. */ 3052*38fd1498Szrj if (bb_top_order_index == NULL) 3053*38fd1498Szrj { 3054*38fd1498Szrj int rpo_num; 3055*38fd1498Szrj int *rpo = XNEWVEC (int, last_basic_block_for_fn (cfun)); 3056*38fd1498Szrj 3057*38fd1498Szrj bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun)); 3058*38fd1498Szrj bb_top_order_index_size = last_basic_block_for_fn (cfun); 3059*38fd1498Szrj rpo_num = pre_and_rev_post_order_compute_fn (cfun, NULL, rpo, true); 3060*38fd1498Szrj for (int i = 0; i < rpo_num; i++) 3061*38fd1498Szrj bb_top_order_index[rpo[i]] = i; 3062*38fd1498Szrj 3063*38fd1498Szrj free (rpo); 3064*38fd1498Szrj } 3065*38fd1498Szrj 3066*38fd1498Szrj FOR_ALL_BB_FN (bb, fun) 3067*38fd1498Szrj { 3068*38fd1498Szrj gimple_stmt_iterator gsi; 3069*38fd1498Szrj for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 3070*38fd1498Szrj gimple_set_uid (gsi_stmt (gsi), -1); 3071*38fd1498Szrj for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) 3072*38fd1498Szrj gimple_set_uid (gsi_stmt (gsi), -1); 3073*38fd1498Szrj } 3074*38fd1498Szrj 3075*38fd1498Szrj /* We can at the moment only distribute non-nested loops, thus restrict 3076*38fd1498Szrj walking to innermost loops. */ 3077*38fd1498Szrj FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) 3078*38fd1498Szrj { 3079*38fd1498Szrj /* Don't distribute multiple exit edges loop, or cold loop. */ 3080*38fd1498Szrj if (!single_exit (loop) 3081*38fd1498Szrj || !optimize_loop_for_speed_p (loop)) 3082*38fd1498Szrj continue; 3083*38fd1498Szrj 3084*38fd1498Szrj /* Don't distribute loop if niters is unknown. */ 3085*38fd1498Szrj tree niters = number_of_latch_executions (loop); 3086*38fd1498Szrj if (niters == NULL_TREE || niters == chrec_dont_know) 3087*38fd1498Szrj continue; 3088*38fd1498Szrj 3089*38fd1498Szrj /* Get the perfect loop nest for distribution. */ 3090*38fd1498Szrj loop = prepare_perfect_loop_nest (loop); 3091*38fd1498Szrj for (; loop; loop = loop->inner) 3092*38fd1498Szrj { 3093*38fd1498Szrj auto_vec<gimple *> work_list; 3094*38fd1498Szrj if (!find_seed_stmts_for_distribution (loop, &work_list)) 3095*38fd1498Szrj break; 3096*38fd1498Szrj 3097*38fd1498Szrj const char *str = loop->inner ? " nest" : ""; 3098*38fd1498Szrj location_t loc = find_loop_location (loop); 3099*38fd1498Szrj if (!cd) 3100*38fd1498Szrj { 3101*38fd1498Szrj calculate_dominance_info (CDI_DOMINATORS); 3102*38fd1498Szrj calculate_dominance_info (CDI_POST_DOMINATORS); 3103*38fd1498Szrj cd = new control_dependences (); 3104*38fd1498Szrj free_dominance_info (CDI_POST_DOMINATORS); 3105*38fd1498Szrj } 3106*38fd1498Szrj 3107*38fd1498Szrj bool destroy_p; 3108*38fd1498Szrj int nb_generated_loops, nb_generated_calls; 3109*38fd1498Szrj nb_generated_loops = distribute_loop (loop, work_list, cd, 3110*38fd1498Szrj &nb_generated_calls, 3111*38fd1498Szrj &destroy_p); 3112*38fd1498Szrj if (destroy_p) 3113*38fd1498Szrj loops_to_be_destroyed.safe_push (loop); 3114*38fd1498Szrj 3115*38fd1498Szrj if (nb_generated_loops + nb_generated_calls > 0) 3116*38fd1498Szrj { 3117*38fd1498Szrj changed = true; 3118*38fd1498Szrj dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, 3119*38fd1498Szrj loc, "Loop%s %d distributed: split to %d loops " 3120*38fd1498Szrj "and %d library calls.\n", str, loop->num, 3121*38fd1498Szrj nb_generated_loops, nb_generated_calls); 3122*38fd1498Szrj 3123*38fd1498Szrj break; 3124*38fd1498Szrj } 3125*38fd1498Szrj 3126*38fd1498Szrj if (dump_file && (dump_flags & TDF_DETAILS)) 3127*38fd1498Szrj fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num); 3128*38fd1498Szrj } 3129*38fd1498Szrj } 3130*38fd1498Szrj 3131*38fd1498Szrj if (cd) 3132*38fd1498Szrj delete cd; 3133*38fd1498Szrj 3134*38fd1498Szrj if (bb_top_order_index != NULL) 3135*38fd1498Szrj { 3136*38fd1498Szrj free (bb_top_order_index); 3137*38fd1498Szrj bb_top_order_index = NULL; 3138*38fd1498Szrj bb_top_order_index_size = 0; 3139*38fd1498Szrj } 3140*38fd1498Szrj 3141*38fd1498Szrj if (changed) 3142*38fd1498Szrj { 3143*38fd1498Szrj /* Destroy loop bodies that could not be reused. Do this late as we 3144*38fd1498Szrj otherwise can end up refering to stale data in control dependences. */ 3145*38fd1498Szrj unsigned i; 3146*38fd1498Szrj FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop) 3147*38fd1498Szrj destroy_loop (loop); 3148*38fd1498Szrj 3149*38fd1498Szrj /* Cached scalar evolutions now may refer to wrong or non-existing 3150*38fd1498Szrj loops. */ 3151*38fd1498Szrj scev_reset_htab (); 3152*38fd1498Szrj mark_virtual_operands_for_renaming (fun); 3153*38fd1498Szrj rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa); 3154*38fd1498Szrj } 3155*38fd1498Szrj 3156*38fd1498Szrj checking_verify_loop_structure (); 3157*38fd1498Szrj 3158*38fd1498Szrj return changed ? TODO_cleanup_cfg : 0; 3159*38fd1498Szrj } 3160*38fd1498Szrj 3161*38fd1498Szrj } // anon namespace 3162*38fd1498Szrj 3163*38fd1498Szrj gimple_opt_pass * 3164*38fd1498Szrj make_pass_loop_distribution (gcc::context *ctxt) 3165*38fd1498Szrj { 3166*38fd1498Szrj return new pass_loop_distribution (ctxt); 3167*38fd1498Szrj } 3168