xref: /netbsd-src/external/gpl3/gcc/dist/gcc/tree-loop-distribution.cc (revision 0a3071956a3a9fdebdbf7f338cf2d439b45fc728)
1 /* Loop distribution.
2    Copyright (C) 2006-2022 Free Software Foundation, Inc.
3    Contributed by Georges-Andre Silber <Georges-Andre.Silber@ensmp.fr>
4    and Sebastian Pop <sebastian.pop@amd.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT
14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 /* This pass performs loop distribution: for example, the loop
23 
24    |DO I = 2, N
25    |    A(I) = B(I) + C
26    |    D(I) = A(I-1)*E
27    |ENDDO
28 
29    is transformed to
30 
31    |DOALL I = 2, N
32    |   A(I) = B(I) + C
33    |ENDDO
34    |
35    |DOALL I = 2, N
36    |   D(I) = A(I-1)*E
37    |ENDDO
38 
39    Loop distribution is the dual of loop fusion.  It separates statements
40    of a loop (or loop nest) into multiple loops (or loop nests) with the
41    same loop header.  The major goal is to separate statements which may
42    be vectorized from those that can't.  This pass implements distribution
43    in the following steps:
44 
45      1) Seed partitions with specific type statements.  For now we support
46 	two types seed statements: statement defining variable used outside
47 	of loop; statement storing to memory.
48      2) Build reduced dependence graph (RDG) for loop to be distributed.
49 	The vertices (RDG:V) model all statements in the loop and the edges
50 	(RDG:E) model flow and control dependencies between statements.
51      3) Apart from RDG, compute data dependencies between memory references.
52      4) Starting from seed statement, build up partition by adding depended
53 	statements according to RDG's dependence information.  Partition is
54 	classified as parallel type if it can be executed paralleled; or as
55 	sequential type if it can't.  Parallel type partition is further
56 	classified as different builtin kinds if it can be implemented as
57 	builtin function calls.
58      5) Build partition dependence graph (PG) based on data dependencies.
59 	The vertices (PG:V) model all partitions and the edges (PG:E) model
60 	all data dependencies between every partitions pair.  In general,
61 	data dependence is either compilation time known or unknown.  In C
62 	family languages, there exists quite amount compilation time unknown
63 	dependencies because of possible alias relation of data references.
64 	We categorize PG's edge to two types: "true" edge that represents
65 	compilation time known data dependencies; "alias" edge for all other
66 	data dependencies.
67      6) Traverse subgraph of PG as if all "alias" edges don't exist.  Merge
68 	partitions in each strong connected component (SCC) correspondingly.
69 	Build new PG for merged partitions.
70      7) Traverse PG again and this time with both "true" and "alias" edges
71 	included.  We try to break SCCs by removing some edges.  Because
72 	SCCs by "true" edges are all fused in step 6), we can break SCCs
73 	by removing some "alias" edges.  It's NP-hard to choose optimal
74 	edge set, fortunately simple approximation is good enough for us
75 	given the small problem scale.
76      8) Collect all data dependencies of the removed "alias" edges.  Create
77 	runtime alias checks for collected data dependencies.
78      9) Version loop under the condition of runtime alias checks.  Given
79 	loop distribution generally introduces additional overhead, it is
80 	only useful if vectorization is achieved in distributed loop.  We
81 	version loop with internal function call IFN_LOOP_DIST_ALIAS.  If
82 	no distributed loop can be vectorized, we simply remove distributed
83 	loops and recover to the original one.
84 
85    TODO:
86      1) We only distribute innermost two-level loop nest now.  We should
87 	extend it for arbitrary loop nests in the future.
88      2) We only fuse partitions in SCC now.  A better fusion algorithm is
89 	desired to minimize loop overhead, maximize parallelism and maximize
90 	data reuse.  */
91 
92 #include "config.h"
93 #include "system.h"
94 #include "coretypes.h"
95 #include "backend.h"
96 #include "tree.h"
97 #include "gimple.h"
98 #include "cfghooks.h"
99 #include "tree-pass.h"
100 #include "ssa.h"
101 #include "gimple-pretty-print.h"
102 #include "fold-const.h"
103 #include "cfganal.h"
104 #include "gimple-iterator.h"
105 #include "gimplify-me.h"
106 #include "stor-layout.h"
107 #include "tree-cfg.h"
108 #include "tree-ssa-loop-manip.h"
109 #include "tree-ssa-loop-ivopts.h"
110 #include "tree-ssa-loop.h"
111 #include "tree-into-ssa.h"
112 #include "tree-ssa.h"
113 #include "cfgloop.h"
114 #include "tree-scalar-evolution.h"
115 #include "tree-vectorizer.h"
116 #include "tree-eh.h"
117 #include "gimple-fold.h"
118 #include "tree-affine.h"
119 #include "intl.h"
120 #include "rtl.h"
121 #include "memmodel.h"
122 #include "optabs.h"
123 
124 
125 #define MAX_DATAREFS_NUM \
126 	((unsigned) param_loop_max_datarefs_for_datadeps)
127 
128 /* Threshold controlling number of distributed partitions.  Given it may
129    be unnecessary if a memory stream cost model is invented in the future,
130    we define it as a temporary macro, rather than a parameter.  */
131 #define NUM_PARTITION_THRESHOLD (4)
132 
133 /* Hashtable helpers.  */
134 
135 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
136 {
137   static inline hashval_t hash (const data_dependence_relation *);
138   static inline bool equal (const data_dependence_relation *,
139 			    const data_dependence_relation *);
140 };
141 
142 /* Hash function for data dependence.  */
143 
144 inline hashval_t
hash(const data_dependence_relation * ddr)145 ddr_hasher::hash (const data_dependence_relation *ddr)
146 {
147   inchash::hash h;
148   h.add_ptr (DDR_A (ddr));
149   h.add_ptr (DDR_B (ddr));
150   return h.end ();
151 }
152 
153 /* Hash table equality function for data dependence.  */
154 
155 inline bool
equal(const data_dependence_relation * ddr1,const data_dependence_relation * ddr2)156 ddr_hasher::equal (const data_dependence_relation *ddr1,
157 		   const data_dependence_relation *ddr2)
158 {
159   return (DDR_A (ddr1) == DDR_A (ddr2) && DDR_B (ddr1) == DDR_B (ddr2));
160 }
161 
162 
163 
164 #define DR_INDEX(dr)      ((uintptr_t) (dr)->aux)
165 
166 /* A Reduced Dependence Graph (RDG) vertex representing a statement.  */
167 struct rdg_vertex
168 {
169   /* The statement represented by this vertex.  */
170   gimple *stmt;
171 
172   /* Vector of data-references in this statement.  */
173   vec<data_reference_p> datarefs;
174 
175   /* True when the statement contains a write to memory.  */
176   bool has_mem_write;
177 
178   /* True when the statement contains a read from memory.  */
179   bool has_mem_reads;
180 };
181 
182 #define RDGV_STMT(V)     ((struct rdg_vertex *) ((V)->data))->stmt
183 #define RDGV_DATAREFS(V) ((struct rdg_vertex *) ((V)->data))->datarefs
184 #define RDGV_HAS_MEM_WRITE(V) ((struct rdg_vertex *) ((V)->data))->has_mem_write
185 #define RDGV_HAS_MEM_READS(V) ((struct rdg_vertex *) ((V)->data))->has_mem_reads
186 #define RDG_STMT(RDG, I) RDGV_STMT (&(RDG->vertices[I]))
187 #define RDG_DATAREFS(RDG, I) RDGV_DATAREFS (&(RDG->vertices[I]))
188 #define RDG_MEM_WRITE_STMT(RDG, I) RDGV_HAS_MEM_WRITE (&(RDG->vertices[I]))
189 #define RDG_MEM_READS_STMT(RDG, I) RDGV_HAS_MEM_READS (&(RDG->vertices[I]))
190 
191 /* Data dependence type.  */
192 
193 enum rdg_dep_type
194 {
195   /* Read After Write (RAW).  */
196   flow_dd = 'f',
197 
198   /* Control dependence (execute conditional on).  */
199   control_dd = 'c'
200 };
201 
202 /* Dependence information attached to an edge of the RDG.  */
203 
204 struct rdg_edge
205 {
206   /* Type of the dependence.  */
207   enum rdg_dep_type type;
208 };
209 
210 #define RDGE_TYPE(E)        ((struct rdg_edge *) ((E)->data))->type
211 
212 /* Kind of distributed loop.  */
213 enum partition_kind {
214     PKIND_NORMAL,
215     /* Partial memset stands for a paritition can be distributed into a loop
216        of memset calls, rather than a single memset call.  It's handled just
217        like a normal parition, i.e, distributed as separate loop, no memset
218        call is generated.
219 
220        Note: This is a hacking fix trying to distribute ZERO-ing stmt in a
221        loop nest as deep as possible.  As a result, parloop achieves better
222        parallelization by parallelizing deeper loop nest.  This hack should
223        be unnecessary and removed once distributed memset can be understood
224        and analyzed in data reference analysis.  See PR82604 for more.  */
225     PKIND_PARTIAL_MEMSET,
226     PKIND_MEMSET, PKIND_MEMCPY, PKIND_MEMMOVE
227 };
228 
229 /* Type of distributed loop.  */
230 enum partition_type {
231     /* The distributed loop can be executed parallelly.  */
232     PTYPE_PARALLEL = 0,
233     /* The distributed loop has to be executed sequentially.  */
234     PTYPE_SEQUENTIAL
235 };
236 
237 /* Builtin info for loop distribution.  */
238 struct builtin_info
239 {
240   /* data-references a kind != PKIND_NORMAL partition is about.  */
241   data_reference_p dst_dr;
242   data_reference_p src_dr;
243   /* Base address and size of memory objects operated by the builtin.  Note
244      both dest and source memory objects must have the same size.  */
245   tree dst_base;
246   tree src_base;
247   tree size;
248   /* Base and offset part of dst_base after stripping constant offset.  This
249      is only used in memset builtin distribution for now.  */
250   tree dst_base_base;
251   unsigned HOST_WIDE_INT dst_base_offset;
252 };
253 
254 /* Partition for loop distribution.  */
255 struct partition
256 {
257   /* Statements of the partition.  */
258   bitmap stmts;
259   /* True if the partition defines variable which is used outside of loop.  */
260   bool reduction_p;
261   location_t loc;
262   enum partition_kind kind;
263   enum partition_type type;
264   /* Data references in the partition.  */
265   bitmap datarefs;
266   /* Information of builtin parition.  */
267   struct builtin_info *builtin;
268 };
269 
270 /* Partitions are fused because of different reasons.  */
271 enum fuse_type
272 {
273   FUSE_NON_BUILTIN = 0,
274   FUSE_REDUCTION = 1,
275   FUSE_SHARE_REF = 2,
276   FUSE_SAME_SCC = 3,
277   FUSE_FINALIZE = 4
278 };
279 
280 /* Description on different fusing reason.  */
281 static const char *fuse_message[] = {
282   "they are non-builtins",
283   "they have reductions",
284   "they have shared memory refs",
285   "they are in the same dependence scc",
286   "there is no point to distribute loop"};
287 
288 
289 /* Dump vertex I in RDG to FILE.  */
290 
291 static void
dump_rdg_vertex(FILE * file,struct graph * rdg,int i)292 dump_rdg_vertex (FILE *file, struct graph *rdg, int i)
293 {
294   struct vertex *v = &(rdg->vertices[i]);
295   struct graph_edge *e;
296 
297   fprintf (file, "(vertex %d: (%s%s) (in:", i,
298 	   RDG_MEM_WRITE_STMT (rdg, i) ? "w" : "",
299 	   RDG_MEM_READS_STMT (rdg, i) ? "r" : "");
300 
301   if (v->pred)
302     for (e = v->pred; e; e = e->pred_next)
303       fprintf (file, " %d", e->src);
304 
305   fprintf (file, ") (out:");
306 
307   if (v->succ)
308     for (e = v->succ; e; e = e->succ_next)
309       fprintf (file, " %d", e->dest);
310 
311   fprintf (file, ")\n");
312   print_gimple_stmt (file, RDGV_STMT (v), 0, TDF_VOPS|TDF_MEMSYMS);
313   fprintf (file, ")\n");
314 }
315 
316 /* Call dump_rdg_vertex on stderr.  */
317 
318 DEBUG_FUNCTION void
debug_rdg_vertex(struct graph * rdg,int i)319 debug_rdg_vertex (struct graph *rdg, int i)
320 {
321   dump_rdg_vertex (stderr, rdg, i);
322 }
323 
324 /* Dump the reduced dependence graph RDG to FILE.  */
325 
326 static void
dump_rdg(FILE * file,struct graph * rdg)327 dump_rdg (FILE *file, struct graph *rdg)
328 {
329   fprintf (file, "(rdg\n");
330   for (int i = 0; i < rdg->n_vertices; i++)
331     dump_rdg_vertex (file, rdg, i);
332   fprintf (file, ")\n");
333 }
334 
335 /* Call dump_rdg on stderr.  */
336 
337 DEBUG_FUNCTION void
debug_rdg(struct graph * rdg)338 debug_rdg (struct graph *rdg)
339 {
340   dump_rdg (stderr, rdg);
341 }
342 
343 static void
dot_rdg_1(FILE * file,struct graph * rdg)344 dot_rdg_1 (FILE *file, struct graph *rdg)
345 {
346   int i;
347   pretty_printer buffer;
348   pp_needs_newline (&buffer) = false;
349   buffer.buffer->stream = file;
350 
351   fprintf (file, "digraph RDG {\n");
352 
353   for (i = 0; i < rdg->n_vertices; i++)
354     {
355       struct vertex *v = &(rdg->vertices[i]);
356       struct graph_edge *e;
357 
358       fprintf (file, "%d [label=\"[%d] ", i, i);
359       pp_gimple_stmt_1 (&buffer, RDGV_STMT (v), 0, TDF_SLIM);
360       pp_flush (&buffer);
361       fprintf (file, "\"]\n");
362 
363       /* Highlight reads from memory.  */
364       if (RDG_MEM_READS_STMT (rdg, i))
365        fprintf (file, "%d [style=filled, fillcolor=green]\n", i);
366 
367       /* Highlight stores to memory.  */
368       if (RDG_MEM_WRITE_STMT (rdg, i))
369        fprintf (file, "%d [style=filled, fillcolor=red]\n", i);
370 
371       if (v->succ)
372        for (e = v->succ; e; e = e->succ_next)
373          switch (RDGE_TYPE (e))
374            {
375            case flow_dd:
376              /* These are the most common dependences: don't print these. */
377              fprintf (file, "%d -> %d \n", i, e->dest);
378              break;
379 
380 	   case control_dd:
381              fprintf (file, "%d -> %d [label=control] \n", i, e->dest);
382              break;
383 
384            default:
385              gcc_unreachable ();
386            }
387     }
388 
389   fprintf (file, "}\n\n");
390 }
391 
392 /* Display the Reduced Dependence Graph using dotty.  */
393 
394 DEBUG_FUNCTION void
dot_rdg(struct graph * rdg)395 dot_rdg (struct graph *rdg)
396 {
397   /* When debugging, you may want to enable the following code.  */
398 #ifdef HAVE_POPEN
399   FILE *file = popen ("dot -Tx11", "w");
400   if (!file)
401     return;
402   dot_rdg_1 (file, rdg);
403   fflush (file);
404   close (fileno (file));
405   pclose (file);
406 #else
407   dot_rdg_1 (stderr, rdg);
408 #endif
409 }
410 
411 /* Returns the index of STMT in RDG.  */
412 
413 static int
rdg_vertex_for_stmt(struct graph * rdg ATTRIBUTE_UNUSED,gimple * stmt)414 rdg_vertex_for_stmt (struct graph *rdg ATTRIBUTE_UNUSED, gimple *stmt)
415 {
416   int index = gimple_uid (stmt);
417   gcc_checking_assert (index == -1 || RDG_STMT (rdg, index) == stmt);
418   return index;
419 }
420 
421 /* Creates dependence edges in RDG for all the uses of DEF.  IDEF is
422    the index of DEF in RDG.  */
423 
424 static void
create_rdg_edges_for_scalar(struct graph * rdg,tree def,int idef)425 create_rdg_edges_for_scalar (struct graph *rdg, tree def, int idef)
426 {
427   use_operand_p imm_use_p;
428   imm_use_iterator iterator;
429 
430   FOR_EACH_IMM_USE_FAST (imm_use_p, iterator, def)
431     {
432       struct graph_edge *e;
433       int use = rdg_vertex_for_stmt (rdg, USE_STMT (imm_use_p));
434 
435       if (use < 0)
436 	continue;
437 
438       e = add_edge (rdg, idef, use);
439       e->data = XNEW (struct rdg_edge);
440       RDGE_TYPE (e) = flow_dd;
441     }
442 }
443 
444 /* Creates an edge for the control dependences of BB to the vertex V.  */
445 
446 static void
create_edge_for_control_dependence(struct graph * rdg,basic_block bb,int v,control_dependences * cd)447 create_edge_for_control_dependence (struct graph *rdg, basic_block bb,
448 				    int v, control_dependences *cd)
449 {
450   bitmap_iterator bi;
451   unsigned edge_n;
452   EXECUTE_IF_SET_IN_BITMAP (cd->get_edges_dependent_on (bb->index),
453 			    0, edge_n, bi)
454     {
455       basic_block cond_bb = cd->get_edge_src (edge_n);
456       gimple *stmt = last_stmt (cond_bb);
457       if (stmt && is_ctrl_stmt (stmt))
458 	{
459 	  struct graph_edge *e;
460 	  int c = rdg_vertex_for_stmt (rdg, stmt);
461 	  if (c < 0)
462 	    continue;
463 
464 	  e = add_edge (rdg, c, v);
465 	  e->data = XNEW (struct rdg_edge);
466 	  RDGE_TYPE (e) = control_dd;
467 	}
468     }
469 }
470 
471 /* Creates the edges of the reduced dependence graph RDG.  */
472 
473 static void
create_rdg_flow_edges(struct graph * rdg)474 create_rdg_flow_edges (struct graph *rdg)
475 {
476   int i;
477   def_operand_p def_p;
478   ssa_op_iter iter;
479 
480   for (i = 0; i < rdg->n_vertices; i++)
481     FOR_EACH_PHI_OR_STMT_DEF (def_p, RDG_STMT (rdg, i),
482 			      iter, SSA_OP_DEF)
483       create_rdg_edges_for_scalar (rdg, DEF_FROM_PTR (def_p), i);
484 }
485 
486 /* Creates the edges of the reduced dependence graph RDG.  */
487 
488 static void
create_rdg_cd_edges(struct graph * rdg,control_dependences * cd,loop_p loop)489 create_rdg_cd_edges (struct graph *rdg, control_dependences *cd, loop_p loop)
490 {
491   int i;
492 
493   for (i = 0; i < rdg->n_vertices; i++)
494     {
495       gimple *stmt = RDG_STMT (rdg, i);
496       if (gimple_code (stmt) == GIMPLE_PHI)
497 	{
498 	  edge_iterator ei;
499 	  edge e;
500 	  FOR_EACH_EDGE (e, ei, gimple_bb (stmt)->preds)
501 	    if (flow_bb_inside_loop_p (loop, e->src))
502 	      create_edge_for_control_dependence (rdg, e->src, i, cd);
503 	}
504       else
505 	create_edge_for_control_dependence (rdg, gimple_bb (stmt), i, cd);
506     }
507 }
508 
509 
510 class loop_distribution
511 {
512   private:
513   /* The loop (nest) to be distributed.  */
514   vec<loop_p> loop_nest;
515 
516   /* Vector of data references in the loop to be distributed.  */
517   vec<data_reference_p> datarefs_vec;
518 
519   /* If there is nonaddressable data reference in above vector.  */
520   bool has_nonaddressable_dataref_p;
521 
522   /* Store index of data reference in aux field.  */
523 
524   /* Hash table for data dependence relation in the loop to be distributed.  */
525   hash_table<ddr_hasher> *ddrs_table;
526 
527   /* Array mapping basic block's index to its topological order.  */
528   int *bb_top_order_index;
529   /* And size of the array.  */
530   int bb_top_order_index_size;
531 
532   /* Build the vertices of the reduced dependence graph RDG.  Return false
533      if that failed.  */
534   bool create_rdg_vertices (struct graph *rdg, const vec<gimple *> &stmts,
535 			    loop_p loop);
536 
537   /* Initialize STMTS with all the statements of LOOP.  We use topological
538      order to discover all statements.  The order is important because
539      generate_loops_for_partition is using the same traversal for identifying
540      statements in loop copies.  */
541   void stmts_from_loop (class loop *loop, vec<gimple *> *stmts);
542 
543 
544   /* Build the Reduced Dependence Graph (RDG) with one vertex per statement of
545      LOOP, and one edge per flow dependence or control dependence from control
546      dependence CD.  During visiting each statement, data references are also
547      collected and recorded in global data DATAREFS_VEC.  */
548   struct graph * build_rdg (class loop *loop, control_dependences *cd);
549 
550 /* Merge PARTITION into the partition DEST.  RDG is the reduced dependence
551    graph and we update type for result partition if it is non-NULL.  */
552   void partition_merge_into (struct graph *rdg,
553 			     partition *dest, partition *partition,
554 			     enum fuse_type ft);
555 
556 
557   /* Return data dependence relation for data references A and B.  The two
558      data references must be in lexicographic order wrto reduced dependence
559      graph RDG.  We firstly try to find ddr from global ddr hash table.  If
560      it doesn't exist, compute the ddr and cache it.  */
561   data_dependence_relation * get_data_dependence (struct graph *rdg,
562 						  data_reference_p a,
563 						  data_reference_p b);
564 
565 
566   /* In reduced dependence graph RDG for loop distribution, return true if
567      dependence between references DR1 and DR2 leads to a dependence cycle
568      and such dependence cycle can't be resolved by runtime alias check.  */
569   bool data_dep_in_cycle_p (struct graph *rdg, data_reference_p dr1,
570 			    data_reference_p dr2);
571 
572 
573   /* Given reduced dependence graph RDG, PARTITION1 and PARTITION2, update
574      PARTITION1's type after merging PARTITION2 into PARTITION1.  */
575   void update_type_for_merge (struct graph *rdg,
576 			      partition *partition1, partition *partition2);
577 
578 
579   /* Returns a partition with all the statements needed for computing
580      the vertex V of the RDG, also including the loop exit conditions.  */
581   partition *build_rdg_partition_for_vertex (struct graph *rdg, int v);
582 
583   /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
584      if it forms builtin memcpy or memmove call.  */
585   void classify_builtin_ldst (loop_p loop, struct graph *rdg, partition *partition,
586 			      data_reference_p dst_dr, data_reference_p src_dr);
587 
588   /* Classifies the builtin kind we can generate for PARTITION of RDG and LOOP.
589      For the moment we detect memset, memcpy and memmove patterns.  Bitmap
590      STMT_IN_ALL_PARTITIONS contains statements belonging to all partitions.
591      Returns true if there is a reduction in all partitions and we
592      possibly did not mark PARTITION as having one for this reason.  */
593 
594   bool
595   classify_partition (loop_p loop,
596 		      struct graph *rdg, partition *partition,
597 		      bitmap stmt_in_all_partitions);
598 
599 
600   /* Returns true when PARTITION1 and PARTITION2 access the same memory
601      object in RDG.  */
602   bool share_memory_accesses (struct graph *rdg,
603 			      partition *partition1, partition *partition2);
604 
605   /* For each seed statement in STARTING_STMTS, this function builds
606      partition for it by adding depended statements according to RDG.
607      All partitions are recorded in PARTITIONS.  */
608   void rdg_build_partitions (struct graph *rdg,
609 			     vec<gimple *> starting_stmts,
610 			     vec<partition *> *partitions);
611 
612   /* Compute partition dependence created by the data references in DRS1
613      and DRS2, modify and return DIR according to that.  IF ALIAS_DDR is
614      not NULL, we record dependence introduced by possible alias between
615      two data references in ALIAS_DDRS; otherwise, we simply ignore such
616      dependence as if it doesn't exist at all.  */
617   int pg_add_dependence_edges (struct graph *rdg, int dir, bitmap drs1,
618 			       bitmap drs2, vec<ddr_p> *alias_ddrs);
619 
620 
621   /* Build and return partition dependence graph for PARTITIONS.  RDG is
622      reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
623      is true, data dependence caused by possible alias between references
624      is ignored, as if it doesn't exist at all; otherwise all depdendences
625      are considered.  */
626   struct graph *build_partition_graph (struct graph *rdg,
627 				       vec<struct partition *> *partitions,
628 				       bool ignore_alias_p);
629 
630   /* Given reduced dependence graph RDG merge strong connected components
631      of PARTITIONS.  If IGNORE_ALIAS_P is true, data dependence caused by
632      possible alias between references is ignored, as if it doesn't exist
633      at all; otherwise all depdendences are considered.  */
634   void merge_dep_scc_partitions (struct graph *rdg, vec<struct partition *>
635 				 *partitions, bool ignore_alias_p);
636 
637 /* This is the main function breaking strong conected components in
638    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
639    relations for runtime alias check in ALIAS_DDRS.  */
640   void break_alias_scc_partitions (struct graph *rdg, vec<struct partition *>
641 				   *partitions, vec<ddr_p> *alias_ddrs);
642 
643 
644   /* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
645      ALIAS_DDRS contains ddrs which need runtime alias check.  */
646   void finalize_partitions (class loop *loop, vec<struct partition *>
647 			    *partitions, vec<ddr_p> *alias_ddrs);
648 
649   /* Distributes the code from LOOP in such a way that producer statements
650      are placed before consumer statements.  Tries to separate only the
651      statements from STMTS into separate loops.  Returns the number of
652      distributed loops.  Set NB_CALLS to number of generated builtin calls.
653      Set *DESTROY_P to whether LOOP needs to be destroyed.  */
654   int distribute_loop (class loop *loop, const vec<gimple *> &stmts,
655 		       control_dependences *cd, int *nb_calls, bool *destroy_p,
656 		       bool only_patterns_p);
657 
658   /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
659      replace them accordingly.  */
660   bool transform_reduction_loop (loop_p loop);
661 
662   /* Compute topological order for basic blocks.  Topological order is
663      needed because data dependence is computed for data references in
664      lexicographical order.  */
665   void bb_top_order_init (void);
666 
667   void bb_top_order_destroy (void);
668 
669   public:
670 
671   /* Getter for bb_top_order.  */
672 
get_bb_top_order_index_size(void)673   inline int get_bb_top_order_index_size (void)
674     {
675       return bb_top_order_index_size;
676     }
677 
get_bb_top_order_index(int i)678   inline int get_bb_top_order_index (int i)
679     {
680       return bb_top_order_index[i];
681     }
682 
683   unsigned int execute (function *fun);
684 };
685 
686 
687 /* If X has a smaller topological sort number than Y, returns -1;
688    if greater, returns 1.  */
689 static int
bb_top_order_cmp_r(const void * x,const void * y,void * loop)690 bb_top_order_cmp_r (const void *x, const void *y, void *loop)
691 {
692   loop_distribution *_loop =
693     (loop_distribution *) loop;
694 
695   basic_block bb1 = *(const basic_block *) x;
696   basic_block bb2 = *(const basic_block *) y;
697 
698   int bb_top_order_index_size = _loop->get_bb_top_order_index_size ();
699 
700   gcc_assert (bb1->index < bb_top_order_index_size
701 	      && bb2->index < bb_top_order_index_size);
702   gcc_assert (bb1 == bb2
703 	      || _loop->get_bb_top_order_index(bb1->index)
704 		 != _loop->get_bb_top_order_index(bb2->index));
705 
706   return (_loop->get_bb_top_order_index(bb1->index) -
707 	  _loop->get_bb_top_order_index(bb2->index));
708 }
709 
710 bool
create_rdg_vertices(struct graph * rdg,const vec<gimple * > & stmts,loop_p loop)711 loop_distribution::create_rdg_vertices (struct graph *rdg,
712 					const vec<gimple *> &stmts,
713 					loop_p loop)
714 {
715   int i;
716   gimple *stmt;
717 
718   FOR_EACH_VEC_ELT (stmts, i, stmt)
719     {
720       struct vertex *v = &(rdg->vertices[i]);
721 
722       /* Record statement to vertex mapping.  */
723       gimple_set_uid (stmt, i);
724 
725       v->data = XNEW (struct rdg_vertex);
726       RDGV_STMT (v) = stmt;
727       RDGV_DATAREFS (v).create (0);
728       RDGV_HAS_MEM_WRITE (v) = false;
729       RDGV_HAS_MEM_READS (v) = false;
730       if (gimple_code (stmt) == GIMPLE_PHI)
731 	continue;
732 
733       unsigned drp = datarefs_vec.length ();
734       if (!find_data_references_in_stmt (loop, stmt, &datarefs_vec))
735 	return false;
736       for (unsigned j = drp; j < datarefs_vec.length (); ++j)
737 	{
738 	  data_reference_p dr = datarefs_vec[j];
739 	  if (DR_IS_READ (dr))
740 	    RDGV_HAS_MEM_READS (v) = true;
741 	  else
742 	    RDGV_HAS_MEM_WRITE (v) = true;
743 	  RDGV_DATAREFS (v).safe_push (dr);
744 	  has_nonaddressable_dataref_p |= may_be_nonaddressable_p (dr->ref);
745 	}
746     }
747   return true;
748 }
749 
750 void
stmts_from_loop(class loop * loop,vec<gimple * > * stmts)751 loop_distribution::stmts_from_loop (class loop *loop, vec<gimple *> *stmts)
752 {
753   unsigned int i;
754   basic_block *bbs = get_loop_body_in_custom_order (loop, this, bb_top_order_cmp_r);
755 
756   for (i = 0; i < loop->num_nodes; i++)
757     {
758       basic_block bb = bbs[i];
759 
760       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
761 	   gsi_next (&bsi))
762 	if (!virtual_operand_p (gimple_phi_result (bsi.phi ())))
763 	  stmts->safe_push (bsi.phi ());
764 
765       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
766 	   gsi_next (&bsi))
767 	{
768 	  gimple *stmt = gsi_stmt (bsi);
769 	  if (gimple_code (stmt) != GIMPLE_LABEL && !is_gimple_debug (stmt))
770 	    stmts->safe_push (stmt);
771 	}
772     }
773 
774   free (bbs);
775 }
776 
777 /* Free the reduced dependence graph RDG.  */
778 
779 static void
free_rdg(struct graph * rdg)780 free_rdg (struct graph *rdg)
781 {
782   int i;
783 
784   for (i = 0; i < rdg->n_vertices; i++)
785     {
786       struct vertex *v = &(rdg->vertices[i]);
787       struct graph_edge *e;
788 
789       for (e = v->succ; e; e = e->succ_next)
790 	free (e->data);
791 
792       if (v->data)
793 	{
794 	  gimple_set_uid (RDGV_STMT (v), -1);
795 	  (RDGV_DATAREFS (v)).release ();
796 	  free (v->data);
797 	}
798     }
799 
800   free_graph (rdg);
801 }
802 
803 struct graph *
build_rdg(class loop * loop,control_dependences * cd)804 loop_distribution::build_rdg (class loop *loop, control_dependences *cd)
805 {
806   struct graph *rdg;
807 
808   /* Create the RDG vertices from the stmts of the loop nest.  */
809   auto_vec<gimple *, 10> stmts;
810   stmts_from_loop (loop, &stmts);
811   rdg = new_graph (stmts.length ());
812   if (!create_rdg_vertices (rdg, stmts, loop))
813     {
814       free_rdg (rdg);
815       return NULL;
816     }
817   stmts.release ();
818 
819   create_rdg_flow_edges (rdg);
820   if (cd)
821     create_rdg_cd_edges (rdg, cd, loop);
822 
823   return rdg;
824 }
825 
826 
827 /* Allocate and initialize a partition from BITMAP.  */
828 
829 static partition *
partition_alloc(void)830 partition_alloc (void)
831 {
832   partition *partition = XCNEW (struct partition);
833   partition->stmts = BITMAP_ALLOC (NULL);
834   partition->reduction_p = false;
835   partition->loc = UNKNOWN_LOCATION;
836   partition->kind = PKIND_NORMAL;
837   partition->type = PTYPE_PARALLEL;
838   partition->datarefs = BITMAP_ALLOC (NULL);
839   return partition;
840 }
841 
842 /* Free PARTITION.  */
843 
844 static void
partition_free(partition * partition)845 partition_free (partition *partition)
846 {
847   BITMAP_FREE (partition->stmts);
848   BITMAP_FREE (partition->datarefs);
849   if (partition->builtin)
850     free (partition->builtin);
851 
852   free (partition);
853 }
854 
855 /* Returns true if the partition can be generated as a builtin.  */
856 
857 static bool
partition_builtin_p(partition * partition)858 partition_builtin_p (partition *partition)
859 {
860   return partition->kind > PKIND_PARTIAL_MEMSET;
861 }
862 
863 /* Returns true if the partition contains a reduction.  */
864 
865 static bool
partition_reduction_p(partition * partition)866 partition_reduction_p (partition *partition)
867 {
868   return partition->reduction_p;
869 }
870 
871 void
partition_merge_into(struct graph * rdg,partition * dest,partition * partition,enum fuse_type ft)872 loop_distribution::partition_merge_into (struct graph *rdg,
873 		      partition *dest, partition *partition, enum fuse_type ft)
874 {
875   if (dump_file && (dump_flags & TDF_DETAILS))
876     {
877       fprintf (dump_file, "Fuse partitions because %s:\n", fuse_message[ft]);
878       fprintf (dump_file, "  Part 1: ");
879       dump_bitmap (dump_file, dest->stmts);
880       fprintf (dump_file, "  Part 2: ");
881       dump_bitmap (dump_file, partition->stmts);
882     }
883 
884   dest->kind = PKIND_NORMAL;
885   if (dest->type == PTYPE_PARALLEL)
886     dest->type = partition->type;
887 
888   bitmap_ior_into (dest->stmts, partition->stmts);
889   if (partition_reduction_p (partition))
890     dest->reduction_p = true;
891 
892   /* Further check if any data dependence prevents us from executing the
893      new partition parallelly.  */
894   if (dest->type == PTYPE_PARALLEL && rdg != NULL)
895     update_type_for_merge (rdg, dest, partition);
896 
897   bitmap_ior_into (dest->datarefs, partition->datarefs);
898 }
899 
900 
901 /* Returns true when DEF is an SSA_NAME defined in LOOP and used after
902    the LOOP.  */
903 
904 static bool
ssa_name_has_uses_outside_loop_p(tree def,loop_p loop)905 ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
906 {
907   imm_use_iterator imm_iter;
908   use_operand_p use_p;
909 
910   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
911     {
912       if (is_gimple_debug (USE_STMT (use_p)))
913 	continue;
914 
915       basic_block use_bb = gimple_bb (USE_STMT (use_p));
916       if (!flow_bb_inside_loop_p (loop, use_bb))
917 	return true;
918     }
919 
920   return false;
921 }
922 
923 /* Returns true when STMT defines a scalar variable used after the
924    loop LOOP.  */
925 
926 static bool
stmt_has_scalar_dependences_outside_loop(loop_p loop,gimple * stmt)927 stmt_has_scalar_dependences_outside_loop (loop_p loop, gimple *stmt)
928 {
929   def_operand_p def_p;
930   ssa_op_iter op_iter;
931 
932   if (gimple_code (stmt) == GIMPLE_PHI)
933     return ssa_name_has_uses_outside_loop_p (gimple_phi_result (stmt), loop);
934 
935   FOR_EACH_SSA_DEF_OPERAND (def_p, stmt, op_iter, SSA_OP_DEF)
936     if (ssa_name_has_uses_outside_loop_p (DEF_FROM_PTR (def_p), loop))
937       return true;
938 
939   return false;
940 }
941 
942 /* Return a copy of LOOP placed before LOOP.  */
943 
944 static class loop *
copy_loop_before(class loop * loop)945 copy_loop_before (class loop *loop)
946 {
947   class loop *res;
948   edge preheader = loop_preheader_edge (loop);
949 
950   initialize_original_copy_tables ();
951   res = slpeel_tree_duplicate_loop_to_edge_cfg (loop, NULL, preheader);
952   gcc_assert (res != NULL);
953   free_original_copy_tables ();
954   delete_update_ssa ();
955 
956   return res;
957 }
958 
959 /* Creates an empty basic block after LOOP.  */
960 
961 static void
create_bb_after_loop(class loop * loop)962 create_bb_after_loop (class loop *loop)
963 {
964   edge exit = single_exit (loop);
965 
966   if (!exit)
967     return;
968 
969   split_edge (exit);
970 }
971 
972 /* Generate code for PARTITION from the code in LOOP.  The loop is
973    copied when COPY_P is true.  All the statements not flagged in the
974    PARTITION bitmap are removed from the loop or from its copy.  The
975    statements are indexed in sequence inside a basic block, and the
976    basic blocks of a loop are taken in dom order.  */
977 
978 static void
generate_loops_for_partition(class loop * loop,partition * partition,bool copy_p)979 generate_loops_for_partition (class loop *loop, partition *partition,
980 			      bool copy_p)
981 {
982   unsigned i;
983   basic_block *bbs;
984 
985   if (copy_p)
986     {
987       int orig_loop_num = loop->orig_loop_num;
988       loop = copy_loop_before (loop);
989       gcc_assert (loop != NULL);
990       loop->orig_loop_num = orig_loop_num;
991       create_preheader (loop, CP_SIMPLE_PREHEADERS);
992       create_bb_after_loop (loop);
993     }
994   else
995     {
996       /* Origin number is set to the new versioned loop's num.  */
997       gcc_assert (loop->orig_loop_num != loop->num);
998     }
999 
1000   /* Remove stmts not in the PARTITION bitmap.  */
1001   bbs = get_loop_body_in_dom_order (loop);
1002 
1003   if (MAY_HAVE_DEBUG_BIND_STMTS)
1004     for (i = 0; i < loop->num_nodes; i++)
1005       {
1006 	basic_block bb = bbs[i];
1007 
1008 	for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
1009 	     gsi_next (&bsi))
1010 	  {
1011 	    gphi *phi = bsi.phi ();
1012 	    if (!virtual_operand_p (gimple_phi_result (phi))
1013 		&& !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1014 	      reset_debug_uses (phi);
1015 	  }
1016 
1017 	for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
1018 	  {
1019 	    gimple *stmt = gsi_stmt (bsi);
1020 	    if (gimple_code (stmt) != GIMPLE_LABEL
1021 		&& !is_gimple_debug (stmt)
1022 		&& !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1023 	      reset_debug_uses (stmt);
1024 	  }
1025       }
1026 
1027   for (i = 0; i < loop->num_nodes; i++)
1028     {
1029       basic_block bb = bbs[i];
1030       edge inner_exit = NULL;
1031 
1032       if (loop != bb->loop_father)
1033 	inner_exit = single_exit (bb->loop_father);
1034 
1035       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);)
1036 	{
1037 	  gphi *phi = bsi.phi ();
1038 	  if (!virtual_operand_p (gimple_phi_result (phi))
1039 	      && !bitmap_bit_p (partition->stmts, gimple_uid (phi)))
1040 	    remove_phi_node (&bsi, true);
1041 	  else
1042 	    gsi_next (&bsi);
1043 	}
1044 
1045       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);)
1046 	{
1047 	  gimple *stmt = gsi_stmt (bsi);
1048 	  if (gimple_code (stmt) != GIMPLE_LABEL
1049 	      && !is_gimple_debug (stmt)
1050 	      && !bitmap_bit_p (partition->stmts, gimple_uid (stmt)))
1051 	    {
1052 	      /* In distribution of loop nest, if bb is inner loop's exit_bb,
1053 		 we choose its exit edge/path in order to avoid generating
1054 		 infinite loop.  For all other cases, we choose an arbitrary
1055 		 path through the empty CFG part that this unnecessary
1056 		 control stmt controls.  */
1057 	      if (gcond *cond_stmt = dyn_cast <gcond *> (stmt))
1058 		{
1059 		  if (inner_exit && inner_exit->flags & EDGE_TRUE_VALUE)
1060 		    gimple_cond_make_true (cond_stmt);
1061 		  else
1062 		    gimple_cond_make_false (cond_stmt);
1063 		  update_stmt (stmt);
1064 		}
1065 	      else if (gimple_code (stmt) == GIMPLE_SWITCH)
1066 		{
1067 		  gswitch *switch_stmt = as_a <gswitch *> (stmt);
1068 		  gimple_switch_set_index
1069 		      (switch_stmt, CASE_LOW (gimple_switch_label (switch_stmt, 1)));
1070 		  update_stmt (stmt);
1071 		}
1072 	      else
1073 		{
1074 		  unlink_stmt_vdef (stmt);
1075 		  gsi_remove (&bsi, true);
1076 		  release_defs (stmt);
1077 		  continue;
1078 		}
1079 	    }
1080 	  gsi_next (&bsi);
1081 	}
1082     }
1083 
1084   free (bbs);
1085 }
1086 
1087 /* If VAL memory representation contains the same value in all bytes,
1088    return that value, otherwise return -1.
1089    E.g. for 0x24242424 return 0x24, for IEEE double
1090    747708026454360457216.0 return 0x44, etc.  */
1091 
1092 static int
const_with_all_bytes_same(tree val)1093 const_with_all_bytes_same (tree val)
1094 {
1095   unsigned char buf[64];
1096   int i, len;
1097 
1098   if (integer_zerop (val)
1099       || (TREE_CODE (val) == CONSTRUCTOR
1100           && !TREE_CLOBBER_P (val)
1101           && CONSTRUCTOR_NELTS (val) == 0))
1102     return 0;
1103 
1104   if (real_zerop (val))
1105     {
1106       /* Only return 0 for +0.0, not for -0.0, which doesn't have
1107 	 an all bytes same memory representation.  Don't transform
1108 	 -0.0 stores into +0.0 even for !HONOR_SIGNED_ZEROS.  */
1109       switch (TREE_CODE (val))
1110 	{
1111 	case REAL_CST:
1112 	  if (!real_isneg (TREE_REAL_CST_PTR (val)))
1113 	    return 0;
1114 	  break;
1115 	case COMPLEX_CST:
1116 	  if (!const_with_all_bytes_same (TREE_REALPART (val))
1117 	      && !const_with_all_bytes_same (TREE_IMAGPART (val)))
1118 	    return 0;
1119 	  break;
1120 	case VECTOR_CST:
1121 	  {
1122 	    unsigned int count = vector_cst_encoded_nelts (val);
1123 	    unsigned int j;
1124 	    for (j = 0; j < count; ++j)
1125 	      if (const_with_all_bytes_same (VECTOR_CST_ENCODED_ELT (val, j)))
1126 		break;
1127 	    if (j == count)
1128 	      return 0;
1129 	    break;
1130 	  }
1131 	default:
1132 	  break;
1133 	}
1134     }
1135 
1136   if (CHAR_BIT != 8 || BITS_PER_UNIT != 8)
1137     return -1;
1138 
1139   len = native_encode_expr (val, buf, sizeof (buf));
1140   if (len == 0)
1141     return -1;
1142   for (i = 1; i < len; i++)
1143     if (buf[i] != buf[0])
1144       return -1;
1145   return buf[0];
1146 }
1147 
1148 /* Generate a call to memset for PARTITION in LOOP.  */
1149 
1150 static void
generate_memset_builtin(class loop * loop,partition * partition)1151 generate_memset_builtin (class loop *loop, partition *partition)
1152 {
1153   gimple_stmt_iterator gsi;
1154   tree mem, fn, nb_bytes;
1155   tree val;
1156   struct builtin_info *builtin = partition->builtin;
1157   gimple *fn_call;
1158 
1159   /* The new statements will be placed before LOOP.  */
1160   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1161 
1162   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1163   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1164 				       false, GSI_CONTINUE_LINKING);
1165   mem = rewrite_to_non_trapping_overflow (builtin->dst_base);
1166   mem = force_gimple_operand_gsi (&gsi, mem, true, NULL_TREE,
1167 				  false, GSI_CONTINUE_LINKING);
1168 
1169   /* This exactly matches the pattern recognition in classify_partition.  */
1170   val = gimple_assign_rhs1 (DR_STMT (builtin->dst_dr));
1171   /* Handle constants like 0x15151515 and similarly
1172      floating point constants etc. where all bytes are the same.  */
1173   int bytev = const_with_all_bytes_same (val);
1174   if (bytev != -1)
1175     val = build_int_cst (integer_type_node, bytev);
1176   else if (TREE_CODE (val) == INTEGER_CST)
1177     val = fold_convert (integer_type_node, val);
1178   else if (!useless_type_conversion_p (integer_type_node, TREE_TYPE (val)))
1179     {
1180       tree tem = make_ssa_name (integer_type_node);
1181       gimple *cstmt = gimple_build_assign (tem, NOP_EXPR, val);
1182       gsi_insert_after (&gsi, cstmt, GSI_CONTINUE_LINKING);
1183       val = tem;
1184     }
1185 
1186   fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_MEMSET));
1187   fn_call = gimple_build_call (fn, 3, mem, val, nb_bytes);
1188   gimple_set_location (fn_call, partition->loc);
1189   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1190   fold_stmt (&gsi);
1191 
1192   if (dump_file && (dump_flags & TDF_DETAILS))
1193     {
1194       fprintf (dump_file, "generated memset");
1195       if (bytev == 0)
1196 	fprintf (dump_file, " zero\n");
1197       else
1198 	fprintf (dump_file, "\n");
1199     }
1200 }
1201 
1202 /* Generate a call to memcpy for PARTITION in LOOP.  */
1203 
1204 static void
generate_memcpy_builtin(class loop * loop,partition * partition)1205 generate_memcpy_builtin (class loop *loop, partition *partition)
1206 {
1207   gimple_stmt_iterator gsi;
1208   gimple *fn_call;
1209   tree dest, src, fn, nb_bytes;
1210   enum built_in_function kind;
1211   struct builtin_info *builtin = partition->builtin;
1212 
1213   /* The new statements will be placed before LOOP.  */
1214   gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1215 
1216   nb_bytes = rewrite_to_non_trapping_overflow (builtin->size);
1217   nb_bytes = force_gimple_operand_gsi (&gsi, nb_bytes, true, NULL_TREE,
1218 				       false, GSI_CONTINUE_LINKING);
1219   dest = rewrite_to_non_trapping_overflow (builtin->dst_base);
1220   src = rewrite_to_non_trapping_overflow (builtin->src_base);
1221   if (partition->kind == PKIND_MEMCPY
1222       || ! ptr_derefs_may_alias_p (dest, src))
1223     kind = BUILT_IN_MEMCPY;
1224   else
1225     kind = BUILT_IN_MEMMOVE;
1226   /* Try harder if we're copying a constant size.  */
1227   if (kind == BUILT_IN_MEMMOVE && poly_int_tree_p (nb_bytes))
1228     {
1229       aff_tree asrc, adest;
1230       tree_to_aff_combination (src, ptr_type_node, &asrc);
1231       tree_to_aff_combination (dest, ptr_type_node, &adest);
1232       aff_combination_scale (&adest, -1);
1233       aff_combination_add (&asrc, &adest);
1234       if (aff_comb_cannot_overlap_p (&asrc, wi::to_poly_widest (nb_bytes),
1235 				     wi::to_poly_widest (nb_bytes)))
1236 	kind = BUILT_IN_MEMCPY;
1237     }
1238 
1239   dest = force_gimple_operand_gsi (&gsi, dest, true, NULL_TREE,
1240 				   false, GSI_CONTINUE_LINKING);
1241   src = force_gimple_operand_gsi (&gsi, src, true, NULL_TREE,
1242 				  false, GSI_CONTINUE_LINKING);
1243   fn = build_fold_addr_expr (builtin_decl_implicit (kind));
1244   fn_call = gimple_build_call (fn, 3, dest, src, nb_bytes);
1245   gimple_set_location (fn_call, partition->loc);
1246   gsi_insert_after (&gsi, fn_call, GSI_CONTINUE_LINKING);
1247   fold_stmt (&gsi);
1248 
1249   if (dump_file && (dump_flags & TDF_DETAILS))
1250     {
1251       if (kind == BUILT_IN_MEMCPY)
1252 	fprintf (dump_file, "generated memcpy\n");
1253       else
1254 	fprintf (dump_file, "generated memmove\n");
1255     }
1256 }
1257 
1258 /* Remove and destroy the loop LOOP.  */
1259 
1260 static void
destroy_loop(class loop * loop)1261 destroy_loop (class loop *loop)
1262 {
1263   unsigned nbbs = loop->num_nodes;
1264   edge exit = single_exit (loop);
1265   basic_block src = loop_preheader_edge (loop)->src, dest = exit->dest;
1266   basic_block *bbs;
1267   unsigned i;
1268 
1269   bbs = get_loop_body_in_dom_order (loop);
1270 
1271   gimple_stmt_iterator dst_gsi = gsi_after_labels (exit->dest);
1272   bool safe_p = single_pred_p (exit->dest);
1273   for (unsigned i = 0; i < nbbs; ++i)
1274     {
1275       /* We have made sure to not leave any dangling uses of SSA
1276          names defined in the loop.  With the exception of virtuals.
1277 	 Make sure we replace all uses of virtual defs that will remain
1278 	 outside of the loop with the bare symbol as delete_basic_block
1279 	 will release them.  */
1280       for (gphi_iterator gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi);
1281 	   gsi_next (&gsi))
1282 	{
1283 	  gphi *phi = gsi.phi ();
1284 	  if (virtual_operand_p (gimple_phi_result (phi)))
1285 	    mark_virtual_phi_result_for_renaming (phi);
1286 	}
1287       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi);)
1288 	{
1289 	  gimple *stmt = gsi_stmt (gsi);
1290 	  tree vdef = gimple_vdef (stmt);
1291 	  if (vdef && TREE_CODE (vdef) == SSA_NAME)
1292 	    mark_virtual_operand_for_renaming (vdef);
1293 	  /* Also move and eventually reset debug stmts.  We can leave
1294 	     constant values in place in case the stmt dominates the exit.
1295 	     ???  Non-constant values from the last iteration can be
1296 	     replaced with final values if we can compute them.  */
1297 	  if (gimple_debug_bind_p (stmt))
1298 	    {
1299 	      tree val = gimple_debug_bind_get_value (stmt);
1300 	      gsi_move_before (&gsi, &dst_gsi);
1301 	      if (val
1302 		  && (!safe_p
1303 		      || !is_gimple_min_invariant (val)
1304 		      || !dominated_by_p (CDI_DOMINATORS, exit->src, bbs[i])))
1305 		{
1306 		  gimple_debug_bind_reset_value (stmt);
1307 		  update_stmt (stmt);
1308 		}
1309 	    }
1310 	  else
1311 	    gsi_next (&gsi);
1312 	}
1313     }
1314 
1315   redirect_edge_pred (exit, src);
1316   exit->flags &= ~(EDGE_TRUE_VALUE|EDGE_FALSE_VALUE);
1317   exit->flags |= EDGE_FALLTHRU;
1318   cancel_loop_tree (loop);
1319   rescan_loop_exit (exit, false, true);
1320 
1321   i = nbbs;
1322   do
1323     {
1324       --i;
1325       delete_basic_block (bbs[i]);
1326     }
1327   while (i != 0);
1328 
1329   free (bbs);
1330 
1331   set_immediate_dominator (CDI_DOMINATORS, dest,
1332 			   recompute_dominator (CDI_DOMINATORS, dest));
1333 }
1334 
1335 /* Generates code for PARTITION.  Return whether LOOP needs to be destroyed.  */
1336 
1337 static bool
generate_code_for_partition(class loop * loop,partition * partition,bool copy_p)1338 generate_code_for_partition (class loop *loop,
1339 			     partition *partition, bool copy_p)
1340 {
1341   switch (partition->kind)
1342     {
1343     case PKIND_NORMAL:
1344     case PKIND_PARTIAL_MEMSET:
1345       /* Reductions all have to be in the last partition.  */
1346       gcc_assert (!partition_reduction_p (partition)
1347 		  || !copy_p);
1348       generate_loops_for_partition (loop, partition, copy_p);
1349       return false;
1350 
1351     case PKIND_MEMSET:
1352       generate_memset_builtin (loop, partition);
1353       break;
1354 
1355     case PKIND_MEMCPY:
1356     case PKIND_MEMMOVE:
1357       generate_memcpy_builtin (loop, partition);
1358       break;
1359 
1360     default:
1361       gcc_unreachable ();
1362     }
1363 
1364   /* Common tail for partitions we turn into a call.  If this was the last
1365      partition for which we generate code, we have to destroy the loop.  */
1366   if (!copy_p)
1367     return true;
1368   return false;
1369 }
1370 
1371 data_dependence_relation *
get_data_dependence(struct graph * rdg,data_reference_p a,data_reference_p b)1372 loop_distribution::get_data_dependence (struct graph *rdg, data_reference_p a,
1373 					data_reference_p b)
1374 {
1375   struct data_dependence_relation ent, **slot;
1376   struct data_dependence_relation *ddr;
1377 
1378   gcc_assert (DR_IS_WRITE (a) || DR_IS_WRITE (b));
1379   gcc_assert (rdg_vertex_for_stmt (rdg, DR_STMT (a))
1380 	      <= rdg_vertex_for_stmt (rdg, DR_STMT (b)));
1381   ent.a = a;
1382   ent.b = b;
1383   slot = ddrs_table->find_slot (&ent, INSERT);
1384   if (*slot == NULL)
1385     {
1386       ddr = initialize_data_dependence_relation (a, b, loop_nest);
1387       compute_affine_dependence (ddr, loop_nest[0]);
1388       *slot = ddr;
1389     }
1390 
1391   return *slot;
1392 }
1393 
1394 bool
data_dep_in_cycle_p(struct graph * rdg,data_reference_p dr1,data_reference_p dr2)1395 loop_distribution::data_dep_in_cycle_p (struct graph *rdg,
1396 					data_reference_p dr1,
1397 					data_reference_p dr2)
1398 {
1399   struct data_dependence_relation *ddr;
1400 
1401   /* Re-shuffle data-refs to be in topological order.  */
1402   if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
1403       > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
1404     std::swap (dr1, dr2);
1405 
1406   ddr = get_data_dependence (rdg, dr1, dr2);
1407 
1408   /* In case of no data dependence.  */
1409   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1410     return false;
1411   /* For unknown data dependence or known data dependence which can't be
1412      expressed in classic distance vector, we check if it can be resolved
1413      by runtime alias check.  If yes, we still consider data dependence
1414      as won't introduce data dependence cycle.  */
1415   else if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1416 	   || DDR_NUM_DIST_VECTS (ddr) == 0)
1417     return !runtime_alias_check_p (ddr, NULL, true);
1418   else if (DDR_NUM_DIST_VECTS (ddr) > 1)
1419     return true;
1420   else if (DDR_REVERSED_P (ddr)
1421 	   || lambda_vector_zerop (DDR_DIST_VECT (ddr, 0), 1))
1422     return false;
1423 
1424   return true;
1425 }
1426 
1427 void
update_type_for_merge(struct graph * rdg,partition * partition1,partition * partition2)1428 loop_distribution::update_type_for_merge (struct graph *rdg,
1429 					   partition *partition1,
1430 					   partition *partition2)
1431 {
1432   unsigned i, j;
1433   bitmap_iterator bi, bj;
1434   data_reference_p dr1, dr2;
1435 
1436   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1437     {
1438       unsigned start = (partition1 == partition2) ? i + 1 : 0;
1439 
1440       dr1 = datarefs_vec[i];
1441       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, start, j, bj)
1442 	{
1443 	  dr2 = datarefs_vec[j];
1444 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
1445 	    continue;
1446 
1447 	  /* Partition can only be executed sequentially if there is any
1448 	     data dependence cycle.  */
1449 	  if (data_dep_in_cycle_p (rdg, dr1, dr2))
1450 	    {
1451 	      partition1->type = PTYPE_SEQUENTIAL;
1452 	      return;
1453 	    }
1454 	}
1455     }
1456 }
1457 
1458 partition *
build_rdg_partition_for_vertex(struct graph * rdg,int v)1459 loop_distribution::build_rdg_partition_for_vertex (struct graph *rdg, int v)
1460 {
1461   partition *partition = partition_alloc ();
1462   auto_vec<int, 3> nodes;
1463   unsigned i, j;
1464   int x;
1465   data_reference_p dr;
1466 
1467   graphds_dfs (rdg, &v, 1, &nodes, false, NULL);
1468 
1469   FOR_EACH_VEC_ELT (nodes, i, x)
1470     {
1471       bitmap_set_bit (partition->stmts, x);
1472 
1473       for (j = 0; RDG_DATAREFS (rdg, x).iterate (j, &dr); ++j)
1474 	{
1475 	  unsigned idx = (unsigned) DR_INDEX (dr);
1476 	  gcc_assert (idx < datarefs_vec.length ());
1477 
1478 	  /* Partition can only be executed sequentially if there is any
1479 	     unknown data reference.  */
1480 	  if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr)
1481 	      || !DR_INIT (dr) || !DR_STEP (dr))
1482 	    partition->type = PTYPE_SEQUENTIAL;
1483 
1484 	  bitmap_set_bit (partition->datarefs, idx);
1485 	}
1486     }
1487 
1488   if (partition->type == PTYPE_SEQUENTIAL)
1489     return partition;
1490 
1491   /* Further check if any data dependence prevents us from executing the
1492      partition parallelly.  */
1493   update_type_for_merge (rdg, partition, partition);
1494 
1495   return partition;
1496 }
1497 
1498 /* Given PARTITION of LOOP and RDG, record single load/store data references
1499    for builtin partition in SRC_DR/DST_DR, return false if there is no such
1500    data references.  */
1501 
1502 static bool
find_single_drs(class loop * loop,struct graph * rdg,const bitmap & partition_stmts,data_reference_p * dst_dr,data_reference_p * src_dr)1503 find_single_drs (class loop *loop, struct graph *rdg, const bitmap &partition_stmts,
1504 		 data_reference_p *dst_dr, data_reference_p *src_dr)
1505 {
1506   unsigned i;
1507   data_reference_p single_ld = NULL, single_st = NULL;
1508   bitmap_iterator bi;
1509 
1510   EXECUTE_IF_SET_IN_BITMAP (partition_stmts, 0, i, bi)
1511     {
1512       gimple *stmt = RDG_STMT (rdg, i);
1513       data_reference_p dr;
1514 
1515       if (gimple_code (stmt) == GIMPLE_PHI)
1516 	continue;
1517 
1518       /* Any scalar stmts are ok.  */
1519       if (!gimple_vuse (stmt))
1520 	continue;
1521 
1522       /* Otherwise just regular loads/stores.  */
1523       if (!gimple_assign_single_p (stmt))
1524 	return false;
1525 
1526       /* But exactly one store and/or load.  */
1527       for (unsigned j = 0; RDG_DATAREFS (rdg, i).iterate (j, &dr); ++j)
1528 	{
1529 	  tree type = TREE_TYPE (DR_REF (dr));
1530 
1531 	  /* The memset, memcpy and memmove library calls are only
1532 	     able to deal with generic address space.  */
1533 	  if (!ADDR_SPACE_GENERIC_P (TYPE_ADDR_SPACE (type)))
1534 	    return false;
1535 
1536 	  if (DR_IS_READ (dr))
1537 	    {
1538 	      if (single_ld != NULL)
1539 		return false;
1540 	      single_ld = dr;
1541 	    }
1542 	  else
1543 	    {
1544 	      if (single_st != NULL)
1545 		return false;
1546 	      single_st = dr;
1547 	    }
1548 	}
1549     }
1550 
1551   if (!single_ld && !single_st)
1552     return false;
1553 
1554   basic_block bb_ld = NULL;
1555   basic_block bb_st = NULL;
1556 
1557   if (single_ld)
1558     {
1559       /* Bail out if this is a bitfield memory reference.  */
1560       if (TREE_CODE (DR_REF (single_ld)) == COMPONENT_REF
1561 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_ld), 1)))
1562 	return false;
1563 
1564       /* Data reference must be executed exactly once per iteration of each
1565 	 loop in the loop nest.  We only need to check dominance information
1566 	 against the outermost one in a perfect loop nest because a bb can't
1567 	 dominate outermost loop's latch without dominating inner loop's.  */
1568       bb_ld = gimple_bb (DR_STMT (single_ld));
1569       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_ld))
1570 	return false;
1571     }
1572 
1573   if (single_st)
1574     {
1575       /* Bail out if this is a bitfield memory reference.  */
1576       if (TREE_CODE (DR_REF (single_st)) == COMPONENT_REF
1577 	  && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (single_st), 1)))
1578 	return false;
1579 
1580       /* Data reference must be executed exactly once per iteration.
1581 	 Same as single_ld, we only need to check against the outermost
1582 	 loop.  */
1583       bb_st = gimple_bb (DR_STMT (single_st));
1584       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb_st))
1585 	return false;
1586     }
1587 
1588   if (single_ld && single_st)
1589     {
1590       /* Load and store must be in the same loop nest.  */
1591       if (bb_st->loop_father != bb_ld->loop_father)
1592 	return false;
1593 
1594       edge e = single_exit (bb_st->loop_father);
1595       bool dom_ld = dominated_by_p (CDI_DOMINATORS, e->src, bb_ld);
1596       bool dom_st = dominated_by_p (CDI_DOMINATORS, e->src, bb_st);
1597       if (dom_ld != dom_st)
1598 	return false;
1599     }
1600 
1601   *src_dr = single_ld;
1602   *dst_dr = single_st;
1603   return true;
1604 }
1605 
1606 /* Given data reference DR in LOOP_NEST, this function checks the enclosing
1607    loops from inner to outer to see if loop's step equals to access size at
1608    each level of loop.  Return 2 if we can prove this at all level loops;
1609    record access base and size in BASE and SIZE; save loop's step at each
1610    level of loop in STEPS if it is not null.  For example:
1611 
1612      int arr[100][100][100];
1613      for (i = 0; i < 100; i++)       ;steps[2] = 40000
1614        for (j = 100; j > 0; j--)     ;steps[1] = -400
1615 	 for (k = 0; k < 100; k++)   ;steps[0] = 4
1616 	   arr[i][j - 1][k] = 0;     ;base = &arr, size = 4000000
1617 
1618    Return 1 if we can prove the equality at the innermost loop, but not all
1619    level loops.  In this case, no information is recorded.
1620 
1621    Return 0 if no equality can be proven at any level loops.  */
1622 
1623 static int
compute_access_range(loop_p loop_nest,data_reference_p dr,tree * base,tree * size,vec<tree> * steps=NULL)1624 compute_access_range (loop_p loop_nest, data_reference_p dr, tree *base,
1625 		      tree *size, vec<tree> *steps = NULL)
1626 {
1627   location_t loc = gimple_location (DR_STMT (dr));
1628   basic_block bb = gimple_bb (DR_STMT (dr));
1629   class loop *loop = bb->loop_father;
1630   tree ref = DR_REF (dr);
1631   tree access_base = build_fold_addr_expr (ref);
1632   tree access_size = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1633   int res = 0;
1634 
1635   do {
1636       tree scev_fn = analyze_scalar_evolution (loop, access_base);
1637       if (TREE_CODE (scev_fn) != POLYNOMIAL_CHREC)
1638 	return res;
1639 
1640       access_base = CHREC_LEFT (scev_fn);
1641       if (tree_contains_chrecs (access_base, NULL))
1642 	return res;
1643 
1644       tree scev_step = CHREC_RIGHT (scev_fn);
1645       /* Only support constant steps.  */
1646       if (TREE_CODE (scev_step) != INTEGER_CST)
1647 	return res;
1648 
1649       enum ev_direction access_dir = scev_direction (scev_fn);
1650       if (access_dir == EV_DIR_UNKNOWN)
1651 	return res;
1652 
1653       if (steps != NULL)
1654 	steps->safe_push (scev_step);
1655 
1656       scev_step = fold_convert_loc (loc, sizetype, scev_step);
1657       /* Compute absolute value of scev step.  */
1658       if (access_dir == EV_DIR_DECREASES)
1659 	scev_step = fold_build1_loc (loc, NEGATE_EXPR, sizetype, scev_step);
1660 
1661       /* At each level of loop, scev step must equal to access size.  In other
1662 	 words, DR must access consecutive memory between loop iterations.  */
1663       if (!operand_equal_p (scev_step, access_size, 0))
1664 	return res;
1665 
1666       /* Access stride can be computed for data reference at least for the
1667 	 innermost loop.  */
1668       res = 1;
1669 
1670       /* Compute DR's execution times in loop.  */
1671       tree niters = number_of_latch_executions (loop);
1672       niters = fold_convert_loc (loc, sizetype, niters);
1673       if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src, bb))
1674 	niters = size_binop_loc (loc, PLUS_EXPR, niters, size_one_node);
1675 
1676       /* Compute DR's overall access size in loop.  */
1677       access_size = fold_build2_loc (loc, MULT_EXPR, sizetype,
1678 				     niters, scev_step);
1679       /* Adjust base address in case of negative step.  */
1680       if (access_dir == EV_DIR_DECREASES)
1681 	{
1682 	  tree adj = fold_build2_loc (loc, MINUS_EXPR, sizetype,
1683 				      scev_step, access_size);
1684 	  access_base = fold_build_pointer_plus_loc (loc, access_base, adj);
1685 	}
1686   } while (loop != loop_nest && (loop = loop_outer (loop)) != NULL);
1687 
1688   *base = access_base;
1689   *size = access_size;
1690   /* Access stride can be computed for data reference at each level loop.  */
1691   return 2;
1692 }
1693 
1694 /* Allocate and return builtin struct.  Record information like DST_DR,
1695    SRC_DR, DST_BASE, SRC_BASE and SIZE in the allocated struct.  */
1696 
1697 static struct builtin_info *
alloc_builtin(data_reference_p dst_dr,data_reference_p src_dr,tree dst_base,tree src_base,tree size)1698 alloc_builtin (data_reference_p dst_dr, data_reference_p src_dr,
1699 	       tree dst_base, tree src_base, tree size)
1700 {
1701   struct builtin_info *builtin = XNEW (struct builtin_info);
1702   builtin->dst_dr = dst_dr;
1703   builtin->src_dr = src_dr;
1704   builtin->dst_base = dst_base;
1705   builtin->src_base = src_base;
1706   builtin->size = size;
1707   return builtin;
1708 }
1709 
1710 /* Given data reference DR in loop nest LOOP, classify if it forms builtin
1711    memset call.  */
1712 
1713 static void
classify_builtin_st(loop_p loop,partition * partition,data_reference_p dr)1714 classify_builtin_st (loop_p loop, partition *partition, data_reference_p dr)
1715 {
1716   gimple *stmt = DR_STMT (dr);
1717   tree base, size, rhs = gimple_assign_rhs1 (stmt);
1718 
1719   if (const_with_all_bytes_same (rhs) == -1
1720       && (!INTEGRAL_TYPE_P (TREE_TYPE (rhs))
1721 	  || (TYPE_MODE (TREE_TYPE (rhs))
1722 	      != TYPE_MODE (unsigned_char_type_node))))
1723     return;
1724 
1725   if (TREE_CODE (rhs) == SSA_NAME
1726       && !SSA_NAME_IS_DEFAULT_DEF (rhs)
1727       && flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (rhs))))
1728     return;
1729 
1730   int res = compute_access_range (loop, dr, &base, &size);
1731   if (res == 0)
1732     return;
1733   if (res == 1)
1734     {
1735       partition->kind = PKIND_PARTIAL_MEMSET;
1736       return;
1737     }
1738 
1739   poly_uint64 base_offset;
1740   unsigned HOST_WIDE_INT const_base_offset;
1741   tree base_base = strip_offset (base, &base_offset);
1742   if (!base_offset.is_constant (&const_base_offset))
1743     return;
1744 
1745   struct builtin_info *builtin;
1746   builtin = alloc_builtin (dr, NULL, base, NULL_TREE, size);
1747   builtin->dst_base_base = base_base;
1748   builtin->dst_base_offset = const_base_offset;
1749   partition->builtin = builtin;
1750   partition->kind = PKIND_MEMSET;
1751 }
1752 
1753 /* Given data references DST_DR and SRC_DR in loop nest LOOP and RDG, classify
1754    if it forms builtin memcpy or memmove call.  */
1755 
1756 void
classify_builtin_ldst(loop_p loop,struct graph * rdg,partition * partition,data_reference_p dst_dr,data_reference_p src_dr)1757 loop_distribution::classify_builtin_ldst (loop_p loop, struct graph *rdg,
1758 					  partition *partition,
1759 					  data_reference_p dst_dr,
1760 					  data_reference_p src_dr)
1761 {
1762   tree base, size, src_base, src_size;
1763   auto_vec<tree> dst_steps, src_steps;
1764 
1765   /* Compute access range of both load and store.  */
1766   int res = compute_access_range (loop, dst_dr, &base, &size, &dst_steps);
1767   if (res != 2)
1768     return;
1769   res = compute_access_range (loop, src_dr, &src_base, &src_size, &src_steps);
1770   if (res != 2)
1771     return;
1772 
1773   /* They must have the same access size.  */
1774   if (!operand_equal_p (size, src_size, 0))
1775     return;
1776 
1777   /* They must have the same storage order.  */
1778   if (reverse_storage_order_for_component_p (DR_REF (dst_dr))
1779       != reverse_storage_order_for_component_p (DR_REF (src_dr)))
1780     return;
1781 
1782   /* Load and store in loop nest must access memory in the same way, i.e,
1783      their must have the same steps in each loop of the nest.  */
1784   if (dst_steps.length () != src_steps.length ())
1785     return;
1786   for (unsigned i = 0; i < dst_steps.length (); ++i)
1787     if (!operand_equal_p (dst_steps[i], src_steps[i], 0))
1788       return;
1789 
1790   /* Now check that if there is a dependence.  */
1791   ddr_p ddr = get_data_dependence (rdg, src_dr, dst_dr);
1792 
1793   /* Classify as memmove if no dependence between load and store.  */
1794   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1795     {
1796       partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1797       partition->kind = PKIND_MEMMOVE;
1798       return;
1799     }
1800 
1801   /* Can't do memmove in case of unknown dependence or dependence without
1802      classical distance vector.  */
1803   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know
1804       || DDR_NUM_DIST_VECTS (ddr) == 0)
1805     return;
1806 
1807   unsigned i;
1808   lambda_vector dist_v;
1809   int num_lev = (DDR_LOOP_NEST (ddr)).length ();
1810   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1811     {
1812       unsigned dep_lev = dependence_level (dist_v, num_lev);
1813       /* Can't do memmove if load depends on store.  */
1814       if (dep_lev > 0 && dist_v[dep_lev - 1] > 0 && !DDR_REVERSED_P (ddr))
1815 	return;
1816     }
1817 
1818   partition->builtin = alloc_builtin (dst_dr, src_dr, base, src_base, size);
1819   partition->kind = PKIND_MEMMOVE;
1820   return;
1821 }
1822 
1823 bool
classify_partition(loop_p loop,struct graph * rdg,partition * partition,bitmap stmt_in_all_partitions)1824 loop_distribution::classify_partition (loop_p loop,
1825 				       struct graph *rdg, partition *partition,
1826 				       bitmap stmt_in_all_partitions)
1827 {
1828   bitmap_iterator bi;
1829   unsigned i;
1830   data_reference_p single_ld = NULL, single_st = NULL;
1831   bool volatiles_p = false, has_reduction = false;
1832 
1833   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, bi)
1834     {
1835       gimple *stmt = RDG_STMT (rdg, i);
1836 
1837       if (gimple_has_volatile_ops (stmt))
1838 	volatiles_p = true;
1839 
1840       /* If the stmt is not included by all partitions and there is uses
1841 	 outside of the loop, then mark the partition as reduction.  */
1842       if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
1843 	{
1844 	  /* Due to limitation in the transform phase we have to fuse all
1845 	     reduction partitions.  As a result, this could cancel valid
1846 	     loop distribution especially for loop that induction variable
1847 	     is used outside of loop.  To workaround this issue, we skip
1848 	     marking partition as reudction if the reduction stmt belongs
1849 	     to all partitions.  In such case, reduction will be computed
1850 	     correctly no matter how partitions are fused/distributed.  */
1851 	  if (!bitmap_bit_p (stmt_in_all_partitions, i))
1852 	    partition->reduction_p = true;
1853 	  else
1854 	    has_reduction = true;
1855 	}
1856     }
1857 
1858   /* Simple workaround to prevent classifying the partition as builtin
1859      if it contains any use outside of loop.  For the case where all
1860      partitions have the reduction this simple workaround is delayed
1861      to only affect the last partition.  */
1862   if (partition->reduction_p)
1863      return has_reduction;
1864 
1865   /* Perform general partition disqualification for builtins.  */
1866   if (volatiles_p
1867       || !flag_tree_loop_distribute_patterns)
1868     return has_reduction;
1869 
1870   /* Find single load/store data references for builtin partition.  */
1871   if (!find_single_drs (loop, rdg, partition->stmts, &single_st, &single_ld)
1872       || !single_st)
1873     return has_reduction;
1874 
1875   if (single_ld && single_st)
1876     {
1877       gimple *store = DR_STMT (single_st), *load = DR_STMT (single_ld);
1878       /* Direct aggregate copy or via an SSA name temporary.  */
1879       if (load != store
1880 	  && gimple_assign_lhs (load) != gimple_assign_rhs1 (store))
1881 	return has_reduction;
1882     }
1883 
1884   partition->loc = gimple_location (DR_STMT (single_st));
1885 
1886   /* Classify the builtin kind.  */
1887   if (single_ld == NULL)
1888     classify_builtin_st (loop, partition, single_st);
1889   else
1890     classify_builtin_ldst (loop, rdg, partition, single_st, single_ld);
1891   return has_reduction;
1892 }
1893 
1894 bool
share_memory_accesses(struct graph * rdg,partition * partition1,partition * partition2)1895 loop_distribution::share_memory_accesses (struct graph *rdg,
1896 		       partition *partition1, partition *partition2)
1897 {
1898   unsigned i, j;
1899   bitmap_iterator bi, bj;
1900   data_reference_p dr1, dr2;
1901 
1902   /* First check whether in the intersection of the two partitions are
1903      any loads or stores.  Common loads are the situation that happens
1904      most often.  */
1905   EXECUTE_IF_AND_IN_BITMAP (partition1->stmts, partition2->stmts, 0, i, bi)
1906     if (RDG_MEM_WRITE_STMT (rdg, i)
1907 	|| RDG_MEM_READS_STMT (rdg, i))
1908       return true;
1909 
1910   /* Then check whether the two partitions access the same memory object.  */
1911   EXECUTE_IF_SET_IN_BITMAP (partition1->datarefs, 0, i, bi)
1912     {
1913       dr1 = datarefs_vec[i];
1914 
1915       if (!DR_BASE_ADDRESS (dr1)
1916 	  || !DR_OFFSET (dr1) || !DR_INIT (dr1) || !DR_STEP (dr1))
1917 	continue;
1918 
1919       EXECUTE_IF_SET_IN_BITMAP (partition2->datarefs, 0, j, bj)
1920 	{
1921 	  dr2 = datarefs_vec[j];
1922 
1923 	  if (!DR_BASE_ADDRESS (dr2)
1924 	      || !DR_OFFSET (dr2) || !DR_INIT (dr2) || !DR_STEP (dr2))
1925 	    continue;
1926 
1927 	  if (operand_equal_p (DR_BASE_ADDRESS (dr1), DR_BASE_ADDRESS (dr2), 0)
1928 	      && operand_equal_p (DR_OFFSET (dr1), DR_OFFSET (dr2), 0)
1929 	      && operand_equal_p (DR_INIT (dr1), DR_INIT (dr2), 0)
1930 	      && operand_equal_p (DR_STEP (dr1), DR_STEP (dr2), 0))
1931 	    return true;
1932 	}
1933     }
1934 
1935   return false;
1936 }
1937 
1938 /* For each seed statement in STARTING_STMTS, this function builds
1939    partition for it by adding depended statements according to RDG.
1940    All partitions are recorded in PARTITIONS.  */
1941 
1942 void
rdg_build_partitions(struct graph * rdg,vec<gimple * > starting_stmts,vec<partition * > * partitions)1943 loop_distribution::rdg_build_partitions (struct graph *rdg,
1944 					 vec<gimple *> starting_stmts,
1945 					 vec<partition *> *partitions)
1946 {
1947   auto_bitmap processed;
1948   int i;
1949   gimple *stmt;
1950 
1951   FOR_EACH_VEC_ELT (starting_stmts, i, stmt)
1952     {
1953       int v = rdg_vertex_for_stmt (rdg, stmt);
1954 
1955       if (dump_file && (dump_flags & TDF_DETAILS))
1956 	fprintf (dump_file,
1957 		 "ldist asked to generate code for vertex %d\n", v);
1958 
1959       /* If the vertex is already contained in another partition so
1960          is the partition rooted at it.  */
1961       if (bitmap_bit_p (processed, v))
1962 	continue;
1963 
1964       partition *partition = build_rdg_partition_for_vertex (rdg, v);
1965       bitmap_ior_into (processed, partition->stmts);
1966 
1967       if (dump_file && (dump_flags & TDF_DETAILS))
1968 	{
1969 	  fprintf (dump_file, "ldist creates useful %s partition:\n",
1970 		   partition->type == PTYPE_PARALLEL ? "parallel" : "sequent");
1971 	  bitmap_print (dump_file, partition->stmts, "  ", "\n");
1972 	}
1973 
1974       partitions->safe_push (partition);
1975     }
1976 
1977   /* All vertices should have been assigned to at least one partition now,
1978      other than vertices belonging to dead code.  */
1979 }
1980 
1981 /* Dump to FILE the PARTITIONS.  */
1982 
1983 static void
dump_rdg_partitions(FILE * file,const vec<partition * > & partitions)1984 dump_rdg_partitions (FILE *file, const vec<partition *> &partitions)
1985 {
1986   int i;
1987   partition *partition;
1988 
1989   FOR_EACH_VEC_ELT (partitions, i, partition)
1990     debug_bitmap_file (file, partition->stmts);
1991 }
1992 
1993 /* Debug PARTITIONS.  */
1994 extern void debug_rdg_partitions (const vec<partition *> &);
1995 
1996 DEBUG_FUNCTION void
debug_rdg_partitions(const vec<partition * > & partitions)1997 debug_rdg_partitions (const vec<partition *> &partitions)
1998 {
1999   dump_rdg_partitions (stderr, partitions);
2000 }
2001 
2002 /* Returns the number of read and write operations in the RDG.  */
2003 
2004 static int
number_of_rw_in_rdg(struct graph * rdg)2005 number_of_rw_in_rdg (struct graph *rdg)
2006 {
2007   int i, res = 0;
2008 
2009   for (i = 0; i < rdg->n_vertices; i++)
2010     {
2011       if (RDG_MEM_WRITE_STMT (rdg, i))
2012 	++res;
2013 
2014       if (RDG_MEM_READS_STMT (rdg, i))
2015 	++res;
2016     }
2017 
2018   return res;
2019 }
2020 
2021 /* Returns the number of read and write operations in a PARTITION of
2022    the RDG.  */
2023 
2024 static int
number_of_rw_in_partition(struct graph * rdg,partition * partition)2025 number_of_rw_in_partition (struct graph *rdg, partition *partition)
2026 {
2027   int res = 0;
2028   unsigned i;
2029   bitmap_iterator ii;
2030 
2031   EXECUTE_IF_SET_IN_BITMAP (partition->stmts, 0, i, ii)
2032     {
2033       if (RDG_MEM_WRITE_STMT (rdg, i))
2034 	++res;
2035 
2036       if (RDG_MEM_READS_STMT (rdg, i))
2037 	++res;
2038     }
2039 
2040   return res;
2041 }
2042 
2043 /* Returns true when one of the PARTITIONS contains all the read or
2044    write operations of RDG.  */
2045 
2046 static bool
partition_contains_all_rw(struct graph * rdg,const vec<partition * > & partitions)2047 partition_contains_all_rw (struct graph *rdg,
2048 			   const vec<partition *> &partitions)
2049 {
2050   int i;
2051   partition *partition;
2052   int nrw = number_of_rw_in_rdg (rdg);
2053 
2054   FOR_EACH_VEC_ELT (partitions, i, partition)
2055     if (nrw == number_of_rw_in_partition (rdg, partition))
2056       return true;
2057 
2058   return false;
2059 }
2060 
2061 int
pg_add_dependence_edges(struct graph * rdg,int dir,bitmap drs1,bitmap drs2,vec<ddr_p> * alias_ddrs)2062 loop_distribution::pg_add_dependence_edges (struct graph *rdg, int dir,
2063 			 bitmap drs1, bitmap drs2, vec<ddr_p> *alias_ddrs)
2064 {
2065   unsigned i, j;
2066   bitmap_iterator bi, bj;
2067   data_reference_p dr1, dr2, saved_dr1;
2068 
2069   /* dependence direction - 0 is no dependence, -1 is back,
2070      1 is forth, 2 is both (we can stop then, merging will occur).  */
2071   EXECUTE_IF_SET_IN_BITMAP (drs1, 0, i, bi)
2072     {
2073       dr1 = datarefs_vec[i];
2074 
2075       EXECUTE_IF_SET_IN_BITMAP (drs2, 0, j, bj)
2076 	{
2077 	  int res, this_dir = 1;
2078 	  ddr_p ddr;
2079 
2080 	  dr2 = datarefs_vec[j];
2081 
2082 	  /* Skip all <read, read> data dependence.  */
2083 	  if (DR_IS_READ (dr1) && DR_IS_READ (dr2))
2084 	    continue;
2085 
2086 	  saved_dr1 = dr1;
2087 	  /* Re-shuffle data-refs to be in topological order.  */
2088 	  if (rdg_vertex_for_stmt (rdg, DR_STMT (dr1))
2089 	      > rdg_vertex_for_stmt (rdg, DR_STMT (dr2)))
2090 	    {
2091 	      std::swap (dr1, dr2);
2092 	      this_dir = -this_dir;
2093 	    }
2094 	  ddr = get_data_dependence (rdg, dr1, dr2);
2095 	  if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2096 	    {
2097 	      this_dir = 0;
2098 	      res = data_ref_compare_tree (DR_BASE_ADDRESS (dr1),
2099 					   DR_BASE_ADDRESS (dr2));
2100 	      /* Be conservative.  If data references are not well analyzed,
2101 		 or the two data references have the same base address and
2102 		 offset, add dependence and consider it alias to each other.
2103 		 In other words, the dependence cannot be resolved by
2104 		 runtime alias check.  */
2105 	      if (!DR_BASE_ADDRESS (dr1) || !DR_BASE_ADDRESS (dr2)
2106 		  || !DR_OFFSET (dr1) || !DR_OFFSET (dr2)
2107 		  || !DR_INIT (dr1) || !DR_INIT (dr2)
2108 		  || !DR_STEP (dr1) || !tree_fits_uhwi_p (DR_STEP (dr1))
2109 		  || !DR_STEP (dr2) || !tree_fits_uhwi_p (DR_STEP (dr2))
2110 		  || res == 0)
2111 		this_dir = 2;
2112 	      /* Data dependence could be resolved by runtime alias check,
2113 		 record it in ALIAS_DDRS.  */
2114 	      else if (alias_ddrs != NULL)
2115 		alias_ddrs->safe_push (ddr);
2116 	      /* Or simply ignore it.  */
2117 	    }
2118 	  else if (DDR_ARE_DEPENDENT (ddr) == NULL_TREE)
2119 	    {
2120 	      /* Known dependences can still be unordered througout the
2121 		 iteration space, see gcc.dg/tree-ssa/ldist-16.c and
2122 		 gcc.dg/tree-ssa/pr94969.c.  */
2123 	      if (DDR_NUM_DIST_VECTS (ddr) != 1)
2124 		this_dir = 2;
2125 	      /* If the overlap is exact preserve stmt order.  */
2126 	      else if (lambda_vector_zerop (DDR_DIST_VECT (ddr, 0),
2127 					    DDR_NB_LOOPS (ddr)))
2128 		;
2129 	      /* Else as the distance vector is lexicographic positive swap
2130 		 the dependence direction.  */
2131 	      else
2132 		{
2133 		  if (DDR_REVERSED_P (ddr))
2134 		    this_dir = -this_dir;
2135 		  this_dir = -this_dir;
2136 
2137 		  /* When then dependence distance of the innermost common
2138 		     loop of the DRs is zero we have a conflict.  */
2139 		  auto l1 = gimple_bb (DR_STMT (dr1))->loop_father;
2140 		  auto l2 = gimple_bb (DR_STMT (dr2))->loop_father;
2141 		  int idx = index_in_loop_nest (find_common_loop (l1, l2)->num,
2142 						DDR_LOOP_NEST (ddr));
2143 		  if (DDR_DIST_VECT (ddr, 0)[idx] == 0)
2144 		    this_dir = 2;
2145 		}
2146 	    }
2147 	  else
2148 	    this_dir = 0;
2149 	  if (this_dir == 2)
2150 	    return 2;
2151 	  else if (dir == 0)
2152 	    dir = this_dir;
2153 	  else if (this_dir != 0 && dir != this_dir)
2154 	    return 2;
2155 	  /* Shuffle "back" dr1.  */
2156 	  dr1 = saved_dr1;
2157 	}
2158     }
2159   return dir;
2160 }
2161 
2162 /* Compare postorder number of the partition graph vertices V1 and V2.  */
2163 
2164 static int
pgcmp(const void * v1_,const void * v2_)2165 pgcmp (const void *v1_, const void *v2_)
2166 {
2167   const vertex *v1 = (const vertex *)v1_;
2168   const vertex *v2 = (const vertex *)v2_;
2169   return v2->post - v1->post;
2170 }
2171 
2172 /* Data attached to vertices of partition dependence graph.  */
2173 struct pg_vdata
2174 {
2175   /* ID of the corresponding partition.  */
2176   int id;
2177   /* The partition.  */
2178   struct partition *partition;
2179 };
2180 
2181 /* Data attached to edges of partition dependence graph.  */
2182 struct pg_edata
2183 {
2184   /* If the dependence edge can be resolved by runtime alias check,
2185      this vector contains data dependence relations for runtime alias
2186      check.  On the other hand, if the dependence edge is introduced
2187      because of compilation time known data dependence, this vector
2188      contains nothing.  */
2189   vec<ddr_p> alias_ddrs;
2190 };
2191 
2192 /* Callback data for traversing edges in graph.  */
2193 struct pg_edge_callback_data
2194 {
2195   /* Bitmap contains strong connected components should be merged.  */
2196   bitmap sccs_to_merge;
2197   /* Array constains component information for all vertices.  */
2198   int *vertices_component;
2199   /* Vector to record all data dependence relations which are needed
2200      to break strong connected components by runtime alias checks.  */
2201   vec<ddr_p> *alias_ddrs;
2202 };
2203 
2204 /* Initialize vertice's data for partition dependence graph PG with
2205    PARTITIONS.  */
2206 
2207 static void
init_partition_graph_vertices(struct graph * pg,vec<struct partition * > * partitions)2208 init_partition_graph_vertices (struct graph *pg,
2209 			       vec<struct partition *> *partitions)
2210 {
2211   int i;
2212   partition *partition;
2213   struct pg_vdata *data;
2214 
2215   for (i = 0; partitions->iterate (i, &partition); ++i)
2216     {
2217       data = new pg_vdata;
2218       pg->vertices[i].data = data;
2219       data->id = i;
2220       data->partition = partition;
2221     }
2222 }
2223 
2224 /* Add edge <I, J> to partition dependence graph PG.  Attach vector of data
2225    dependence relations to the EDGE if DDRS isn't NULL.  */
2226 
2227 static void
add_partition_graph_edge(struct graph * pg,int i,int j,vec<ddr_p> * ddrs)2228 add_partition_graph_edge (struct graph *pg, int i, int j, vec<ddr_p> *ddrs)
2229 {
2230   struct graph_edge *e = add_edge (pg, i, j);
2231 
2232   /* If the edge is attached with data dependence relations, it means this
2233      dependence edge can be resolved by runtime alias checks.  */
2234   if (ddrs != NULL)
2235     {
2236       struct pg_edata *data = new pg_edata;
2237 
2238       gcc_assert (ddrs->length () > 0);
2239       e->data = data;
2240       data->alias_ddrs = vNULL;
2241       data->alias_ddrs.safe_splice (*ddrs);
2242     }
2243 }
2244 
2245 /* Callback function for graph travesal algorithm.  It returns true
2246    if edge E should skipped when traversing the graph.  */
2247 
2248 static bool
pg_skip_alias_edge(struct graph_edge * e)2249 pg_skip_alias_edge (struct graph_edge *e)
2250 {
2251   struct pg_edata *data = (struct pg_edata *)e->data;
2252   return (data != NULL && data->alias_ddrs.length () > 0);
2253 }
2254 
2255 /* Callback function freeing data attached to edge E of graph.  */
2256 
2257 static void
free_partition_graph_edata_cb(struct graph *,struct graph_edge * e,void *)2258 free_partition_graph_edata_cb (struct graph *, struct graph_edge *e, void *)
2259 {
2260   if (e->data != NULL)
2261     {
2262       struct pg_edata *data = (struct pg_edata *)e->data;
2263       data->alias_ddrs.release ();
2264       delete data;
2265     }
2266 }
2267 
2268 /* Free data attached to vertice of partition dependence graph PG.  */
2269 
2270 static void
free_partition_graph_vdata(struct graph * pg)2271 free_partition_graph_vdata (struct graph *pg)
2272 {
2273   int i;
2274   struct pg_vdata *data;
2275 
2276   for (i = 0; i < pg->n_vertices; ++i)
2277     {
2278       data = (struct pg_vdata *)pg->vertices[i].data;
2279       delete data;
2280     }
2281 }
2282 
2283 /* Build and return partition dependence graph for PARTITIONS.  RDG is
2284    reduced dependence graph for the loop to be distributed.  If IGNORE_ALIAS_P
2285    is true, data dependence caused by possible alias between references
2286    is ignored, as if it doesn't exist at all; otherwise all depdendences
2287    are considered.  */
2288 
2289 struct graph *
build_partition_graph(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)2290 loop_distribution::build_partition_graph (struct graph *rdg,
2291 					  vec<struct partition *> *partitions,
2292 					  bool ignore_alias_p)
2293 {
2294   int i, j;
2295   struct partition *partition1, *partition2;
2296   graph *pg = new_graph (partitions->length ());
2297   auto_vec<ddr_p> alias_ddrs, *alias_ddrs_p;
2298 
2299   alias_ddrs_p = ignore_alias_p ? NULL : &alias_ddrs;
2300 
2301   init_partition_graph_vertices (pg, partitions);
2302 
2303   for (i = 0; partitions->iterate (i, &partition1); ++i)
2304     {
2305       for (j = i + 1; partitions->iterate (j, &partition2); ++j)
2306 	{
2307 	  /* dependence direction - 0 is no dependence, -1 is back,
2308 	     1 is forth, 2 is both (we can stop then, merging will occur).  */
2309 	  int dir = 0;
2310 
2311 	  /* If the first partition has reduction, add back edge; if the
2312 	     second partition has reduction, add forth edge.  This makes
2313 	     sure that reduction partition will be sorted as the last one.  */
2314 	  if (partition_reduction_p (partition1))
2315 	    dir = -1;
2316 	  else if (partition_reduction_p (partition2))
2317 	    dir = 1;
2318 
2319 	  /* Cleanup the temporary vector.  */
2320 	  alias_ddrs.truncate (0);
2321 
2322 	  dir = pg_add_dependence_edges (rdg, dir, partition1->datarefs,
2323 					 partition2->datarefs, alias_ddrs_p);
2324 
2325 	  /* Add edge to partition graph if there exists dependence.  There
2326 	     are two types of edges.  One type edge is caused by compilation
2327 	     time known dependence, this type cannot be resolved by runtime
2328 	     alias check.  The other type can be resolved by runtime alias
2329 	     check.  */
2330 	  if (dir == 1 || dir == 2
2331 	      || alias_ddrs.length () > 0)
2332 	    {
2333 	      /* Attach data dependence relations to edge that can be resolved
2334 		 by runtime alias check.  */
2335 	      bool alias_edge_p = (dir != 1 && dir != 2);
2336 	      add_partition_graph_edge (pg, i, j,
2337 					(alias_edge_p) ? &alias_ddrs : NULL);
2338 	    }
2339 	  if (dir == -1 || dir == 2
2340 	      || alias_ddrs.length () > 0)
2341 	    {
2342 	      /* Attach data dependence relations to edge that can be resolved
2343 		 by runtime alias check.  */
2344 	      bool alias_edge_p = (dir != -1 && dir != 2);
2345 	      add_partition_graph_edge (pg, j, i,
2346 					(alias_edge_p) ? &alias_ddrs : NULL);
2347 	    }
2348 	}
2349     }
2350   return pg;
2351 }
2352 
2353 /* Sort partitions in PG in descending post order and store them in
2354    PARTITIONS.  */
2355 
2356 static void
sort_partitions_by_post_order(struct graph * pg,vec<struct partition * > * partitions)2357 sort_partitions_by_post_order (struct graph *pg,
2358 			       vec<struct partition *> *partitions)
2359 {
2360   int i;
2361   struct pg_vdata *data;
2362 
2363   /* Now order the remaining nodes in descending postorder.  */
2364   qsort (pg->vertices, pg->n_vertices, sizeof (vertex), pgcmp);
2365   partitions->truncate (0);
2366   for (i = 0; i < pg->n_vertices; ++i)
2367     {
2368       data = (struct pg_vdata *)pg->vertices[i].data;
2369       if (data->partition)
2370 	partitions->safe_push (data->partition);
2371     }
2372 }
2373 
2374 void
merge_dep_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,bool ignore_alias_p)2375 loop_distribution::merge_dep_scc_partitions (struct graph *rdg,
2376 					     vec<struct partition *> *partitions,
2377 					     bool ignore_alias_p)
2378 {
2379   struct partition *partition1, *partition2;
2380   struct pg_vdata *data;
2381   graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
2382   int i, j, num_sccs = graphds_scc (pg, NULL);
2383 
2384   /* Strong connected compoenent means dependence cycle, we cannot distribute
2385      them.  So fuse them together.  */
2386   if ((unsigned) num_sccs < partitions->length ())
2387     {
2388       for (i = 0; i < num_sccs; ++i)
2389 	{
2390 	  for (j = 0; partitions->iterate (j, &partition1); ++j)
2391 	    if (pg->vertices[j].component == i)
2392 	      break;
2393 	  for (j = j + 1; partitions->iterate (j, &partition2); ++j)
2394 	    if (pg->vertices[j].component == i)
2395 	      {
2396 		partition_merge_into (NULL, partition1,
2397 				      partition2, FUSE_SAME_SCC);
2398 		partition1->type = PTYPE_SEQUENTIAL;
2399 		(*partitions)[j] = NULL;
2400 		partition_free (partition2);
2401 		data = (struct pg_vdata *)pg->vertices[j].data;
2402 		data->partition = NULL;
2403 	      }
2404 	}
2405     }
2406 
2407   sort_partitions_by_post_order (pg, partitions);
2408   gcc_assert (partitions->length () == (unsigned)num_sccs);
2409   free_partition_graph_vdata (pg);
2410   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2411   free_graph (pg);
2412 }
2413 
2414 /* Callback function for traversing edge E in graph G.  DATA is private
2415    callback data.  */
2416 
2417 static void
pg_collect_alias_ddrs(struct graph * g,struct graph_edge * e,void * data)2418 pg_collect_alias_ddrs (struct graph *g, struct graph_edge *e, void *data)
2419 {
2420   int i, j, component;
2421   struct pg_edge_callback_data *cbdata;
2422   struct pg_edata *edata = (struct pg_edata *) e->data;
2423 
2424   /* If the edge doesn't have attached data dependence, it represents
2425      compilation time known dependences.  This type dependence cannot
2426      be resolved by runtime alias check.  */
2427   if (edata == NULL || edata->alias_ddrs.length () == 0)
2428     return;
2429 
2430   cbdata = (struct pg_edge_callback_data *) data;
2431   i = e->src;
2432   j = e->dest;
2433   component = cbdata->vertices_component[i];
2434   /* Vertices are topologically sorted according to compilation time
2435      known dependences, so we can break strong connected components
2436      by removing edges of the opposite direction, i.e, edges pointing
2437      from vertice with smaller post number to vertice with bigger post
2438      number.  */
2439   if (g->vertices[i].post < g->vertices[j].post
2440       /* We only need to remove edges connecting vertices in the same
2441 	 strong connected component to break it.  */
2442       && component == cbdata->vertices_component[j]
2443       /* Check if we want to break the strong connected component or not.  */
2444       && !bitmap_bit_p (cbdata->sccs_to_merge, component))
2445     cbdata->alias_ddrs->safe_splice (edata->alias_ddrs);
2446 }
2447 
2448 /* Callback function for traversing edge E.  DATA is private
2449    callback data.  */
2450 
2451 static void
pg_unmark_merged_alias_ddrs(struct graph *,struct graph_edge * e,void * data)2452 pg_unmark_merged_alias_ddrs (struct graph *, struct graph_edge *e, void *data)
2453 {
2454   int i, j, component;
2455   struct pg_edge_callback_data *cbdata;
2456   struct pg_edata *edata = (struct pg_edata *) e->data;
2457 
2458   if (edata == NULL || edata->alias_ddrs.length () == 0)
2459     return;
2460 
2461   cbdata = (struct pg_edge_callback_data *) data;
2462   i = e->src;
2463   j = e->dest;
2464   component = cbdata->vertices_component[i];
2465   /* Make sure to not skip vertices inside SCCs we are going to merge.  */
2466   if (component == cbdata->vertices_component[j]
2467       && bitmap_bit_p (cbdata->sccs_to_merge, component))
2468     {
2469       edata->alias_ddrs.release ();
2470       delete edata;
2471       e->data = NULL;
2472     }
2473 }
2474 
2475 /* This is the main function breaking strong conected components in
2476    PARTITIONS giving reduced depdendence graph RDG.  Store data dependence
2477    relations for runtime alias check in ALIAS_DDRS.  */
2478 void
break_alias_scc_partitions(struct graph * rdg,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2479 loop_distribution::break_alias_scc_partitions (struct graph *rdg,
2480 					       vec<struct partition *> *partitions,
2481 					       vec<ddr_p> *alias_ddrs)
2482 {
2483   int i, j, k, num_sccs, num_sccs_no_alias = 0;
2484   /* Build partition dependence graph.  */
2485   graph *pg = build_partition_graph (rdg, partitions, false);
2486 
2487   alias_ddrs->truncate (0);
2488   /* Find strong connected components in the graph, with all dependence edges
2489      considered.  */
2490   num_sccs = graphds_scc (pg, NULL);
2491   /* All SCCs now can be broken by runtime alias checks because SCCs caused by
2492      compilation time known dependences are merged before this function.  */
2493   if ((unsigned) num_sccs < partitions->length ())
2494     {
2495       struct pg_edge_callback_data cbdata;
2496       auto_bitmap sccs_to_merge;
2497       auto_vec<enum partition_type> scc_types;
2498       struct partition *partition, *first;
2499 
2500       /* If all partitions in a SCC have the same type, we can simply merge the
2501 	 SCC.  This loop finds out such SCCS and record them in bitmap.  */
2502       bitmap_set_range (sccs_to_merge, 0, (unsigned) num_sccs);
2503       for (i = 0; i < num_sccs; ++i)
2504 	{
2505 	  for (j = 0; partitions->iterate (j, &first); ++j)
2506 	    if (pg->vertices[j].component == i)
2507 	      break;
2508 
2509 	  bool same_type = true, all_builtins = partition_builtin_p (first);
2510 	  for (++j; partitions->iterate (j, &partition); ++j)
2511 	    {
2512 	      if (pg->vertices[j].component != i)
2513 		continue;
2514 
2515 	      if (first->type != partition->type)
2516 		{
2517 		  same_type = false;
2518 		  break;
2519 		}
2520 	      all_builtins &= partition_builtin_p (partition);
2521 	    }
2522 	  /* Merge SCC if all partitions in SCC have the same type, though the
2523 	     result partition is sequential, because vectorizer can do better
2524 	     runtime alias check.  One expecption is all partitions in SCC are
2525 	     builtins.  */
2526 	  if (!same_type || all_builtins)
2527 	    bitmap_clear_bit (sccs_to_merge, i);
2528 	}
2529 
2530       /* Initialize callback data for traversing.  */
2531       cbdata.sccs_to_merge = sccs_to_merge;
2532       cbdata.alias_ddrs = alias_ddrs;
2533       cbdata.vertices_component = XNEWVEC (int, pg->n_vertices);
2534       /* Record the component information which will be corrupted by next
2535 	 graph scc finding call.  */
2536       for (i = 0; i < pg->n_vertices; ++i)
2537 	cbdata.vertices_component[i] = pg->vertices[i].component;
2538 
2539       /* Collect data dependences for runtime alias checks to break SCCs.  */
2540       if (bitmap_count_bits (sccs_to_merge) != (unsigned) num_sccs)
2541 	{
2542 	  /* For SCCs we want to merge clear all alias_ddrs for edges
2543 	     inside the component.  */
2544 	  for_each_edge (pg, pg_unmark_merged_alias_ddrs, &cbdata);
2545 
2546 	  /* Run SCC finding algorithm again, with alias dependence edges
2547 	     skipped.  This is to topologically sort partitions according to
2548 	     compilation time known dependence.  Note the topological order
2549 	     is stored in the form of pg's post order number.  */
2550 	  num_sccs_no_alias = graphds_scc (pg, NULL, pg_skip_alias_edge);
2551 	  /* We cannot assert partitions->length () == num_sccs_no_alias
2552 	     since we are not ignoring alias edges in cycles we are
2553 	     going to merge.  That's required to compute correct postorder.  */
2554 	  /* With topological order, we can construct two subgraphs L and R.
2555 	     L contains edge <x, y> where x < y in terms of post order, while
2556 	     R contains edge <x, y> where x > y.  Edges for compilation time
2557 	     known dependence all fall in R, so we break SCCs by removing all
2558 	     (alias) edges of in subgraph L.  */
2559 	  for_each_edge (pg, pg_collect_alias_ddrs, &cbdata);
2560 	}
2561 
2562       /* For SCC that doesn't need to be broken, merge it.  */
2563       for (i = 0; i < num_sccs; ++i)
2564 	{
2565 	  if (!bitmap_bit_p (sccs_to_merge, i))
2566 	    continue;
2567 
2568 	  for (j = 0; partitions->iterate (j, &first); ++j)
2569 	    if (cbdata.vertices_component[j] == i)
2570 	      break;
2571 	  for (k = j + 1; partitions->iterate (k, &partition); ++k)
2572 	    {
2573 	      struct pg_vdata *data;
2574 
2575 	      if (cbdata.vertices_component[k] != i)
2576 		continue;
2577 
2578 	      partition_merge_into (NULL, first, partition, FUSE_SAME_SCC);
2579 	      (*partitions)[k] = NULL;
2580 	      partition_free (partition);
2581 	      data = (struct pg_vdata *)pg->vertices[k].data;
2582 	      gcc_assert (data->id == k);
2583 	      data->partition = NULL;
2584 	      /* The result partition of merged SCC must be sequential.  */
2585 	      first->type = PTYPE_SEQUENTIAL;
2586 	    }
2587 	}
2588       /* If reduction partition's SCC is broken by runtime alias checks,
2589 	 we force a negative post order to it making sure it will be scheduled
2590 	 in the last.  */
2591       if (num_sccs_no_alias > 0)
2592 	{
2593 	  j = -1;
2594 	  for (i = 0; i < pg->n_vertices; ++i)
2595 	    {
2596 	      struct pg_vdata *data = (struct pg_vdata *)pg->vertices[i].data;
2597 	      if (data->partition && partition_reduction_p (data->partition))
2598 		{
2599 		  gcc_assert (j == -1);
2600 		  j = i;
2601 		}
2602 	    }
2603 	  if (j >= 0)
2604 	    pg->vertices[j].post = -1;
2605 	}
2606 
2607       free (cbdata.vertices_component);
2608     }
2609 
2610   sort_partitions_by_post_order (pg, partitions);
2611   free_partition_graph_vdata (pg);
2612   for_each_edge (pg, free_partition_graph_edata_cb, NULL);
2613   free_graph (pg);
2614 
2615   if (dump_file && (dump_flags & TDF_DETAILS))
2616     {
2617       fprintf (dump_file, "Possible alias data dependence to break:\n");
2618       dump_data_dependence_relations (dump_file, *alias_ddrs);
2619     }
2620 }
2621 
2622 /* Compute and return an expression whose value is the segment length which
2623    will be accessed by DR in NITERS iterations.  */
2624 
2625 static tree
data_ref_segment_size(struct data_reference * dr,tree niters)2626 data_ref_segment_size (struct data_reference *dr, tree niters)
2627 {
2628   niters = size_binop (MINUS_EXPR,
2629 		       fold_convert (sizetype, niters),
2630 		       size_one_node);
2631   return size_binop (MULT_EXPR,
2632 		     fold_convert (sizetype, DR_STEP (dr)),
2633 		     fold_convert (sizetype, niters));
2634 }
2635 
2636 /* Return true if LOOP's latch is dominated by statement for data reference
2637    DR.  */
2638 
2639 static inline bool
latch_dominated_by_data_ref(class loop * loop,data_reference * dr)2640 latch_dominated_by_data_ref (class loop *loop, data_reference *dr)
2641 {
2642   return dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
2643 			 gimple_bb (DR_STMT (dr)));
2644 }
2645 
2646 /* Compute alias check pairs and store them in COMP_ALIAS_PAIRS for LOOP's
2647    data dependence relations ALIAS_DDRS.  */
2648 
2649 static void
compute_alias_check_pairs(class loop * loop,vec<ddr_p> * alias_ddrs,vec<dr_with_seg_len_pair_t> * comp_alias_pairs)2650 compute_alias_check_pairs (class loop *loop, vec<ddr_p> *alias_ddrs,
2651 			   vec<dr_with_seg_len_pair_t> *comp_alias_pairs)
2652 {
2653   unsigned int i;
2654   unsigned HOST_WIDE_INT factor = 1;
2655   tree niters_plus_one, niters = number_of_latch_executions (loop);
2656 
2657   gcc_assert (niters != NULL_TREE && niters != chrec_dont_know);
2658   niters = fold_convert (sizetype, niters);
2659   niters_plus_one = size_binop (PLUS_EXPR, niters, size_one_node);
2660 
2661   if (dump_file && (dump_flags & TDF_DETAILS))
2662     fprintf (dump_file, "Creating alias check pairs:\n");
2663 
2664   /* Iterate all data dependence relations and compute alias check pairs.  */
2665   for (i = 0; i < alias_ddrs->length (); i++)
2666     {
2667       ddr_p ddr = (*alias_ddrs)[i];
2668       struct data_reference *dr_a = DDR_A (ddr);
2669       struct data_reference *dr_b = DDR_B (ddr);
2670       tree seg_length_a, seg_length_b;
2671 
2672       if (latch_dominated_by_data_ref (loop, dr_a))
2673 	seg_length_a = data_ref_segment_size (dr_a, niters_plus_one);
2674       else
2675 	seg_length_a = data_ref_segment_size (dr_a, niters);
2676 
2677       if (latch_dominated_by_data_ref (loop, dr_b))
2678 	seg_length_b = data_ref_segment_size (dr_b, niters_plus_one);
2679       else
2680 	seg_length_b = data_ref_segment_size (dr_b, niters);
2681 
2682       unsigned HOST_WIDE_INT access_size_a
2683 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_a))));
2684       unsigned HOST_WIDE_INT access_size_b
2685 	= tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr_b))));
2686       unsigned int align_a = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_a)));
2687       unsigned int align_b = TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_b)));
2688 
2689       dr_with_seg_len_pair_t dr_with_seg_len_pair
2690 	(dr_with_seg_len (dr_a, seg_length_a, access_size_a, align_a),
2691 	 dr_with_seg_len (dr_b, seg_length_b, access_size_b, align_b),
2692 	 /* ??? Would WELL_ORDERED be safe?  */
2693 	 dr_with_seg_len_pair_t::REORDERED);
2694 
2695       comp_alias_pairs->safe_push (dr_with_seg_len_pair);
2696     }
2697 
2698   if (tree_fits_uhwi_p (niters))
2699     factor = tree_to_uhwi (niters);
2700 
2701   /* Prune alias check pairs.  */
2702   prune_runtime_alias_test_list (comp_alias_pairs, factor);
2703   if (dump_file && (dump_flags & TDF_DETAILS))
2704     fprintf (dump_file,
2705 	     "Improved number of alias checks from %d to %d\n",
2706 	     alias_ddrs->length (), comp_alias_pairs->length ());
2707 }
2708 
2709 /* Given data dependence relations in ALIAS_DDRS, generate runtime alias
2710    checks and version LOOP under condition of these runtime alias checks.  */
2711 
2712 static void
version_loop_by_alias_check(vec<struct partition * > * partitions,class loop * loop,vec<ddr_p> * alias_ddrs)2713 version_loop_by_alias_check (vec<struct partition *> *partitions,
2714 			     class loop *loop, vec<ddr_p> *alias_ddrs)
2715 {
2716   profile_probability prob;
2717   basic_block cond_bb;
2718   class loop *nloop;
2719   tree lhs, arg0, cond_expr = NULL_TREE;
2720   gimple_seq cond_stmts = NULL;
2721   gimple *call_stmt = NULL;
2722   auto_vec<dr_with_seg_len_pair_t> comp_alias_pairs;
2723 
2724   /* Generate code for runtime alias checks if necessary.  */
2725   gcc_assert (alias_ddrs->length () > 0);
2726 
2727   if (dump_file && (dump_flags & TDF_DETAILS))
2728     fprintf (dump_file,
2729 	     "Version loop <%d> with runtime alias check\n", loop->num);
2730 
2731   compute_alias_check_pairs (loop, alias_ddrs, &comp_alias_pairs);
2732   create_runtime_alias_checks (loop, &comp_alias_pairs, &cond_expr);
2733   cond_expr = force_gimple_operand_1 (cond_expr, &cond_stmts,
2734 				      is_gimple_val, NULL_TREE);
2735 
2736   /* Depend on vectorizer to fold IFN_LOOP_DIST_ALIAS.  */
2737   bool cancelable_p = flag_tree_loop_vectorize;
2738   if (cancelable_p)
2739     {
2740       unsigned i = 0;
2741       struct partition *partition;
2742       for (; partitions->iterate (i, &partition); ++i)
2743 	if (!partition_builtin_p (partition))
2744 	  break;
2745 
2746      /* If all partitions are builtins, distributing it would be profitable and
2747 	we don't want to cancel the runtime alias checks.  */
2748       if (i == partitions->length ())
2749 	cancelable_p = false;
2750     }
2751 
2752   /* Generate internal function call for loop distribution alias check if the
2753      runtime alias check should be cancelable.  */
2754   if (cancelable_p)
2755     {
2756       call_stmt = gimple_build_call_internal (IFN_LOOP_DIST_ALIAS,
2757 					      2, NULL_TREE, cond_expr);
2758       lhs = make_ssa_name (boolean_type_node);
2759       gimple_call_set_lhs (call_stmt, lhs);
2760     }
2761   else
2762     lhs = cond_expr;
2763 
2764   prob = profile_probability::guessed_always ().apply_scale (9, 10);
2765   initialize_original_copy_tables ();
2766   nloop = loop_version (loop, lhs, &cond_bb, prob, prob.invert (),
2767 			prob, prob.invert (), true);
2768   free_original_copy_tables ();
2769   /* Record the original loop number in newly generated loops.  In case of
2770      distribution, the original loop will be distributed and the new loop
2771      is kept.  */
2772   loop->orig_loop_num = nloop->num;
2773   nloop->orig_loop_num = nloop->num;
2774   nloop->dont_vectorize = true;
2775   nloop->force_vectorize = false;
2776 
2777   if (call_stmt)
2778     {
2779       /* Record new loop's num in IFN_LOOP_DIST_ALIAS because the original
2780 	 loop could be destroyed.  */
2781       arg0 = build_int_cst (integer_type_node, loop->orig_loop_num);
2782       gimple_call_set_arg (call_stmt, 0, arg0);
2783       gimple_seq_add_stmt_without_update (&cond_stmts, call_stmt);
2784     }
2785 
2786   if (cond_stmts)
2787     {
2788       gimple_stmt_iterator cond_gsi = gsi_last_bb (cond_bb);
2789       gsi_insert_seq_before (&cond_gsi, cond_stmts, GSI_SAME_STMT);
2790     }
2791   update_ssa (TODO_update_ssa);
2792 }
2793 
2794 /* Return true if loop versioning is needed to distrubute PARTITIONS.
2795    ALIAS_DDRS are data dependence relations for runtime alias check.  */
2796 
2797 static inline bool
version_for_distribution_p(vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2798 version_for_distribution_p (vec<struct partition *> *partitions,
2799 			    vec<ddr_p> *alias_ddrs)
2800 {
2801   /* No need to version loop if we have only one partition.  */
2802   if (partitions->length () == 1)
2803     return false;
2804 
2805   /* Need to version loop if runtime alias check is necessary.  */
2806   return (alias_ddrs->length () > 0);
2807 }
2808 
2809 /* Compare base offset of builtin mem* partitions P1 and P2.  */
2810 
2811 static int
offset_cmp(const void * vp1,const void * vp2)2812 offset_cmp (const void *vp1, const void *vp2)
2813 {
2814   struct partition *p1 = *(struct partition *const *) vp1;
2815   struct partition *p2 = *(struct partition *const *) vp2;
2816   unsigned HOST_WIDE_INT o1 = p1->builtin->dst_base_offset;
2817   unsigned HOST_WIDE_INT o2 = p2->builtin->dst_base_offset;
2818   return (o2 < o1) - (o1 < o2);
2819 }
2820 
2821 /* Fuse adjacent memset builtin PARTITIONS if possible.  This is a special
2822    case optimization transforming below code:
2823 
2824      __builtin_memset (&obj, 0, 100);
2825      _1 = &obj + 100;
2826      __builtin_memset (_1, 0, 200);
2827      _2 = &obj + 300;
2828      __builtin_memset (_2, 0, 100);
2829 
2830    into:
2831 
2832      __builtin_memset (&obj, 0, 400);
2833 
2834    Note we don't have dependence information between different partitions
2835    at this point, as a result, we can't handle nonadjacent memset builtin
2836    partitions since dependence might be broken.  */
2837 
2838 static void
fuse_memset_builtins(vec<struct partition * > * partitions)2839 fuse_memset_builtins (vec<struct partition *> *partitions)
2840 {
2841   unsigned i, j;
2842   struct partition *part1, *part2;
2843   tree rhs1, rhs2;
2844 
2845   for (i = 0; partitions->iterate (i, &part1);)
2846     {
2847       if (part1->kind != PKIND_MEMSET)
2848 	{
2849 	  i++;
2850 	  continue;
2851 	}
2852 
2853       /* Find sub-array of memset builtins of the same base.  Index range
2854 	 of the sub-array is [i, j) with "j > i".  */
2855       for (j = i + 1; partitions->iterate (j, &part2); ++j)
2856 	{
2857 	  if (part2->kind != PKIND_MEMSET
2858 	      || !operand_equal_p (part1->builtin->dst_base_base,
2859 				   part2->builtin->dst_base_base, 0))
2860 	    break;
2861 
2862 	  /* Memset calls setting different values can't be merged.  */
2863 	  rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2864 	  rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2865 	  if (!operand_equal_p (rhs1, rhs2, 0))
2866 	    break;
2867 	}
2868 
2869       /* Stable sort is required in order to avoid breaking dependence.  */
2870       gcc_stablesort (&(*partitions)[i], j - i, sizeof (*partitions)[i],
2871 		      offset_cmp);
2872       /* Continue with next partition.  */
2873       i = j;
2874     }
2875 
2876   /* Merge all consecutive memset builtin partitions.  */
2877   for (i = 0; i < partitions->length () - 1;)
2878     {
2879       part1 = (*partitions)[i];
2880       if (part1->kind != PKIND_MEMSET)
2881 	{
2882 	  i++;
2883 	  continue;
2884 	}
2885 
2886       part2 = (*partitions)[i + 1];
2887       /* Only merge memset partitions of the same base and with constant
2888 	 access sizes.  */
2889       if (part2->kind != PKIND_MEMSET
2890 	  || TREE_CODE (part1->builtin->size) != INTEGER_CST
2891 	  || TREE_CODE (part2->builtin->size) != INTEGER_CST
2892 	  || !operand_equal_p (part1->builtin->dst_base_base,
2893 			       part2->builtin->dst_base_base, 0))
2894 	{
2895 	  i++;
2896 	  continue;
2897 	}
2898       rhs1 = gimple_assign_rhs1 (DR_STMT (part1->builtin->dst_dr));
2899       rhs2 = gimple_assign_rhs1 (DR_STMT (part2->builtin->dst_dr));
2900       int bytev1 = const_with_all_bytes_same (rhs1);
2901       int bytev2 = const_with_all_bytes_same (rhs2);
2902       /* Only merge memset partitions of the same value.  */
2903       if (bytev1 != bytev2 || bytev1 == -1)
2904 	{
2905 	  i++;
2906 	  continue;
2907 	}
2908       wide_int end1 = wi::add (part1->builtin->dst_base_offset,
2909 			       wi::to_wide (part1->builtin->size));
2910       /* Only merge adjacent memset partitions.  */
2911       if (wi::ne_p (end1, part2->builtin->dst_base_offset))
2912 	{
2913 	  i++;
2914 	  continue;
2915 	}
2916       /* Merge partitions[i] and partitions[i+1].  */
2917       part1->builtin->size = fold_build2 (PLUS_EXPR, sizetype,
2918 					  part1->builtin->size,
2919 					  part2->builtin->size);
2920       partition_free (part2);
2921       partitions->ordered_remove (i + 1);
2922     }
2923 }
2924 
2925 void
finalize_partitions(class loop * loop,vec<struct partition * > * partitions,vec<ddr_p> * alias_ddrs)2926 loop_distribution::finalize_partitions (class loop *loop,
2927 					vec<struct partition *> *partitions,
2928 					vec<ddr_p> *alias_ddrs)
2929 {
2930   unsigned i;
2931   struct partition *partition, *a;
2932 
2933   if (partitions->length () == 1
2934       || alias_ddrs->length () > 0)
2935     return;
2936 
2937   unsigned num_builtin = 0, num_normal = 0, num_partial_memset = 0;
2938   bool same_type_p = true;
2939   enum partition_type type = ((*partitions)[0])->type;
2940   for (i = 0; partitions->iterate (i, &partition); ++i)
2941     {
2942       same_type_p &= (type == partition->type);
2943       if (partition_builtin_p (partition))
2944 	{
2945 	  num_builtin++;
2946 	  continue;
2947 	}
2948       num_normal++;
2949       if (partition->kind == PKIND_PARTIAL_MEMSET)
2950 	num_partial_memset++;
2951     }
2952 
2953   /* Don't distribute current loop into too many loops given we don't have
2954      memory stream cost model.  Be even more conservative in case of loop
2955      nest distribution.  */
2956   if ((same_type_p && num_builtin == 0
2957        && (loop->inner == NULL || num_normal != 2 || num_partial_memset != 1))
2958       || (loop->inner != NULL
2959 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
2960       || (loop->inner == NULL
2961 	  && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
2962     {
2963       a = (*partitions)[0];
2964       for (i = 1; partitions->iterate (i, &partition); ++i)
2965 	{
2966 	  partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
2967 	  partition_free (partition);
2968 	}
2969       partitions->truncate (1);
2970     }
2971 
2972   /* Fuse memset builtins if possible.  */
2973   if (partitions->length () > 1)
2974     fuse_memset_builtins (partitions);
2975 }
2976 
2977 /* Distributes the code from LOOP in such a way that producer statements
2978    are placed before consumer statements.  Tries to separate only the
2979    statements from STMTS into separate loops.  Returns the number of
2980    distributed loops.  Set NB_CALLS to number of generated builtin calls.
2981    Set *DESTROY_P to whether LOOP needs to be destroyed.  */
2982 
2983 int
distribute_loop(class loop * loop,const vec<gimple * > & stmts,control_dependences * cd,int * nb_calls,bool * destroy_p,bool only_patterns_p)2984 loop_distribution::distribute_loop (class loop *loop,
2985 		 const vec<gimple *> &stmts,
2986 		 control_dependences *cd, int *nb_calls, bool *destroy_p,
2987 		 bool only_patterns_p)
2988 {
2989   ddrs_table = new hash_table<ddr_hasher> (389);
2990   struct graph *rdg;
2991   partition *partition;
2992   int i, nbp;
2993 
2994   *destroy_p = false;
2995   *nb_calls = 0;
2996   loop_nest.create (0);
2997   if (!find_loop_nest (loop, &loop_nest))
2998     {
2999       loop_nest.release ();
3000       delete ddrs_table;
3001       return 0;
3002     }
3003 
3004   datarefs_vec.create (20);
3005   has_nonaddressable_dataref_p = false;
3006   rdg = build_rdg (loop, cd);
3007   if (!rdg)
3008     {
3009       if (dump_file && (dump_flags & TDF_DETAILS))
3010 	fprintf (dump_file,
3011 		 "Loop %d not distributed: failed to build the RDG.\n",
3012 		 loop->num);
3013 
3014       loop_nest.release ();
3015       free_data_refs (datarefs_vec);
3016       delete ddrs_table;
3017       return 0;
3018     }
3019 
3020   if (datarefs_vec.length () > MAX_DATAREFS_NUM)
3021     {
3022       if (dump_file && (dump_flags & TDF_DETAILS))
3023 	fprintf (dump_file,
3024 		 "Loop %d not distributed: too many memory references.\n",
3025 		 loop->num);
3026 
3027       free_rdg (rdg);
3028       loop_nest.release ();
3029       free_data_refs (datarefs_vec);
3030       delete ddrs_table;
3031       return 0;
3032     }
3033 
3034   data_reference_p dref;
3035   for (i = 0; datarefs_vec.iterate (i, &dref); ++i)
3036     dref->aux = (void *) (uintptr_t) i;
3037 
3038   if (dump_file && (dump_flags & TDF_DETAILS))
3039     dump_rdg (dump_file, rdg);
3040 
3041   auto_vec<struct partition *, 3> partitions;
3042   rdg_build_partitions (rdg, stmts, &partitions);
3043 
3044   auto_vec<ddr_p> alias_ddrs;
3045 
3046   auto_bitmap stmt_in_all_partitions;
3047   bitmap_copy (stmt_in_all_partitions, partitions[0]->stmts);
3048   for (i = 1; partitions.iterate (i, &partition); ++i)
3049     bitmap_and_into (stmt_in_all_partitions, partitions[i]->stmts);
3050 
3051   bool any_builtin = false;
3052   bool reduction_in_all = false;
3053   FOR_EACH_VEC_ELT (partitions, i, partition)
3054     {
3055       reduction_in_all
3056 	|= classify_partition (loop, rdg, partition, stmt_in_all_partitions);
3057       any_builtin |= partition_builtin_p (partition);
3058     }
3059 
3060   /* If we are only distributing patterns but did not detect any,
3061      simply bail out.  */
3062   if (only_patterns_p
3063       && !any_builtin)
3064     {
3065       nbp = 0;
3066       goto ldist_done;
3067     }
3068 
3069   /* If we are only distributing patterns fuse all partitions that
3070      were not classified as builtins.  This also avoids chopping
3071      a loop into pieces, separated by builtin calls.  That is, we
3072      only want no or a single loop body remaining.  */
3073   struct partition *into;
3074   if (only_patterns_p)
3075     {
3076       for (i = 0; partitions.iterate (i, &into); ++i)
3077 	if (!partition_builtin_p (into))
3078 	  break;
3079       for (++i; partitions.iterate (i, &partition); ++i)
3080 	if (!partition_builtin_p (partition))
3081 	  {
3082 	    partition_merge_into (NULL, into, partition, FUSE_NON_BUILTIN);
3083 	    partitions.unordered_remove (i);
3084 	    partition_free (partition);
3085 	    i--;
3086 	  }
3087     }
3088 
3089   /* Due to limitations in the transform phase we have to fuse all
3090      reduction partitions into the last partition so the existing
3091      loop will contain all loop-closed PHI nodes.  */
3092   for (i = 0; partitions.iterate (i, &into); ++i)
3093     if (partition_reduction_p (into))
3094       break;
3095   for (i = i + 1; partitions.iterate (i, &partition); ++i)
3096     if (partition_reduction_p (partition))
3097       {
3098 	partition_merge_into (rdg, into, partition, FUSE_REDUCTION);
3099 	partitions.unordered_remove (i);
3100 	partition_free (partition);
3101 	i--;
3102       }
3103 
3104   /* Apply our simple cost model - fuse partitions with similar
3105      memory accesses.  */
3106   for (i = 0; partitions.iterate (i, &into); ++i)
3107     {
3108       bool changed = false;
3109       if (partition_builtin_p (into) || into->kind == PKIND_PARTIAL_MEMSET)
3110 	continue;
3111       for (int j = i + 1;
3112 	   partitions.iterate (j, &partition); ++j)
3113 	{
3114 	  if (share_memory_accesses (rdg, into, partition))
3115 	    {
3116 	      partition_merge_into (rdg, into, partition, FUSE_SHARE_REF);
3117 	      partitions.unordered_remove (j);
3118 	      partition_free (partition);
3119 	      j--;
3120 	      changed = true;
3121 	    }
3122 	}
3123       /* If we fused 0 1 2 in step 1 to 0,2 1 as 0 and 2 have similar
3124          accesses when 1 and 2 have similar accesses but not 0 and 1
3125 	 then in the next iteration we will fail to consider merging
3126 	 1 into 0,2.  So try again if we did any merging into 0.  */
3127       if (changed)
3128 	i--;
3129     }
3130 
3131   /* Put a non-builtin partition last if we need to preserve a reduction.
3132      ???  This is a workaround that makes sort_partitions_by_post_order do
3133      the correct thing while in reality it should sort each component
3134      separately and then put the component with a reduction or a non-builtin
3135      last.  */
3136   if (reduction_in_all
3137       && partition_builtin_p (partitions.last()))
3138     FOR_EACH_VEC_ELT (partitions, i, partition)
3139       if (!partition_builtin_p (partition))
3140 	{
3141 	  partitions.unordered_remove (i);
3142 	  partitions.quick_push (partition);
3143 	  break;
3144 	}
3145 
3146   /* Build the partition dependency graph and fuse partitions in strong
3147      connected component.  */
3148   if (partitions.length () > 1)
3149     {
3150       /* Don't support loop nest distribution under runtime alias check
3151 	 since it's not likely to enable many vectorization opportunities.
3152 	 Also if loop has any data reference which may be not addressable
3153 	 since alias check needs to take, compare address of the object.  */
3154       if (loop->inner || has_nonaddressable_dataref_p)
3155 	merge_dep_scc_partitions (rdg, &partitions, false);
3156       else
3157 	{
3158 	  merge_dep_scc_partitions (rdg, &partitions, true);
3159 	  if (partitions.length () > 1)
3160 	    break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
3161 	}
3162     }
3163 
3164   finalize_partitions (loop, &partitions, &alias_ddrs);
3165 
3166   /* If there is a reduction in all partitions make sure the last one
3167      is not classified for builtin code generation.  */
3168   if (reduction_in_all)
3169     {
3170       partition = partitions.last ();
3171       if (only_patterns_p
3172 	  && partition_builtin_p (partition)
3173 	  && !partition_builtin_p (partitions[0]))
3174 	{
3175 	  nbp = 0;
3176 	  goto ldist_done;
3177 	}
3178       partition->kind = PKIND_NORMAL;
3179     }
3180 
3181   nbp = partitions.length ();
3182   if (nbp == 0
3183       || (nbp == 1 && !partition_builtin_p (partitions[0]))
3184       || (nbp > 1 && partition_contains_all_rw (rdg, partitions)))
3185     {
3186       nbp = 0;
3187       goto ldist_done;
3188     }
3189 
3190   if (version_for_distribution_p (&partitions, &alias_ddrs))
3191     version_loop_by_alias_check (&partitions, loop, &alias_ddrs);
3192 
3193   if (dump_file && (dump_flags & TDF_DETAILS))
3194     {
3195       fprintf (dump_file,
3196 	       "distribute loop <%d> into partitions:\n", loop->num);
3197       dump_rdg_partitions (dump_file, partitions);
3198     }
3199 
3200   FOR_EACH_VEC_ELT (partitions, i, partition)
3201     {
3202       if (partition_builtin_p (partition))
3203 	(*nb_calls)++;
3204       *destroy_p |= generate_code_for_partition (loop, partition, i < nbp - 1);
3205     }
3206 
3207  ldist_done:
3208   loop_nest.release ();
3209   free_data_refs (datarefs_vec);
3210   for (hash_table<ddr_hasher>::iterator iter = ddrs_table->begin ();
3211        iter != ddrs_table->end (); ++iter)
3212     {
3213       free_dependence_relation (*iter);
3214       *iter = NULL;
3215     }
3216   delete ddrs_table;
3217 
3218   FOR_EACH_VEC_ELT (partitions, i, partition)
3219     partition_free (partition);
3220 
3221   free_rdg (rdg);
3222   return nbp - *nb_calls;
3223 }
3224 
3225 
bb_top_order_init(void)3226 void loop_distribution::bb_top_order_init (void)
3227 {
3228   int rpo_num;
3229   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (cfun) - NUM_FIXED_BLOCKS);
3230   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (cfun));
3231   bitmap exit_bbs = BITMAP_ALLOC (NULL);
3232 
3233   bb_top_order_index = XNEWVEC (int, last_basic_block_for_fn (cfun));
3234   bb_top_order_index_size = last_basic_block_for_fn (cfun);
3235 
3236   entry->flags &= ~EDGE_DFS_BACK;
3237   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
3238   rpo_num = rev_post_order_and_mark_dfs_back_seme (cfun, entry, exit_bbs, true,
3239 						   rpo, NULL);
3240   BITMAP_FREE (exit_bbs);
3241 
3242   for (int i = 0; i < rpo_num; i++)
3243     bb_top_order_index[rpo[i]] = i;
3244 
3245   free (rpo);
3246 }
3247 
bb_top_order_destroy()3248 void loop_distribution::bb_top_order_destroy ()
3249 {
3250   free (bb_top_order_index);
3251   bb_top_order_index = NULL;
3252   bb_top_order_index_size = 0;
3253 }
3254 
3255 
3256 /* Given LOOP, this function records seed statements for distribution in
3257    WORK_LIST.  Return false if there is nothing for distribution.  */
3258 
3259 static bool
find_seed_stmts_for_distribution(class loop * loop,vec<gimple * > * work_list)3260 find_seed_stmts_for_distribution (class loop *loop, vec<gimple *> *work_list)
3261 {
3262   basic_block *bbs = get_loop_body_in_dom_order (loop);
3263 
3264   /* Initialize the worklist with stmts we seed the partitions with.  */
3265   for (unsigned i = 0; i < loop->num_nodes; ++i)
3266     {
3267       /* In irreducible sub-regions we don't know how to redirect
3268 	 conditions, so fail.  See PR100492.  */
3269       if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
3270 	{
3271 	  if (dump_file && (dump_flags & TDF_DETAILS))
3272 	    fprintf (dump_file, "loop %d contains an irreducible region.\n",
3273 		     loop->num);
3274 	  work_list->truncate (0);
3275 	  break;
3276 	}
3277       for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
3278 	   !gsi_end_p (gsi); gsi_next (&gsi))
3279 	{
3280 	  gphi *phi = gsi.phi ();
3281 	  if (virtual_operand_p (gimple_phi_result (phi)))
3282 	    continue;
3283 	  /* Distribute stmts which have defs that are used outside of
3284 	     the loop.  */
3285 	  if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
3286 	    continue;
3287 	  work_list->safe_push (phi);
3288 	}
3289       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
3290 	   !gsi_end_p (gsi); gsi_next (&gsi))
3291 	{
3292 	  gimple *stmt = gsi_stmt (gsi);
3293 
3294 	  /* Ignore clobbers, they do not have true side effects.  */
3295 	  if (gimple_clobber_p (stmt))
3296 	    continue;
3297 
3298 	  /* If there is a stmt with side-effects bail out - we
3299 	     cannot and should not distribute this loop.  */
3300 	  if (gimple_has_side_effects (stmt))
3301 	    {
3302 	      free (bbs);
3303 	      return false;
3304 	    }
3305 
3306 	  /* Distribute stmts which have defs that are used outside of
3307 	     the loop.  */
3308 	  if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3309 	    ;
3310 	  /* Otherwise only distribute stores for now.  */
3311 	  else if (!gimple_vdef (stmt))
3312 	    continue;
3313 
3314 	  work_list->safe_push (stmt);
3315 	}
3316     }
3317   bool res = work_list->length () > 0;
3318   if (res && !can_copy_bbs_p (bbs, loop->num_nodes))
3319     {
3320       if (dump_file && (dump_flags & TDF_DETAILS))
3321 	fprintf (dump_file, "cannot copy loop %d.\n", loop->num);
3322       res = false;
3323     }
3324   free (bbs);
3325   return res;
3326 }
3327 
3328 /* A helper function for generate_{rawmemchr,strlen}_builtin functions in order
3329    to place new statements SEQ before LOOP and replace the old reduction
3330    variable with the new one.  */
3331 
3332 static void
generate_reduction_builtin_1(loop_p loop,gimple_seq & seq,tree reduction_var_old,tree reduction_var_new,const char * info,machine_mode load_mode)3333 generate_reduction_builtin_1 (loop_p loop, gimple_seq &seq,
3334 			      tree reduction_var_old, tree reduction_var_new,
3335 			      const char *info, machine_mode load_mode)
3336 {
3337   gcc_assert (flag_tree_loop_distribute_patterns);
3338 
3339   /* Place new statements before LOOP.  */
3340   gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
3341   gsi_insert_seq_after (&gsi, seq, GSI_CONTINUE_LINKING);
3342 
3343   /* Replace old reduction variable with new one.  */
3344   imm_use_iterator iter;
3345   gimple *stmt;
3346   use_operand_p use_p;
3347   FOR_EACH_IMM_USE_STMT (stmt, iter, reduction_var_old)
3348     {
3349       FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
3350 	SET_USE (use_p, reduction_var_new);
3351 
3352       update_stmt (stmt);
3353     }
3354 
3355   if (dump_file && (dump_flags & TDF_DETAILS))
3356     fprintf (dump_file, info, GET_MODE_NAME (load_mode));
3357 }
3358 
3359 /* Generate a call to rawmemchr and place it before LOOP.  REDUCTION_VAR is
3360    replaced with a fresh SSA name representing the result of the call.  */
3361 
3362 static void
generate_rawmemchr_builtin(loop_p loop,tree reduction_var,data_reference_p store_dr,tree base,tree pattern,location_t loc)3363 generate_rawmemchr_builtin (loop_p loop, tree reduction_var,
3364 			    data_reference_p store_dr, tree base, tree pattern,
3365 			    location_t loc)
3366 {
3367   gimple_seq seq = NULL;
3368 
3369   tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
3370   gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, mem, pattern);
3371   tree reduction_var_new = copy_ssa_name (reduction_var);
3372   gimple_call_set_lhs (fn_call, reduction_var_new);
3373   gimple_set_location (fn_call, loc);
3374   gimple_seq_add_stmt (&seq, fn_call);
3375 
3376   if (store_dr)
3377     {
3378       gassign *g = gimple_build_assign (DR_REF (store_dr), reduction_var_new);
3379       gimple_seq_add_stmt (&seq, g);
3380     }
3381 
3382   generate_reduction_builtin_1 (loop, seq, reduction_var, reduction_var_new,
3383 				"generated rawmemchr%s\n",
3384 				TYPE_MODE (TREE_TYPE (TREE_TYPE (base))));
3385 }
3386 
3387 /* Helper function for generate_strlen_builtin(,_using_rawmemchr)  */
3388 
3389 static void
generate_strlen_builtin_1(loop_p loop,gimple_seq & seq,tree reduction_var_old,tree reduction_var_new,machine_mode mode,tree start_len)3390 generate_strlen_builtin_1 (loop_p loop, gimple_seq &seq,
3391 			   tree reduction_var_old, tree reduction_var_new,
3392 			   machine_mode mode, tree start_len)
3393 {
3394   /* REDUCTION_VAR_NEW has either size type or ptrdiff type and must be
3395      converted if types of old and new reduction variable are not compatible. */
3396   reduction_var_new = gimple_convert (&seq, TREE_TYPE (reduction_var_old),
3397 				      reduction_var_new);
3398 
3399   /* Loops of the form `for (i=42; s[i]; ++i);` have an additional start
3400      length.  */
3401   if (!integer_zerop (start_len))
3402     {
3403       tree lhs = make_ssa_name (TREE_TYPE (reduction_var_new));
3404       gimple *g = gimple_build_assign (lhs, PLUS_EXPR, reduction_var_new,
3405 				       start_len);
3406       gimple_seq_add_stmt (&seq, g);
3407       reduction_var_new = lhs;
3408     }
3409 
3410   generate_reduction_builtin_1 (loop, seq, reduction_var_old, reduction_var_new,
3411 				"generated strlen%s\n", mode);
3412 }
3413 
3414 /* Generate a call to strlen and place it before LOOP.  REDUCTION_VAR is
3415    replaced with a fresh SSA name representing the result of the call.  */
3416 
3417 static void
generate_strlen_builtin(loop_p loop,tree reduction_var,tree base,tree start_len,location_t loc)3418 generate_strlen_builtin (loop_p loop, tree reduction_var, tree base,
3419 			 tree start_len, location_t loc)
3420 {
3421   gimple_seq seq = NULL;
3422 
3423   tree reduction_var_new = make_ssa_name (size_type_node);
3424 
3425   tree mem = force_gimple_operand (base, &seq, true, NULL_TREE);
3426   tree fn = build_fold_addr_expr (builtin_decl_implicit (BUILT_IN_STRLEN));
3427   gimple *fn_call = gimple_build_call (fn, 1, mem);
3428   gimple_call_set_lhs (fn_call, reduction_var_new);
3429   gimple_set_location (fn_call, loc);
3430   gimple_seq_add_stmt (&seq, fn_call);
3431 
3432   generate_strlen_builtin_1 (loop, seq, reduction_var, reduction_var_new,
3433 			     QImode, start_len);
3434 }
3435 
3436 /* Generate code in order to mimic the behaviour of strlen but this time over
3437    an array of elements with mode different than QI.  REDUCTION_VAR is replaced
3438    with a fresh SSA name representing the result, i.e., the length.  */
3439 
3440 static void
generate_strlen_builtin_using_rawmemchr(loop_p loop,tree reduction_var,tree base,tree load_type,tree start_len,location_t loc)3441 generate_strlen_builtin_using_rawmemchr (loop_p loop, tree reduction_var,
3442 					 tree base, tree load_type,
3443 					 tree start_len, location_t loc)
3444 {
3445   gimple_seq seq = NULL;
3446 
3447   tree start = force_gimple_operand (base, &seq, true, NULL_TREE);
3448   tree zero = build_zero_cst (load_type);
3449   gimple *fn_call = gimple_build_call_internal (IFN_RAWMEMCHR, 2, start, zero);
3450   tree end = make_ssa_name (TREE_TYPE (base));
3451   gimple_call_set_lhs (fn_call, end);
3452   gimple_set_location (fn_call, loc);
3453   gimple_seq_add_stmt (&seq, fn_call);
3454 
3455   /* Determine the number of elements between START and END by
3456      evaluating (END - START) / sizeof (*START).  */
3457   tree diff = make_ssa_name (ptrdiff_type_node);
3458   gimple *diff_stmt = gimple_build_assign (diff, POINTER_DIFF_EXPR, end, start);
3459   gimple_seq_add_stmt (&seq, diff_stmt);
3460   /* Let SIZE be the size of each character.  */
3461   tree size = gimple_convert (&seq, ptrdiff_type_node,
3462 			      TYPE_SIZE_UNIT (load_type));
3463   tree count = make_ssa_name (ptrdiff_type_node);
3464   gimple *count_stmt = gimple_build_assign (count, TRUNC_DIV_EXPR, diff, size);
3465   gimple_seq_add_stmt (&seq, count_stmt);
3466 
3467   generate_strlen_builtin_1 (loop, seq, reduction_var, count,
3468 			     TYPE_MODE (load_type),
3469 			     start_len);
3470 }
3471 
3472 /* Return true if we can count at least as many characters by taking pointer
3473    difference as we can count via reduction_var without an overflow.  Thus
3474    compute 2^n < (2^(m-1) / s) where n = TYPE_PRECISION (reduction_var_type),
3475    m = TYPE_PRECISION (ptrdiff_type_node), and s = size of each character.  */
3476 static bool
reduction_var_overflows_first(tree reduction_var_type,tree load_type)3477 reduction_var_overflows_first (tree reduction_var_type, tree load_type)
3478 {
3479   widest_int n2 = wi::lshift (1, TYPE_PRECISION (reduction_var_type));;
3480   widest_int m2 = wi::lshift (1, TYPE_PRECISION (ptrdiff_type_node) - 1);
3481   widest_int s = wi::to_widest (TYPE_SIZE_UNIT (load_type));
3482   return wi::ltu_p (n2, wi::udiv_trunc (m2, s));
3483 }
3484 
3485 static gimple *
determine_reduction_stmt_1(const loop_p loop,const basic_block * bbs)3486 determine_reduction_stmt_1 (const loop_p loop, const basic_block *bbs)
3487 {
3488   gimple *reduction_stmt = NULL;
3489 
3490   for (unsigned i = 0, ninsns = 0; i < loop->num_nodes; ++i)
3491     {
3492       basic_block bb = bbs[i];
3493 
3494       for (gphi_iterator bsi = gsi_start_phis (bb); !gsi_end_p (bsi);
3495 	   gsi_next_nondebug (&bsi))
3496 	{
3497 	  gphi *phi = bsi.phi ();
3498 	  if (virtual_operand_p (gimple_phi_result (phi)))
3499 	    continue;
3500 	  if (stmt_has_scalar_dependences_outside_loop (loop, phi))
3501 	    {
3502 	      if (reduction_stmt)
3503 		return NULL;
3504 	      reduction_stmt = phi;
3505 	    }
3506 	}
3507 
3508       for (gimple_stmt_iterator bsi = gsi_start_bb (bb); !gsi_end_p (bsi);
3509 	   gsi_next_nondebug (&bsi), ++ninsns)
3510 	{
3511 	  /* Bail out early for loops which are unlikely to match.  */
3512 	  if (ninsns > 16)
3513 	    return NULL;
3514 	  gimple *stmt = gsi_stmt (bsi);
3515 	  if (gimple_clobber_p (stmt))
3516 	    continue;
3517 	  if (gimple_code (stmt) == GIMPLE_LABEL)
3518 	    continue;
3519 	  if (gimple_has_volatile_ops (stmt))
3520 	    return NULL;
3521 	  if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
3522 	    {
3523 	      if (reduction_stmt)
3524 		return NULL;
3525 	      reduction_stmt = stmt;
3526 	    }
3527 	}
3528     }
3529 
3530   return reduction_stmt;
3531 }
3532 
3533 /* If LOOP has a single non-volatile reduction statement, then return a pointer
3534    to it.  Otherwise return NULL.  */
3535 static gimple *
determine_reduction_stmt(const loop_p loop)3536 determine_reduction_stmt (const loop_p loop)
3537 {
3538   basic_block *bbs = get_loop_body (loop);
3539   gimple *reduction_stmt = determine_reduction_stmt_1 (loop, bbs);
3540   XDELETEVEC (bbs);
3541   return reduction_stmt;
3542 }
3543 
3544 /* Transform loops which mimic the effects of builtins rawmemchr or strlen and
3545    replace them accordingly.  For example, a loop of the form
3546 
3547      for (; *p != 42; ++p);
3548 
3549    is replaced by
3550 
3551      p = rawmemchr<MODE> (p, 42);
3552 
3553    under the assumption that rawmemchr is available for a particular MODE.
3554    Another example is
3555 
3556      int i;
3557      for (i = 42; s[i]; ++i);
3558 
3559    which is replaced by
3560 
3561      i = (int)strlen (&s[42]) + 42;
3562 
3563    for some character array S.  In case array S is not of type character array
3564    we end up with
3565 
3566      i = (int)(rawmemchr<MODE> (&s[42], 0) - &s[42]) + 42;
3567 
3568    assuming that rawmemchr is available for a particular MODE.  */
3569 
3570 bool
transform_reduction_loop(loop_p loop)3571 loop_distribution::transform_reduction_loop (loop_p loop)
3572 {
3573   gimple *reduction_stmt;
3574   data_reference_p load_dr = NULL, store_dr = NULL;
3575 
3576   edge e = single_exit (loop);
3577   gcond *cond = safe_dyn_cast <gcond *> (last_stmt (e->src));
3578   if (!cond)
3579     return false;
3580   /* Ensure loop condition is an (in)equality test and loop is exited either if
3581      the inequality test fails or the equality test succeeds.  */
3582   if (!(e->flags & EDGE_FALSE_VALUE && gimple_cond_code (cond) == NE_EXPR)
3583       && !(e->flags & EDGE_TRUE_VALUE && gimple_cond_code (cond) == EQ_EXPR))
3584     return false;
3585   /* A limitation of the current implementation is that we only support
3586      constant patterns in (in)equality tests.  */
3587   tree pattern = gimple_cond_rhs (cond);
3588   if (TREE_CODE (pattern) != INTEGER_CST)
3589     return false;
3590 
3591   reduction_stmt = determine_reduction_stmt (loop);
3592 
3593   /* A limitation of the current implementation is that we require a reduction
3594      statement.  Therefore, loops without a reduction statement as in the
3595      following are not recognized:
3596      int *p;
3597      void foo (void) { for (; *p; ++p); } */
3598   if (reduction_stmt == NULL)
3599     return false;
3600 
3601   /* Reduction variables are guaranteed to be SSA names.  */
3602   tree reduction_var;
3603   switch (gimple_code (reduction_stmt))
3604     {
3605     case GIMPLE_ASSIGN:
3606     case GIMPLE_PHI:
3607       reduction_var = gimple_get_lhs (reduction_stmt);
3608       break;
3609     default:
3610       /* Bail out e.g. for GIMPLE_CALL.  */
3611       return false;
3612     }
3613 
3614   struct graph *rdg = build_rdg (loop, NULL);
3615   if (rdg == NULL)
3616     {
3617       if (dump_file && (dump_flags & TDF_DETAILS))
3618 	fprintf (dump_file,
3619 		 "Loop %d not transformed: failed to build the RDG.\n",
3620 		 loop->num);
3621 
3622       return false;
3623     }
3624   auto_bitmap partition_stmts;
3625   bitmap_set_range (partition_stmts, 0, rdg->n_vertices);
3626   find_single_drs (loop, rdg, partition_stmts, &store_dr, &load_dr);
3627   free_rdg (rdg);
3628 
3629   /* Bail out if there is no single load.  */
3630   if (load_dr == NULL)
3631     return false;
3632 
3633   /* Reaching this point we have a loop with a single reduction variable,
3634      a single load, and an optional single store.  */
3635 
3636   tree load_ref = DR_REF (load_dr);
3637   tree load_type = TREE_TYPE (load_ref);
3638   tree load_access_base = build_fold_addr_expr (load_ref);
3639   tree load_access_size = TYPE_SIZE_UNIT (load_type);
3640   affine_iv load_iv, reduction_iv;
3641 
3642   if (!INTEGRAL_TYPE_P (load_type)
3643       || !type_has_mode_precision_p (load_type))
3644     return false;
3645 
3646   /* We already ensured that the loop condition tests for (in)equality where the
3647      rhs is a constant pattern. Now ensure that the lhs is the result of the
3648      load.  */
3649   if (gimple_cond_lhs (cond) != gimple_assign_lhs (DR_STMT (load_dr)))
3650     return false;
3651 
3652   /* Bail out if no affine induction variable with constant step can be
3653      determined.  */
3654   if (!simple_iv (loop, loop, load_access_base, &load_iv, false))
3655     return false;
3656 
3657   /* Bail out if memory accesses are not consecutive or not growing.  */
3658   if (!operand_equal_p (load_iv.step, load_access_size, 0))
3659     return false;
3660 
3661   if (!simple_iv (loop, loop, reduction_var, &reduction_iv, false))
3662     return false;
3663 
3664   /* Handle rawmemchr like loops.  */
3665   if (operand_equal_p (load_iv.base, reduction_iv.base)
3666       && operand_equal_p (load_iv.step, reduction_iv.step))
3667     {
3668       if (store_dr)
3669 	{
3670 	  /* Ensure that we store to X and load from X+I where I>0.  */
3671 	  if (TREE_CODE (load_iv.base) != POINTER_PLUS_EXPR
3672 	      || !integer_onep (TREE_OPERAND (load_iv.base, 1)))
3673 	    return false;
3674 	  tree ptr_base = TREE_OPERAND (load_iv.base, 0);
3675 	  if (TREE_CODE (ptr_base) != SSA_NAME)
3676 	    return false;
3677 	  gimple *def = SSA_NAME_DEF_STMT (ptr_base);
3678 	  if (!gimple_assign_single_p (def)
3679 	      || gimple_assign_rhs1 (def) != DR_REF (store_dr))
3680 	    return false;
3681 	  /* Ensure that the reduction value is stored.  */
3682 	  if (gimple_assign_rhs1 (DR_STMT (store_dr)) != reduction_var)
3683 	    return false;
3684 	}
3685       /* Bail out if target does not provide rawmemchr for a certain mode.  */
3686       machine_mode mode = TYPE_MODE (load_type);
3687       if (direct_optab_handler (rawmemchr_optab, mode) == CODE_FOR_nothing)
3688 	return false;
3689       location_t loc = gimple_location (DR_STMT (load_dr));
3690       generate_rawmemchr_builtin (loop, reduction_var, store_dr, load_iv.base,
3691 				  pattern, loc);
3692       return true;
3693     }
3694 
3695   /* Handle strlen like loops.  */
3696   if (store_dr == NULL
3697       && integer_zerop (pattern)
3698       && INTEGRAL_TYPE_P (TREE_TYPE (reduction_var))
3699       && TREE_CODE (reduction_iv.base) == INTEGER_CST
3700       && TREE_CODE (reduction_iv.step) == INTEGER_CST
3701       && integer_onep (reduction_iv.step))
3702     {
3703       location_t loc = gimple_location (DR_STMT (load_dr));
3704       tree reduction_var_type = TREE_TYPE (reduction_var);
3705       /* While determining the length of a string an overflow might occur.
3706 	 If an overflow only occurs in the loop implementation and not in the
3707 	 strlen implementation, then either the overflow is undefined or the
3708 	 truncated result of strlen equals the one of the loop.  Otherwise if
3709 	 an overflow may also occur in the strlen implementation, then
3710 	 replacing a loop by a call to strlen is sound whenever we ensure that
3711 	 if an overflow occurs in the strlen implementation, then also an
3712 	 overflow occurs in the loop implementation which is undefined.  It
3713 	 seems reasonable to relax this and assume that the strlen
3714 	 implementation cannot overflow in case sizetype is big enough in the
3715 	 sense that an overflow can only happen for string objects which are
3716 	 bigger than half of the address space; at least for 32-bit targets and
3717 	 up.
3718 
3719 	 For strlen which makes use of rawmemchr the maximal length of a string
3720 	 which can be determined without an overflow is PTRDIFF_MAX / S where
3721 	 each character has size S.  Since an overflow for ptrdiff type is
3722 	 undefined we have to make sure that if an overflow occurs, then an
3723 	 overflow occurs in the loop implementation, too, and this is
3724 	 undefined, too.  Similar as before we relax this and assume that no
3725 	 string object is larger than half of the address space; at least for
3726 	 32-bit targets and up.  */
3727       if (TYPE_MODE (load_type) == TYPE_MODE (char_type_node)
3728 	  && TYPE_PRECISION (load_type) == TYPE_PRECISION (char_type_node)
3729 	  && ((TYPE_PRECISION (sizetype) >= TYPE_PRECISION (ptr_type_node) - 1
3730 	       && TYPE_PRECISION (ptr_type_node) >= 32)
3731 	      || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
3732 		  && TYPE_PRECISION (reduction_var_type) <= TYPE_PRECISION (sizetype)))
3733 	  && builtin_decl_implicit (BUILT_IN_STRLEN))
3734 	generate_strlen_builtin (loop, reduction_var, load_iv.base,
3735 				 reduction_iv.base, loc);
3736       else if (direct_optab_handler (rawmemchr_optab, TYPE_MODE (load_type))
3737 	       != CODE_FOR_nothing
3738 	       && ((TYPE_PRECISION (ptrdiff_type_node) == TYPE_PRECISION (ptr_type_node)
3739 		    && TYPE_PRECISION (ptrdiff_type_node) >= 32)
3740 		   || (TYPE_OVERFLOW_UNDEFINED (reduction_var_type)
3741 		       && reduction_var_overflows_first (reduction_var_type, load_type))))
3742 	generate_strlen_builtin_using_rawmemchr (loop, reduction_var,
3743 						 load_iv.base,
3744 						 load_type,
3745 						 reduction_iv.base, loc);
3746       else
3747 	return false;
3748       return true;
3749     }
3750 
3751   return false;
3752 }
3753 
3754 /* Given innermost LOOP, return the outermost enclosing loop that forms a
3755    perfect loop nest.  */
3756 
3757 static class loop *
prepare_perfect_loop_nest(class loop * loop)3758 prepare_perfect_loop_nest (class loop *loop)
3759 {
3760   class loop *outer = loop_outer (loop);
3761   tree niters = number_of_latch_executions (loop);
3762 
3763   /* TODO: We only support the innermost 3-level loop nest distribution
3764      because of compilation time issue for now.  This should be relaxed
3765      in the future.  Note we only allow 3-level loop nest distribution
3766      when parallelizing loops.  */
3767   while ((loop->inner == NULL
3768 	  || (loop->inner->inner == NULL && flag_tree_parallelize_loops > 1))
3769 	 && loop_outer (outer)
3770 	 && outer->inner == loop && loop->next == NULL
3771 	 && single_exit (outer)
3772 	 && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
3773 	 && (niters = number_of_latch_executions (outer)) != NULL_TREE
3774 	 && niters != chrec_dont_know)
3775     {
3776       loop = outer;
3777       outer = loop_outer (loop);
3778     }
3779 
3780   return loop;
3781 }
3782 
3783 
3784 unsigned int
execute(function * fun)3785 loop_distribution::execute (function *fun)
3786 {
3787   bool changed = false;
3788   basic_block bb;
3789   control_dependences *cd = NULL;
3790   auto_vec<loop_p> loops_to_be_destroyed;
3791 
3792   if (number_of_loops (fun) <= 1)
3793     return 0;
3794 
3795   bb_top_order_init ();
3796 
3797   FOR_ALL_BB_FN (bb, fun)
3798     {
3799       gimple_stmt_iterator gsi;
3800       for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3801 	gimple_set_uid (gsi_stmt (gsi), -1);
3802       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3803 	gimple_set_uid (gsi_stmt (gsi), -1);
3804     }
3805 
3806   /* We can at the moment only distribute non-nested loops, thus restrict
3807      walking to innermost loops.  */
3808   for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
3809     {
3810       /* Don't distribute multiple exit edges loop, or cold loop when
3811          not doing pattern detection.  */
3812       if (!single_exit (loop)
3813 	  || (!flag_tree_loop_distribute_patterns
3814 	      && !optimize_loop_for_speed_p (loop)))
3815 	continue;
3816 
3817       /* If niters is unknown don't distribute loop but rather try to transform
3818 	 it to a call to a builtin.  */
3819       tree niters = number_of_latch_executions (loop);
3820       if (niters == NULL_TREE || niters == chrec_dont_know)
3821 	{
3822 	  datarefs_vec.create (20);
3823 	  if (flag_tree_loop_distribute_patterns
3824 	      && transform_reduction_loop (loop))
3825 	    {
3826 	      changed = true;
3827 	      loops_to_be_destroyed.safe_push (loop);
3828 	      if (dump_enabled_p ())
3829 		{
3830 		  dump_user_location_t loc = find_loop_location (loop);
3831 		  dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3832 				   loc, "Loop %d transformed into a builtin.\n",
3833 				   loop->num);
3834 		}
3835 	    }
3836 	  free_data_refs (datarefs_vec);
3837 	  continue;
3838 	}
3839 
3840       /* Get the perfect loop nest for distribution.  */
3841       loop = prepare_perfect_loop_nest (loop);
3842       for (; loop; loop = loop->inner)
3843 	{
3844 	  auto_vec<gimple *> work_list;
3845 	  if (!find_seed_stmts_for_distribution (loop, &work_list))
3846 	    break;
3847 
3848 	  const char *str = loop->inner ? " nest" : "";
3849 	  dump_user_location_t loc = find_loop_location (loop);
3850 	  if (!cd)
3851 	    {
3852 	      calculate_dominance_info (CDI_DOMINATORS);
3853 	      calculate_dominance_info (CDI_POST_DOMINATORS);
3854 	      cd = new control_dependences ();
3855 	      free_dominance_info (CDI_POST_DOMINATORS);
3856 	    }
3857 
3858 	  bool destroy_p;
3859 	  int nb_generated_loops, nb_generated_calls;
3860 	  nb_generated_loops
3861 	    = distribute_loop (loop, work_list, cd, &nb_generated_calls,
3862 			       &destroy_p, (!optimize_loop_for_speed_p (loop)
3863 					    || !flag_tree_loop_distribution));
3864 	  if (destroy_p)
3865 	    loops_to_be_destroyed.safe_push (loop);
3866 
3867 	  if (nb_generated_loops + nb_generated_calls > 0)
3868 	    {
3869 	      changed = true;
3870 	      if (dump_enabled_p ())
3871 		dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
3872 				 loc, "Loop%s %d distributed: split to %d loops "
3873 				 "and %d library calls.\n", str, loop->num,
3874 				 nb_generated_loops, nb_generated_calls);
3875 
3876 	      break;
3877 	    }
3878 
3879 	  if (dump_file && (dump_flags & TDF_DETAILS))
3880 	    fprintf (dump_file, "Loop%s %d not distributed.\n", str, loop->num);
3881 	}
3882     }
3883 
3884   if (cd)
3885     delete cd;
3886 
3887   if (bb_top_order_index != NULL)
3888     bb_top_order_destroy ();
3889 
3890   if (changed)
3891     {
3892       /* Destroy loop bodies that could not be reused.  Do this late as we
3893 	 otherwise can end up refering to stale data in control dependences.  */
3894       unsigned i;
3895       class loop *loop;
3896       FOR_EACH_VEC_ELT (loops_to_be_destroyed, i, loop)
3897 	destroy_loop (loop);
3898 
3899       /* Cached scalar evolutions now may refer to wrong or non-existing
3900 	 loops.  */
3901       scev_reset ();
3902       mark_virtual_operands_for_renaming (fun);
3903       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3904     }
3905 
3906   checking_verify_loop_structure ();
3907 
3908   return changed ? TODO_cleanup_cfg : 0;
3909 }
3910 
3911 
3912 /* Distribute all loops in the current function.  */
3913 
3914 namespace {
3915 
3916 const pass_data pass_data_loop_distribution =
3917 {
3918   GIMPLE_PASS, /* type */
3919   "ldist", /* name */
3920   OPTGROUP_LOOP, /* optinfo_flags */
3921   TV_TREE_LOOP_DISTRIBUTION, /* tv_id */
3922   ( PROP_cfg | PROP_ssa ), /* properties_required */
3923   0, /* properties_provided */
3924   0, /* properties_destroyed */
3925   0, /* todo_flags_start */
3926   0, /* todo_flags_finish */
3927 };
3928 
3929 class pass_loop_distribution : public gimple_opt_pass
3930 {
3931 public:
pass_loop_distribution(gcc::context * ctxt)3932   pass_loop_distribution (gcc::context *ctxt)
3933     : gimple_opt_pass (pass_data_loop_distribution, ctxt)
3934   {}
3935 
3936   /* opt_pass methods: */
gate(function *)3937   virtual bool gate (function *)
3938     {
3939       return flag_tree_loop_distribution
3940 	|| flag_tree_loop_distribute_patterns;
3941     }
3942 
3943   virtual unsigned int execute (function *);
3944 
3945 }; // class pass_loop_distribution
3946 
3947 unsigned int
execute(function * fun)3948 pass_loop_distribution::execute (function *fun)
3949 {
3950   return loop_distribution ().execute (fun);
3951 }
3952 
3953 } // anon namespace
3954 
3955 gimple_opt_pass *
make_pass_loop_distribution(gcc::context * ctxt)3956 make_pass_loop_distribution (gcc::context *ctxt)
3957 {
3958   return new pass_loop_distribution (ctxt);
3959 }
3960