xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/tree-parloops.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /* Loop autoparallelization.
2    Copyright (C) 2006-2017 Free Software Foundation, Inc.
3    Contributed by Sebastian Pop <pop@cri.ensmp.fr>
4    Zdenek Dvorak <dvorakz@suse.cz> and Razya Ladelsky <razya@il.ibm.com>.
5 
6 This file is part of GCC.
7 
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12 
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "cfghooks.h"
29 #include "tree-pass.h"
30 #include "ssa.h"
31 #include "cgraph.h"
32 #include "gimple-pretty-print.h"
33 #include "fold-const.h"
34 #include "gimplify.h"
35 #include "gimple-iterator.h"
36 #include "gimplify-me.h"
37 #include "gimple-walk.h"
38 #include "stor-layout.h"
39 #include "tree-nested.h"
40 #include "tree-cfg.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "tree-into-ssa.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "langhooks.h"
49 #include "tree-vectorizer.h"
50 #include "tree-hasher.h"
51 #include "tree-parloops.h"
52 #include "omp-general.h"
53 #include "omp-low.h"
54 #include "tree-ssa.h"
55 #include "params.h"
56 #include "params-enum.h"
57 #include "tree-ssa-alias.h"
58 #include "tree-eh.h"
59 #include "gomp-constants.h"
60 #include "tree-dfa.h"
61 
62 /* This pass tries to distribute iterations of loops into several threads.
63    The implementation is straightforward -- for each loop we test whether its
64    iterations are independent, and if it is the case (and some additional
65    conditions regarding profitability and correctness are satisfied), we
66    add GIMPLE_OMP_PARALLEL and GIMPLE_OMP_FOR codes and let omp expansion
67    machinery do its job.
68 
69    The most of the complexity is in bringing the code into shape expected
70    by the omp expanders:
71    -- for GIMPLE_OMP_FOR, ensuring that the loop has only one induction
72       variable and that the exit test is at the start of the loop body
73    -- for GIMPLE_OMP_PARALLEL, replacing the references to local addressable
74       variables by accesses through pointers, and breaking up ssa chains
75       by storing the values incoming to the parallelized loop to a structure
76       passed to the new function as an argument (something similar is done
77       in omp gimplification, unfortunately only a small part of the code
78       can be shared).
79 
80    TODO:
81    -- if there are several parallelizable loops in a function, it may be
82       possible to generate the threads just once (using synchronization to
83       ensure that cross-loop dependences are obeyed).
84    -- handling of common reduction patterns for outer loops.
85 
86    More info can also be found at http://gcc.gnu.org/wiki/AutoParInGCC  */
87 /*
88   Reduction handling:
89   currently we use vect_force_simple_reduction() to detect reduction patterns.
90   The code transformation will be introduced by an example.
91 
92 
93 parloop
94 {
95   int sum=1;
96 
97   for (i = 0; i < N; i++)
98    {
99     x[i] = i + 3;
100     sum+=x[i];
101    }
102 }
103 
104 gimple-like code:
105 header_bb:
106 
107   # sum_29 = PHI <sum_11(5), 1(3)>
108   # i_28 = PHI <i_12(5), 0(3)>
109   D.1795_8 = i_28 + 3;
110   x[i_28] = D.1795_8;
111   sum_11 = D.1795_8 + sum_29;
112   i_12 = i_28 + 1;
113   if (N_6(D) > i_12)
114     goto header_bb;
115 
116 
117 exit_bb:
118 
119   # sum_21 = PHI <sum_11(4)>
120   printf (&"%d"[0], sum_21);
121 
122 
123 after reduction transformation (only relevant parts):
124 
125 parloop
126 {
127 
128 ....
129 
130 
131   # Storing the initial value given by the user.  #
132 
133   .paral_data_store.32.sum.27 = 1;
134 
135   #pragma omp parallel num_threads(4)
136 
137   #pragma omp for schedule(static)
138 
139   # The neutral element corresponding to the particular
140   reduction's operation, e.g. 0 for PLUS_EXPR,
141   1 for MULT_EXPR, etc. replaces the user's initial value.  #
142 
143   # sum.27_29 = PHI <sum.27_11, 0>
144 
145   sum.27_11 = D.1827_8 + sum.27_29;
146 
147   GIMPLE_OMP_CONTINUE
148 
149   # Adding this reduction phi is done at create_phi_for_local_result() #
150   # sum.27_56 = PHI <sum.27_11, 0>
151   GIMPLE_OMP_RETURN
152 
153   # Creating the atomic operation is done at
154   create_call_for_reduction_1()  #
155 
156   #pragma omp atomic_load
157   D.1839_59 = *&.paral_data_load.33_51->reduction.23;
158   D.1840_60 = sum.27_56 + D.1839_59;
159   #pragma omp atomic_store (D.1840_60);
160 
161   GIMPLE_OMP_RETURN
162 
163  # collecting the result after the join of the threads is done at
164   create_loads_for_reductions().
165   The value computed by the threads is loaded from the
166   shared struct.  #
167 
168 
169   .paral_data_load.33_52 = &.paral_data_store.32;
170   sum_37 =  .paral_data_load.33_52->sum.27;
171   sum_43 = D.1795_41 + sum_37;
172 
173   exit bb:
174   # sum_21 = PHI <sum_43, sum_26>
175   printf (&"%d"[0], sum_21);
176 
177 ...
178 
179 }
180 
181 */
182 
183 /* Minimal number of iterations of a loop that should be executed in each
184    thread.  */
185 #define MIN_PER_THREAD 100
186 
187 /* Element of the hashtable, representing a
188    reduction in the current loop.  */
189 struct reduction_info
190 {
191   gimple *reduc_stmt;		/* reduction statement.  */
192   gimple *reduc_phi;		/* The phi node defining the reduction.  */
193   enum tree_code reduction_code;/* code for the reduction operation.  */
194   unsigned reduc_version;	/* SSA_NAME_VERSION of original reduc_phi
195 				   result.  */
196   gphi *keep_res;		/* The PHI_RESULT of this phi is the resulting value
197 				   of the reduction variable when existing the loop. */
198   tree initial_value;		/* The initial value of the reduction var before entering the loop.  */
199   tree field;			/*  the name of the field in the parloop data structure intended for reduction.  */
200   tree reduc_addr;		/* The address of the reduction variable for
201 				   openacc reductions.  */
202   tree init;			/* reduction initialization value.  */
203   gphi *new_phi;		/* (helper field) Newly created phi node whose result
204 				   will be passed to the atomic operation.  Represents
205 				   the local result each thread computed for the reduction
206 				   operation.  */
207 };
208 
209 /* Reduction info hashtable helpers.  */
210 
211 struct reduction_hasher : free_ptr_hash <reduction_info>
212 {
213   static inline hashval_t hash (const reduction_info *);
214   static inline bool equal (const reduction_info *, const reduction_info *);
215 };
216 
217 /* Equality and hash functions for hashtab code.  */
218 
219 inline bool
220 reduction_hasher::equal (const reduction_info *a, const reduction_info *b)
221 {
222   return (a->reduc_phi == b->reduc_phi);
223 }
224 
225 inline hashval_t
226 reduction_hasher::hash (const reduction_info *a)
227 {
228   return a->reduc_version;
229 }
230 
231 typedef hash_table<reduction_hasher> reduction_info_table_type;
232 
233 
234 static struct reduction_info *
235 reduction_phi (reduction_info_table_type *reduction_list, gimple *phi)
236 {
237   struct reduction_info tmpred, *red;
238 
239   if (reduction_list->elements () == 0 || phi == NULL)
240     return NULL;
241 
242   if (gimple_uid (phi) == (unsigned int)-1
243       || gimple_uid (phi) == 0)
244     return NULL;
245 
246   tmpred.reduc_phi = phi;
247   tmpred.reduc_version = gimple_uid (phi);
248   red = reduction_list->find (&tmpred);
249   gcc_assert (red == NULL || red->reduc_phi == phi);
250 
251   return red;
252 }
253 
254 /* Element of hashtable of names to copy.  */
255 
256 struct name_to_copy_elt
257 {
258   unsigned version;	/* The version of the name to copy.  */
259   tree new_name;	/* The new name used in the copy.  */
260   tree field;		/* The field of the structure used to pass the
261 			   value.  */
262 };
263 
264 /* Name copies hashtable helpers.  */
265 
266 struct name_to_copy_hasher : free_ptr_hash <name_to_copy_elt>
267 {
268   static inline hashval_t hash (const name_to_copy_elt *);
269   static inline bool equal (const name_to_copy_elt *, const name_to_copy_elt *);
270 };
271 
272 /* Equality and hash functions for hashtab code.  */
273 
274 inline bool
275 name_to_copy_hasher::equal (const name_to_copy_elt *a, const name_to_copy_elt *b)
276 {
277   return a->version == b->version;
278 }
279 
280 inline hashval_t
281 name_to_copy_hasher::hash (const name_to_copy_elt *a)
282 {
283   return (hashval_t) a->version;
284 }
285 
286 typedef hash_table<name_to_copy_hasher> name_to_copy_table_type;
287 
288 /* A transformation matrix, which is a self-contained ROWSIZE x COLSIZE
289    matrix.  Rather than use floats, we simply keep a single DENOMINATOR that
290    represents the denominator for every element in the matrix.  */
291 typedef struct lambda_trans_matrix_s
292 {
293   lambda_matrix matrix;
294   int rowsize;
295   int colsize;
296   int denominator;
297 } *lambda_trans_matrix;
298 #define LTM_MATRIX(T) ((T)->matrix)
299 #define LTM_ROWSIZE(T) ((T)->rowsize)
300 #define LTM_COLSIZE(T) ((T)->colsize)
301 #define LTM_DENOMINATOR(T) ((T)->denominator)
302 
303 /* Allocate a new transformation matrix.  */
304 
305 static lambda_trans_matrix
306 lambda_trans_matrix_new (int colsize, int rowsize,
307 			 struct obstack * lambda_obstack)
308 {
309   lambda_trans_matrix ret;
310 
311   ret = (lambda_trans_matrix)
312     obstack_alloc (lambda_obstack, sizeof (struct lambda_trans_matrix_s));
313   LTM_MATRIX (ret) = lambda_matrix_new (rowsize, colsize, lambda_obstack);
314   LTM_ROWSIZE (ret) = rowsize;
315   LTM_COLSIZE (ret) = colsize;
316   LTM_DENOMINATOR (ret) = 1;
317   return ret;
318 }
319 
320 /* Multiply a vector VEC by a matrix MAT.
321    MAT is an M*N matrix, and VEC is a vector with length N.  The result
322    is stored in DEST which must be a vector of length M.  */
323 
324 static void
325 lambda_matrix_vector_mult (lambda_matrix matrix, int m, int n,
326 			   lambda_vector vec, lambda_vector dest)
327 {
328   int i, j;
329 
330   lambda_vector_clear (dest, m);
331   for (i = 0; i < m; i++)
332     for (j = 0; j < n; j++)
333       dest[i] += matrix[i][j] * vec[j];
334 }
335 
336 /* Return true if TRANS is a legal transformation matrix that respects
337    the dependence vectors in DISTS and DIRS.  The conservative answer
338    is false.
339 
340    "Wolfe proves that a unimodular transformation represented by the
341    matrix T is legal when applied to a loop nest with a set of
342    lexicographically non-negative distance vectors RDG if and only if
343    for each vector d in RDG, (T.d >= 0) is lexicographically positive.
344    i.e.: if and only if it transforms the lexicographically positive
345    distance vectors to lexicographically positive vectors.  Note that
346    a unimodular matrix must transform the zero vector (and only it) to
347    the zero vector." S.Muchnick.  */
348 
349 static bool
350 lambda_transform_legal_p (lambda_trans_matrix trans,
351 			  int nb_loops,
352 			  vec<ddr_p> dependence_relations)
353 {
354   unsigned int i, j;
355   lambda_vector distres;
356   struct data_dependence_relation *ddr;
357 
358   gcc_assert (LTM_COLSIZE (trans) == nb_loops
359 	      && LTM_ROWSIZE (trans) == nb_loops);
360 
361   /* When there are no dependences, the transformation is correct.  */
362   if (dependence_relations.length () == 0)
363     return true;
364 
365   ddr = dependence_relations[0];
366   if (ddr == NULL)
367     return true;
368 
369   /* When there is an unknown relation in the dependence_relations, we
370      know that it is no worth looking at this loop nest: give up.  */
371   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
372     return false;
373 
374   distres = lambda_vector_new (nb_loops);
375 
376   /* For each distance vector in the dependence graph.  */
377   FOR_EACH_VEC_ELT (dependence_relations, i, ddr)
378     {
379       /* Don't care about relations for which we know that there is no
380 	 dependence, nor about read-read (aka. output-dependences):
381 	 these data accesses can happen in any order.  */
382       if (DDR_ARE_DEPENDENT (ddr) == chrec_known
383 	  || (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr))))
384 	continue;
385 
386       /* Conservatively answer: "this transformation is not valid".  */
387       if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
388 	return false;
389 
390       /* If the dependence could not be captured by a distance vector,
391 	 conservatively answer that the transform is not valid.  */
392       if (DDR_NUM_DIST_VECTS (ddr) == 0)
393 	return false;
394 
395       /* Compute trans.dist_vect */
396       for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++)
397 	{
398 	  lambda_matrix_vector_mult (LTM_MATRIX (trans), nb_loops, nb_loops,
399 				     DDR_DIST_VECT (ddr, j), distres);
400 
401 	  if (!lambda_vector_lexico_pos (distres, nb_loops))
402 	    return false;
403 	}
404     }
405   return true;
406 }
407 
408 /* Data dependency analysis. Returns true if the iterations of LOOP
409    are independent on each other (that is, if we can execute them
410    in parallel).  */
411 
412 static bool
413 loop_parallel_p (struct loop *loop, struct obstack * parloop_obstack)
414 {
415   vec<ddr_p> dependence_relations;
416   vec<data_reference_p> datarefs;
417   lambda_trans_matrix trans;
418   bool ret = false;
419 
420   if (dump_file && (dump_flags & TDF_DETAILS))
421   {
422     fprintf (dump_file, "Considering loop %d\n", loop->num);
423     if (!loop->inner)
424       fprintf (dump_file, "loop is innermost\n");
425     else
426       fprintf (dump_file, "loop NOT innermost\n");
427    }
428 
429   /* Check for problems with dependences.  If the loop can be reversed,
430      the iterations are independent.  */
431   auto_vec<loop_p, 3> loop_nest;
432   datarefs.create (10);
433   dependence_relations.create (100);
434   if (! compute_data_dependences_for_loop (loop, true, &loop_nest, &datarefs,
435 					   &dependence_relations))
436     {
437       if (dump_file && (dump_flags & TDF_DETAILS))
438 	fprintf (dump_file, "  FAILED: cannot analyze data dependencies\n");
439       ret = false;
440       goto end;
441     }
442   if (dump_file && (dump_flags & TDF_DETAILS))
443     dump_data_dependence_relations (dump_file, dependence_relations);
444 
445   trans = lambda_trans_matrix_new (1, 1, parloop_obstack);
446   LTM_MATRIX (trans)[0][0] = -1;
447 
448   if (lambda_transform_legal_p (trans, 1, dependence_relations))
449     {
450       ret = true;
451       if (dump_file && (dump_flags & TDF_DETAILS))
452 	fprintf (dump_file, "  SUCCESS: may be parallelized\n");
453     }
454   else if (dump_file && (dump_flags & TDF_DETAILS))
455     fprintf (dump_file,
456 	     "  FAILED: data dependencies exist across iterations\n");
457 
458  end:
459   free_dependence_relations (dependence_relations);
460   free_data_refs (datarefs);
461 
462   return ret;
463 }
464 
465 /* Return true when LOOP contains basic blocks marked with the
466    BB_IRREDUCIBLE_LOOP flag.  */
467 
468 static inline bool
469 loop_has_blocks_with_irreducible_flag (struct loop *loop)
470 {
471   unsigned i;
472   basic_block *bbs = get_loop_body_in_dom_order (loop);
473   bool res = true;
474 
475   for (i = 0; i < loop->num_nodes; i++)
476     if (bbs[i]->flags & BB_IRREDUCIBLE_LOOP)
477       goto end;
478 
479   res = false;
480  end:
481   free (bbs);
482   return res;
483 }
484 
485 /* Assigns the address of OBJ in TYPE to an ssa name, and returns this name.
486    The assignment statement is placed on edge ENTRY.  DECL_ADDRESS maps decls
487    to their addresses that can be reused.  The address of OBJ is known to
488    be invariant in the whole function.  Other needed statements are placed
489    right before GSI.  */
490 
491 static tree
492 take_address_of (tree obj, tree type, edge entry,
493 		 int_tree_htab_type *decl_address, gimple_stmt_iterator *gsi)
494 {
495   int uid;
496   tree *var_p, name, addr;
497   gassign *stmt;
498   gimple_seq stmts;
499 
500   /* Since the address of OBJ is invariant, the trees may be shared.
501      Avoid rewriting unrelated parts of the code.  */
502   obj = unshare_expr (obj);
503   for (var_p = &obj;
504        handled_component_p (*var_p);
505        var_p = &TREE_OPERAND (*var_p, 0))
506     continue;
507 
508   /* Canonicalize the access to base on a MEM_REF.  */
509   if (DECL_P (*var_p))
510     *var_p = build_simple_mem_ref (build_fold_addr_expr (*var_p));
511 
512   /* Assign a canonical SSA name to the address of the base decl used
513      in the address and share it for all accesses and addresses based
514      on it.  */
515   uid = DECL_UID (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
516   int_tree_map elt;
517   elt.uid = uid;
518   int_tree_map *slot = decl_address->find_slot (elt, INSERT);
519   if (!slot->to)
520     {
521       if (gsi == NULL)
522 	return NULL;
523       addr = TREE_OPERAND (*var_p, 0);
524       const char *obj_name
525 	= get_name (TREE_OPERAND (TREE_OPERAND (*var_p, 0), 0));
526       if (obj_name)
527 	name = make_temp_ssa_name (TREE_TYPE (addr), NULL, obj_name);
528       else
529 	name = make_ssa_name (TREE_TYPE (addr));
530       stmt = gimple_build_assign (name, addr);
531       gsi_insert_on_edge_immediate (entry, stmt);
532 
533       slot->uid = uid;
534       slot->to = name;
535     }
536   else
537     name = slot->to;
538 
539   /* Express the address in terms of the canonical SSA name.  */
540   TREE_OPERAND (*var_p, 0) = name;
541   if (gsi == NULL)
542     return build_fold_addr_expr_with_type (obj, type);
543 
544   name = force_gimple_operand (build_addr (obj),
545 			       &stmts, true, NULL_TREE);
546   if (!gimple_seq_empty_p (stmts))
547     gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
548 
549   if (!useless_type_conversion_p (type, TREE_TYPE (name)))
550     {
551       name = force_gimple_operand (fold_convert (type, name), &stmts, true,
552 				   NULL_TREE);
553       if (!gimple_seq_empty_p (stmts))
554 	gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
555     }
556 
557   return name;
558 }
559 
560 static tree
561 reduc_stmt_res (gimple *stmt)
562 {
563   return (gimple_code (stmt) == GIMPLE_PHI
564 	  ? gimple_phi_result (stmt)
565 	  : gimple_assign_lhs (stmt));
566 }
567 
568 /* Callback for htab_traverse.  Create the initialization statement
569    for reduction described in SLOT, and place it at the preheader of
570    the loop described in DATA.  */
571 
572 int
573 initialize_reductions (reduction_info **slot, struct loop *loop)
574 {
575   tree init;
576   tree type, arg;
577   edge e;
578 
579   struct reduction_info *const reduc = *slot;
580 
581   /* Create initialization in preheader:
582      reduction_variable = initialization value of reduction.  */
583 
584   /* In the phi node at the header, replace the argument coming
585      from the preheader with the reduction initialization value.  */
586 
587   /* Initialize the reduction.  */
588   type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
589   init = omp_reduction_init_op (gimple_location (reduc->reduc_stmt),
590 				reduc->reduction_code, type);
591   reduc->init = init;
592 
593   /* Replace the argument representing the initialization value
594      with the initialization value for the reduction (neutral
595      element for the particular operation, e.g. 0 for PLUS_EXPR,
596      1 for MULT_EXPR, etc).
597      Keep the old value in a new variable "reduction_initial",
598      that will be taken in consideration after the parallel
599      computing is done.  */
600 
601   e = loop_preheader_edge (loop);
602   arg = PHI_ARG_DEF_FROM_EDGE (reduc->reduc_phi, e);
603   /* Create new variable to hold the initial value.  */
604 
605   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE
606 	   (reduc->reduc_phi, loop_preheader_edge (loop)), init);
607   reduc->initial_value = arg;
608   return 1;
609 }
610 
611 struct elv_data
612 {
613   struct walk_stmt_info info;
614   edge entry;
615   int_tree_htab_type *decl_address;
616   gimple_stmt_iterator *gsi;
617   bool changed;
618   bool reset;
619 };
620 
621 /* Eliminates references to local variables in *TP out of the single
622    entry single exit region starting at DTA->ENTRY.
623    DECL_ADDRESS contains addresses of the references that had their
624    address taken already.  If the expression is changed, CHANGED is
625    set to true.  Callback for walk_tree.  */
626 
627 static tree
628 eliminate_local_variables_1 (tree *tp, int *walk_subtrees, void *data)
629 {
630   struct elv_data *const dta = (struct elv_data *) data;
631   tree t = *tp, var, addr, addr_type, type, obj;
632 
633   if (DECL_P (t))
634     {
635       *walk_subtrees = 0;
636 
637       if (!SSA_VAR_P (t) || DECL_EXTERNAL (t))
638 	return NULL_TREE;
639 
640       type = TREE_TYPE (t);
641       addr_type = build_pointer_type (type);
642       addr = take_address_of (t, addr_type, dta->entry, dta->decl_address,
643 			      dta->gsi);
644       if (dta->gsi == NULL && addr == NULL_TREE)
645 	{
646 	  dta->reset = true;
647 	  return NULL_TREE;
648 	}
649 
650       *tp = build_simple_mem_ref (addr);
651 
652       dta->changed = true;
653       return NULL_TREE;
654     }
655 
656   if (TREE_CODE (t) == ADDR_EXPR)
657     {
658       /* ADDR_EXPR may appear in two contexts:
659 	 -- as a gimple operand, when the address taken is a function invariant
660 	 -- as gimple rhs, when the resulting address in not a function
661 	    invariant
662 	 We do not need to do anything special in the latter case (the base of
663 	 the memory reference whose address is taken may be replaced in the
664 	 DECL_P case).  The former case is more complicated, as we need to
665 	 ensure that the new address is still a gimple operand.  Thus, it
666 	 is not sufficient to replace just the base of the memory reference --
667 	 we need to move the whole computation of the address out of the
668 	 loop.  */
669       if (!is_gimple_val (t))
670 	return NULL_TREE;
671 
672       *walk_subtrees = 0;
673       obj = TREE_OPERAND (t, 0);
674       var = get_base_address (obj);
675       if (!var || !SSA_VAR_P (var) || DECL_EXTERNAL (var))
676 	return NULL_TREE;
677 
678       addr_type = TREE_TYPE (t);
679       addr = take_address_of (obj, addr_type, dta->entry, dta->decl_address,
680 			      dta->gsi);
681       if (dta->gsi == NULL && addr == NULL_TREE)
682 	{
683 	  dta->reset = true;
684 	  return NULL_TREE;
685 	}
686       *tp = addr;
687 
688       dta->changed = true;
689       return NULL_TREE;
690     }
691 
692   if (!EXPR_P (t))
693     *walk_subtrees = 0;
694 
695   return NULL_TREE;
696 }
697 
698 /* Moves the references to local variables in STMT at *GSI out of the single
699    entry single exit region starting at ENTRY.  DECL_ADDRESS contains
700    addresses of the references that had their address taken
701    already.  */
702 
703 static void
704 eliminate_local_variables_stmt (edge entry, gimple_stmt_iterator *gsi,
705 				int_tree_htab_type *decl_address)
706 {
707   struct elv_data dta;
708   gimple *stmt = gsi_stmt (*gsi);
709 
710   memset (&dta.info, '\0', sizeof (dta.info));
711   dta.entry = entry;
712   dta.decl_address = decl_address;
713   dta.changed = false;
714   dta.reset = false;
715 
716   if (gimple_debug_bind_p (stmt))
717     {
718       dta.gsi = NULL;
719       walk_tree (gimple_debug_bind_get_value_ptr (stmt),
720 		 eliminate_local_variables_1, &dta.info, NULL);
721       if (dta.reset)
722 	{
723 	  gimple_debug_bind_reset_value (stmt);
724 	  dta.changed = true;
725 	}
726     }
727   else if (gimple_clobber_p (stmt))
728     {
729       unlink_stmt_vdef (stmt);
730       stmt = gimple_build_nop ();
731       gsi_replace (gsi, stmt, false);
732       dta.changed = true;
733     }
734   else
735     {
736       dta.gsi = gsi;
737       walk_gimple_op (stmt, eliminate_local_variables_1, &dta.info);
738     }
739 
740   if (dta.changed)
741     update_stmt (stmt);
742 }
743 
744 /* Eliminates the references to local variables from the single entry
745    single exit region between the ENTRY and EXIT edges.
746 
747    This includes:
748    1) Taking address of a local variable -- these are moved out of the
749    region (and temporary variable is created to hold the address if
750    necessary).
751 
752    2) Dereferencing a local variable -- these are replaced with indirect
753    references.  */
754 
755 static void
756 eliminate_local_variables (edge entry, edge exit)
757 {
758   basic_block bb;
759   auto_vec<basic_block, 3> body;
760   unsigned i;
761   gimple_stmt_iterator gsi;
762   bool has_debug_stmt = false;
763   int_tree_htab_type decl_address (10);
764   basic_block entry_bb = entry->src;
765   basic_block exit_bb = exit->dest;
766 
767   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
768 
769   FOR_EACH_VEC_ELT (body, i, bb)
770     if (bb != entry_bb && bb != exit_bb)
771       {
772         for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
773 	  if (is_gimple_debug (gsi_stmt (gsi)))
774 	    {
775 	      if (gimple_debug_bind_p (gsi_stmt (gsi)))
776 	        has_debug_stmt = true;
777 	    }
778 	  else
779 	    eliminate_local_variables_stmt (entry, &gsi, &decl_address);
780       }
781 
782   if (has_debug_stmt)
783     FOR_EACH_VEC_ELT (body, i, bb)
784       if (bb != entry_bb && bb != exit_bb)
785 	for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
786 	  if (gimple_debug_bind_p (gsi_stmt (gsi)))
787 	    eliminate_local_variables_stmt (entry, &gsi, &decl_address);
788 }
789 
790 /* Returns true if expression EXPR is not defined between ENTRY and
791    EXIT, i.e. if all its operands are defined outside of the region.  */
792 
793 static bool
794 expr_invariant_in_region_p (edge entry, edge exit, tree expr)
795 {
796   basic_block entry_bb = entry->src;
797   basic_block exit_bb = exit->dest;
798   basic_block def_bb;
799 
800   if (is_gimple_min_invariant (expr))
801     return true;
802 
803   if (TREE_CODE (expr) == SSA_NAME)
804     {
805       def_bb = gimple_bb (SSA_NAME_DEF_STMT (expr));
806       if (def_bb
807 	  && dominated_by_p (CDI_DOMINATORS, def_bb, entry_bb)
808 	  && !dominated_by_p (CDI_DOMINATORS, def_bb, exit_bb))
809 	return false;
810 
811       return true;
812     }
813 
814   return false;
815 }
816 
817 /* If COPY_NAME_P is true, creates and returns a duplicate of NAME.
818    The copies are stored to NAME_COPIES, if NAME was already duplicated,
819    its duplicate stored in NAME_COPIES is returned.
820 
821    Regardless of COPY_NAME_P, the decl used as a base of the ssa name is also
822    duplicated, storing the copies in DECL_COPIES.  */
823 
824 static tree
825 separate_decls_in_region_name (tree name, name_to_copy_table_type *name_copies,
826 			       int_tree_htab_type *decl_copies,
827 			       bool copy_name_p)
828 {
829   tree copy, var, var_copy;
830   unsigned idx, uid, nuid;
831   struct int_tree_map ielt;
832   struct name_to_copy_elt elt, *nelt;
833   name_to_copy_elt **slot;
834   int_tree_map *dslot;
835 
836   if (TREE_CODE (name) != SSA_NAME)
837     return name;
838 
839   idx = SSA_NAME_VERSION (name);
840   elt.version = idx;
841   slot = name_copies->find_slot_with_hash (&elt, idx,
842 					   copy_name_p ? INSERT : NO_INSERT);
843   if (slot && *slot)
844     return (*slot)->new_name;
845 
846   if (copy_name_p)
847     {
848       copy = duplicate_ssa_name (name, NULL);
849       nelt = XNEW (struct name_to_copy_elt);
850       nelt->version = idx;
851       nelt->new_name = copy;
852       nelt->field = NULL_TREE;
853       *slot = nelt;
854     }
855   else
856     {
857       gcc_assert (!slot);
858       copy = name;
859     }
860 
861   var = SSA_NAME_VAR (name);
862   if (!var)
863     return copy;
864 
865   uid = DECL_UID (var);
866   ielt.uid = uid;
867   dslot = decl_copies->find_slot_with_hash (ielt, uid, INSERT);
868   if (!dslot->to)
869     {
870       var_copy = create_tmp_var (TREE_TYPE (var), get_name (var));
871       DECL_GIMPLE_REG_P (var_copy) = DECL_GIMPLE_REG_P (var);
872       dslot->uid = uid;
873       dslot->to = var_copy;
874 
875       /* Ensure that when we meet this decl next time, we won't duplicate
876          it again.  */
877       nuid = DECL_UID (var_copy);
878       ielt.uid = nuid;
879       dslot = decl_copies->find_slot_with_hash (ielt, nuid, INSERT);
880       gcc_assert (!dslot->to);
881       dslot->uid = nuid;
882       dslot->to = var_copy;
883     }
884   else
885     var_copy = dslot->to;
886 
887   replace_ssa_name_symbol (copy, var_copy);
888   return copy;
889 }
890 
891 /* Finds the ssa names used in STMT that are defined outside the
892    region between ENTRY and EXIT and replaces such ssa names with
893    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
894    decls of all ssa names used in STMT (including those defined in
895    LOOP) are replaced with the new temporary variables; the
896    replacement decls are stored in DECL_COPIES.  */
897 
898 static void
899 separate_decls_in_region_stmt (edge entry, edge exit, gimple *stmt,
900 			       name_to_copy_table_type *name_copies,
901 			       int_tree_htab_type *decl_copies)
902 {
903   use_operand_p use;
904   def_operand_p def;
905   ssa_op_iter oi;
906   tree name, copy;
907   bool copy_name_p;
908 
909   FOR_EACH_PHI_OR_STMT_DEF (def, stmt, oi, SSA_OP_DEF)
910   {
911     name = DEF_FROM_PTR (def);
912     gcc_assert (TREE_CODE (name) == SSA_NAME);
913     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
914 					  false);
915     gcc_assert (copy == name);
916   }
917 
918   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
919   {
920     name = USE_FROM_PTR (use);
921     if (TREE_CODE (name) != SSA_NAME)
922       continue;
923 
924     copy_name_p = expr_invariant_in_region_p (entry, exit, name);
925     copy = separate_decls_in_region_name (name, name_copies, decl_copies,
926 					  copy_name_p);
927     SET_USE (use, copy);
928   }
929 }
930 
931 /* Finds the ssa names used in STMT that are defined outside the
932    region between ENTRY and EXIT and replaces such ssa names with
933    their duplicates.  The duplicates are stored to NAME_COPIES.  Base
934    decls of all ssa names used in STMT (including those defined in
935    LOOP) are replaced with the new temporary variables; the
936    replacement decls are stored in DECL_COPIES.  */
937 
938 static bool
939 separate_decls_in_region_debug (gimple *stmt,
940 				name_to_copy_table_type *name_copies,
941 				int_tree_htab_type *decl_copies)
942 {
943   use_operand_p use;
944   ssa_op_iter oi;
945   tree var, name;
946   struct int_tree_map ielt;
947   struct name_to_copy_elt elt;
948   name_to_copy_elt **slot;
949   int_tree_map *dslot;
950 
951   if (gimple_debug_bind_p (stmt))
952     var = gimple_debug_bind_get_var (stmt);
953   else if (gimple_debug_source_bind_p (stmt))
954     var = gimple_debug_source_bind_get_var (stmt);
955   else
956     return true;
957   if (TREE_CODE (var) == DEBUG_EXPR_DECL || TREE_CODE (var) == LABEL_DECL)
958     return true;
959   gcc_assert (DECL_P (var) && SSA_VAR_P (var));
960   ielt.uid = DECL_UID (var);
961   dslot = decl_copies->find_slot_with_hash (ielt, ielt.uid, NO_INSERT);
962   if (!dslot)
963     return true;
964   if (gimple_debug_bind_p (stmt))
965     gimple_debug_bind_set_var (stmt, dslot->to);
966   else if (gimple_debug_source_bind_p (stmt))
967     gimple_debug_source_bind_set_var (stmt, dslot->to);
968 
969   FOR_EACH_PHI_OR_STMT_USE (use, stmt, oi, SSA_OP_USE)
970   {
971     name = USE_FROM_PTR (use);
972     if (TREE_CODE (name) != SSA_NAME)
973       continue;
974 
975     elt.version = SSA_NAME_VERSION (name);
976     slot = name_copies->find_slot_with_hash (&elt, elt.version, NO_INSERT);
977     if (!slot)
978       {
979 	gimple_debug_bind_reset_value (stmt);
980 	update_stmt (stmt);
981 	break;
982       }
983 
984     SET_USE (use, (*slot)->new_name);
985   }
986 
987   return false;
988 }
989 
990 /* Callback for htab_traverse.  Adds a field corresponding to the reduction
991    specified in SLOT. The type is passed in DATA.  */
992 
993 int
994 add_field_for_reduction (reduction_info **slot, tree type)
995 {
996 
997   struct reduction_info *const red = *slot;
998   tree var = reduc_stmt_res (red->reduc_stmt);
999   tree field = build_decl (gimple_location (red->reduc_stmt), FIELD_DECL,
1000 			   SSA_NAME_IDENTIFIER (var), TREE_TYPE (var));
1001 
1002   insert_field_into_struct (type, field);
1003 
1004   red->field = field;
1005 
1006   return 1;
1007 }
1008 
1009 /* Callback for htab_traverse.  Adds a field corresponding to a ssa name
1010    described in SLOT. The type is passed in DATA.  */
1011 
1012 int
1013 add_field_for_name (name_to_copy_elt **slot, tree type)
1014 {
1015   struct name_to_copy_elt *const elt = *slot;
1016   tree name = ssa_name (elt->version);
1017   tree field = build_decl (UNKNOWN_LOCATION,
1018 			   FIELD_DECL, SSA_NAME_IDENTIFIER (name),
1019 			   TREE_TYPE (name));
1020 
1021   insert_field_into_struct (type, field);
1022   elt->field = field;
1023 
1024   return 1;
1025 }
1026 
1027 /* Callback for htab_traverse.  A local result is the intermediate result
1028    computed by a single
1029    thread, or the initial value in case no iteration was executed.
1030    This function creates a phi node reflecting these values.
1031    The phi's result will be stored in NEW_PHI field of the
1032    reduction's data structure.  */
1033 
1034 int
1035 create_phi_for_local_result (reduction_info **slot, struct loop *loop)
1036 {
1037   struct reduction_info *const reduc = *slot;
1038   edge e;
1039   gphi *new_phi;
1040   basic_block store_bb, continue_bb;
1041   tree local_res;
1042   source_location locus;
1043 
1044   /* STORE_BB is the block where the phi
1045      should be stored.  It is the destination of the loop exit.
1046      (Find the fallthru edge from GIMPLE_OMP_CONTINUE).  */
1047   continue_bb = single_pred (loop->latch);
1048   store_bb = FALLTHRU_EDGE (continue_bb)->dest;
1049 
1050   /* STORE_BB has two predecessors.  One coming from  the loop
1051      (the reduction's result is computed at the loop),
1052      and another coming from a block preceding the loop,
1053      when no iterations
1054      are executed (the initial value should be taken).  */
1055   if (EDGE_PRED (store_bb, 0) == FALLTHRU_EDGE (continue_bb))
1056     e = EDGE_PRED (store_bb, 1);
1057   else
1058     e = EDGE_PRED (store_bb, 0);
1059   tree lhs = reduc_stmt_res (reduc->reduc_stmt);
1060   local_res = copy_ssa_name (lhs);
1061   locus = gimple_location (reduc->reduc_stmt);
1062   new_phi = create_phi_node (local_res, store_bb);
1063   add_phi_arg (new_phi, reduc->init, e, locus);
1064   add_phi_arg (new_phi, lhs, FALLTHRU_EDGE (continue_bb), locus);
1065   reduc->new_phi = new_phi;
1066 
1067   return 1;
1068 }
1069 
1070 struct clsn_data
1071 {
1072   tree store;
1073   tree load;
1074 
1075   basic_block store_bb;
1076   basic_block load_bb;
1077 };
1078 
1079 /* Callback for htab_traverse.  Create an atomic instruction for the
1080    reduction described in SLOT.
1081    DATA annotates the place in memory the atomic operation relates to,
1082    and the basic block it needs to be generated in.  */
1083 
1084 int
1085 create_call_for_reduction_1 (reduction_info **slot, struct clsn_data *clsn_data)
1086 {
1087   struct reduction_info *const reduc = *slot;
1088   gimple_stmt_iterator gsi;
1089   tree type = TREE_TYPE (PHI_RESULT (reduc->reduc_phi));
1090   tree load_struct;
1091   basic_block bb;
1092   basic_block new_bb;
1093   edge e;
1094   tree t, addr, ref, x;
1095   tree tmp_load, name;
1096   gimple *load;
1097 
1098   if (reduc->reduc_addr == NULL_TREE)
1099     {
1100       load_struct = build_simple_mem_ref (clsn_data->load);
1101       t = build3 (COMPONENT_REF, type, load_struct, reduc->field, NULL_TREE);
1102 
1103       addr = build_addr (t);
1104     }
1105   else
1106     {
1107       /* Set the address for the atomic store.  */
1108       addr = reduc->reduc_addr;
1109 
1110       /* Remove the non-atomic store '*addr = sum'.  */
1111       tree res = PHI_RESULT (reduc->keep_res);
1112       use_operand_p use_p;
1113       gimple *stmt;
1114       bool single_use_p = single_imm_use (res, &use_p, &stmt);
1115       gcc_assert (single_use_p);
1116       replace_uses_by (gimple_vdef (stmt),
1117 		       gimple_vuse (stmt));
1118       gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
1119       gsi_remove (&gsi, true);
1120     }
1121 
1122   /* Create phi node.  */
1123   bb = clsn_data->load_bb;
1124 
1125   gsi = gsi_last_bb (bb);
1126   e = split_block (bb, gsi_stmt (gsi));
1127   new_bb = e->dest;
1128 
1129   tmp_load = create_tmp_var (TREE_TYPE (TREE_TYPE (addr)));
1130   tmp_load = make_ssa_name (tmp_load);
1131   load = gimple_build_omp_atomic_load (tmp_load, addr);
1132   SSA_NAME_DEF_STMT (tmp_load) = load;
1133   gsi = gsi_start_bb (new_bb);
1134   gsi_insert_after (&gsi, load, GSI_NEW_STMT);
1135 
1136   e = split_block (new_bb, load);
1137   new_bb = e->dest;
1138   gsi = gsi_start_bb (new_bb);
1139   ref = tmp_load;
1140   x = fold_build2 (reduc->reduction_code,
1141 		   TREE_TYPE (PHI_RESULT (reduc->new_phi)), ref,
1142 		   PHI_RESULT (reduc->new_phi));
1143 
1144   name = force_gimple_operand_gsi (&gsi, x, true, NULL_TREE, true,
1145 				   GSI_CONTINUE_LINKING);
1146 
1147   gsi_insert_after (&gsi, gimple_build_omp_atomic_store (name), GSI_NEW_STMT);
1148   return 1;
1149 }
1150 
1151 /* Create the atomic operation at the join point of the threads.
1152    REDUCTION_LIST describes the reductions in the LOOP.
1153    LD_ST_DATA describes the shared data structure where
1154    shared data is stored in and loaded from.  */
1155 static void
1156 create_call_for_reduction (struct loop *loop,
1157 			   reduction_info_table_type *reduction_list,
1158 			   struct clsn_data *ld_st_data)
1159 {
1160   reduction_list->traverse <struct loop *, create_phi_for_local_result> (loop);
1161   /* Find the fallthru edge from GIMPLE_OMP_CONTINUE.  */
1162   basic_block continue_bb = single_pred (loop->latch);
1163   ld_st_data->load_bb = FALLTHRU_EDGE (continue_bb)->dest;
1164   reduction_list
1165     ->traverse <struct clsn_data *, create_call_for_reduction_1> (ld_st_data);
1166 }
1167 
1168 /* Callback for htab_traverse.  Loads the final reduction value at the
1169    join point of all threads, and inserts it in the right place.  */
1170 
1171 int
1172 create_loads_for_reductions (reduction_info **slot, struct clsn_data *clsn_data)
1173 {
1174   struct reduction_info *const red = *slot;
1175   gimple *stmt;
1176   gimple_stmt_iterator gsi;
1177   tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1178   tree load_struct;
1179   tree name;
1180   tree x;
1181 
1182   /* If there's no exit phi, the result of the reduction is unused.  */
1183   if (red->keep_res == NULL)
1184     return 1;
1185 
1186   gsi = gsi_after_labels (clsn_data->load_bb);
1187   load_struct = build_simple_mem_ref (clsn_data->load);
1188   load_struct = build3 (COMPONENT_REF, type, load_struct, red->field,
1189 			NULL_TREE);
1190 
1191   x = load_struct;
1192   name = PHI_RESULT (red->keep_res);
1193   stmt = gimple_build_assign (name, x);
1194 
1195   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1196 
1197   for (gsi = gsi_start_phis (gimple_bb (red->keep_res));
1198        !gsi_end_p (gsi); gsi_next (&gsi))
1199     if (gsi_stmt (gsi) == red->keep_res)
1200       {
1201 	remove_phi_node (&gsi, false);
1202 	return 1;
1203       }
1204   gcc_unreachable ();
1205 }
1206 
1207 /* Load the reduction result that was stored in LD_ST_DATA.
1208    REDUCTION_LIST describes the list of reductions that the
1209    loads should be generated for.  */
1210 static void
1211 create_final_loads_for_reduction (reduction_info_table_type *reduction_list,
1212 				  struct clsn_data *ld_st_data)
1213 {
1214   gimple_stmt_iterator gsi;
1215   tree t;
1216   gimple *stmt;
1217 
1218   gsi = gsi_after_labels (ld_st_data->load_bb);
1219   t = build_fold_addr_expr (ld_st_data->store);
1220   stmt = gimple_build_assign (ld_st_data->load, t);
1221 
1222   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
1223 
1224   reduction_list
1225     ->traverse <struct clsn_data *, create_loads_for_reductions> (ld_st_data);
1226 
1227 }
1228 
1229 /* Callback for htab_traverse.  Store the neutral value for the
1230   particular reduction's operation, e.g. 0 for PLUS_EXPR,
1231   1 for MULT_EXPR, etc. into the reduction field.
1232   The reduction is specified in SLOT. The store information is
1233   passed in DATA.  */
1234 
1235 int
1236 create_stores_for_reduction (reduction_info **slot, struct clsn_data *clsn_data)
1237 {
1238   struct reduction_info *const red = *slot;
1239   tree t;
1240   gimple *stmt;
1241   gimple_stmt_iterator gsi;
1242   tree type = TREE_TYPE (reduc_stmt_res (red->reduc_stmt));
1243 
1244   gsi = gsi_last_bb (clsn_data->store_bb);
1245   t = build3 (COMPONENT_REF, type, clsn_data->store, red->field, NULL_TREE);
1246   stmt = gimple_build_assign (t, red->initial_value);
1247   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1248 
1249   return 1;
1250 }
1251 
1252 /* Callback for htab_traverse.  Creates loads to a field of LOAD in LOAD_BB and
1253    store to a field of STORE in STORE_BB for the ssa name and its duplicate
1254    specified in SLOT.  */
1255 
1256 int
1257 create_loads_and_stores_for_name (name_to_copy_elt **slot,
1258 				  struct clsn_data *clsn_data)
1259 {
1260   struct name_to_copy_elt *const elt = *slot;
1261   tree t;
1262   gimple *stmt;
1263   gimple_stmt_iterator gsi;
1264   tree type = TREE_TYPE (elt->new_name);
1265   tree load_struct;
1266 
1267   gsi = gsi_last_bb (clsn_data->store_bb);
1268   t = build3 (COMPONENT_REF, type, clsn_data->store, elt->field, NULL_TREE);
1269   stmt = gimple_build_assign (t, ssa_name (elt->version));
1270   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1271 
1272   gsi = gsi_last_bb (clsn_data->load_bb);
1273   load_struct = build_simple_mem_ref (clsn_data->load);
1274   t = build3 (COMPONENT_REF, type, load_struct, elt->field, NULL_TREE);
1275   stmt = gimple_build_assign (elt->new_name, t);
1276   gsi_insert_after (&gsi, stmt, GSI_NEW_STMT);
1277 
1278   return 1;
1279 }
1280 
1281 /* Moves all the variables used in LOOP and defined outside of it (including
1282    the initial values of loop phi nodes, and *PER_THREAD if it is a ssa
1283    name) to a structure created for this purpose.  The code
1284 
1285    while (1)
1286      {
1287        use (a);
1288        use (b);
1289      }
1290 
1291    is transformed this way:
1292 
1293    bb0:
1294    old.a = a;
1295    old.b = b;
1296 
1297    bb1:
1298    a' = new->a;
1299    b' = new->b;
1300    while (1)
1301      {
1302        use (a');
1303        use (b');
1304      }
1305 
1306    `old' is stored to *ARG_STRUCT and `new' is stored to NEW_ARG_STRUCT.  The
1307    pointer `new' is intentionally not initialized (the loop will be split to a
1308    separate function later, and `new' will be initialized from its arguments).
1309    LD_ST_DATA holds information about the shared data structure used to pass
1310    information among the threads.  It is initialized here, and
1311    gen_parallel_loop will pass it to create_call_for_reduction that
1312    needs this information.  REDUCTION_LIST describes the reductions
1313    in LOOP.  */
1314 
1315 static void
1316 separate_decls_in_region (edge entry, edge exit,
1317 			  reduction_info_table_type *reduction_list,
1318 			  tree *arg_struct, tree *new_arg_struct,
1319 			  struct clsn_data *ld_st_data)
1320 
1321 {
1322   basic_block bb1 = split_edge (entry);
1323   basic_block bb0 = single_pred (bb1);
1324   name_to_copy_table_type name_copies (10);
1325   int_tree_htab_type decl_copies (10);
1326   unsigned i;
1327   tree type, type_name, nvar;
1328   gimple_stmt_iterator gsi;
1329   struct clsn_data clsn_data;
1330   auto_vec<basic_block, 3> body;
1331   basic_block bb;
1332   basic_block entry_bb = bb1;
1333   basic_block exit_bb = exit->dest;
1334   bool has_debug_stmt = false;
1335 
1336   entry = single_succ_edge (entry_bb);
1337   gather_blocks_in_sese_region (entry_bb, exit_bb, &body);
1338 
1339   FOR_EACH_VEC_ELT (body, i, bb)
1340     {
1341       if (bb != entry_bb && bb != exit_bb)
1342 	{
1343 	  for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1344 	    separate_decls_in_region_stmt (entry, exit, gsi_stmt (gsi),
1345 					   &name_copies, &decl_copies);
1346 
1347 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1348 	    {
1349 	      gimple *stmt = gsi_stmt (gsi);
1350 
1351 	      if (is_gimple_debug (stmt))
1352 		has_debug_stmt = true;
1353 	      else
1354 		separate_decls_in_region_stmt (entry, exit, stmt,
1355 					       &name_copies, &decl_copies);
1356 	    }
1357 	}
1358     }
1359 
1360   /* Now process debug bind stmts.  We must not create decls while
1361      processing debug stmts, so we defer their processing so as to
1362      make sure we will have debug info for as many variables as
1363      possible (all of those that were dealt with in the loop above),
1364      and discard those for which we know there's nothing we can
1365      do.  */
1366   if (has_debug_stmt)
1367     FOR_EACH_VEC_ELT (body, i, bb)
1368       if (bb != entry_bb && bb != exit_bb)
1369 	{
1370 	  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
1371 	    {
1372 	      gimple *stmt = gsi_stmt (gsi);
1373 
1374 	      if (is_gimple_debug (stmt))
1375 		{
1376 		  if (separate_decls_in_region_debug (stmt, &name_copies,
1377 						      &decl_copies))
1378 		    {
1379 		      gsi_remove (&gsi, true);
1380 		      continue;
1381 		    }
1382 		}
1383 
1384 	      gsi_next (&gsi);
1385 	    }
1386 	}
1387 
1388   if (name_copies.elements () == 0 && reduction_list->elements () == 0)
1389     {
1390       /* It may happen that there is nothing to copy (if there are only
1391          loop carried and external variables in the loop).  */
1392       *arg_struct = NULL;
1393       *new_arg_struct = NULL;
1394     }
1395   else
1396     {
1397       /* Create the type for the structure to store the ssa names to.  */
1398       type = lang_hooks.types.make_type (RECORD_TYPE);
1399       type_name = build_decl (UNKNOWN_LOCATION,
1400 			      TYPE_DECL, create_tmp_var_name (".paral_data"),
1401 			      type);
1402       TYPE_NAME (type) = type_name;
1403 
1404       name_copies.traverse <tree, add_field_for_name> (type);
1405       if (reduction_list && reduction_list->elements () > 0)
1406 	{
1407 	  /* Create the fields for reductions.  */
1408 	  reduction_list->traverse <tree, add_field_for_reduction> (type);
1409 	}
1410       layout_type (type);
1411 
1412       /* Create the loads and stores.  */
1413       *arg_struct = create_tmp_var (type, ".paral_data_store");
1414       nvar = create_tmp_var (build_pointer_type (type), ".paral_data_load");
1415       *new_arg_struct = make_ssa_name (nvar);
1416 
1417       ld_st_data->store = *arg_struct;
1418       ld_st_data->load = *new_arg_struct;
1419       ld_st_data->store_bb = bb0;
1420       ld_st_data->load_bb = bb1;
1421 
1422       name_copies
1423 	.traverse <struct clsn_data *, create_loads_and_stores_for_name>
1424 		  (ld_st_data);
1425 
1426       /* Load the calculation from memory (after the join of the threads).  */
1427 
1428       if (reduction_list && reduction_list->elements () > 0)
1429 	{
1430 	  reduction_list
1431 	    ->traverse <struct clsn_data *, create_stores_for_reduction>
1432 	    (ld_st_data);
1433 	  clsn_data.load = make_ssa_name (nvar);
1434 	  clsn_data.load_bb = exit->dest;
1435 	  clsn_data.store = ld_st_data->store;
1436 	  create_final_loads_for_reduction (reduction_list, &clsn_data);
1437 	}
1438     }
1439 }
1440 
1441 /* Returns true if FN was created to run in parallel.  */
1442 
1443 bool
1444 parallelized_function_p (tree fndecl)
1445 {
1446   cgraph_node *node = cgraph_node::get (fndecl);
1447   gcc_assert (node != NULL);
1448   return node->parallelized_function;
1449 }
1450 
1451 /* Creates and returns an empty function that will receive the body of
1452    a parallelized loop.  */
1453 
1454 static tree
1455 create_loop_fn (location_t loc)
1456 {
1457   char buf[100];
1458   char *tname;
1459   tree decl, type, name, t;
1460   struct function *act_cfun = cfun;
1461   static unsigned loopfn_num;
1462 
1463   loc = LOCATION_LOCUS (loc);
1464   snprintf (buf, 100, "%s.$loopfn", current_function_name ());
1465   ASM_FORMAT_PRIVATE_NAME (tname, buf, loopfn_num++);
1466   clean_symbol_name (tname);
1467   name = get_identifier (tname);
1468   type = build_function_type_list (void_type_node, ptr_type_node, NULL_TREE);
1469 
1470   decl = build_decl (loc, FUNCTION_DECL, name, type);
1471   TREE_STATIC (decl) = 1;
1472   TREE_USED (decl) = 1;
1473   DECL_ARTIFICIAL (decl) = 1;
1474   DECL_IGNORED_P (decl) = 0;
1475   TREE_PUBLIC (decl) = 0;
1476   DECL_UNINLINABLE (decl) = 1;
1477   DECL_EXTERNAL (decl) = 0;
1478   DECL_CONTEXT (decl) = NULL_TREE;
1479   DECL_INITIAL (decl) = make_node (BLOCK);
1480   BLOCK_SUPERCONTEXT (DECL_INITIAL (decl)) = decl;
1481 
1482   t = build_decl (loc, RESULT_DECL, NULL_TREE, void_type_node);
1483   DECL_ARTIFICIAL (t) = 1;
1484   DECL_IGNORED_P (t) = 1;
1485   DECL_RESULT (decl) = t;
1486 
1487   t = build_decl (loc, PARM_DECL, get_identifier (".paral_data_param"),
1488 		  ptr_type_node);
1489   DECL_ARTIFICIAL (t) = 1;
1490   DECL_ARG_TYPE (t) = ptr_type_node;
1491   DECL_CONTEXT (t) = decl;
1492   TREE_USED (t) = 1;
1493   DECL_ARGUMENTS (decl) = t;
1494 
1495   allocate_struct_function (decl, false);
1496 
1497   /* The call to allocate_struct_function clobbers CFUN, so we need to restore
1498      it.  */
1499   set_cfun (act_cfun);
1500 
1501   return decl;
1502 }
1503 
1504 /* Replace uses of NAME by VAL in block BB.  */
1505 
1506 static void
1507 replace_uses_in_bb_by (tree name, tree val, basic_block bb)
1508 {
1509   gimple *use_stmt;
1510   imm_use_iterator imm_iter;
1511 
1512   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, name)
1513     {
1514       if (gimple_bb (use_stmt) != bb)
1515 	continue;
1516 
1517       use_operand_p use_p;
1518       FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1519 	SET_USE (use_p, val);
1520     }
1521 }
1522 
1523 /* Do transformation from:
1524 
1525      <bb preheader>:
1526      ...
1527      goto <bb header>
1528 
1529      <bb header>:
1530      ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1531      sum_a = PHI <sum_init (preheader), sum_b (latch)>
1532      ...
1533      use (ivtmp_a)
1534      ...
1535      sum_b = sum_a + sum_update
1536      ...
1537      if (ivtmp_a < n)
1538        goto <bb latch>;
1539      else
1540        goto <bb exit>;
1541 
1542      <bb latch>:
1543      ivtmp_b = ivtmp_a + 1;
1544      goto <bb header>
1545 
1546      <bb exit>:
1547      sum_z = PHI <sum_b (cond[1]), ...>
1548 
1549      [1] Where <bb cond> is single_pred (bb latch); In the simplest case,
1550 	 that's <bb header>.
1551 
1552    to:
1553 
1554      <bb preheader>:
1555      ...
1556      goto <bb newheader>
1557 
1558      <bb header>:
1559      ivtmp_a = PHI <ivtmp_c (latch)>
1560      sum_a = PHI <sum_c (latch)>
1561      ...
1562      use (ivtmp_a)
1563      ...
1564      sum_b = sum_a + sum_update
1565      ...
1566      goto <bb latch>;
1567 
1568      <bb newheader>:
1569      ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1570      sum_c = PHI <sum_init (preheader), sum_b (latch)>
1571      if (ivtmp_c < n + 1)
1572        goto <bb header>;
1573      else
1574        goto <bb newexit>;
1575 
1576      <bb latch>:
1577      ivtmp_b = ivtmp_a + 1;
1578      goto <bb newheader>
1579 
1580      <bb newexit>:
1581      sum_y = PHI <sum_c (newheader)>
1582 
1583      <bb exit>:
1584      sum_z = PHI <sum_y (newexit), ...>
1585 
1586 
1587    In unified diff format:
1588 
1589       <bb preheader>:
1590       ...
1591 -     goto <bb header>
1592 +     goto <bb newheader>
1593 
1594       <bb header>:
1595 -     ivtmp_a = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1596 -     sum_a = PHI <sum_init (preheader), sum_b (latch)>
1597 +     ivtmp_a = PHI <ivtmp_c (latch)>
1598 +     sum_a = PHI <sum_c (latch)>
1599       ...
1600       use (ivtmp_a)
1601       ...
1602       sum_b = sum_a + sum_update
1603       ...
1604 -     if (ivtmp_a < n)
1605 -       goto <bb latch>;
1606 +     goto <bb latch>;
1607 +
1608 +     <bb newheader>:
1609 +     ivtmp_c = PHI <ivtmp_init (preheader), ivtmp_b (latch)>
1610 +     sum_c = PHI <sum_init (preheader), sum_b (latch)>
1611 +     if (ivtmp_c < n + 1)
1612 +       goto <bb header>;
1613       else
1614 	goto <bb exit>;
1615 
1616       <bb latch>:
1617       ivtmp_b = ivtmp_a + 1;
1618 -     goto <bb header>
1619 +     goto <bb newheader>
1620 
1621 +    <bb newexit>:
1622 +    sum_y = PHI <sum_c (newheader)>
1623 
1624       <bb exit>:
1625 -     sum_z = PHI <sum_b (cond[1]), ...>
1626 +     sum_z = PHI <sum_y (newexit), ...>
1627 
1628    Note: the example does not show any virtual phis, but these are handled more
1629    or less as reductions.
1630 
1631 
1632    Moves the exit condition of LOOP to the beginning of its header.
1633    REDUCTION_LIST describes the reductions in LOOP.  BOUND is the new loop
1634    bound.  */
1635 
1636 static void
1637 transform_to_exit_first_loop_alt (struct loop *loop,
1638 				  reduction_info_table_type *reduction_list,
1639 				  tree bound)
1640 {
1641   basic_block header = loop->header;
1642   basic_block latch = loop->latch;
1643   edge exit = single_dom_exit (loop);
1644   basic_block exit_block = exit->dest;
1645   gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1646   tree control = gimple_cond_lhs (cond_stmt);
1647   edge e;
1648 
1649   /* Rewriting virtuals into loop-closed ssa normal form makes this
1650      transformation simpler.  It also ensures that the virtuals are in
1651      loop-closed ssa normal from after the transformation, which is required by
1652      create_parallel_loop.  */
1653   rewrite_virtuals_into_loop_closed_ssa (loop);
1654 
1655   /* Create the new_header block.  */
1656   basic_block new_header = split_block_before_cond_jump (exit->src);
1657   edge edge_at_split = single_pred_edge (new_header);
1658 
1659   /* Redirect entry edge to new_header.  */
1660   edge entry = loop_preheader_edge (loop);
1661   e = redirect_edge_and_branch (entry, new_header);
1662   gcc_assert (e == entry);
1663 
1664   /* Redirect post_inc_edge to new_header.  */
1665   edge post_inc_edge = single_succ_edge (latch);
1666   e = redirect_edge_and_branch (post_inc_edge, new_header);
1667   gcc_assert (e == post_inc_edge);
1668 
1669   /* Redirect post_cond_edge to header.  */
1670   edge post_cond_edge = single_pred_edge (latch);
1671   e = redirect_edge_and_branch (post_cond_edge, header);
1672   gcc_assert (e == post_cond_edge);
1673 
1674   /* Redirect edge_at_split to latch.  */
1675   e = redirect_edge_and_branch (edge_at_split, latch);
1676   gcc_assert (e == edge_at_split);
1677 
1678   /* Set the new loop bound.  */
1679   gimple_cond_set_rhs (cond_stmt, bound);
1680   update_stmt (cond_stmt);
1681 
1682   /* Repair the ssa.  */
1683   vec<edge_var_map> *v = redirect_edge_var_map_vector (post_inc_edge);
1684   edge_var_map *vm;
1685   gphi_iterator gsi;
1686   int i;
1687   for (gsi = gsi_start_phis (header), i = 0;
1688        !gsi_end_p (gsi) && v->iterate (i, &vm);
1689        gsi_next (&gsi), i++)
1690     {
1691       gphi *phi = gsi.phi ();
1692       tree res_a = PHI_RESULT (phi);
1693 
1694       /* Create new phi.  */
1695       tree res_c = copy_ssa_name (res_a, phi);
1696       gphi *nphi = create_phi_node (res_c, new_header);
1697 
1698       /* Replace ivtmp_a with ivtmp_c in condition 'if (ivtmp_a < n)'.  */
1699       replace_uses_in_bb_by (res_a, res_c, new_header);
1700 
1701       /* Replace ivtmp/sum_b with ivtmp/sum_c in header phi.  */
1702       add_phi_arg (phi, res_c, post_cond_edge, UNKNOWN_LOCATION);
1703 
1704       /* Replace sum_b with sum_c in exit phi.  */
1705       tree res_b = redirect_edge_var_map_def (vm);
1706       replace_uses_in_bb_by (res_b, res_c, exit_block);
1707 
1708       struct reduction_info *red = reduction_phi (reduction_list, phi);
1709       gcc_assert (virtual_operand_p (res_a)
1710 		  || res_a == control
1711 		  || red != NULL);
1712 
1713       if (red)
1714 	{
1715 	  /* Register the new reduction phi.  */
1716 	  red->reduc_phi = nphi;
1717 	  gimple_set_uid (red->reduc_phi, red->reduc_version);
1718 	}
1719     }
1720   gcc_assert (gsi_end_p (gsi) && !v->iterate (i, &vm));
1721 
1722   /* Set the preheader argument of the new phis to ivtmp/sum_init.  */
1723   flush_pending_stmts (entry);
1724 
1725   /* Set the latch arguments of the new phis to ivtmp/sum_b.  */
1726   flush_pending_stmts (post_inc_edge);
1727 
1728 
1729   basic_block new_exit_block = NULL;
1730   if (!single_pred_p (exit->dest))
1731     {
1732       /* Create a new empty exit block, inbetween the new loop header and the
1733 	 old exit block.  The function separate_decls_in_region needs this block
1734 	 to insert code that is active on loop exit, but not any other path.  */
1735       new_exit_block = split_edge (exit);
1736     }
1737 
1738   /* Insert and register the reduction exit phis.  */
1739   for (gphi_iterator gsi = gsi_start_phis (exit_block);
1740        !gsi_end_p (gsi);
1741        gsi_next (&gsi))
1742     {
1743       gphi *phi = gsi.phi ();
1744       gphi *nphi = NULL;
1745       tree res_z = PHI_RESULT (phi);
1746       tree res_c;
1747 
1748       if (new_exit_block != NULL)
1749 	{
1750 	  /* Now that we have a new exit block, duplicate the phi of the old
1751 	     exit block in the new exit block to preserve loop-closed ssa.  */
1752 	  edge succ_new_exit_block = single_succ_edge (new_exit_block);
1753 	  edge pred_new_exit_block = single_pred_edge (new_exit_block);
1754 	  tree res_y = copy_ssa_name (res_z, phi);
1755 	  nphi = create_phi_node (res_y, new_exit_block);
1756 	  res_c = PHI_ARG_DEF_FROM_EDGE (phi, succ_new_exit_block);
1757 	  add_phi_arg (nphi, res_c, pred_new_exit_block, UNKNOWN_LOCATION);
1758 	  add_phi_arg (phi, res_y, succ_new_exit_block, UNKNOWN_LOCATION);
1759 	}
1760       else
1761 	res_c = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1762 
1763       if (virtual_operand_p (res_z))
1764 	continue;
1765 
1766       gimple *reduc_phi = SSA_NAME_DEF_STMT (res_c);
1767       struct reduction_info *red = reduction_phi (reduction_list, reduc_phi);
1768       if (red != NULL)
1769 	red->keep_res = (nphi != NULL
1770 			 ? nphi
1771 			 : phi);
1772     }
1773 
1774   /* We're going to cancel the loop at the end of gen_parallel_loop, but until
1775      then we're still using some fields, so only bother about fields that are
1776      still used: header and latch.
1777      The loop has a new header bb, so we update it.  The latch bb stays the
1778      same.  */
1779   loop->header = new_header;
1780 
1781   /* Recalculate dominance info.  */
1782   free_dominance_info (CDI_DOMINATORS);
1783   calculate_dominance_info (CDI_DOMINATORS);
1784 
1785   checking_verify_ssa (true, true);
1786 }
1787 
1788 /* Tries to moves the exit condition of LOOP to the beginning of its header
1789    without duplication of the loop body.  NIT is the number of iterations of the
1790    loop.  REDUCTION_LIST describes the reductions in LOOP.  Return true if
1791    transformation is successful.  */
1792 
1793 static bool
1794 try_transform_to_exit_first_loop_alt (struct loop *loop,
1795 				      reduction_info_table_type *reduction_list,
1796 				      tree nit)
1797 {
1798   /* Check whether the latch contains a single statement.  */
1799   if (!gimple_seq_nondebug_singleton_p (bb_seq (loop->latch)))
1800     return false;
1801 
1802   /* Check whether the latch contains no phis.  */
1803   if (phi_nodes (loop->latch) != NULL)
1804     return false;
1805 
1806   /* Check whether the latch contains the loop iv increment.  */
1807   edge back = single_succ_edge (loop->latch);
1808   edge exit = single_dom_exit (loop);
1809   gcond *cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1810   tree control = gimple_cond_lhs (cond_stmt);
1811   gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (control));
1812   tree inc_res = gimple_phi_arg_def (phi, back->dest_idx);
1813   if (gimple_bb (SSA_NAME_DEF_STMT (inc_res)) != loop->latch)
1814     return false;
1815 
1816   /* Check whether there's no code between the loop condition and the latch.  */
1817   if (!single_pred_p (loop->latch)
1818       || single_pred (loop->latch) != exit->src)
1819     return false;
1820 
1821   tree alt_bound = NULL_TREE;
1822   tree nit_type = TREE_TYPE (nit);
1823 
1824   /* Figure out whether nit + 1 overflows.  */
1825   if (TREE_CODE (nit) == INTEGER_CST)
1826     {
1827       if (!tree_int_cst_equal (nit, TYPE_MAXVAL (nit_type)))
1828 	{
1829 	  alt_bound = fold_build2_loc (UNKNOWN_LOCATION, PLUS_EXPR, nit_type,
1830 				       nit, build_one_cst (nit_type));
1831 
1832 	  gcc_assert (TREE_CODE (alt_bound) == INTEGER_CST);
1833 	  transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
1834 	  return true;
1835 	}
1836       else
1837 	{
1838 	  /* Todo: Figure out if we can trigger this, if it's worth to handle
1839 	     optimally, and if we can handle it optimally.  */
1840 	  return false;
1841 	}
1842     }
1843 
1844   gcc_assert (TREE_CODE (nit) == SSA_NAME);
1845 
1846   /* Variable nit is the loop bound as returned by canonicalize_loop_ivs, for an
1847      iv with base 0 and step 1 that is incremented in the latch, like this:
1848 
1849      <bb header>:
1850      # iv_1 = PHI <0 (preheader), iv_2 (latch)>
1851      ...
1852      if (iv_1 < nit)
1853        goto <bb latch>;
1854      else
1855        goto <bb exit>;
1856 
1857      <bb latch>:
1858      iv_2 = iv_1 + 1;
1859      goto <bb header>;
1860 
1861      The range of iv_1 is [0, nit].  The latch edge is taken for
1862      iv_1 == [0, nit - 1] and the exit edge is taken for iv_1 == nit.  So the
1863      number of latch executions is equal to nit.
1864 
1865      The function max_loop_iterations gives us the maximum number of latch
1866      executions, so it gives us the maximum value of nit.  */
1867   widest_int nit_max;
1868   if (!max_loop_iterations (loop, &nit_max))
1869     return false;
1870 
1871   /* Check if nit + 1 overflows.  */
1872   widest_int type_max = wi::to_widest (TYPE_MAXVAL (nit_type));
1873   if (nit_max >= type_max)
1874     return false;
1875 
1876   gimple *def = SSA_NAME_DEF_STMT (nit);
1877 
1878   /* Try to find nit + 1, in the form of n in an assignment nit = n - 1.  */
1879   if (def
1880       && is_gimple_assign (def)
1881       && gimple_assign_rhs_code (def) == PLUS_EXPR)
1882     {
1883       tree op1 = gimple_assign_rhs1 (def);
1884       tree op2 = gimple_assign_rhs2 (def);
1885       if (integer_minus_onep (op1))
1886 	alt_bound = op2;
1887       else if (integer_minus_onep (op2))
1888 	alt_bound = op1;
1889     }
1890 
1891   /* If not found, insert nit + 1.  */
1892   if (alt_bound == NULL_TREE)
1893     {
1894       alt_bound = fold_build2 (PLUS_EXPR, nit_type, nit,
1895 			       build_int_cst_type (nit_type, 1));
1896 
1897       gimple_stmt_iterator gsi = gsi_last_bb (loop_preheader_edge (loop)->src);
1898 
1899       alt_bound
1900 	= force_gimple_operand_gsi (&gsi, alt_bound, true, NULL_TREE, false,
1901 				    GSI_CONTINUE_LINKING);
1902     }
1903 
1904   transform_to_exit_first_loop_alt (loop, reduction_list, alt_bound);
1905   return true;
1906 }
1907 
1908 /* Moves the exit condition of LOOP to the beginning of its header.  NIT is the
1909    number of iterations of the loop.  REDUCTION_LIST describes the reductions in
1910    LOOP.  */
1911 
1912 static void
1913 transform_to_exit_first_loop (struct loop *loop,
1914 			      reduction_info_table_type *reduction_list,
1915 			      tree nit)
1916 {
1917   basic_block *bbs, *nbbs, ex_bb, orig_header;
1918   unsigned n;
1919   bool ok;
1920   edge exit = single_dom_exit (loop), hpred;
1921   tree control, control_name, res, t;
1922   gphi *phi, *nphi;
1923   gassign *stmt;
1924   gcond *cond_stmt, *cond_nit;
1925   tree nit_1;
1926 
1927   split_block_after_labels (loop->header);
1928   orig_header = single_succ (loop->header);
1929   hpred = single_succ_edge (loop->header);
1930 
1931   cond_stmt = as_a <gcond *> (last_stmt (exit->src));
1932   control = gimple_cond_lhs (cond_stmt);
1933   gcc_assert (gimple_cond_rhs (cond_stmt) == nit);
1934 
1935   /* Make sure that we have phi nodes on exit for all loop header phis
1936      (create_parallel_loop requires that).  */
1937   for (gphi_iterator gsi = gsi_start_phis (loop->header);
1938        !gsi_end_p (gsi);
1939        gsi_next (&gsi))
1940     {
1941       phi = gsi.phi ();
1942       res = PHI_RESULT (phi);
1943       t = copy_ssa_name (res, phi);
1944       SET_PHI_RESULT (phi, t);
1945       nphi = create_phi_node (res, orig_header);
1946       add_phi_arg (nphi, t, hpred, UNKNOWN_LOCATION);
1947 
1948       if (res == control)
1949 	{
1950 	  gimple_cond_set_lhs (cond_stmt, t);
1951 	  update_stmt (cond_stmt);
1952 	  control = t;
1953 	}
1954     }
1955 
1956   bbs = get_loop_body_in_dom_order (loop);
1957 
1958   for (n = 0; bbs[n] != exit->src; n++)
1959    continue;
1960   nbbs = XNEWVEC (basic_block, n);
1961   ok = gimple_duplicate_sese_tail (single_succ_edge (loop->header), exit,
1962 				   bbs + 1, n, nbbs);
1963   gcc_assert (ok);
1964   free (bbs);
1965   ex_bb = nbbs[0];
1966   free (nbbs);
1967 
1968   /* Other than reductions, the only gimple reg that should be copied
1969      out of the loop is the control variable.  */
1970   exit = single_dom_exit (loop);
1971   control_name = NULL_TREE;
1972   for (gphi_iterator gsi = gsi_start_phis (ex_bb);
1973        !gsi_end_p (gsi); )
1974     {
1975       phi = gsi.phi ();
1976       res = PHI_RESULT (phi);
1977       if (virtual_operand_p (res))
1978 	{
1979 	  gsi_next (&gsi);
1980 	  continue;
1981 	}
1982 
1983       /* Check if it is a part of reduction.  If it is,
1984          keep the phi at the reduction's keep_res field.  The
1985          PHI_RESULT of this phi is the resulting value of the reduction
1986          variable when exiting the loop.  */
1987 
1988       if (reduction_list->elements () > 0)
1989 	{
1990 	  struct reduction_info *red;
1991 
1992 	  tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
1993 	  red = reduction_phi (reduction_list, SSA_NAME_DEF_STMT (val));
1994 	  if (red)
1995 	    {
1996 	      red->keep_res = phi;
1997 	      gsi_next (&gsi);
1998 	      continue;
1999 	    }
2000 	}
2001       gcc_assert (control_name == NULL_TREE
2002 		  && SSA_NAME_VAR (res) == SSA_NAME_VAR (control));
2003       control_name = res;
2004       remove_phi_node (&gsi, false);
2005     }
2006   gcc_assert (control_name != NULL_TREE);
2007 
2008   /* Initialize the control variable to number of iterations
2009      according to the rhs of the exit condition.  */
2010   gimple_stmt_iterator gsi = gsi_after_labels (ex_bb);
2011   cond_nit = as_a <gcond *> (last_stmt (exit->src));
2012   nit_1 =  gimple_cond_rhs (cond_nit);
2013   nit_1 = force_gimple_operand_gsi (&gsi,
2014 				  fold_convert (TREE_TYPE (control_name), nit_1),
2015 				  false, NULL_TREE, false, GSI_SAME_STMT);
2016   stmt = gimple_build_assign (control_name, nit_1);
2017   gsi_insert_before (&gsi, stmt, GSI_NEW_STMT);
2018 }
2019 
2020 /* Create the parallel constructs for LOOP as described in gen_parallel_loop.
2021    LOOP_FN and DATA are the arguments of GIMPLE_OMP_PARALLEL.
2022    NEW_DATA is the variable that should be initialized from the argument
2023    of LOOP_FN.  N_THREADS is the requested number of threads, which can be 0 if
2024    that number is to be determined later.  */
2025 
2026 static void
2027 create_parallel_loop (struct loop *loop, tree loop_fn, tree data,
2028 		      tree new_data, unsigned n_threads, location_t loc,
2029 		      bool oacc_kernels_p)
2030 {
2031   gimple_stmt_iterator gsi;
2032   basic_block for_bb, ex_bb, continue_bb;
2033   tree t, param;
2034   gomp_parallel *omp_par_stmt;
2035   gimple *omp_return_stmt1, *omp_return_stmt2;
2036   gimple *phi;
2037   gcond *cond_stmt;
2038   gomp_for *for_stmt;
2039   gomp_continue *omp_cont_stmt;
2040   tree cvar, cvar_init, initvar, cvar_next, cvar_base, type;
2041   edge exit, nexit, guard, end, e;
2042 
2043   /* Prepare the GIMPLE_OMP_PARALLEL statement.  */
2044   if (oacc_kernels_p)
2045     {
2046       tree clause = build_omp_clause (loc, OMP_CLAUSE_NUM_GANGS);
2047       OMP_CLAUSE_NUM_GANGS_EXPR (clause)
2048 	= build_int_cst (integer_type_node, n_threads);
2049       oacc_set_fn_attrib (cfun->decl, clause, true, NULL);
2050     }
2051   else
2052     {
2053       basic_block bb = loop_preheader_edge (loop)->src;
2054       basic_block paral_bb = single_pred (bb);
2055       gsi = gsi_last_bb (paral_bb);
2056 
2057       gcc_checking_assert (n_threads != 0);
2058       t = build_omp_clause (loc, OMP_CLAUSE_NUM_THREADS);
2059       OMP_CLAUSE_NUM_THREADS_EXPR (t)
2060 	= build_int_cst (integer_type_node, n_threads);
2061       omp_par_stmt = gimple_build_omp_parallel (NULL, t, loop_fn, data);
2062       gimple_set_location (omp_par_stmt, loc);
2063 
2064       gsi_insert_after (&gsi, omp_par_stmt, GSI_NEW_STMT);
2065 
2066       /* Initialize NEW_DATA.  */
2067       if (data)
2068 	{
2069 	  gassign *assign_stmt;
2070 
2071 	  gsi = gsi_after_labels (bb);
2072 
2073 	  param = make_ssa_name (DECL_ARGUMENTS (loop_fn));
2074 	  assign_stmt = gimple_build_assign (param, build_fold_addr_expr (data));
2075 	  gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2076 
2077 	  assign_stmt = gimple_build_assign (new_data,
2078 					     fold_convert (TREE_TYPE (new_data), param));
2079 	  gsi_insert_before (&gsi, assign_stmt, GSI_SAME_STMT);
2080 	}
2081 
2082       /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_PARALLEL.  */
2083       bb = split_loop_exit_edge (single_dom_exit (loop));
2084       gsi = gsi_last_bb (bb);
2085       omp_return_stmt1 = gimple_build_omp_return (false);
2086       gimple_set_location (omp_return_stmt1, loc);
2087       gsi_insert_after (&gsi, omp_return_stmt1, GSI_NEW_STMT);
2088     }
2089 
2090   /* Extract data for GIMPLE_OMP_FOR.  */
2091   gcc_assert (loop->header == single_dom_exit (loop)->src);
2092   cond_stmt = as_a <gcond *> (last_stmt (loop->header));
2093 
2094   cvar = gimple_cond_lhs (cond_stmt);
2095   cvar_base = SSA_NAME_VAR (cvar);
2096   phi = SSA_NAME_DEF_STMT (cvar);
2097   cvar_init = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
2098   initvar = copy_ssa_name (cvar);
2099   SET_USE (PHI_ARG_DEF_PTR_FROM_EDGE (phi, loop_preheader_edge (loop)),
2100 	   initvar);
2101   cvar_next = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2102 
2103   gsi = gsi_last_nondebug_bb (loop->latch);
2104   gcc_assert (gsi_stmt (gsi) == SSA_NAME_DEF_STMT (cvar_next));
2105   gsi_remove (&gsi, true);
2106 
2107   /* Prepare cfg.  */
2108   for_bb = split_edge (loop_preheader_edge (loop));
2109   ex_bb = split_loop_exit_edge (single_dom_exit (loop));
2110   extract_true_false_edges_from_block (loop->header, &nexit, &exit);
2111   gcc_assert (exit == single_dom_exit (loop));
2112 
2113   guard = make_edge (for_bb, ex_bb, 0);
2114   /* Split the latch edge, so LOOPS_HAVE_SIMPLE_LATCHES is still valid.  */
2115   loop->latch = split_edge (single_succ_edge (loop->latch));
2116   single_pred_edge (loop->latch)->flags = 0;
2117   end = make_edge (single_pred (loop->latch), ex_bb, EDGE_FALLTHRU);
2118   rescan_loop_exit (end, true, false);
2119 
2120   for (gphi_iterator gpi = gsi_start_phis (ex_bb);
2121        !gsi_end_p (gpi); gsi_next (&gpi))
2122     {
2123       source_location locus;
2124       gphi *phi = gpi.phi ();
2125       tree def = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2126       gimple *def_stmt = SSA_NAME_DEF_STMT (def);
2127 
2128       /* If the exit phi is not connected to a header phi in the same loop, this
2129 	 value is not modified in the loop, and we're done with this phi.  */
2130       if (!(gimple_code (def_stmt) == GIMPLE_PHI
2131 	    && gimple_bb (def_stmt) == loop->header))
2132 	{
2133 	  locus = gimple_phi_arg_location_from_edge (phi, exit);
2134 	  add_phi_arg (phi, def, guard, locus);
2135 	  add_phi_arg (phi, def, end, locus);
2136 	  continue;
2137 	}
2138 
2139       gphi *stmt = as_a <gphi *> (def_stmt);
2140       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2141       locus = gimple_phi_arg_location_from_edge (stmt,
2142 						 loop_preheader_edge (loop));
2143       add_phi_arg (phi, def, guard, locus);
2144 
2145       def = PHI_ARG_DEF_FROM_EDGE (stmt, loop_latch_edge (loop));
2146       locus = gimple_phi_arg_location_from_edge (stmt, loop_latch_edge (loop));
2147       add_phi_arg (phi, def, end, locus);
2148     }
2149   e = redirect_edge_and_branch (exit, nexit->dest);
2150   PENDING_STMT (e) = NULL;
2151 
2152   /* Emit GIMPLE_OMP_FOR.  */
2153   if (oacc_kernels_p)
2154     /* In combination with the NUM_GANGS on the parallel.  */
2155     t = build_omp_clause (loc, OMP_CLAUSE_GANG);
2156   else
2157     {
2158       t = build_omp_clause (loc, OMP_CLAUSE_SCHEDULE);
2159       int chunk_size = PARAM_VALUE (PARAM_PARLOOPS_CHUNK_SIZE);
2160       enum PARAM_PARLOOPS_SCHEDULE_KIND schedule_type \
2161 	= (enum PARAM_PARLOOPS_SCHEDULE_KIND) PARAM_VALUE (PARAM_PARLOOPS_SCHEDULE);
2162       switch (schedule_type)
2163 	{
2164 	case PARAM_PARLOOPS_SCHEDULE_KIND_static:
2165 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_STATIC;
2166 	  break;
2167 	case PARAM_PARLOOPS_SCHEDULE_KIND_dynamic:
2168 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_DYNAMIC;
2169 	  break;
2170 	case PARAM_PARLOOPS_SCHEDULE_KIND_guided:
2171 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_GUIDED;
2172 	  break;
2173 	case PARAM_PARLOOPS_SCHEDULE_KIND_auto:
2174 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_AUTO;
2175 	  chunk_size = 0;
2176 	  break;
2177 	case PARAM_PARLOOPS_SCHEDULE_KIND_runtime:
2178 	  OMP_CLAUSE_SCHEDULE_KIND (t) = OMP_CLAUSE_SCHEDULE_RUNTIME;
2179 	  chunk_size = 0;
2180 	  break;
2181 	default:
2182 	  gcc_unreachable ();
2183 	}
2184       if (chunk_size != 0)
2185 	OMP_CLAUSE_SCHEDULE_CHUNK_EXPR (t)
2186 	  = build_int_cst (integer_type_node, chunk_size);
2187     }
2188 
2189   for_stmt = gimple_build_omp_for (NULL,
2190 				   (oacc_kernels_p
2191 				    ? GF_OMP_FOR_KIND_OACC_LOOP
2192 				    : GF_OMP_FOR_KIND_FOR),
2193 				   t, 1, NULL);
2194 
2195   gimple_cond_set_lhs (cond_stmt, cvar_base);
2196   type = TREE_TYPE (cvar);
2197   gimple_set_location (for_stmt, loc);
2198   gimple_omp_for_set_index (for_stmt, 0, initvar);
2199   gimple_omp_for_set_initial (for_stmt, 0, cvar_init);
2200   gimple_omp_for_set_final (for_stmt, 0, gimple_cond_rhs (cond_stmt));
2201   gimple_omp_for_set_cond (for_stmt, 0, gimple_cond_code (cond_stmt));
2202   gimple_omp_for_set_incr (for_stmt, 0, build2 (PLUS_EXPR, type,
2203 						cvar_base,
2204 						build_int_cst (type, 1)));
2205 
2206   gsi = gsi_last_bb (for_bb);
2207   gsi_insert_after (&gsi, for_stmt, GSI_NEW_STMT);
2208   SSA_NAME_DEF_STMT (initvar) = for_stmt;
2209 
2210   /* Emit GIMPLE_OMP_CONTINUE.  */
2211   continue_bb = single_pred (loop->latch);
2212   gsi = gsi_last_bb (continue_bb);
2213   omp_cont_stmt = gimple_build_omp_continue (cvar_next, cvar);
2214   gimple_set_location (omp_cont_stmt, loc);
2215   gsi_insert_after (&gsi, omp_cont_stmt, GSI_NEW_STMT);
2216   SSA_NAME_DEF_STMT (cvar_next) = omp_cont_stmt;
2217 
2218   /* Emit GIMPLE_OMP_RETURN for GIMPLE_OMP_FOR.  */
2219   gsi = gsi_last_bb (ex_bb);
2220   omp_return_stmt2 = gimple_build_omp_return (true);
2221   gimple_set_location (omp_return_stmt2, loc);
2222   gsi_insert_after (&gsi, omp_return_stmt2, GSI_NEW_STMT);
2223 
2224   /* After the above dom info is hosed.  Re-compute it.  */
2225   free_dominance_info (CDI_DOMINATORS);
2226   calculate_dominance_info (CDI_DOMINATORS);
2227 }
2228 
2229 /* Generates code to execute the iterations of LOOP in N_THREADS
2230    threads in parallel, which can be 0 if that number is to be determined
2231    later.
2232 
2233    NITER describes number of iterations of LOOP.
2234    REDUCTION_LIST describes the reductions existent in the LOOP.  */
2235 
2236 static void
2237 gen_parallel_loop (struct loop *loop,
2238 		   reduction_info_table_type *reduction_list,
2239 		   unsigned n_threads, struct tree_niter_desc *niter,
2240 		   bool oacc_kernels_p)
2241 {
2242   tree many_iterations_cond, type, nit;
2243   tree arg_struct, new_arg_struct;
2244   gimple_seq stmts;
2245   edge entry, exit;
2246   struct clsn_data clsn_data;
2247   unsigned prob;
2248   location_t loc;
2249   gimple *cond_stmt;
2250   unsigned int m_p_thread=2;
2251 
2252   /* From
2253 
2254      ---------------------------------------------------------------------
2255      loop
2256        {
2257 	 IV = phi (INIT, IV + STEP)
2258 	 BODY1;
2259 	 if (COND)
2260 	   break;
2261 	 BODY2;
2262        }
2263      ---------------------------------------------------------------------
2264 
2265      with # of iterations NITER (possibly with MAY_BE_ZERO assumption),
2266      we generate the following code:
2267 
2268      ---------------------------------------------------------------------
2269 
2270      if (MAY_BE_ZERO
2271      || NITER < MIN_PER_THREAD * N_THREADS)
2272      goto original;
2273 
2274      BODY1;
2275      store all local loop-invariant variables used in body of the loop to DATA.
2276      GIMPLE_OMP_PARALLEL (OMP_CLAUSE_NUM_THREADS (N_THREADS), LOOPFN, DATA);
2277      load the variables from DATA.
2278      GIMPLE_OMP_FOR (IV = INIT; COND; IV += STEP) (OMP_CLAUSE_SCHEDULE (static))
2279      BODY2;
2280      BODY1;
2281      GIMPLE_OMP_CONTINUE;
2282      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_FOR
2283      GIMPLE_OMP_RETURN         -- GIMPLE_OMP_PARALLEL
2284      goto end;
2285 
2286      original:
2287      loop
2288        {
2289 	 IV = phi (INIT, IV + STEP)
2290 	 BODY1;
2291 	 if (COND)
2292 	   break;
2293 	 BODY2;
2294        }
2295 
2296      end:
2297 
2298    */
2299 
2300   /* Create two versions of the loop -- in the old one, we know that the
2301      number of iterations is large enough, and we will transform it into the
2302      loop that will be split to loop_fn, the new one will be used for the
2303      remaining iterations.  */
2304 
2305   /* We should compute a better number-of-iterations value for outer loops.
2306      That is, if we have
2307 
2308     for (i = 0; i < n; ++i)
2309       for (j = 0; j < m; ++j)
2310         ...
2311 
2312     we should compute nit = n * m, not nit = n.
2313     Also may_be_zero handling would need to be adjusted.  */
2314 
2315   type = TREE_TYPE (niter->niter);
2316   nit = force_gimple_operand (unshare_expr (niter->niter), &stmts, true,
2317 			      NULL_TREE);
2318   if (stmts)
2319     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
2320 
2321   if (!oacc_kernels_p)
2322     {
2323       if (loop->inner)
2324 	m_p_thread=2;
2325       else
2326 	m_p_thread=MIN_PER_THREAD;
2327 
2328       gcc_checking_assert (n_threads != 0);
2329       many_iterations_cond =
2330 	fold_build2 (GE_EXPR, boolean_type_node,
2331 		     nit, build_int_cst (type, m_p_thread * n_threads));
2332 
2333       many_iterations_cond
2334 	= fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2335 		       invert_truthvalue (unshare_expr (niter->may_be_zero)),
2336 		       many_iterations_cond);
2337       many_iterations_cond
2338 	= force_gimple_operand (many_iterations_cond, &stmts, false, NULL_TREE);
2339       if (stmts)
2340 	gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
2341       if (!is_gimple_condexpr (many_iterations_cond))
2342 	{
2343 	  many_iterations_cond
2344 	    = force_gimple_operand (many_iterations_cond, &stmts,
2345 				    true, NULL_TREE);
2346 	  if (stmts)
2347 	    gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
2348 					      stmts);
2349 	}
2350 
2351       initialize_original_copy_tables ();
2352 
2353       /* We assume that the loop usually iterates a lot.  */
2354       prob = 4 * REG_BR_PROB_BASE / 5;
2355       loop_version (loop, many_iterations_cond, NULL,
2356 		    prob, REG_BR_PROB_BASE - prob,
2357 		    prob, REG_BR_PROB_BASE - prob, true);
2358       update_ssa (TODO_update_ssa);
2359       free_original_copy_tables ();
2360     }
2361 
2362   /* Base all the induction variables in LOOP on a single control one.  */
2363   canonicalize_loop_ivs (loop, &nit, true);
2364 
2365   /* Ensure that the exit condition is the first statement in the loop.
2366      The common case is that latch of the loop is empty (apart from the
2367      increment) and immediately follows the loop exit test.  Attempt to move the
2368      entry of the loop directly before the exit check and increase the number of
2369      iterations of the loop by one.  */
2370   if (try_transform_to_exit_first_loop_alt (loop, reduction_list, nit))
2371     {
2372       if (dump_file
2373 	  && (dump_flags & TDF_DETAILS))
2374 	fprintf (dump_file,
2375 		 "alternative exit-first loop transform succeeded"
2376 		 " for loop %d\n", loop->num);
2377     }
2378   else
2379     {
2380       if (oacc_kernels_p)
2381 	n_threads = 1;
2382 
2383       /* Fall back on the method that handles more cases, but duplicates the
2384 	 loop body: move the exit condition of LOOP to the beginning of its
2385 	 header, and duplicate the part of the last iteration that gets disabled
2386 	 to the exit of the loop.  */
2387       transform_to_exit_first_loop (loop, reduction_list, nit);
2388     }
2389 
2390   /* Generate initializations for reductions.  */
2391   if (reduction_list->elements () > 0)
2392     reduction_list->traverse <struct loop *, initialize_reductions> (loop);
2393 
2394   /* Eliminate the references to local variables from the loop.  */
2395   gcc_assert (single_exit (loop));
2396   entry = loop_preheader_edge (loop);
2397   exit = single_dom_exit (loop);
2398 
2399   /* This rewrites the body in terms of new variables.  This has already
2400      been done for oacc_kernels_p in pass_lower_omp/lower_omp ().  */
2401   if (!oacc_kernels_p)
2402     {
2403       eliminate_local_variables (entry, exit);
2404       /* In the old loop, move all variables non-local to the loop to a
2405 	 structure and back, and create separate decls for the variables used in
2406 	 loop.  */
2407       separate_decls_in_region (entry, exit, reduction_list, &arg_struct,
2408 				&new_arg_struct, &clsn_data);
2409     }
2410   else
2411     {
2412       arg_struct = NULL_TREE;
2413       new_arg_struct = NULL_TREE;
2414       clsn_data.load = NULL_TREE;
2415       clsn_data.load_bb = exit->dest;
2416       clsn_data.store = NULL_TREE;
2417       clsn_data.store_bb = NULL;
2418     }
2419 
2420   /* Create the parallel constructs.  */
2421   loc = UNKNOWN_LOCATION;
2422   cond_stmt = last_stmt (loop->header);
2423   if (cond_stmt)
2424     loc = gimple_location (cond_stmt);
2425   create_parallel_loop (loop, create_loop_fn (loc), arg_struct, new_arg_struct,
2426 			n_threads, loc, oacc_kernels_p);
2427   if (reduction_list->elements () > 0)
2428     create_call_for_reduction (loop, reduction_list, &clsn_data);
2429 
2430   scev_reset ();
2431 
2432   /* Free loop bound estimations that could contain references to
2433      removed statements.  */
2434   FOR_EACH_LOOP (loop, 0)
2435     free_numbers_of_iterations_estimates_loop (loop);
2436 }
2437 
2438 /* Returns true when LOOP contains vector phi nodes.  */
2439 
2440 static bool
2441 loop_has_vector_phi_nodes (struct loop *loop ATTRIBUTE_UNUSED)
2442 {
2443   unsigned i;
2444   basic_block *bbs = get_loop_body_in_dom_order (loop);
2445   gphi_iterator gsi;
2446   bool res = true;
2447 
2448   for (i = 0; i < loop->num_nodes; i++)
2449     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
2450       if (TREE_CODE (TREE_TYPE (PHI_RESULT (gsi.phi ()))) == VECTOR_TYPE)
2451 	goto end;
2452 
2453   res = false;
2454  end:
2455   free (bbs);
2456   return res;
2457 }
2458 
2459 /* Create a reduction_info struct, initialize it with REDUC_STMT
2460    and PHI, insert it to the REDUCTION_LIST.  */
2461 
2462 static void
2463 build_new_reduction (reduction_info_table_type *reduction_list,
2464 		     gimple *reduc_stmt, gphi *phi)
2465 {
2466   reduction_info **slot;
2467   struct reduction_info *new_reduction;
2468   enum tree_code reduction_code;
2469 
2470   gcc_assert (reduc_stmt);
2471 
2472   if (dump_file && (dump_flags & TDF_DETAILS))
2473     {
2474       fprintf (dump_file,
2475 	       "Detected reduction. reduction stmt is:\n");
2476       print_gimple_stmt (dump_file, reduc_stmt, 0, 0);
2477       fprintf (dump_file, "\n");
2478     }
2479 
2480   if (gimple_code (reduc_stmt) == GIMPLE_PHI)
2481     {
2482       tree op1 = PHI_ARG_DEF (reduc_stmt, 0);
2483       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2484       reduction_code = gimple_assign_rhs_code (def1);
2485     }
2486 
2487   else
2488     reduction_code = gimple_assign_rhs_code (reduc_stmt);
2489 
2490   new_reduction = XCNEW (struct reduction_info);
2491 
2492   new_reduction->reduc_stmt = reduc_stmt;
2493   new_reduction->reduc_phi = phi;
2494   new_reduction->reduc_version = SSA_NAME_VERSION (gimple_phi_result (phi));
2495   new_reduction->reduction_code = reduction_code;
2496   slot = reduction_list->find_slot (new_reduction, INSERT);
2497   *slot = new_reduction;
2498 }
2499 
2500 /* Callback for htab_traverse.  Sets gimple_uid of reduc_phi stmts.  */
2501 
2502 int
2503 set_reduc_phi_uids (reduction_info **slot, void *data ATTRIBUTE_UNUSED)
2504 {
2505   struct reduction_info *const red = *slot;
2506   gimple_set_uid (red->reduc_phi, red->reduc_version);
2507   return 1;
2508 }
2509 
2510 /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST.  */
2511 
2512 static void
2513 gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list)
2514 {
2515   gphi_iterator gsi;
2516   loop_vec_info simple_loop_info;
2517   auto_vec<gphi *, 4> double_reduc_phis;
2518   auto_vec<gimple *, 4> double_reduc_stmts;
2519 
2520   if (!stmt_vec_info_vec.exists ())
2521     init_stmt_vec_info_vec ();
2522 
2523   simple_loop_info = vect_analyze_loop_form (loop);
2524   if (simple_loop_info == NULL)
2525     goto gather_done;
2526 
2527   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
2528     {
2529       gphi *phi = gsi.phi ();
2530       affine_iv iv;
2531       tree res = PHI_RESULT (phi);
2532       bool double_reduc;
2533 
2534       if (virtual_operand_p (res))
2535 	continue;
2536 
2537       if (simple_iv (loop, loop, res, &iv, true))
2538 	continue;
2539 
2540       gimple *reduc_stmt
2541 	= vect_force_simple_reduction (simple_loop_info, phi, true,
2542 				       &double_reduc, true);
2543       if (!reduc_stmt)
2544 	continue;
2545 
2546       if (double_reduc)
2547 	{
2548 	  if (loop->inner->inner != NULL)
2549 	    continue;
2550 
2551 	  double_reduc_phis.safe_push (phi);
2552 	  double_reduc_stmts.safe_push (reduc_stmt);
2553 	  continue;
2554 	}
2555 
2556       build_new_reduction (reduction_list, reduc_stmt, phi);
2557     }
2558   destroy_loop_vec_info (simple_loop_info, true);
2559 
2560   if (!double_reduc_phis.is_empty ())
2561     {
2562       simple_loop_info = vect_analyze_loop_form (loop->inner);
2563       if (simple_loop_info)
2564 	{
2565 	  gphi *phi;
2566 	  unsigned int i;
2567 
2568 	  FOR_EACH_VEC_ELT (double_reduc_phis, i, phi)
2569 	    {
2570 	      affine_iv iv;
2571 	      tree res = PHI_RESULT (phi);
2572 	      bool double_reduc;
2573 
2574 	      use_operand_p use_p;
2575 	      gimple *inner_stmt;
2576 	      bool single_use_p = single_imm_use (res, &use_p, &inner_stmt);
2577 	      gcc_assert (single_use_p);
2578 	      if (gimple_code (inner_stmt) != GIMPLE_PHI)
2579 		continue;
2580 	      gphi *inner_phi = as_a <gphi *> (inner_stmt);
2581 	      if (simple_iv (loop->inner, loop->inner, PHI_RESULT (inner_phi),
2582 			     &iv, true))
2583 		continue;
2584 
2585 	      gimple *inner_reduc_stmt
2586 		= vect_force_simple_reduction (simple_loop_info, inner_phi,
2587 					       true, &double_reduc, true);
2588 	      gcc_assert (!double_reduc);
2589 	      if (inner_reduc_stmt == NULL)
2590 		continue;
2591 
2592 	      build_new_reduction (reduction_list, double_reduc_stmts[i], phi);
2593 	    }
2594 	  destroy_loop_vec_info (simple_loop_info, true);
2595 	}
2596     }
2597 
2598  gather_done:
2599   /* Release the claim on gimple_uid.  */
2600   free_stmt_vec_info_vec ();
2601 
2602   if (reduction_list->elements () == 0)
2603     return;
2604 
2605   /* As gimple_uid is used by the vectorizer in between vect_analyze_loop_form
2606      and free_stmt_vec_info_vec, we can set gimple_uid of reduc_phi stmts only
2607      now.  */
2608   basic_block bb;
2609   FOR_EACH_BB_FN (bb, cfun)
2610     for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2611       gimple_set_uid (gsi_stmt (gsi), (unsigned int)-1);
2612   reduction_list->traverse <void *, set_reduc_phi_uids> (NULL);
2613 }
2614 
2615 /* Try to initialize NITER for code generation part.  */
2616 
2617 static bool
2618 try_get_loop_niter (loop_p loop, struct tree_niter_desc *niter)
2619 {
2620   edge exit = single_dom_exit (loop);
2621 
2622   gcc_assert (exit);
2623 
2624   /* We need to know # of iterations, and there should be no uses of values
2625      defined inside loop outside of it, unless the values are invariants of
2626      the loop.  */
2627   if (!number_of_iterations_exit (loop, exit, niter, false))
2628     {
2629       if (dump_file && (dump_flags & TDF_DETAILS))
2630 	fprintf (dump_file, "  FAILED: number of iterations not known\n");
2631       return false;
2632     }
2633 
2634   return true;
2635 }
2636 
2637 /* Return the default def of the first function argument.  */
2638 
2639 static tree
2640 get_omp_data_i_param (void)
2641 {
2642   tree decl = DECL_ARGUMENTS (cfun->decl);
2643   gcc_assert (DECL_CHAIN (decl) == NULL_TREE);
2644   return ssa_default_def (cfun, decl);
2645 }
2646 
2647 /* For PHI in loop header of LOOP, look for pattern:
2648 
2649    <bb preheader>
2650    .omp_data_i = &.omp_data_arr;
2651    addr = .omp_data_i->sum;
2652    sum_a = *addr;
2653 
2654    <bb header>:
2655    sum_b = PHI <sum_a (preheader), sum_c (latch)>
2656 
2657    and return addr.  Otherwise, return NULL_TREE.  */
2658 
2659 static tree
2660 find_reduc_addr (struct loop *loop, gphi *phi)
2661 {
2662   edge e = loop_preheader_edge (loop);
2663   tree arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
2664   gimple *stmt = SSA_NAME_DEF_STMT (arg);
2665   if (!gimple_assign_single_p (stmt))
2666     return NULL_TREE;
2667   tree memref = gimple_assign_rhs1 (stmt);
2668   if (TREE_CODE (memref) != MEM_REF)
2669     return NULL_TREE;
2670   tree addr = TREE_OPERAND (memref, 0);
2671 
2672   gimple *stmt2 = SSA_NAME_DEF_STMT (addr);
2673   if (!gimple_assign_single_p (stmt2))
2674     return NULL_TREE;
2675   tree compref = gimple_assign_rhs1 (stmt2);
2676   if (TREE_CODE (compref) != COMPONENT_REF)
2677     return NULL_TREE;
2678   tree addr2 = TREE_OPERAND (compref, 0);
2679   if (TREE_CODE (addr2) != MEM_REF)
2680     return NULL_TREE;
2681   addr2 = TREE_OPERAND (addr2, 0);
2682   if (TREE_CODE (addr2) != SSA_NAME
2683       || addr2 != get_omp_data_i_param ())
2684     return NULL_TREE;
2685 
2686   return addr;
2687 }
2688 
2689 /* Try to initialize REDUCTION_LIST for code generation part.
2690    REDUCTION_LIST describes the reductions.  */
2691 
2692 static bool
2693 try_create_reduction_list (loop_p loop,
2694 			   reduction_info_table_type *reduction_list,
2695 			   bool oacc_kernels_p)
2696 {
2697   edge exit = single_dom_exit (loop);
2698   gphi_iterator gsi;
2699 
2700   gcc_assert (exit);
2701 
2702   /* Try to get rid of exit phis.  */
2703   final_value_replacement_loop (loop);
2704 
2705   gather_scalar_reductions (loop, reduction_list);
2706 
2707 
2708   for (gsi = gsi_start_phis (exit->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2709     {
2710       gphi *phi = gsi.phi ();
2711       struct reduction_info *red;
2712       imm_use_iterator imm_iter;
2713       use_operand_p use_p;
2714       gimple *reduc_phi;
2715       tree val = PHI_ARG_DEF_FROM_EDGE (phi, exit);
2716 
2717       if (!virtual_operand_p (val))
2718 	{
2719 	  if (dump_file && (dump_flags & TDF_DETAILS))
2720 	    {
2721 	      fprintf (dump_file, "phi is ");
2722 	      print_gimple_stmt (dump_file, phi, 0, 0);
2723 	      fprintf (dump_file, "arg of phi to exit:   value ");
2724 	      print_generic_expr (dump_file, val, 0);
2725 	      fprintf (dump_file, " used outside loop\n");
2726 	      fprintf (dump_file,
2727 		       "  checking if it is part of reduction pattern:\n");
2728 	    }
2729 	  if (reduction_list->elements () == 0)
2730 	    {
2731 	      if (dump_file && (dump_flags & TDF_DETAILS))
2732 		fprintf (dump_file,
2733 			 "  FAILED: it is not a part of reduction.\n");
2734 	      return false;
2735 	    }
2736 	  reduc_phi = NULL;
2737 	  FOR_EACH_IMM_USE_FAST (use_p, imm_iter, val)
2738 	    {
2739 	      if (!gimple_debug_bind_p (USE_STMT (use_p))
2740 		  && flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
2741 		{
2742 		  reduc_phi = USE_STMT (use_p);
2743 		  break;
2744 		}
2745 	    }
2746 	  red = reduction_phi (reduction_list, reduc_phi);
2747 	  if (red == NULL)
2748 	    {
2749 	      if (dump_file && (dump_flags & TDF_DETAILS))
2750 		fprintf (dump_file,
2751 			 "  FAILED: it is not a part of reduction.\n");
2752 	      return false;
2753 	    }
2754 	  if (red->keep_res != NULL)
2755 	    {
2756 	      if (dump_file && (dump_flags & TDF_DETAILS))
2757 		fprintf (dump_file,
2758 			 "  FAILED: reduction has multiple exit phis.\n");
2759 	      return false;
2760 	    }
2761 	  red->keep_res = phi;
2762 	  if (dump_file && (dump_flags & TDF_DETAILS))
2763 	    {
2764 	      fprintf (dump_file, "reduction phi is  ");
2765 	      print_gimple_stmt (dump_file, red->reduc_phi, 0, 0);
2766 	      fprintf (dump_file, "reduction stmt is  ");
2767 	      print_gimple_stmt (dump_file, red->reduc_stmt, 0, 0);
2768 	    }
2769 	}
2770     }
2771 
2772   /* The iterations of the loop may communicate only through bivs whose
2773      iteration space can be distributed efficiently.  */
2774   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
2775     {
2776       gphi *phi = gsi.phi ();
2777       tree def = PHI_RESULT (phi);
2778       affine_iv iv;
2779 
2780       if (!virtual_operand_p (def) && !simple_iv (loop, loop, def, &iv, true))
2781 	{
2782 	  struct reduction_info *red;
2783 
2784 	  red = reduction_phi (reduction_list, phi);
2785 	  if (red == NULL)
2786 	    {
2787 	      if (dump_file && (dump_flags & TDF_DETAILS))
2788 		fprintf (dump_file,
2789 			 "  FAILED: scalar dependency between iterations\n");
2790 	      return false;
2791 	    }
2792 	}
2793     }
2794 
2795   if (oacc_kernels_p)
2796     {
2797       for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi);
2798 	   gsi_next (&gsi))
2799 	{
2800 	  gphi *phi = gsi.phi ();
2801 	  tree def = PHI_RESULT (phi);
2802 	  affine_iv iv;
2803 
2804 	  if (!virtual_operand_p (def)
2805 	      && !simple_iv (loop, loop, def, &iv, true))
2806 	    {
2807 	      tree addr = find_reduc_addr (loop, phi);
2808 	      if (addr == NULL_TREE)
2809 		return false;
2810 	      struct reduction_info *red = reduction_phi (reduction_list, phi);
2811 	      red->reduc_addr = addr;
2812 	    }
2813 	}
2814     }
2815 
2816   return true;
2817 }
2818 
2819 /* Return true if LOOP contains phis with ADDR_EXPR in args.  */
2820 
2821 static bool
2822 loop_has_phi_with_address_arg (struct loop *loop)
2823 {
2824   basic_block *bbs = get_loop_body (loop);
2825   bool res = false;
2826 
2827   unsigned i, j;
2828   gphi_iterator gsi;
2829   for (i = 0; i < loop->num_nodes; i++)
2830     for (gsi = gsi_start_phis (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
2831       {
2832 	gphi *phi = gsi.phi ();
2833 	for (j = 0; j < gimple_phi_num_args (phi); j++)
2834 	  {
2835 	    tree arg = gimple_phi_arg_def (phi, j);
2836 	    if (TREE_CODE (arg) == ADDR_EXPR)
2837 	      {
2838 		/* This should be handled by eliminate_local_variables, but that
2839 		   function currently ignores phis.  */
2840 		res = true;
2841 		goto end;
2842 	      }
2843 	  }
2844       }
2845  end:
2846   free (bbs);
2847 
2848   return res;
2849 }
2850 
2851 /* Return true if memory ref REF (corresponding to the stmt at GSI in
2852    REGIONS_BB[I]) conflicts with the statements in REGIONS_BB[I] after gsi,
2853    or the statements in REGIONS_BB[I + n].  REF_IS_STORE indicates if REF is a
2854    store.  Ignore conflicts with SKIP_STMT.  */
2855 
2856 static bool
2857 ref_conflicts_with_region (gimple_stmt_iterator gsi, ao_ref *ref,
2858 			   bool ref_is_store, vec<basic_block> region_bbs,
2859 			   unsigned int i, gimple *skip_stmt)
2860 {
2861   basic_block bb = region_bbs[i];
2862   gsi_next (&gsi);
2863 
2864   while (true)
2865     {
2866       for (; !gsi_end_p (gsi);
2867 	   gsi_next (&gsi))
2868 	{
2869 	  gimple *stmt = gsi_stmt (gsi);
2870 	  if (stmt == skip_stmt)
2871 	    {
2872 	      if (dump_file)
2873 		{
2874 		  fprintf (dump_file, "skipping reduction store: ");
2875 		  print_gimple_stmt (dump_file, stmt, 0, 0);
2876 		}
2877 	      continue;
2878 	    }
2879 
2880 	  if (!gimple_vdef (stmt)
2881 	      && !gimple_vuse (stmt))
2882 	    continue;
2883 
2884 	  if (gimple_code (stmt) == GIMPLE_RETURN)
2885 	    continue;
2886 
2887 	  if (ref_is_store)
2888 	    {
2889 	      if (ref_maybe_used_by_stmt_p (stmt, ref))
2890 		{
2891 		  if (dump_file)
2892 		    {
2893 		      fprintf (dump_file, "Stmt ");
2894 		      print_gimple_stmt (dump_file, stmt, 0, 0);
2895 		    }
2896 		  return true;
2897 		}
2898 	    }
2899 	  else
2900 	    {
2901 	      if (stmt_may_clobber_ref_p_1 (stmt, ref))
2902 		{
2903 		  if (dump_file)
2904 		    {
2905 		      fprintf (dump_file, "Stmt ");
2906 		      print_gimple_stmt (dump_file, stmt, 0, 0);
2907 		    }
2908 		  return true;
2909 		}
2910 	    }
2911 	}
2912       i++;
2913       if (i == region_bbs.length ())
2914 	break;
2915       bb = region_bbs[i];
2916       gsi = gsi_start_bb (bb);
2917     }
2918 
2919   return false;
2920 }
2921 
2922 /* Return true if the bbs in REGION_BBS but not in in_loop_bbs can be executed
2923    in parallel with REGION_BBS containing the loop.  Return the stores of
2924    reduction results in REDUCTION_STORES.  */
2925 
2926 static bool
2927 oacc_entry_exit_ok_1 (bitmap in_loop_bbs, vec<basic_block> region_bbs,
2928 		      reduction_info_table_type *reduction_list,
2929 		      bitmap reduction_stores)
2930 {
2931   tree omp_data_i = get_omp_data_i_param ();
2932 
2933   unsigned i;
2934   basic_block bb;
2935   FOR_EACH_VEC_ELT (region_bbs, i, bb)
2936     {
2937       if (bitmap_bit_p (in_loop_bbs, bb->index))
2938 	continue;
2939 
2940       gimple_stmt_iterator gsi;
2941       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
2942 	   gsi_next (&gsi))
2943 	{
2944 	  gimple *stmt = gsi_stmt (gsi);
2945 	  gimple *skip_stmt = NULL;
2946 
2947 	  if (is_gimple_debug (stmt)
2948 	      || gimple_code (stmt) == GIMPLE_COND)
2949 	    continue;
2950 
2951 	  ao_ref ref;
2952 	  bool ref_is_store = false;
2953 	  if (gimple_assign_load_p (stmt))
2954 	    {
2955 	      tree rhs = gimple_assign_rhs1 (stmt);
2956 	      tree base = get_base_address (rhs);
2957 	      if (TREE_CODE (base) == MEM_REF
2958 		  && operand_equal_p (TREE_OPERAND (base, 0), omp_data_i, 0))
2959 		continue;
2960 
2961 	      tree lhs = gimple_assign_lhs (stmt);
2962 	      if (TREE_CODE (lhs) == SSA_NAME
2963 		  && has_single_use (lhs))
2964 		{
2965 		  use_operand_p use_p;
2966 		  gimple *use_stmt;
2967 		  single_imm_use (lhs, &use_p, &use_stmt);
2968 		  if (gimple_code (use_stmt) == GIMPLE_PHI)
2969 		    {
2970 		      struct reduction_info *red;
2971 		      red = reduction_phi (reduction_list, use_stmt);
2972 		      tree val = PHI_RESULT (red->keep_res);
2973 		      if (has_single_use (val))
2974 			{
2975 			  single_imm_use (val, &use_p, &use_stmt);
2976 			  if (gimple_store_p (use_stmt))
2977 			    {
2978 			      unsigned int id
2979 				= SSA_NAME_VERSION (gimple_vdef (use_stmt));
2980 			      bitmap_set_bit (reduction_stores, id);
2981 			      skip_stmt = use_stmt;
2982 			      if (dump_file)
2983 				{
2984 				  fprintf (dump_file, "found reduction load: ");
2985 				  print_gimple_stmt (dump_file, stmt, 0, 0);
2986 				}
2987 			    }
2988 			}
2989 		    }
2990 		}
2991 
2992 	      ao_ref_init (&ref, rhs);
2993 	    }
2994 	  else if (gimple_store_p (stmt))
2995 	    {
2996 	      ao_ref_init (&ref, gimple_assign_lhs (stmt));
2997 	      ref_is_store = true;
2998 	    }
2999 	  else if (gimple_code (stmt) == GIMPLE_OMP_RETURN)
3000 	    continue;
3001 	  else if (!gimple_has_side_effects (stmt)
3002 		   && !gimple_could_trap_p (stmt)
3003 		   && !stmt_could_throw_p (stmt)
3004 		   && !gimple_vdef (stmt)
3005 		   && !gimple_vuse (stmt))
3006 	    continue;
3007 	  else if (gimple_call_internal_p (stmt, IFN_GOACC_DIM_POS))
3008 	    continue;
3009 	  else if (gimple_code (stmt) == GIMPLE_RETURN)
3010 	    continue;
3011 	  else
3012 	    {
3013 	      if (dump_file)
3014 		{
3015 		  fprintf (dump_file, "Unhandled stmt in entry/exit: ");
3016 		  print_gimple_stmt (dump_file, stmt, 0, 0);
3017 		}
3018 	      return false;
3019 	    }
3020 
3021 	  if (ref_conflicts_with_region (gsi, &ref, ref_is_store, region_bbs,
3022 					 i, skip_stmt))
3023 	    {
3024 	      if (dump_file)
3025 		{
3026 		  fprintf (dump_file, "conflicts with entry/exit stmt: ");
3027 		  print_gimple_stmt (dump_file, stmt, 0, 0);
3028 		}
3029 	      return false;
3030 	    }
3031 	}
3032     }
3033 
3034   return true;
3035 }
3036 
3037 /* Find stores inside REGION_BBS and outside IN_LOOP_BBS, and guard them with
3038    gang_pos == 0, except when the stores are REDUCTION_STORES.  Return true
3039    if any changes were made.  */
3040 
3041 static bool
3042 oacc_entry_exit_single_gang (bitmap in_loop_bbs, vec<basic_block> region_bbs,
3043 			     bitmap reduction_stores)
3044 {
3045   tree gang_pos = NULL_TREE;
3046   bool changed = false;
3047 
3048   unsigned i;
3049   basic_block bb;
3050   FOR_EACH_VEC_ELT (region_bbs, i, bb)
3051     {
3052       if (bitmap_bit_p (in_loop_bbs, bb->index))
3053 	continue;
3054 
3055       gimple_stmt_iterator gsi;
3056       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);)
3057 	{
3058 	  gimple *stmt = gsi_stmt (gsi);
3059 
3060 	  if (!gimple_store_p (stmt))
3061 	    {
3062 	      /* Update gsi to point to next stmt.  */
3063 	      gsi_next (&gsi);
3064 	      continue;
3065 	    }
3066 
3067 	  if (bitmap_bit_p (reduction_stores,
3068 			    SSA_NAME_VERSION (gimple_vdef (stmt))))
3069 	    {
3070 	      if (dump_file)
3071 		{
3072 		  fprintf (dump_file,
3073 			   "skipped reduction store for single-gang"
3074 			   " neutering: ");
3075 		  print_gimple_stmt (dump_file, stmt, 0, 0);
3076 		}
3077 
3078 	      /* Update gsi to point to next stmt.  */
3079 	      gsi_next (&gsi);
3080 	      continue;
3081 	    }
3082 
3083 	  changed = true;
3084 
3085 	  if (gang_pos == NULL_TREE)
3086 	    {
3087 	      tree arg = build_int_cst (integer_type_node, GOMP_DIM_GANG);
3088 	      gcall *gang_single
3089 		= gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, arg);
3090 	      gang_pos = make_ssa_name (integer_type_node);
3091 	      gimple_call_set_lhs (gang_single, gang_pos);
3092 	      gimple_stmt_iterator start
3093 		= gsi_start_bb (single_succ (ENTRY_BLOCK_PTR_FOR_FN (cfun)));
3094 	      tree vuse = ssa_default_def (cfun, gimple_vop (cfun));
3095 	      gimple_set_vuse (gang_single, vuse);
3096 	      gsi_insert_before (&start, gang_single, GSI_SAME_STMT);
3097 	    }
3098 
3099 	  if (dump_file)
3100 	    {
3101 	      fprintf (dump_file,
3102 		       "found store that needs single-gang neutering: ");
3103 	      print_gimple_stmt (dump_file, stmt, 0, 0);
3104 	    }
3105 
3106 	  {
3107 	    /* Split block before store.  */
3108 	    gimple_stmt_iterator gsi2 = gsi;
3109 	    gsi_prev (&gsi2);
3110 	    edge e;
3111 	    if (gsi_end_p (gsi2))
3112 	      {
3113 		e = split_block_after_labels (bb);
3114 		gsi2 = gsi_last_bb (bb);
3115 	      }
3116 	    else
3117 	      e = split_block (bb, gsi_stmt (gsi2));
3118 	    basic_block bb2 = e->dest;
3119 
3120 	    /* Split block after store.  */
3121 	    gimple_stmt_iterator gsi3 = gsi_start_bb (bb2);
3122 	    edge e2 = split_block (bb2, gsi_stmt (gsi3));
3123 	    basic_block bb3 = e2->dest;
3124 
3125 	    gimple *cond
3126 	      = gimple_build_cond (EQ_EXPR, gang_pos, integer_zero_node,
3127 				   NULL_TREE, NULL_TREE);
3128 	    gsi_insert_after (&gsi2, cond, GSI_NEW_STMT);
3129 
3130 	    edge e3 = make_edge (bb, bb3, EDGE_FALSE_VALUE);
3131 	    e->flags = EDGE_TRUE_VALUE;
3132 
3133 	    tree vdef = gimple_vdef (stmt);
3134 	    tree vuse = gimple_vuse (stmt);
3135 
3136 	    tree phi_res = copy_ssa_name (vdef);
3137 	    gphi *new_phi = create_phi_node (phi_res, bb3);
3138 	    replace_uses_by (vdef, phi_res);
3139 	    add_phi_arg (new_phi, vuse, e3, UNKNOWN_LOCATION);
3140 	    add_phi_arg (new_phi, vdef, e2, UNKNOWN_LOCATION);
3141 
3142 	    /* Update gsi to point to next stmt.  */
3143 	    bb = bb3;
3144 	    gsi = gsi_start_bb (bb);
3145 	  }
3146 	}
3147     }
3148 
3149   return changed;
3150 }
3151 
3152 /* Return true if the statements before and after the LOOP can be executed in
3153    parallel with the function containing the loop.  Resolve conflicting stores
3154    outside LOOP by guarding them such that only a single gang executes them.  */
3155 
3156 static bool
3157 oacc_entry_exit_ok (struct loop *loop,
3158 		    reduction_info_table_type *reduction_list)
3159 {
3160   basic_block *loop_bbs = get_loop_body_in_dom_order (loop);
3161   vec<basic_block> region_bbs
3162     = get_all_dominated_blocks (CDI_DOMINATORS, ENTRY_BLOCK_PTR_FOR_FN (cfun));
3163 
3164   bitmap in_loop_bbs = BITMAP_ALLOC (NULL);
3165   bitmap_clear (in_loop_bbs);
3166   for (unsigned int i = 0; i < loop->num_nodes; i++)
3167     bitmap_set_bit (in_loop_bbs, loop_bbs[i]->index);
3168 
3169   bitmap reduction_stores = BITMAP_ALLOC (NULL);
3170   bool res = oacc_entry_exit_ok_1 (in_loop_bbs, region_bbs, reduction_list,
3171 				   reduction_stores);
3172 
3173   if (res)
3174     {
3175       bool changed = oacc_entry_exit_single_gang (in_loop_bbs, region_bbs,
3176 						  reduction_stores);
3177       if (changed)
3178 	{
3179 	  free_dominance_info (CDI_DOMINATORS);
3180 	  calculate_dominance_info (CDI_DOMINATORS);
3181 	}
3182     }
3183 
3184   region_bbs.release ();
3185   free (loop_bbs);
3186 
3187   BITMAP_FREE (in_loop_bbs);
3188   BITMAP_FREE (reduction_stores);
3189 
3190   return res;
3191 }
3192 
3193 /* Detect parallel loops and generate parallel code using libgomp
3194    primitives.  Returns true if some loop was parallelized, false
3195    otherwise.  */
3196 
3197 static bool
3198 parallelize_loops (bool oacc_kernels_p)
3199 {
3200   unsigned n_threads;
3201   bool changed = false;
3202   struct loop *loop;
3203   struct loop *skip_loop = NULL;
3204   struct tree_niter_desc niter_desc;
3205   struct obstack parloop_obstack;
3206   HOST_WIDE_INT estimated;
3207   source_location loop_loc;
3208 
3209   /* Do not parallelize loops in the functions created by parallelization.  */
3210   if (!oacc_kernels_p
3211       && parallelized_function_p (cfun->decl))
3212     return false;
3213 
3214   /* Do not parallelize loops in offloaded functions.  */
3215   if (!oacc_kernels_p
3216       && oacc_get_fn_attrib (cfun->decl) != NULL)
3217      return false;
3218 
3219   if (cfun->has_nonlocal_label)
3220     return false;
3221 
3222   /* For OpenACC kernels, n_threads will be determined later; otherwise, it's
3223      the argument to -ftree-parallelize-loops.  */
3224   if (oacc_kernels_p)
3225     n_threads = 0;
3226   else
3227     n_threads = flag_tree_parallelize_loops;
3228 
3229   gcc_obstack_init (&parloop_obstack);
3230   reduction_info_table_type reduction_list (10);
3231 
3232   calculate_dominance_info (CDI_DOMINATORS);
3233 
3234   FOR_EACH_LOOP (loop, 0)
3235     {
3236       if (loop == skip_loop)
3237 	{
3238 	  if (!loop->in_oacc_kernels_region
3239 	      && dump_file && (dump_flags & TDF_DETAILS))
3240 	    fprintf (dump_file,
3241 		     "Skipping loop %d as inner loop of parallelized loop\n",
3242 		     loop->num);
3243 
3244 	  skip_loop = loop->inner;
3245 	  continue;
3246 	}
3247       else
3248 	skip_loop = NULL;
3249 
3250       reduction_list.empty ();
3251 
3252       if (oacc_kernels_p)
3253 	{
3254 	  if (!loop->in_oacc_kernels_region)
3255 	    continue;
3256 
3257 	  /* Don't try to parallelize inner loops in an oacc kernels region.  */
3258 	  if (loop->inner)
3259 	    skip_loop = loop->inner;
3260 
3261 	  if (dump_file && (dump_flags & TDF_DETAILS))
3262 	    fprintf (dump_file,
3263 		     "Trying loop %d with header bb %d in oacc kernels"
3264 		     " region\n", loop->num, loop->header->index);
3265 	}
3266 
3267       if (dump_file && (dump_flags & TDF_DETAILS))
3268       {
3269         fprintf (dump_file, "Trying loop %d as candidate\n",loop->num);
3270 	if (loop->inner)
3271 	  fprintf (dump_file, "loop %d is not innermost\n",loop->num);
3272 	else
3273 	  fprintf (dump_file, "loop %d is innermost\n",loop->num);
3274       }
3275 
3276       /* If we use autopar in graphite pass, we use its marked dependency
3277       checking results.  */
3278       if (flag_loop_parallelize_all && !loop->can_be_parallel)
3279       {
3280         if (dump_file && (dump_flags & TDF_DETAILS))
3281 	   fprintf (dump_file, "loop is not parallel according to graphite\n");
3282 	continue;
3283       }
3284 
3285       if (!single_dom_exit (loop))
3286       {
3287 
3288         if (dump_file && (dump_flags & TDF_DETAILS))
3289 	  fprintf (dump_file, "loop is !single_dom_exit\n");
3290 
3291 	continue;
3292       }
3293 
3294       if (/* And of course, the loop must be parallelizable.  */
3295 	  !can_duplicate_loop_p (loop)
3296 	  || loop_has_blocks_with_irreducible_flag (loop)
3297 	  || (loop_preheader_edge (loop)->src->flags & BB_IRREDUCIBLE_LOOP)
3298 	  /* FIXME: the check for vector phi nodes could be removed.  */
3299 	  || loop_has_vector_phi_nodes (loop))
3300 	continue;
3301 
3302       estimated = estimated_stmt_executions_int (loop);
3303       if (estimated == -1)
3304 	estimated = likely_max_stmt_executions_int (loop);
3305       /* FIXME: Bypass this check as graphite doesn't update the
3306 	 count and frequency correctly now.  */
3307       if (!flag_loop_parallelize_all
3308 	  && !oacc_kernels_p
3309 	  && ((estimated != -1
3310 	       && estimated <= (HOST_WIDE_INT) n_threads * MIN_PER_THREAD)
3311 	      /* Do not bother with loops in cold areas.  */
3312 	      || optimize_loop_nest_for_size_p (loop)))
3313 	continue;
3314 
3315       if (!try_get_loop_niter (loop, &niter_desc))
3316 	continue;
3317 
3318       if (!try_create_reduction_list (loop, &reduction_list, oacc_kernels_p))
3319 	continue;
3320 
3321       if (loop_has_phi_with_address_arg (loop))
3322 	continue;
3323 
3324       if (!flag_loop_parallelize_all
3325 	  && !loop_parallel_p (loop, &parloop_obstack))
3326 	continue;
3327 
3328       if (oacc_kernels_p
3329 	&& !oacc_entry_exit_ok (loop, &reduction_list))
3330 	{
3331 	  if (dump_file)
3332 	    fprintf (dump_file, "entry/exit not ok: FAILED\n");
3333 	  continue;
3334 	}
3335 
3336       changed = true;
3337       skip_loop = loop->inner;
3338 
3339       loop_loc = find_loop_location (loop);
3340       if (loop->inner)
3341 	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc,
3342 			 "parallelizing outer loop %d\n", loop->num);
3343       else
3344 	dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, loop_loc,
3345 			 "parallelizing inner loop %d\n", loop->num);
3346 
3347       gen_parallel_loop (loop, &reduction_list,
3348 			 n_threads, &niter_desc, oacc_kernels_p);
3349     }
3350 
3351   obstack_free (&parloop_obstack, NULL);
3352 
3353   /* Parallelization will cause new function calls to be inserted through
3354      which local variables will escape.  Reset the points-to solution
3355      for ESCAPED.  */
3356   if (changed)
3357     pt_solution_reset (&cfun->gimple_df->escaped);
3358 
3359   return changed;
3360 }
3361 
3362 /* Parallelization.  */
3363 
3364 namespace {
3365 
3366 const pass_data pass_data_parallelize_loops =
3367 {
3368   GIMPLE_PASS, /* type */
3369   "parloops", /* name */
3370   OPTGROUP_LOOP, /* optinfo_flags */
3371   TV_TREE_PARALLELIZE_LOOPS, /* tv_id */
3372   ( PROP_cfg | PROP_ssa ), /* properties_required */
3373   0, /* properties_provided */
3374   0, /* properties_destroyed */
3375   0, /* todo_flags_start */
3376   0, /* todo_flags_finish */
3377 };
3378 
3379 class pass_parallelize_loops : public gimple_opt_pass
3380 {
3381 public:
3382   pass_parallelize_loops (gcc::context *ctxt)
3383     : gimple_opt_pass (pass_data_parallelize_loops, ctxt),
3384       oacc_kernels_p (false)
3385   {}
3386 
3387   /* opt_pass methods: */
3388   virtual bool gate (function *)
3389   {
3390     if (oacc_kernels_p)
3391       return flag_openacc;
3392     else
3393       return flag_tree_parallelize_loops > 1;
3394   }
3395   virtual unsigned int execute (function *);
3396   opt_pass * clone () { return new pass_parallelize_loops (m_ctxt); }
3397   void set_pass_param (unsigned int n, bool param)
3398     {
3399       gcc_assert (n == 0);
3400       oacc_kernels_p = param;
3401     }
3402 
3403  private:
3404   bool oacc_kernels_p;
3405 }; // class pass_parallelize_loops
3406 
3407 unsigned
3408 pass_parallelize_loops::execute (function *fun)
3409 {
3410   tree nthreads = builtin_decl_explicit (BUILT_IN_OMP_GET_NUM_THREADS);
3411   if (nthreads == NULL_TREE)
3412     return 0;
3413 
3414   bool in_loop_pipeline = scev_initialized_p ();
3415   if (!in_loop_pipeline)
3416     loop_optimizer_init (LOOPS_NORMAL
3417 			 | LOOPS_HAVE_RECORDED_EXITS);
3418 
3419   if (number_of_loops (fun) <= 1)
3420     return 0;
3421 
3422   if (!in_loop_pipeline)
3423     {
3424       rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa);
3425       scev_initialize ();
3426     }
3427 
3428   unsigned int todo = 0;
3429   if (parallelize_loops (oacc_kernels_p))
3430     {
3431       fun->curr_properties &= ~(PROP_gimple_eomp);
3432 
3433       checking_verify_loop_structure ();
3434 
3435       todo |= TODO_update_ssa;
3436     }
3437 
3438   if (!in_loop_pipeline)
3439     {
3440       scev_finalize ();
3441       loop_optimizer_finalize ();
3442     }
3443 
3444   return todo;
3445 }
3446 
3447 } // anon namespace
3448 
3449 gimple_opt_pass *
3450 make_pass_parallelize_loops (gcc::context *ctxt)
3451 {
3452   return new pass_parallelize_loops (ctxt);
3453 }
3454