xref: /netbsd-src/external/gpl3/gcc/dist/gcc/omp-oacc-neuter-broadcast.cc (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /* OpenACC worker partitioning via middle end neutering/broadcasting scheme
2 
3    Copyright (C) 2015-2022 Free Software Foundation, Inc.
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published
9    by the Free Software Foundation; either version 3, or (at your
10    option) any later version.
11 
12    GCC is distributed in the hope that it will be useful, but WITHOUT
13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15    License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "backend.h"
25 #include "rtl.h"
26 #include "tree.h"
27 #include "gimple.h"
28 #include "tree-pass.h"
29 #include "ssa.h"
30 #include "cgraph.h"
31 #include "pretty-print.h"
32 #include "fold-const.h"
33 #include "gimplify.h"
34 #include "gimple-iterator.h"
35 #include "gimple-walk.h"
36 #include "tree-inline.h"
37 #include "langhooks.h"
38 #include "omp-general.h"
39 #include "omp-low.h"
40 #include "gimple-pretty-print.h"
41 #include "cfghooks.h"
42 #include "insn-config.h"
43 #include "recog.h"
44 #include "internal-fn.h"
45 #include "bitmap.h"
46 #include "tree-nested.h"
47 #include "stor-layout.h"
48 #include "tree-ssa-threadupdate.h"
49 #include "tree-into-ssa.h"
50 #include "splay-tree.h"
51 #include "target.h"
52 #include "cfgloop.h"
53 #include "tree-cfg.h"
54 #include "omp-offload.h"
55 #include "attribs.h"
56 #include "targhooks.h"
57 #include "diagnostic-core.h"
58 
59 /* Loop structure of the function.  The entire function is described as
60    a NULL loop.  */
61 /* Adapted from 'gcc/config/nvptx/nvptx.cc:struct parallel'.  */
62 
63 struct parallel_g
64 {
65   /* Parent parallel.  */
66   parallel_g *parent;
67 
68   /* Next sibling parallel.  */
69   parallel_g *next;
70 
71   /* First child parallel.  */
72   parallel_g *inner;
73 
74   /* Partitioning mask of the parallel.  */
75   unsigned mask;
76 
77   /* Partitioning used within inner parallels. */
78   unsigned inner_mask;
79 
80   /* Location of parallel forked and join.  The forked is the first
81      block in the parallel and the join is the first block after of
82      the partition.  */
83   basic_block forked_block;
84   basic_block join_block;
85 
86   gimple *forked_stmt;
87   gimple *join_stmt;
88 
89   gimple *fork_stmt;
90   gimple *joining_stmt;
91 
92   /* Basic blocks in this parallel, but not in child parallels.  The
93      FORKED and JOINING blocks are in the partition.  The FORK and JOIN
94      blocks are not.  */
95   auto_vec<basic_block> blocks;
96 
97   tree record_type;
98   tree sender_decl;
99   tree receiver_decl;
100 
101 public:
102   parallel_g (parallel_g *parent, unsigned mode);
103   ~parallel_g ();
104 };
105 
106 /* Constructor links the new parallel into it's parent's chain of
107    children.  */
108 
parallel_g(parallel_g * parent_,unsigned mask_)109 parallel_g::parallel_g (parallel_g *parent_, unsigned mask_)
110   :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
111 {
112   forked_block = join_block = 0;
113   forked_stmt = join_stmt = NULL;
114   fork_stmt = joining_stmt = NULL;
115 
116   record_type = NULL_TREE;
117   sender_decl = NULL_TREE;
118   receiver_decl = NULL_TREE;
119 
120   if (parent)
121     {
122       next = parent->inner;
123       parent->inner = this;
124     }
125 }
126 
~parallel_g()127 parallel_g::~parallel_g ()
128 {
129   delete inner;
130   delete next;
131 }
132 
133 static bool
local_var_based_p(tree decl)134 local_var_based_p (tree decl)
135 {
136   switch (TREE_CODE (decl))
137     {
138     case VAR_DECL:
139       return !is_global_var (decl);
140 
141     case COMPONENT_REF:
142     case BIT_FIELD_REF:
143     case ARRAY_REF:
144       return local_var_based_p (TREE_OPERAND (decl, 0));
145 
146     default:
147       return false;
148     }
149 }
150 
151 /* Map of basic blocks to gimple stmts.  */
152 typedef hash_map<basic_block, gimple *> bb_stmt_map_t;
153 
154 /* Calls to OpenACC routines are made by all workers/wavefronts/warps, since
155    the routine likely contains partitioned loops (else will do its own
156    neutering and variable propagation). Return TRUE if a function call CALL
157    should be made in (worker) single mode instead, rather than redundant
158    mode.  */
159 
160 static bool
omp_sese_active_worker_call(gcall * call)161 omp_sese_active_worker_call (gcall *call)
162 {
163 #define GOMP_DIM_SEQ GOMP_DIM_MAX
164   tree fndecl = gimple_call_fndecl (call);
165 
166   if (!fndecl)
167     return true;
168 
169   tree attrs = oacc_get_fn_attrib (fndecl);
170 
171   if (!attrs)
172     return true;
173 
174   int level = oacc_fn_attrib_level (attrs);
175 
176   /* Neither regular functions nor "seq" routines should be run by all threads
177      in worker-single mode.  */
178   return level == -1 || level == GOMP_DIM_SEQ;
179 #undef GOMP_DIM_SEQ
180 }
181 
182 /* Split basic blocks such that each forked and join unspecs are at
183    the start of their basic blocks.  Thus afterwards each block will
184    have a single partitioning mode.  We also do the same for return
185    insns, as they are executed by every thread.  Return the
186    partitioning mode of the function as a whole.  Populate MAP with
187    head and tail blocks.  We also clear the BB visited flag, which is
188    used when finding partitions.  */
189 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_split_blocks'.  */
190 
191 static void
omp_sese_split_blocks(bb_stmt_map_t * map)192 omp_sese_split_blocks (bb_stmt_map_t *map)
193 {
194   auto_vec<gimple *> worklist;
195   basic_block block;
196 
197   /* Locate all the reorg instructions of interest.  */
198   FOR_ALL_BB_FN (block, cfun)
199     {
200       /* Clear visited flag, for use by parallel locator  */
201       block->flags &= ~BB_VISITED;
202 
203       for (gimple_stmt_iterator gsi = gsi_start_bb (block);
204 	   !gsi_end_p (gsi);
205 	   gsi_next (&gsi))
206 	{
207 	  gimple *stmt = gsi_stmt (gsi);
208 
209 	  if (gimple_call_internal_p (stmt, IFN_UNIQUE))
210 	    {
211 	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
212 		TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
213 
214 	      if (k == IFN_UNIQUE_OACC_JOIN)
215 		worklist.safe_push (stmt);
216 	      else if (k == IFN_UNIQUE_OACC_FORK)
217 		{
218 		  gcc_assert (gsi_one_before_end_p (gsi));
219 		  basic_block forked_block = single_succ (block);
220 		  gimple_stmt_iterator gsi2 = gsi_start_bb (forked_block);
221 
222 		  /* We push a NOP as a placeholder for the "forked" stmt.
223 		     This is then recognized in omp_sese_find_par.  */
224 		  gimple *nop = gimple_build_nop ();
225 		  gsi_insert_before (&gsi2, nop, GSI_SAME_STMT);
226 
227 		  worklist.safe_push (nop);
228 		}
229 	    }
230 	  else if (gimple_code (stmt) == GIMPLE_RETURN
231 		   || gimple_code (stmt) == GIMPLE_COND
232 		   || gimple_code (stmt) == GIMPLE_SWITCH
233 		   || (gimple_code (stmt) == GIMPLE_CALL
234 		       && !gimple_call_internal_p (stmt)
235 		       && !omp_sese_active_worker_call (as_a <gcall *> (stmt))))
236 	    worklist.safe_push (stmt);
237 	  else if (is_gimple_assign (stmt))
238 	    {
239 	      tree lhs = gimple_assign_lhs (stmt);
240 
241 	      /* Force assignments to components/fields/elements of local
242 		 aggregates into fully-partitioned (redundant) mode.  This
243 		 avoids having to broadcast the whole aggregate.  The RHS of
244 		 the assignment will be propagated using the normal
245 		 mechanism.  */
246 
247 	      switch (TREE_CODE (lhs))
248 		{
249 		case COMPONENT_REF:
250 		case BIT_FIELD_REF:
251 		case ARRAY_REF:
252 		  {
253 		    tree aggr = TREE_OPERAND (lhs, 0);
254 
255 		    if (local_var_based_p (aggr))
256 		      worklist.safe_push (stmt);
257 		  }
258 		  break;
259 
260 		default:
261 		  ;
262 		}
263 	    }
264 	}
265     }
266 
267   /* Split blocks on the worklist.  */
268   unsigned ix;
269   gimple *stmt;
270 
271   for (ix = 0; worklist.iterate (ix, &stmt); ix++)
272     {
273       basic_block block = gimple_bb (stmt);
274 
275       if (gimple_code (stmt) == GIMPLE_COND)
276 	{
277 	  gcond *orig_cond = as_a <gcond *> (stmt);
278 	  tree_code code = gimple_expr_code (orig_cond);
279 	  tree pred = make_ssa_name (boolean_type_node);
280 	  gimple *asgn = gimple_build_assign (pred, code,
281 			   gimple_cond_lhs (orig_cond),
282 			   gimple_cond_rhs (orig_cond));
283 	  gcond *new_cond
284 	    = gimple_build_cond (NE_EXPR, pred, boolean_false_node,
285 				 gimple_cond_true_label (orig_cond),
286 				 gimple_cond_false_label (orig_cond));
287 
288 	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
289 	  gsi_insert_before (&gsi, asgn, GSI_SAME_STMT);
290 	  gsi_replace (&gsi, new_cond, true);
291 
292 	  edge e = split_block (block, asgn);
293 	  block = e->dest;
294 	  map->get_or_insert (block) = new_cond;
295 	}
296       else if ((gimple_code (stmt) == GIMPLE_CALL
297 		&& !gimple_call_internal_p (stmt))
298 	       || is_gimple_assign (stmt))
299 	{
300 	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
301 	  gsi_prev (&gsi);
302 
303 	  edge call = split_block (block, gsi_stmt (gsi));
304 
305 	  gimple *call_stmt = gsi_stmt (gsi_start_bb (call->dest));
306 
307 	  edge call_to_ret = split_block (call->dest, call_stmt);
308 
309 	  map->get_or_insert (call_to_ret->src) = call_stmt;
310 	}
311       else
312 	{
313 	  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
314 	  gsi_prev (&gsi);
315 
316 	  if (gsi_end_p (gsi))
317 	    map->get_or_insert (block) = stmt;
318 	  else
319 	    {
320 	      /* Split block before insn. The insn is in the new block.  */
321 	      edge e = split_block (block, gsi_stmt (gsi));
322 
323 	      block = e->dest;
324 	      map->get_or_insert (block) = stmt;
325 	    }
326 	}
327     }
328 }
329 
330 static const char *
mask_name(unsigned mask)331 mask_name (unsigned mask)
332 {
333   switch (mask)
334     {
335     case 0: return "gang redundant";
336     case 1: return "gang partitioned";
337     case 2: return "worker partitioned";
338     case 3: return "gang+worker partitioned";
339     case 4: return "vector partitioned";
340     case 5: return "gang+vector partitioned";
341     case 6: return "worker+vector partitioned";
342     case 7: return "fully partitioned";
343     default: return "<illegal>";
344     }
345 }
346 
347 /* Dump this parallel and all its inner parallels.  */
348 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_dump_pars'.  */
349 
350 static void
omp_sese_dump_pars(parallel_g * par,unsigned depth)351 omp_sese_dump_pars (parallel_g *par, unsigned depth)
352 {
353   fprintf (dump_file, "%u: mask %d (%s) head=%d, tail=%d\n",
354 	   depth, par->mask, mask_name (par->mask),
355 	   par->forked_block ? par->forked_block->index : -1,
356 	   par->join_block ? par->join_block->index : -1);
357 
358   fprintf (dump_file, "    blocks:");
359 
360   basic_block block;
361   for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
362     fprintf (dump_file, " %d", block->index);
363   fprintf (dump_file, "\n");
364   if (par->inner)
365     omp_sese_dump_pars (par->inner, depth + 1);
366 
367   if (par->next)
368     omp_sese_dump_pars (par->next, depth);
369 }
370 
371 /* If BLOCK contains a fork/join marker, process it to create or
372    terminate a loop structure.  Add this block to the current loop,
373    and then walk successor blocks.   */
374 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_find_par'.  */
375 
376 static parallel_g *
omp_sese_find_par(bb_stmt_map_t * map,parallel_g * par,basic_block block)377 omp_sese_find_par (bb_stmt_map_t *map, parallel_g *par, basic_block block)
378 {
379   if (block->flags & BB_VISITED)
380     return par;
381   block->flags |= BB_VISITED;
382 
383   if (gimple **stmtp = map->get (block))
384     {
385       gimple *stmt = *stmtp;
386 
387       if (gimple_code (stmt) == GIMPLE_COND
388 	  || gimple_code (stmt) == GIMPLE_SWITCH
389 	  || gimple_code (stmt) == GIMPLE_RETURN
390 	  || (gimple_code (stmt) == GIMPLE_CALL
391 	      && !gimple_call_internal_p (stmt))
392 	  || is_gimple_assign (stmt))
393 	{
394 	  /* A single block that is forced to be at the maximum partition
395 	     level.  Make a singleton par for it.  */
396 	  par = new parallel_g (par, GOMP_DIM_MASK (GOMP_DIM_GANG)
397 				   | GOMP_DIM_MASK (GOMP_DIM_WORKER)
398 				   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
399 	  par->forked_block = block;
400 	  par->forked_stmt = stmt;
401 	  par->blocks.safe_push (block);
402 	  par = par->parent;
403 	  goto walk_successors;
404 	}
405       else if (gimple_nop_p (stmt))
406 	{
407 	  basic_block pred = single_pred (block);
408 	  gcc_assert (pred);
409 	  gimple_stmt_iterator gsi = gsi_last_bb (pred);
410 	  gimple *final_stmt = gsi_stmt (gsi);
411 
412 	  if (gimple_call_internal_p (final_stmt, IFN_UNIQUE))
413 	    {
414 	      gcall *call = as_a <gcall *> (final_stmt);
415 	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
416 		TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
417 
418 	      if (k == IFN_UNIQUE_OACC_FORK)
419 		{
420 		  HOST_WIDE_INT dim
421 		    = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
422 		  unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
423 
424 		  par = new parallel_g (par, mask);
425 		  par->forked_block = block;
426 		  par->forked_stmt = final_stmt;
427 		  par->fork_stmt = stmt;
428 		}
429 	      else
430 		gcc_unreachable ();
431 	    }
432 	  else
433 	    gcc_unreachable ();
434 	}
435       else if (gimple_call_internal_p (stmt, IFN_UNIQUE))
436 	{
437 	  gcall *call = as_a <gcall *> (stmt);
438 	  enum ifn_unique_kind k = ((enum ifn_unique_kind)
439 	    TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
440 	  if (k == IFN_UNIQUE_OACC_JOIN)
441 	    {
442 	      HOST_WIDE_INT dim = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
443 	      unsigned mask = (dim >= 0) ? GOMP_DIM_MASK (dim) : 0;
444 
445 	      gcc_assert (par->mask == mask);
446 	      par->join_block = block;
447 	      par->join_stmt = stmt;
448 	      par = par->parent;
449 	    }
450 	  else
451 	    gcc_unreachable ();
452 	}
453       else
454 	gcc_unreachable ();
455     }
456 
457   if (par)
458     /* Add this block onto the current loop's list of blocks.  */
459     par->blocks.safe_push (block);
460   else
461     /* This must be the entry block.  Create a NULL parallel.  */
462     par = new parallel_g (0, 0);
463 
464 walk_successors:
465   /* Walk successor blocks.  */
466   edge e;
467   edge_iterator ei;
468 
469   FOR_EACH_EDGE (e, ei, block->succs)
470     omp_sese_find_par (map, par, e->dest);
471 
472   return par;
473 }
474 
475 /* DFS walk the CFG looking for fork & join markers.  Construct
476    loop structures as we go.  MAP is a mapping of basic blocks
477    to head & tail markers, discovered when splitting blocks.  This
478    speeds up the discovery.  We rely on the BB visited flag having
479    been cleared when splitting blocks.  */
480 /* Adapted from 'gcc/config/nvptx/nvptx.cc:nvptx_discover_pars'.  */
481 
482 static parallel_g *
omp_sese_discover_pars(bb_stmt_map_t * map)483 omp_sese_discover_pars (bb_stmt_map_t *map)
484 {
485   basic_block block;
486 
487   /* Mark exit blocks as visited.  */
488   block = EXIT_BLOCK_PTR_FOR_FN (cfun);
489   block->flags |= BB_VISITED;
490 
491   /* And entry block as not.  */
492   block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
493   block->flags &= ~BB_VISITED;
494 
495   parallel_g *par = omp_sese_find_par (map, 0, block);
496 
497   if (dump_file)
498     {
499       fprintf (dump_file, "\nLoops\n");
500       omp_sese_dump_pars (par, 0);
501       fprintf (dump_file, "\n");
502     }
503 
504   return par;
505 }
506 
507 static void
populate_single_mode_bitmaps(parallel_g * par,bitmap worker_single,bitmap vector_single,unsigned outer_mask,int depth)508 populate_single_mode_bitmaps (parallel_g *par, bitmap worker_single,
509 			      bitmap vector_single, unsigned outer_mask,
510 			      int depth)
511 {
512   unsigned mask = outer_mask | par->mask;
513 
514   basic_block block;
515 
516   for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
517     {
518       if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
519 	bitmap_set_bit (worker_single, block->index);
520 
521       if ((mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) == 0)
522 	bitmap_set_bit (vector_single, block->index);
523     }
524 
525   if (par->inner)
526     populate_single_mode_bitmaps (par->inner, worker_single, vector_single,
527 				  mask, depth + 1);
528   if (par->next)
529     populate_single_mode_bitmaps (par->next, worker_single, vector_single,
530 				  outer_mask, depth);
531 }
532 
533 /* A map from SSA names or var decls to record fields.  */
534 
535 typedef hash_map<tree, tree> field_map_t;
536 
537 /* For each propagation record type, this is a map from SSA names or var decls
538    to propagate, to the field in the record type that should be used for
539    transmission and reception.  */
540 
541 typedef hash_map<tree, field_map_t> record_field_map_t;
542 
543 static void
install_var_field(tree var,tree record_type,field_map_t * fields)544 install_var_field (tree var, tree record_type, field_map_t *fields)
545 {
546   tree name;
547   char tmp[20];
548 
549   if (TREE_CODE (var) == SSA_NAME)
550     {
551       name = SSA_NAME_IDENTIFIER (var);
552       if (!name)
553 	{
554 	  sprintf (tmp, "_%u", (unsigned) SSA_NAME_VERSION (var));
555 	  name = get_identifier (tmp);
556 	}
557     }
558   else if (TREE_CODE (var) == VAR_DECL)
559     {
560       name = DECL_NAME (var);
561       if (!name)
562 	{
563 	  sprintf (tmp, "D_%u", (unsigned) DECL_UID (var));
564 	  name = get_identifier (tmp);
565 	}
566     }
567   else
568     gcc_unreachable ();
569 
570   gcc_assert (!fields->get (var));
571 
572   tree type = TREE_TYPE (var);
573 
574   if (POINTER_TYPE_P (type)
575       && TYPE_RESTRICT (type))
576     type = build_qualified_type (type, TYPE_QUALS (type) & ~TYPE_QUAL_RESTRICT);
577 
578   tree field = build_decl (BUILTINS_LOCATION, FIELD_DECL, name, type);
579 
580   if (TREE_CODE (var) == VAR_DECL && type == TREE_TYPE (var))
581     {
582       SET_DECL_ALIGN (field, DECL_ALIGN (var));
583       DECL_USER_ALIGN (field) = DECL_USER_ALIGN (var);
584       TREE_THIS_VOLATILE (field) = TREE_THIS_VOLATILE (var);
585     }
586   else
587     SET_DECL_ALIGN (field, TYPE_ALIGN (type));
588 
589   fields->put (var, field);
590 
591   insert_field_into_struct (record_type, field);
592 }
593 
594 /* Sets of SSA_NAMES or VAR_DECLs to propagate.  */
595 typedef hash_set<tree> propagation_set;
596 
597 static void
find_ssa_names_to_propagate(parallel_g * par,unsigned outer_mask,bitmap worker_single,bitmap vector_single,vec<propagation_set * > * prop_set)598 find_ssa_names_to_propagate (parallel_g *par, unsigned outer_mask,
599 			     bitmap worker_single, bitmap vector_single,
600 			     vec<propagation_set *> *prop_set)
601 {
602   unsigned mask = outer_mask | par->mask;
603 
604   if (par->inner)
605     find_ssa_names_to_propagate (par->inner, mask, worker_single,
606 				 vector_single, prop_set);
607   if (par->next)
608     find_ssa_names_to_propagate (par->next, outer_mask, worker_single,
609 				 vector_single, prop_set);
610 
611   if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
612     {
613       basic_block block;
614       int ix;
615 
616       for (ix = 0; par->blocks.iterate (ix, &block); ix++)
617 	{
618 	  for (gphi_iterator psi = gsi_start_phis (block);
619 	       !gsi_end_p (psi); gsi_next (&psi))
620 	    {
621 	      gphi *phi = psi.phi ();
622 	      use_operand_p use;
623 	      ssa_op_iter iter;
624 
625 	      FOR_EACH_PHI_ARG (use, phi, iter, SSA_OP_USE)
626 		{
627 		  tree var = USE_FROM_PTR (use);
628 
629 		  if (TREE_CODE (var) != SSA_NAME)
630 		    continue;
631 
632 		  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
633 
634 		  if (gimple_nop_p (def_stmt))
635 		    continue;
636 
637 		  basic_block def_bb = gimple_bb (def_stmt);
638 
639 		  if (bitmap_bit_p (worker_single, def_bb->index))
640 		    {
641 		      if (!(*prop_set)[def_bb->index])
642 			(*prop_set)[def_bb->index] = new propagation_set;
643 
644 		      propagation_set *ws_prop = (*prop_set)[def_bb->index];
645 
646 		      ws_prop->add (var);
647 		    }
648 		}
649 	    }
650 
651 	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
652 	       !gsi_end_p (gsi); gsi_next (&gsi))
653 	    {
654 	      use_operand_p use;
655 	      ssa_op_iter iter;
656 	      gimple *stmt = gsi_stmt (gsi);
657 
658 	      FOR_EACH_SSA_USE_OPERAND (use, stmt, iter, SSA_OP_USE)
659 		{
660 		  tree var = USE_FROM_PTR (use);
661 
662 		  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
663 
664 		  if (gimple_nop_p (def_stmt))
665 		    continue;
666 
667 		  basic_block def_bb = gimple_bb (def_stmt);
668 
669 		  if (bitmap_bit_p (worker_single, def_bb->index))
670 		    {
671 		      if (!(*prop_set)[def_bb->index])
672 			(*prop_set)[def_bb->index] = new propagation_set;
673 
674 		      propagation_set *ws_prop = (*prop_set)[def_bb->index];
675 
676 		      ws_prop->add (var);
677 		    }
678 		}
679 	    }
680 	}
681     }
682 }
683 
684 /* Callback for walk_gimple_stmt to find RHS VAR_DECLs (uses) in a
685    statement.  */
686 
687 static tree
find_partitioned_var_uses_1(tree * node,int *,void * data)688 find_partitioned_var_uses_1 (tree *node, int *, void *data)
689 {
690   walk_stmt_info *wi = (walk_stmt_info *) data;
691   hash_set<tree> *partitioned_var_uses = (hash_set<tree> *) wi->info;
692 
693   if (!wi->is_lhs && VAR_P (*node))
694     partitioned_var_uses->add (*node);
695 
696   return NULL_TREE;
697 }
698 
699 static void
find_partitioned_var_uses(parallel_g * par,unsigned outer_mask,hash_set<tree> * partitioned_var_uses)700 find_partitioned_var_uses (parallel_g *par, unsigned outer_mask,
701 			   hash_set<tree> *partitioned_var_uses)
702 {
703   unsigned mask = outer_mask | par->mask;
704 
705   if (par->inner)
706     find_partitioned_var_uses (par->inner, mask, partitioned_var_uses);
707   if (par->next)
708     find_partitioned_var_uses (par->next, outer_mask, partitioned_var_uses);
709 
710   if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
711     {
712       basic_block block;
713       int ix;
714 
715       for (ix = 0; par->blocks.iterate (ix, &block); ix++)
716 	for (gimple_stmt_iterator gsi = gsi_start_bb (block);
717 	     !gsi_end_p (gsi); gsi_next (&gsi))
718 	  {
719 	    walk_stmt_info wi;
720 	    memset (&wi, 0, sizeof (wi));
721 	    wi.info = (void *) partitioned_var_uses;
722 	    walk_gimple_stmt (&gsi, NULL, find_partitioned_var_uses_1, &wi);
723 	  }
724     }
725 }
726 
727 /* Gang-private variables (typically placed in a GPU's shared memory) do not
728    need to be processed by the worker-propagation mechanism.  Populate the
729    GANG_PRIVATE_VARS set with any such variables found in the current
730    function.  */
731 
732 static void
find_gang_private_vars(hash_set<tree> * gang_private_vars)733 find_gang_private_vars (hash_set<tree> *gang_private_vars)
734 {
735   basic_block block;
736 
737   FOR_EACH_BB_FN (block, cfun)
738     {
739       for (gimple_stmt_iterator gsi = gsi_start_bb (block);
740 	   !gsi_end_p (gsi);
741 	   gsi_next (&gsi))
742 	{
743 	  gimple *stmt = gsi_stmt (gsi);
744 
745 	  if (gimple_call_internal_p (stmt, IFN_UNIQUE))
746 	    {
747 	      enum ifn_unique_kind k = ((enum ifn_unique_kind)
748 		TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
749 	      if (k == IFN_UNIQUE_OACC_PRIVATE)
750 		{
751 		  HOST_WIDE_INT level
752 		    = TREE_INT_CST_LOW (gimple_call_arg (stmt, 2));
753 		  if (level != GOMP_DIM_GANG)
754 		    continue;
755 		  for (unsigned i = 3; i < gimple_call_num_args (stmt); i++)
756 		    {
757 		      tree arg = gimple_call_arg (stmt, i);
758 		      gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
759 		      tree decl = TREE_OPERAND (arg, 0);
760 		      gang_private_vars->add (decl);
761 		    }
762 		}
763 	    }
764 	}
765     }
766 }
767 
768 static void
find_local_vars_to_propagate(parallel_g * par,unsigned outer_mask,hash_set<tree> * partitioned_var_uses,hash_set<tree> * gang_private_vars,bitmap writes_gang_private,vec<propagation_set * > * prop_set)769 find_local_vars_to_propagate (parallel_g *par, unsigned outer_mask,
770 			      hash_set<tree> *partitioned_var_uses,
771 			      hash_set<tree> *gang_private_vars,
772 			      bitmap writes_gang_private,
773 			      vec<propagation_set *> *prop_set)
774 {
775   unsigned mask = outer_mask | par->mask;
776 
777   if (par->inner)
778     find_local_vars_to_propagate (par->inner, mask, partitioned_var_uses,
779 				  gang_private_vars, writes_gang_private,
780 				  prop_set);
781   if (par->next)
782     find_local_vars_to_propagate (par->next, outer_mask, partitioned_var_uses,
783 				  gang_private_vars, writes_gang_private,
784 				  prop_set);
785 
786   if (!(mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)))
787     {
788       basic_block block;
789       int ix;
790 
791       for (ix = 0; par->blocks.iterate (ix, &block); ix++)
792 	{
793 	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
794 	       !gsi_end_p (gsi); gsi_next (&gsi))
795 	    {
796 	      gimple *stmt = gsi_stmt (gsi);
797 	      tree var;
798 	      unsigned i;
799 
800 	      FOR_EACH_LOCAL_DECL (cfun, i, var)
801 		{
802 		  if (!VAR_P (var)
803 		      || is_global_var (var)
804 		      || AGGREGATE_TYPE_P (TREE_TYPE (var))
805 		      || !partitioned_var_uses->contains (var))
806 		    continue;
807 
808 		  if (stmt_may_clobber_ref_p (stmt, var))
809 		    {
810 		      if (dump_file)
811 			{
812 			  fprintf (dump_file, "bb %u: local variable may be "
813 				   "clobbered in %s mode: ", block->index,
814 				   mask_name (mask));
815 			  print_generic_expr (dump_file, var, TDF_SLIM);
816 			  fprintf (dump_file, "\n");
817 			}
818 
819 		      if (gang_private_vars->contains (var))
820 			{
821 			  /* If we write a gang-private variable, we want a
822 			     barrier at the end of the block.  */
823 			  bitmap_set_bit (writes_gang_private, block->index);
824 			  continue;
825 			}
826 
827 		      if (!(*prop_set)[block->index])
828 			(*prop_set)[block->index] = new propagation_set;
829 
830 		      propagation_set *ws_prop
831 			= (*prop_set)[block->index];
832 
833 		      ws_prop->add (var);
834 		    }
835 		}
836 	    }
837 	}
838     }
839 }
840 
841 /* Transform basic blocks FROM, TO (which may be the same block) into:
842    if (GOACC_single_start ())
843      BLOCK;
844    GOACC_barrier ();
845 			      \  |  /
846 			      +----+
847 			      |    |        (new) predicate block
848 			      +----+--
849    \  |  /   \  |  /	        |t    \
850    +----+    +----+	      +----+  |
851    |	|    |    |	===>  |    |  | f   (old) from block
852    +----+    +----+	      +----+  |
853      |       t/  \f	        |    /
854 			      +----+/
855   (split  (split before       |    |        skip block
856   at end)   condition)	      +----+
857 			      t/  \f
858 */
859 
860 static void
worker_single_simple(basic_block from,basic_block to,hash_set<tree> * def_escapes_block)861 worker_single_simple (basic_block from, basic_block to,
862 		      hash_set<tree> *def_escapes_block)
863 {
864   gimple *call, *cond;
865   tree lhs, decl;
866   basic_block skip_block;
867 
868   gimple_stmt_iterator gsi = gsi_last_bb (to);
869   if (EDGE_COUNT (to->succs) > 1)
870     {
871       gcc_assert (gimple_code (gsi_stmt (gsi)) == GIMPLE_COND);
872       gsi_prev (&gsi);
873     }
874   edge e = split_block (to, gsi_stmt (gsi));
875   skip_block = e->dest;
876 
877   gimple_stmt_iterator start = gsi_after_labels (from);
878 
879   decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_START);
880   lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
881   call = gimple_build_call (decl, 0);
882   gimple_call_set_lhs (call, lhs);
883   gsi_insert_before (&start, call, GSI_NEW_STMT);
884   update_stmt (call);
885 
886   cond = gimple_build_cond (EQ_EXPR, lhs,
887 			    fold_convert_loc (UNKNOWN_LOCATION,
888 					      TREE_TYPE (lhs),
889 					      boolean_true_node),
890 			    NULL_TREE, NULL_TREE);
891   gsi_insert_after (&start, cond, GSI_NEW_STMT);
892   update_stmt (cond);
893 
894   edge et = split_block (from, cond);
895   et->flags &= ~EDGE_FALLTHRU;
896   et->flags |= EDGE_TRUE_VALUE;
897   /* Make the active worker the more probable path so we prefer fallthrough
898      (letting the idle workers jump around more).  */
899   et->probability = profile_probability::likely ();
900 
901   edge ef = make_edge (from, skip_block, EDGE_FALSE_VALUE);
902   ef->probability = et->probability.invert ();
903 
904   basic_block neutered = split_edge (ef);
905   gimple_stmt_iterator neut_gsi = gsi_last_bb (neutered);
906 
907   for (gsi = gsi_start_bb (et->dest); !gsi_end_p (gsi); gsi_next (&gsi))
908     {
909       gimple *stmt = gsi_stmt (gsi);
910       ssa_op_iter iter;
911       tree var;
912 
913       FOR_EACH_SSA_TREE_OPERAND (var, stmt, iter, SSA_OP_DEF)
914 	{
915 	  if (def_escapes_block->contains (var))
916 	    {
917 	      gphi *join_phi = create_phi_node (NULL_TREE, skip_block);
918 	      create_new_def_for (var, join_phi,
919 				  gimple_phi_result_ptr (join_phi));
920 	      add_phi_arg (join_phi, var, e, UNKNOWN_LOCATION);
921 
922 	      tree neutered_def = copy_ssa_name (var, NULL);
923 	      /* We really want "don't care" or some value representing
924 		 undefined here, but optimizers will probably get rid of the
925 		 zero-assignments anyway.  */
926 	      gassign *zero = gimple_build_assign (neutered_def,
927 				build_zero_cst (TREE_TYPE (neutered_def)));
928 
929 	      gsi_insert_after (&neut_gsi, zero, GSI_CONTINUE_LINKING);
930 	      update_stmt (zero);
931 
932 	      add_phi_arg (join_phi, neutered_def, single_succ_edge (neutered),
933 			   UNKNOWN_LOCATION);
934 	      update_stmt (join_phi);
935 	    }
936 	}
937     }
938 }
939 
940 static tree
build_receiver_ref(tree var,tree receiver_decl,field_map_t * fields)941 build_receiver_ref (tree var, tree receiver_decl, field_map_t *fields)
942 {
943   tree x = build_simple_mem_ref (receiver_decl);
944   tree field = *fields->get (var);
945   TREE_THIS_NOTRAP (x) = 1;
946   x = omp_build_component_ref (x, field);
947   return x;
948 }
949 
950 static tree
build_sender_ref(tree var,tree sender_decl,field_map_t * fields)951 build_sender_ref (tree var, tree sender_decl, field_map_t *fields)
952 {
953   if (POINTER_TYPE_P (TREE_TYPE (sender_decl)))
954     sender_decl = build_simple_mem_ref (sender_decl);
955   tree field = *fields->get (var);
956   return omp_build_component_ref (sender_decl, field);
957 }
958 
959 static int
sort_by_ssa_version_or_uid(const void * p1,const void * p2)960 sort_by_ssa_version_or_uid (const void *p1, const void *p2)
961 {
962   const tree t1 = *(const tree *)p1;
963   const tree t2 = *(const tree *)p2;
964 
965   if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) == SSA_NAME)
966     return SSA_NAME_VERSION (t1) - SSA_NAME_VERSION (t2);
967   else if (TREE_CODE (t1) == SSA_NAME && TREE_CODE (t2) != SSA_NAME)
968     return -1;
969   else if (TREE_CODE (t1) != SSA_NAME && TREE_CODE (t2) == SSA_NAME)
970     return 1;
971   else
972     return DECL_UID (t1) - DECL_UID (t2);
973 }
974 
975 static int
sort_by_size_then_ssa_version_or_uid(const void * p1,const void * p2)976 sort_by_size_then_ssa_version_or_uid (const void *p1, const void *p2)
977 {
978   const tree t1 = *(const tree *)p1;
979   const tree t2 = *(const tree *)p2;
980   unsigned HOST_WIDE_INT s1 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t1)));
981   unsigned HOST_WIDE_INT s2 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (t2)));
982   if (s1 != s2)
983     return s2 - s1;
984   else
985     return sort_by_ssa_version_or_uid (p1, p2);
986 }
987 
988 static void
worker_single_copy(basic_block from,basic_block to,hash_set<tree> * def_escapes_block,hash_set<tree> * worker_partitioned_uses,tree record_type,record_field_map_t * record_field_map,unsigned HOST_WIDE_INT placement,bool isolate_broadcasts,bool has_gang_private_write)989 worker_single_copy (basic_block from, basic_block to,
990 		    hash_set<tree> *def_escapes_block,
991 		    hash_set<tree> *worker_partitioned_uses,
992 		    tree record_type, record_field_map_t *record_field_map,
993 		    unsigned HOST_WIDE_INT placement,
994 		    bool isolate_broadcasts, bool has_gang_private_write)
995 {
996   /* If we only have virtual defs, we'll have no record type, but we still want
997      to emit single_copy_start and (particularly) single_copy_end to act as
998      a vdef source on the neutered edge representing memory writes on the
999      non-neutered edge.  */
1000   if (!record_type)
1001     record_type = char_type_node;
1002 
1003   tree sender_decl
1004     = targetm.goacc.create_worker_broadcast_record (record_type, true,
1005 						    ".oacc_worker_o",
1006 						    placement);
1007   tree receiver_decl
1008     = targetm.goacc.create_worker_broadcast_record (record_type, false,
1009 						    ".oacc_worker_i",
1010 						    placement);
1011 
1012   gimple_stmt_iterator gsi = gsi_last_bb (to);
1013   if (EDGE_COUNT (to->succs) > 1)
1014     gsi_prev (&gsi);
1015   edge e = split_block (to, gsi_stmt (gsi));
1016   basic_block barrier_block = e->dest;
1017 
1018   gimple_stmt_iterator start = gsi_after_labels (from);
1019 
1020   tree decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_START);
1021 
1022   tree lhs = create_tmp_var (TREE_TYPE (TREE_TYPE (decl)));
1023 
1024   gimple *call
1025     = gimple_build_call (decl, 1,
1026 			 POINTER_TYPE_P (TREE_TYPE (sender_decl))
1027 			 ? sender_decl : build_fold_addr_expr (sender_decl));
1028   gimple_call_set_lhs (call, lhs);
1029   gsi_insert_before (&start, call, GSI_NEW_STMT);
1030   update_stmt (call);
1031 
1032   /* The shared-memory range for this block overflowed.  Add a barrier before
1033      the GOACC_single_copy_start call.  */
1034   if (isolate_broadcasts)
1035     {
1036       decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1037       gimple *acc_bar = gimple_build_call (decl, 0);
1038       gsi_insert_before (&start, acc_bar, GSI_SAME_STMT);
1039     }
1040 
1041   tree conv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
1042 
1043   gimple *conv = gimple_build_assign (conv_tmp,
1044 				      fold_convert (TREE_TYPE (receiver_decl),
1045 						    lhs));
1046   update_stmt (conv);
1047   gsi_insert_after (&start, conv, GSI_NEW_STMT);
1048   gimple *asgn = gimple_build_assign (receiver_decl, conv_tmp);
1049   gsi_insert_after (&start, asgn, GSI_NEW_STMT);
1050   update_stmt (asgn);
1051 
1052   tree zero_ptr = build_int_cst (TREE_TYPE (receiver_decl), 0);
1053 
1054   tree recv_tmp = make_ssa_name (TREE_TYPE (receiver_decl));
1055   asgn = gimple_build_assign (recv_tmp, receiver_decl);
1056   gsi_insert_after (&start, asgn, GSI_NEW_STMT);
1057   update_stmt (asgn);
1058 
1059   gimple *cond = gimple_build_cond (EQ_EXPR, recv_tmp, zero_ptr, NULL_TREE,
1060 				    NULL_TREE);
1061   update_stmt (cond);
1062 
1063   gsi_insert_after (&start, cond, GSI_NEW_STMT);
1064 
1065   edge et = split_block (from, cond);
1066   et->flags &= ~EDGE_FALLTHRU;
1067   et->flags |= EDGE_TRUE_VALUE;
1068   /* Make the active worker the more probable path so we prefer fallthrough
1069      (letting the idle workers jump around more).  */
1070   et->probability = profile_probability::likely ();
1071 
1072   basic_block body = et->dest;
1073 
1074   edge ef = make_edge (from, barrier_block, EDGE_FALSE_VALUE);
1075   ef->probability = et->probability.invert ();
1076 
1077   gimple_stmt_iterator bar_gsi = gsi_start_bb (barrier_block);
1078   cond = gimple_build_cond (NE_EXPR, recv_tmp, zero_ptr, NULL_TREE, NULL_TREE);
1079 
1080   if (record_type != char_type_node || has_gang_private_write)
1081     {
1082       decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1083       gimple *acc_bar = gimple_build_call (decl, 0);
1084 
1085       gsi_insert_before (&bar_gsi, acc_bar, GSI_NEW_STMT);
1086       gsi_insert_after (&bar_gsi, cond, GSI_NEW_STMT);
1087     }
1088   else
1089     gsi_insert_before (&bar_gsi, cond, GSI_NEW_STMT);
1090 
1091   edge et2 = split_block (barrier_block, cond);
1092   et2->flags &= ~EDGE_FALLTHRU;
1093   et2->flags |= EDGE_TRUE_VALUE;
1094   et2->probability = profile_probability::unlikely ();
1095 
1096   basic_block exit_block = et2->dest;
1097 
1098   basic_block copyout_block = split_edge (et2);
1099   edge ef2 = make_edge (barrier_block, exit_block, EDGE_FALSE_VALUE);
1100   ef2->probability = et2->probability.invert ();
1101 
1102   gimple_stmt_iterator copyout_gsi = gsi_start_bb (copyout_block);
1103 
1104   edge copyout_to_exit = single_succ_edge (copyout_block);
1105 
1106   gimple_seq sender_seq = NULL;
1107 
1108   /* Make sure we iterate over definitions in a stable order.  */
1109   auto_vec<tree> escape_vec (def_escapes_block->elements ());
1110   for (hash_set<tree>::iterator it = def_escapes_block->begin ();
1111        it != def_escapes_block->end (); ++it)
1112     escape_vec.quick_push (*it);
1113   escape_vec.qsort (sort_by_ssa_version_or_uid);
1114 
1115   for (unsigned i = 0; i < escape_vec.length (); i++)
1116     {
1117       tree var = escape_vec[i];
1118 
1119       if (TREE_CODE (var) == SSA_NAME && SSA_NAME_IS_VIRTUAL_OPERAND (var))
1120 	continue;
1121 
1122       tree barrier_def = 0;
1123 
1124       if (TREE_CODE (var) == SSA_NAME)
1125 	{
1126 	  gimple *def_stmt = SSA_NAME_DEF_STMT (var);
1127 
1128 	  if (gimple_nop_p (def_stmt))
1129 	    continue;
1130 
1131 	  /* The barrier phi takes one result from the actual work of the
1132 	     block we're neutering, and the other result is constant zero of
1133 	     the same type.  */
1134 
1135 	  gphi *barrier_phi = create_phi_node (NULL_TREE, barrier_block);
1136 	  barrier_def = create_new_def_for (var, barrier_phi,
1137 			  gimple_phi_result_ptr (barrier_phi));
1138 
1139 	  add_phi_arg (barrier_phi, var, e, UNKNOWN_LOCATION);
1140 	  add_phi_arg (barrier_phi, build_zero_cst (TREE_TYPE (var)), ef,
1141 		       UNKNOWN_LOCATION);
1142 
1143 	  update_stmt (barrier_phi);
1144 	}
1145       else
1146 	gcc_assert (TREE_CODE (var) == VAR_DECL);
1147 
1148       /* If we had no record type, we will have no fields map.  */
1149       field_map_t *fields = record_field_map->get (record_type);
1150 
1151       if (worker_partitioned_uses->contains (var)
1152 	  && fields
1153 	  && fields->get (var))
1154 	{
1155 	  tree neutered_def = make_ssa_name (TREE_TYPE (var));
1156 
1157 	  /* Receive definition from shared memory block.  */
1158 
1159 	  tree receiver_ref = build_receiver_ref (var, receiver_decl, fields);
1160 	  gassign *recv = gimple_build_assign (neutered_def,
1161 					       receiver_ref);
1162 	  gsi_insert_after (&copyout_gsi, recv, GSI_CONTINUE_LINKING);
1163 	  update_stmt (recv);
1164 
1165 	  if (TREE_CODE (var) == VAR_DECL)
1166 	    {
1167 	      /* If it's a VAR_DECL, we only copied to an SSA temporary.  Copy
1168 		 to the final location now.  */
1169 	      gassign *asgn = gimple_build_assign (var, neutered_def);
1170 	      gsi_insert_after (&copyout_gsi, asgn, GSI_CONTINUE_LINKING);
1171 	      update_stmt (asgn);
1172 	    }
1173 	  else
1174 	    {
1175 	      /* If it's an SSA name, create a new phi at the join node to
1176 		 represent either the output from the active worker (the
1177 		 barrier) or the inactive workers (the copyout block).  */
1178 	      gphi *join_phi = create_phi_node (NULL_TREE, exit_block);
1179 	      create_new_def_for (barrier_def, join_phi,
1180 				  gimple_phi_result_ptr (join_phi));
1181 	      add_phi_arg (join_phi, barrier_def, ef2, UNKNOWN_LOCATION);
1182 	      add_phi_arg (join_phi, neutered_def, copyout_to_exit,
1183 			   UNKNOWN_LOCATION);
1184 	      update_stmt (join_phi);
1185 	    }
1186 
1187 	  /* Send definition to shared memory block.  */
1188 
1189 	  tree sender_ref = build_sender_ref (var, sender_decl, fields);
1190 
1191 	  if (TREE_CODE (var) == SSA_NAME)
1192 	    {
1193 	      gassign *send = gimple_build_assign (sender_ref, var);
1194 	      gimple_seq_add_stmt (&sender_seq, send);
1195 	      update_stmt (send);
1196 	    }
1197 	  else if (TREE_CODE (var) == VAR_DECL)
1198 	    {
1199 	      tree tmp = make_ssa_name (TREE_TYPE (var));
1200 	      gassign *send = gimple_build_assign (tmp, var);
1201 	      gimple_seq_add_stmt (&sender_seq, send);
1202 	      update_stmt (send);
1203 	      send = gimple_build_assign (sender_ref, tmp);
1204 	      gimple_seq_add_stmt (&sender_seq, send);
1205 	      update_stmt (send);
1206 	    }
1207 	  else
1208 	    gcc_unreachable ();
1209 	}
1210     }
1211 
1212   /* The shared-memory range for this block overflowed.  Add a barrier at the
1213      end.  */
1214   if (isolate_broadcasts)
1215     {
1216       gsi = gsi_start_bb (exit_block);
1217       decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1218       gimple *acc_bar = gimple_build_call (decl, 0);
1219       gsi_insert_before (&gsi, acc_bar, GSI_SAME_STMT);
1220     }
1221 
1222   /* It's possible for the ET->DEST block (the work done by the active thread)
1223      to finish with a control-flow insn, e.g. a UNIQUE function call.  Split
1224      the block and add SENDER_SEQ in the latter part to avoid having control
1225      flow in the middle of a BB.  */
1226 
1227   decl = builtin_decl_explicit (BUILT_IN_GOACC_SINGLE_COPY_END);
1228   call = gimple_build_call (decl, 1,
1229 			    POINTER_TYPE_P (TREE_TYPE (sender_decl))
1230 			    ? sender_decl
1231 			    : build_fold_addr_expr (sender_decl));
1232   gimple_seq_add_stmt (&sender_seq, call);
1233 
1234   gsi = gsi_last_bb (body);
1235   gimple *last = gsi_stmt (gsi);
1236   basic_block sender_block = split_block (body, last)->dest;
1237   gsi = gsi_last_bb (sender_block);
1238   gsi_insert_seq_after (&gsi, sender_seq, GSI_CONTINUE_LINKING);
1239 }
1240 
1241 typedef hash_map<basic_block, std::pair<unsigned HOST_WIDE_INT, bool> >
1242   blk_offset_map_t;
1243 
1244 static void
neuter_worker_single(parallel_g * par,unsigned outer_mask,bitmap worker_single,bitmap vector_single,vec<propagation_set * > * prop_set,hash_set<tree> * partitioned_var_uses,record_field_map_t * record_field_map,blk_offset_map_t * blk_offset_map,bitmap writes_gang_private)1245 neuter_worker_single (parallel_g *par, unsigned outer_mask,
1246 		      bitmap worker_single, bitmap vector_single,
1247 		      vec<propagation_set *> *prop_set,
1248 		      hash_set<tree> *partitioned_var_uses,
1249 		      record_field_map_t *record_field_map,
1250 		      blk_offset_map_t *blk_offset_map,
1251 		      bitmap writes_gang_private)
1252 {
1253   unsigned mask = outer_mask | par->mask;
1254 
1255   if ((mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
1256     {
1257       basic_block block;
1258 
1259       for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
1260 	{
1261 	  bool has_defs = false;
1262 	  hash_set<tree> def_escapes_block;
1263 	  hash_set<tree> worker_partitioned_uses;
1264 	  unsigned j;
1265 	  tree var;
1266 
1267 	  FOR_EACH_SSA_NAME (j, var, cfun)
1268 	    {
1269 	      if (SSA_NAME_IS_VIRTUAL_OPERAND (var))
1270 		{
1271 		  has_defs = true;
1272 		  continue;
1273 		}
1274 
1275 	      gimple *def_stmt = SSA_NAME_DEF_STMT (var);
1276 
1277 	      if (gimple_nop_p (def_stmt))
1278 		continue;
1279 
1280 	      if (gimple_bb (def_stmt)->index != block->index)
1281 		continue;
1282 
1283 	      gimple *use_stmt;
1284 	      imm_use_iterator use_iter;
1285 	      bool uses_outside_block = false;
1286 	      bool worker_partitioned_use = false;
1287 
1288 	      FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, var)
1289 		{
1290 		  int blocknum = gimple_bb (use_stmt)->index;
1291 
1292 		  /* Don't propagate SSA names that are only used in the
1293 		     current block, unless the usage is in a phi node: that
1294 		     means the name left the block, then came back in at the
1295 		     top.  */
1296 		  if (blocknum != block->index
1297 		      || gimple_code (use_stmt) == GIMPLE_PHI)
1298 		    uses_outside_block = true;
1299 		  if (!bitmap_bit_p (worker_single, blocknum))
1300 		    worker_partitioned_use = true;
1301 		}
1302 
1303 	      if (uses_outside_block)
1304 		def_escapes_block.add (var);
1305 
1306 	      if (worker_partitioned_use)
1307 		{
1308 		  worker_partitioned_uses.add (var);
1309 		  has_defs = true;
1310 		}
1311 	    }
1312 
1313 	  propagation_set *ws_prop = (*prop_set)[block->index];
1314 
1315 	  if (ws_prop)
1316 	    {
1317 	      for (propagation_set::iterator it = ws_prop->begin ();
1318 		   it != ws_prop->end ();
1319 		   ++it)
1320 		{
1321 		  tree var = *it;
1322 		  if (TREE_CODE (var) == VAR_DECL)
1323 		    {
1324 		      def_escapes_block.add (var);
1325 		      if (partitioned_var_uses->contains (var))
1326 			{
1327 			  worker_partitioned_uses.add (var);
1328 			  has_defs = true;
1329 			}
1330 		    }
1331 		}
1332 
1333 	      delete ws_prop;
1334 	      (*prop_set)[block->index] = 0;
1335 	    }
1336 
1337 	  bool only_marker_fns = true;
1338 	  bool join_block = false;
1339 
1340 	  for (gimple_stmt_iterator gsi = gsi_start_bb (block);
1341 	       !gsi_end_p (gsi);
1342 	       gsi_next (&gsi))
1343 	    {
1344 	      gimple *stmt = gsi_stmt (gsi);
1345 	      if (gimple_code (stmt) == GIMPLE_CALL
1346 		  && gimple_call_internal_p (stmt, IFN_UNIQUE))
1347 		{
1348 		  enum ifn_unique_kind k = ((enum ifn_unique_kind)
1349 		    TREE_INT_CST_LOW (gimple_call_arg (stmt, 0)));
1350 		  if (k != IFN_UNIQUE_OACC_PRIVATE
1351 		      && k != IFN_UNIQUE_OACC_JOIN
1352 		      && k != IFN_UNIQUE_OACC_FORK
1353 		      && k != IFN_UNIQUE_OACC_HEAD_MARK
1354 		      && k != IFN_UNIQUE_OACC_TAIL_MARK)
1355 		    only_marker_fns = false;
1356 		  else if (k == IFN_UNIQUE_OACC_JOIN)
1357 		    /* The JOIN marker is special in that it *cannot* be
1358 		       predicated for worker zero, because it may be lowered
1359 		       to a barrier instruction and all workers must typically
1360 		       execute that barrier.  We shouldn't be doing any
1361 		       broadcasts from the join block anyway.  */
1362 		    join_block = true;
1363 		}
1364 	      else if (gimple_code (stmt) == GIMPLE_CALL
1365 		       && gimple_call_internal_p (stmt, IFN_GOACC_LOOP))
1366 		/* Empty.  */;
1367 	      else if (gimple_nop_p (stmt))
1368 		/* Empty.  */;
1369 	      else
1370 		only_marker_fns = false;
1371 	    }
1372 
1373 	  /* We can skip predicating this block for worker zero if the only
1374 	     thing it contains is marker functions that will be removed in the
1375 	     oaccdevlow pass anyway.
1376 	     Don't do this if the block has (any) phi nodes, because those
1377 	     might define SSA names that need broadcasting.
1378 	     TODO: We might be able to skip transforming blocks that only
1379 	     contain some other trivial statements too.  */
1380 	  if (only_marker_fns && !phi_nodes (block))
1381 	    continue;
1382 
1383 	  gcc_assert (!join_block);
1384 
1385 	  if (has_defs)
1386 	    {
1387 	      tree record_type = (tree) block->aux;
1388 	      std::pair<unsigned HOST_WIDE_INT, bool> *off_rngalloc
1389 		= blk_offset_map->get (block);
1390 	      gcc_assert (!record_type || off_rngalloc);
1391 	      unsigned HOST_WIDE_INT offset
1392 		= off_rngalloc ? off_rngalloc->first : 0;
1393 	      bool range_allocated
1394 		= off_rngalloc ? off_rngalloc->second : true;
1395 	      bool has_gang_private_write
1396 		= bitmap_bit_p (writes_gang_private, block->index);
1397 	      worker_single_copy (block, block, &def_escapes_block,
1398 				  &worker_partitioned_uses, record_type,
1399 				  record_field_map,
1400 				  offset, !range_allocated,
1401 				  has_gang_private_write);
1402 	    }
1403 	  else
1404 	    worker_single_simple (block, block, &def_escapes_block);
1405 	}
1406     }
1407 
1408   if ((outer_mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) == 0)
1409     {
1410       basic_block block;
1411 
1412       for (unsigned i = 0; par->blocks.iterate (i, &block); i++)
1413 	for (gimple_stmt_iterator gsi = gsi_start_bb (block);
1414 	     !gsi_end_p (gsi);
1415 	     gsi_next (&gsi))
1416 	  {
1417 	    gimple *stmt = gsi_stmt (gsi);
1418 
1419 	    if (gimple_code (stmt) == GIMPLE_CALL
1420 		&& !gimple_call_internal_p (stmt)
1421 		&& !omp_sese_active_worker_call (as_a <gcall *> (stmt)))
1422 	      {
1423 		/* If we have an OpenACC routine call in worker-single mode,
1424 		   place barriers before and afterwards to prevent
1425 		   clobbering re-used shared memory regions (as are used
1426 		   for AMDGCN at present, for example).  */
1427 		tree decl = builtin_decl_explicit (BUILT_IN_GOACC_BARRIER);
1428 		gsi_insert_before (&gsi, gimple_build_call (decl, 0),
1429 				   GSI_SAME_STMT);
1430 		gsi_insert_after (&gsi, gimple_build_call (decl, 0),
1431 				  GSI_NEW_STMT);
1432 	      }
1433 	  }
1434     }
1435 
1436   if (par->inner)
1437     neuter_worker_single (par->inner, mask, worker_single, vector_single,
1438 			  prop_set, partitioned_var_uses, record_field_map,
1439 			  blk_offset_map, writes_gang_private);
1440   if (par->next)
1441     neuter_worker_single (par->next, outer_mask, worker_single, vector_single,
1442 			  prop_set, partitioned_var_uses, record_field_map,
1443 			  blk_offset_map, writes_gang_private);
1444 }
1445 
1446 static void
dfs_broadcast_reachable_1(basic_block bb,sbitmap reachable)1447 dfs_broadcast_reachable_1 (basic_block bb, sbitmap reachable)
1448 {
1449   if (bb->flags & BB_VISITED)
1450     return;
1451 
1452   bb->flags |= BB_VISITED;
1453 
1454   if (bb->succs)
1455     {
1456       edge e;
1457       edge_iterator ei;
1458       FOR_EACH_EDGE (e, ei, bb->succs)
1459 	{
1460 	  basic_block dest = e->dest;
1461 	  if (dest->aux)
1462 	    bitmap_set_bit (reachable, dest->index);
1463 	  else
1464 	    dfs_broadcast_reachable_1 (dest, reachable);
1465 	}
1466     }
1467 }
1468 
1469 typedef std::pair<int, tree> idx_decl_pair_t;
1470 
1471 typedef auto_vec<splay_tree> used_range_vec_t;
1472 
1473 static int
sort_size_descending(const void * a,const void * b)1474 sort_size_descending (const void *a, const void *b)
1475 {
1476   const idx_decl_pair_t *pa = (const idx_decl_pair_t *) a;
1477   const idx_decl_pair_t *pb = (const idx_decl_pair_t *) b;
1478   unsigned HOST_WIDE_INT asize = tree_to_uhwi (TYPE_SIZE_UNIT (pa->second));
1479   unsigned HOST_WIDE_INT bsize = tree_to_uhwi (TYPE_SIZE_UNIT (pb->second));
1480   return bsize - asize;
1481 }
1482 
1483 class addr_range
1484 {
1485 public:
addr_range(unsigned HOST_WIDE_INT addr_lo,unsigned HOST_WIDE_INT addr_hi)1486   addr_range (unsigned HOST_WIDE_INT addr_lo, unsigned HOST_WIDE_INT addr_hi)
1487     : lo (addr_lo), hi (addr_hi)
1488     { }
addr_range(const addr_range & ar)1489   addr_range (const addr_range &ar) : lo (ar.lo), hi (ar.hi)
1490     { }
addr_range()1491   addr_range () : lo (0), hi (0)
1492     { }
1493 
invalid()1494   bool invalid () { return lo == 0 && hi == 0; }
1495 
1496   unsigned HOST_WIDE_INT lo;
1497   unsigned HOST_WIDE_INT hi;
1498 };
1499 
1500 static int
splay_tree_compare_addr_range(splay_tree_key a,splay_tree_key b)1501 splay_tree_compare_addr_range (splay_tree_key a, splay_tree_key b)
1502 {
1503   addr_range *ar = (addr_range *) a;
1504   addr_range *br = (addr_range *) b;
1505   if (ar->lo == br->lo && ar->hi == br->hi)
1506     return 0;
1507   if (ar->hi <= br->lo)
1508     return -1;
1509   else if (ar->lo >= br->hi)
1510     return 1;
1511   return 0;
1512 }
1513 
1514 static void
splay_tree_free_key(splay_tree_key k)1515 splay_tree_free_key (splay_tree_key k)
1516 {
1517   addr_range *ar = (addr_range *) k;
1518   delete ar;
1519 }
1520 
1521 static addr_range
first_fit_range(splay_tree s,unsigned HOST_WIDE_INT size,unsigned HOST_WIDE_INT align,addr_range * bounds)1522 first_fit_range (splay_tree s, unsigned HOST_WIDE_INT size,
1523 		 unsigned HOST_WIDE_INT align, addr_range *bounds)
1524 {
1525   splay_tree_node min = splay_tree_min (s);
1526   if (min)
1527     {
1528       splay_tree_node next;
1529       while ((next = splay_tree_successor (s, min->key)))
1530 	{
1531 	  unsigned HOST_WIDE_INT lo = ((addr_range *) min->key)->hi;
1532 	  unsigned HOST_WIDE_INT hi = ((addr_range *) next->key)->lo;
1533 	  unsigned HOST_WIDE_INT base = (lo + align - 1) & ~(align - 1);
1534 	  if (base + size <= hi)
1535 	    return addr_range (base, base + size);
1536 	  min = next;
1537 	}
1538 
1539       unsigned HOST_WIDE_INT base = ((addr_range *)min->key)->hi;
1540       base = (base + align - 1) & ~(align - 1);
1541       if (base + size <= bounds->hi)
1542 	return addr_range (base, base + size);
1543       else
1544 	return addr_range ();
1545     }
1546   else
1547     {
1548       unsigned HOST_WIDE_INT lo = bounds->lo;
1549       lo = (lo + align - 1) & ~(align - 1);
1550       if (lo + size <= bounds->hi)
1551 	return addr_range (lo, lo + size);
1552       else
1553 	return addr_range ();
1554     }
1555 }
1556 
1557 static int
merge_ranges_1(splay_tree_node n,void * ptr)1558 merge_ranges_1 (splay_tree_node n, void *ptr)
1559 {
1560   splay_tree accum = (splay_tree) ptr;
1561   addr_range ar = *(addr_range *) n->key;
1562 
1563   splay_tree_node old = splay_tree_lookup (accum, n->key);
1564 
1565   /* We might have an overlap.  Create a new range covering the
1566      overlapping parts.  */
1567   if (old)
1568     {
1569       addr_range *old_ar = (addr_range *) old->key;
1570       ar.lo = MIN (old_ar->lo, ar.lo);
1571       ar.hi = MAX (old_ar->hi, ar.hi);
1572       splay_tree_remove (accum, old->key);
1573     }
1574 
1575   addr_range *new_ar = new addr_range (ar);
1576 
1577   splay_tree_insert (accum, (splay_tree_key) new_ar, n->value);
1578 
1579   return 0;
1580 }
1581 
1582 static void
merge_ranges(splay_tree accum,splay_tree sp)1583 merge_ranges (splay_tree accum, splay_tree sp)
1584 {
1585   splay_tree_foreach (sp, merge_ranges_1, (void *) accum);
1586 }
1587 
1588 static void
oacc_do_neutering(unsigned HOST_WIDE_INT bounds_lo,unsigned HOST_WIDE_INT bounds_hi)1589 oacc_do_neutering (unsigned HOST_WIDE_INT bounds_lo,
1590 		   unsigned HOST_WIDE_INT bounds_hi)
1591 {
1592   bb_stmt_map_t bb_stmt_map;
1593   auto_bitmap worker_single, vector_single;
1594 
1595   omp_sese_split_blocks (&bb_stmt_map);
1596 
1597   if (dump_file)
1598     {
1599       fprintf (dump_file, "\n\nAfter splitting:\n\n");
1600       dump_function_to_file (current_function_decl, dump_file, dump_flags);
1601     }
1602 
1603   unsigned mask = 0;
1604 
1605   /* If this is a routine, calculate MASK as if the outer levels are already
1606      partitioned.  */
1607   {
1608     tree attr = oacc_get_fn_attrib (current_function_decl);
1609     tree dims = TREE_VALUE (attr);
1610     unsigned ix;
1611     for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
1612       {
1613 	tree allowed = TREE_PURPOSE (dims);
1614 	if (allowed && integer_zerop (allowed))
1615 	  mask |= GOMP_DIM_MASK (ix);
1616       }
1617   }
1618 
1619   parallel_g *par = omp_sese_discover_pars (&bb_stmt_map);
1620   populate_single_mode_bitmaps (par, worker_single, vector_single, mask, 0);
1621 
1622   basic_block bb;
1623   FOR_ALL_BB_FN (bb, cfun)
1624     bb->aux = NULL;
1625 
1626   vec<propagation_set *> prop_set (vNULL);
1627   prop_set.safe_grow_cleared (last_basic_block_for_fn (cfun), true);
1628 
1629   find_ssa_names_to_propagate (par, mask, worker_single, vector_single,
1630 			       &prop_set);
1631 
1632   hash_set<tree> partitioned_var_uses;
1633   hash_set<tree> gang_private_vars;
1634   auto_bitmap writes_gang_private;
1635 
1636   find_gang_private_vars (&gang_private_vars);
1637   find_partitioned_var_uses (par, mask, &partitioned_var_uses);
1638   find_local_vars_to_propagate (par, mask, &partitioned_var_uses,
1639 				&gang_private_vars, writes_gang_private,
1640 				&prop_set);
1641 
1642   record_field_map_t record_field_map;
1643 
1644   FOR_ALL_BB_FN (bb, cfun)
1645     {
1646       propagation_set *ws_prop = prop_set[bb->index];
1647       if (ws_prop)
1648 	{
1649 	  tree record_type = lang_hooks.types.make_type (RECORD_TYPE);
1650 	  tree name = create_tmp_var_name (".oacc_ws_data_s");
1651 	  name = build_decl (UNKNOWN_LOCATION, TYPE_DECL, name, record_type);
1652 	  DECL_ARTIFICIAL (name) = 1;
1653 	  DECL_NAMELESS (name) = 1;
1654 	  TYPE_NAME (record_type) = name;
1655 	  TYPE_ARTIFICIAL (record_type) = 1;
1656 
1657 	  auto_vec<tree> field_vec (ws_prop->elements ());
1658 	  for (hash_set<tree>::iterator it = ws_prop->begin ();
1659 	       it != ws_prop->end (); ++it)
1660 	    field_vec.quick_push (*it);
1661 
1662 	  field_vec.qsort (sort_by_size_then_ssa_version_or_uid);
1663 
1664 	  bool existed;
1665 	  field_map_t *fields
1666 	    = &record_field_map.get_or_insert (record_type, &existed);
1667 	  gcc_checking_assert (!existed);
1668 
1669 	  /* Insert var fields in reverse order, so the last inserted element
1670 	     is the first in the structure.  */
1671 	  for (int i = field_vec.length () - 1; i >= 0; i--)
1672 	    install_var_field (field_vec[i], record_type, fields);
1673 
1674 	  layout_type (record_type);
1675 
1676 	  bb->aux = (tree) record_type;
1677 	}
1678     }
1679 
1680   sbitmap *reachable
1681     = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
1682 			    last_basic_block_for_fn (cfun));
1683 
1684   bitmap_vector_clear (reachable, last_basic_block_for_fn (cfun));
1685 
1686   auto_vec<std::pair<int, tree> > priority;
1687 
1688   FOR_ALL_BB_FN (bb, cfun)
1689     {
1690       if (bb->aux)
1691 	{
1692 	  tree record_type = (tree) bb->aux;
1693 
1694 	  basic_block bb2;
1695 	  FOR_ALL_BB_FN (bb2, cfun)
1696 	    bb2->flags &= ~BB_VISITED;
1697 
1698 	  priority.safe_push (std::make_pair (bb->index, record_type));
1699 	  dfs_broadcast_reachable_1 (bb, reachable[bb->index]);
1700 	}
1701     }
1702 
1703   sbitmap *inverted
1704     = sbitmap_vector_alloc (last_basic_block_for_fn (cfun),
1705 			    last_basic_block_for_fn (cfun));
1706 
1707   bitmap_vector_clear (inverted, last_basic_block_for_fn (cfun));
1708 
1709   for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
1710     {
1711       sbitmap_iterator bi;
1712       unsigned int j;
1713       EXECUTE_IF_SET_IN_BITMAP (reachable[i], 0, j, bi)
1714 	bitmap_set_bit (inverted[j], i);
1715     }
1716 
1717   for (int i = 0; i < last_basic_block_for_fn (cfun); i++)
1718     bitmap_ior (reachable[i], reachable[i], inverted[i]);
1719 
1720   sbitmap_vector_free (inverted);
1721 
1722   used_range_vec_t used_ranges;
1723 
1724   used_ranges.safe_grow_cleared (last_basic_block_for_fn (cfun));
1725 
1726   blk_offset_map_t blk_offset_map;
1727 
1728   addr_range worker_shm_bounds (bounds_lo, bounds_hi);
1729 
1730   priority.qsort (sort_size_descending);
1731   for (unsigned int i = 0; i < priority.length (); i++)
1732     {
1733       idx_decl_pair_t p = priority[i];
1734       int blkno = p.first;
1735       tree record_type = p.second;
1736       HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (record_type));
1737       HOST_WIDE_INT align = TYPE_ALIGN_UNIT (record_type);
1738 
1739       splay_tree conflicts = splay_tree_new (splay_tree_compare_addr_range,
1740 					     splay_tree_free_key, NULL);
1741 
1742       if (!used_ranges[blkno])
1743 	used_ranges[blkno] = splay_tree_new (splay_tree_compare_addr_range,
1744 					     splay_tree_free_key, NULL);
1745       else
1746 	merge_ranges (conflicts, used_ranges[blkno]);
1747 
1748       sbitmap_iterator bi;
1749       unsigned int j;
1750       EXECUTE_IF_SET_IN_BITMAP (reachable[blkno], 0, j, bi)
1751 	if (used_ranges[j])
1752 	  merge_ranges (conflicts, used_ranges[j]);
1753 
1754       addr_range ar
1755 	= first_fit_range (conflicts, size, align, &worker_shm_bounds);
1756 
1757       splay_tree_delete (conflicts);
1758 
1759       if (ar.invalid ())
1760 	{
1761 	  unsigned HOST_WIDE_INT base
1762 	    = (bounds_lo + align - 1) & ~(align - 1);
1763 	  if (base + size > bounds_hi)
1764 	    error_at (UNKNOWN_LOCATION, "shared-memory region overflow");
1765 	  std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
1766 	    = std::make_pair (base, false);
1767 	  blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
1768 	}
1769       else
1770 	{
1771 	  splay_tree_node old = splay_tree_lookup (used_ranges[blkno],
1772 						   (splay_tree_key) &ar);
1773 	  if (old)
1774 	    {
1775 	      fprintf (stderr, "trying to map [%d..%d] but [%d..%d] is "
1776 		       "already mapped in block %d\n", (int) ar.lo,
1777 		       (int) ar.hi, (int) ((addr_range *) old->key)->lo,
1778 		       (int) ((addr_range *) old->key)->hi, blkno);
1779 	      abort ();
1780 	    }
1781 
1782 	  addr_range *arp = new addr_range (ar);
1783 	  splay_tree_insert (used_ranges[blkno], (splay_tree_key) arp,
1784 			     (splay_tree_value) blkno);
1785 	  std::pair<unsigned HOST_WIDE_INT, bool> base_inrng
1786 	    = std::make_pair (ar.lo, true);
1787 	  blk_offset_map.put (BASIC_BLOCK_FOR_FN (cfun, blkno), base_inrng);
1788 	}
1789     }
1790 
1791   sbitmap_vector_free (reachable);
1792 
1793   neuter_worker_single (par, mask, worker_single, vector_single, &prop_set,
1794 			&partitioned_var_uses, &record_field_map,
1795 			&blk_offset_map, writes_gang_private);
1796 
1797   record_field_map.empty ();
1798 
1799   /* These are supposed to have been 'delete'd by 'neuter_worker_single'.  */
1800   for (auto it : prop_set)
1801     gcc_checking_assert (!it);
1802   prop_set.release ();
1803 
1804   delete par;
1805 
1806   /* This doesn't seem to make a difference.  */
1807   loops_state_clear (LOOP_CLOSED_SSA);
1808 
1809   /* Neutering worker-single neutered blocks will invalidate dominance info.
1810      It may be possible to incrementally update just the affected blocks, but
1811      obliterate everything for now.  */
1812   free_dominance_info (CDI_DOMINATORS);
1813   free_dominance_info (CDI_POST_DOMINATORS);
1814 
1815   if (dump_file)
1816     {
1817       fprintf (dump_file, "\n\nAfter neutering:\n\n");
1818       dump_function_to_file (current_function_decl, dump_file, dump_flags);
1819     }
1820 }
1821 
1822 static int
execute_omp_oacc_neuter_broadcast()1823 execute_omp_oacc_neuter_broadcast ()
1824 {
1825   unsigned HOST_WIDE_INT reduction_size[GOMP_DIM_MAX];
1826   unsigned HOST_WIDE_INT private_size[GOMP_DIM_MAX];
1827 
1828   for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
1829     {
1830       reduction_size[i] = 0;
1831       private_size[i] = 0;
1832     }
1833 
1834   /* Calculate shared memory size required for reduction variables and
1835      gang-private memory for this offloaded function.  */
1836   basic_block bb;
1837   FOR_ALL_BB_FN (bb, cfun)
1838     {
1839       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
1840 	   !gsi_end_p (gsi);
1841 	   gsi_next (&gsi))
1842 	{
1843 	  gimple *stmt = gsi_stmt (gsi);
1844 	  if (!is_gimple_call (stmt))
1845 	    continue;
1846 	  gcall *call = as_a <gcall *> (stmt);
1847 	  if (!gimple_call_internal_p (call))
1848 	    continue;
1849 	  enum internal_fn ifn_code = gimple_call_internal_fn (call);
1850 	  switch (ifn_code)
1851 	    {
1852 	    default: break;
1853 	    case IFN_GOACC_REDUCTION:
1854 	      if (integer_minus_onep (gimple_call_arg (call, 3)))
1855 		continue;
1856 	      else
1857 		{
1858 		  unsigned code = TREE_INT_CST_LOW (gimple_call_arg (call, 0));
1859 		  /* Only count reduction variables once: the choice to pick
1860 		     the setup call is fairly arbitrary.  */
1861 		  if (code == IFN_GOACC_REDUCTION_SETUP)
1862 		    {
1863 		      int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
1864 		      tree var = gimple_call_arg (call, 2);
1865 		      tree offset = gimple_call_arg (call, 5);
1866 		      tree var_type = TREE_TYPE (var);
1867 		      unsigned HOST_WIDE_INT limit
1868 			= (tree_to_uhwi (offset)
1869 			   + tree_to_uhwi (TYPE_SIZE_UNIT (var_type)));
1870 		      reduction_size[level]
1871 			= MAX (reduction_size[level], limit);
1872 		    }
1873 		}
1874 	      break;
1875 	    case IFN_UNIQUE:
1876 	      {
1877 		enum ifn_unique_kind kind
1878 		  = ((enum ifn_unique_kind)
1879 		     TREE_INT_CST_LOW (gimple_call_arg (call, 0)));
1880 
1881 		if (kind == IFN_UNIQUE_OACC_PRIVATE)
1882 		  {
1883 		    HOST_WIDE_INT level
1884 		      = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
1885 		    if (level == -1)
1886 		      break;
1887 		    for (unsigned i = 3;
1888 			 i < gimple_call_num_args (call);
1889 			 i++)
1890 		      {
1891 			tree arg = gimple_call_arg (call, i);
1892 			gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
1893 			tree decl = TREE_OPERAND (arg, 0);
1894 			unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (decl);
1895 			private_size[level] = ((private_size[level] + align - 1)
1896 					       & ~(align - 1));
1897 			unsigned HOST_WIDE_INT decl_size
1898 			  = tree_to_uhwi (TYPE_SIZE_UNIT (TREE_TYPE (decl)));
1899 			private_size[level] += decl_size;
1900 		      }
1901 		  }
1902 	      }
1903 	      break;
1904 	    }
1905 	}
1906     }
1907 
1908   int dims[GOMP_DIM_MAX];
1909   for (unsigned i = 0; i < GOMP_DIM_MAX; i++)
1910     dims[i] = oacc_get_fn_dim_size (current_function_decl, i);
1911 
1912   /* Find bounds of shared-memory buffer space we can use.  */
1913   unsigned HOST_WIDE_INT bounds_lo = 0, bounds_hi = 0;
1914   if (targetm.goacc.shared_mem_layout)
1915     targetm.goacc.shared_mem_layout (&bounds_lo, &bounds_hi, dims,
1916 				     private_size, reduction_size);
1917 
1918   /* Perform worker partitioning unless we know 'num_workers(1)'.  */
1919   if (dims[GOMP_DIM_WORKER] != 1)
1920     oacc_do_neutering (bounds_lo, bounds_hi);
1921 
1922   return 0;
1923 }
1924 
1925 namespace {
1926 
1927 const pass_data pass_data_omp_oacc_neuter_broadcast =
1928 {
1929   GIMPLE_PASS, /* type */
1930   "omp_oacc_neuter_broadcast", /* name */
1931   OPTGROUP_OMP, /* optinfo_flags */
1932   TV_NONE, /* tv_id */
1933   PROP_cfg, /* properties_required */
1934   0, /* properties_provided */
1935   0, /* properties_destroyed */
1936   0, /* todo_flags_start */
1937   TODO_update_ssa | TODO_cleanup_cfg, /* todo_flags_finish */
1938 };
1939 
1940 class pass_omp_oacc_neuter_broadcast : public gimple_opt_pass
1941 {
1942 public:
pass_omp_oacc_neuter_broadcast(gcc::context * ctxt)1943   pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
1944     : gimple_opt_pass (pass_data_omp_oacc_neuter_broadcast, ctxt)
1945   {}
1946 
1947   /* opt_pass methods: */
gate(function * fun)1948   virtual bool gate (function *fun)
1949   {
1950     if (!flag_openacc)
1951       return false;
1952 
1953     if (!targetm.goacc.create_worker_broadcast_record)
1954       return false;
1955 
1956     /* Only relevant for OpenACC offloaded functions.  */
1957     tree attr = oacc_get_fn_attrib (fun->decl);
1958     if (!attr)
1959       return false;
1960 
1961     return true;
1962   }
1963 
execute(function *)1964   virtual unsigned int execute (function *)
1965     {
1966       return execute_omp_oacc_neuter_broadcast ();
1967     }
1968 
1969 }; // class pass_omp_oacc_neuter_broadcast
1970 
1971 } // anon namespace
1972 
1973 gimple_opt_pass *
make_pass_omp_oacc_neuter_broadcast(gcc::context * ctxt)1974 make_pass_omp_oacc_neuter_broadcast (gcc::context *ctxt)
1975 {
1976   return new pass_omp_oacc_neuter_broadcast (ctxt);
1977 }
1978