xref: /netbsd-src/external/gpl3/gcc.old/dist/gcc/config/nvptx/nvptx.c (revision 2dd295436a0082eb4f8d294f4aa73c223413d0f2)
1 /* Target code for NVPTX.
2    Copyright (C) 2014-2020 Free Software Foundation, Inc.
3    Contributed by Bernd Schmidt <bernds@codesourcery.com>
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published
9    by the Free Software Foundation; either version 3, or (at your
10    option) any later version.
11 
12    GCC is distributed in the hope that it will be useful, but WITHOUT
13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15    License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
64 #include "dumpfile.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
68 #include "attribs.h"
69 #include "tree-vrp.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
72 #include "gimplify.h"
73 #include "tree-phinodes.h"
74 #include "cfgloop.h"
75 #include "fold-const.h"
76 #include "intl.h"
77 
78 /* This file should be included last.  */
79 #include "target-def.h"
80 
81 #define WORKAROUND_PTXJIT_BUG 1
82 #define WORKAROUND_PTXJIT_BUG_2 1
83 #define WORKAROUND_PTXJIT_BUG_3 1
84 
85 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
86    block, which has had a maximum number of threads of 1024 since CUDA version
87    2.x.  */
88 #define PTX_CTA_SIZE 1024
89 
90 #define PTX_CTA_NUM_BARRIERS 16
91 #define PTX_WARP_SIZE 32
92 
93 #define PTX_PER_CTA_BARRIER 0
94 #define PTX_NUM_PER_CTA_BARRIERS 1
95 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
96 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
97 
98 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
99 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
100 #define PTX_WORKER_LENGTH 32
101 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
102 
103 /* The various PTX memory areas an object might reside in.  */
104 enum nvptx_data_area
105 {
106   DATA_AREA_GENERIC,
107   DATA_AREA_GLOBAL,
108   DATA_AREA_SHARED,
109   DATA_AREA_LOCAL,
110   DATA_AREA_CONST,
111   DATA_AREA_PARAM,
112   DATA_AREA_MAX
113 };
114 
115 /*  We record the data area in the target symbol flags.  */
116 #define SYMBOL_DATA_AREA(SYM) \
117   (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
118 		    & 7)
119 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
120   (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
121 
122 /* Record the function decls we've written, and the libfuncs and function
123    decls corresponding to them.  */
124 static std::stringstream func_decls;
125 
126 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
127 {
128   static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
129   static bool equal (rtx a, rtx b) { return a == b; }
130 };
131 
132 static GTY((cache))
133   hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
134 
135 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
136 {
137   static hashval_t hash (tree t) { return htab_hash_pointer (t); }
138   static bool equal (tree a, tree b) { return a == b; }
139 };
140 
141 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
142 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
143 
144 /* Buffer needed to broadcast across workers and vectors.  This is
145    used for both worker-neutering and worker broadcasting, and
146    vector-neutering and boardcasting when vector_length > 32.  It is
147    shared by all functions emitted.  The buffer is placed in shared
148    memory.  It'd be nice if PTX supported common blocks, because then
149    this could be shared across TUs (taking the largest size).  */
150 static unsigned oacc_bcast_size;
151 static unsigned oacc_bcast_partition;
152 static unsigned oacc_bcast_align;
153 static GTY(()) rtx oacc_bcast_sym;
154 
155 /* Buffer needed for worker reductions.  This has to be distinct from
156    the worker broadcast array, as both may be live concurrently.  */
157 static unsigned worker_red_size;
158 static unsigned worker_red_align;
159 static GTY(()) rtx worker_red_sym;
160 
161 /* Buffer needed for vector reductions, when vector_length >
162    PTX_WARP_SIZE.  This has to be distinct from the worker broadcast
163    array, as both may be live concurrently.  */
164 static unsigned vector_red_size;
165 static unsigned vector_red_align;
166 static unsigned vector_red_partition;
167 static GTY(()) rtx vector_red_sym;
168 
169 /* Global lock variable, needed for 128bit worker & gang reductions.  */
170 static GTY(()) tree global_lock_var;
171 
172 /* True if any function references __nvptx_stacks.  */
173 static bool need_softstack_decl;
174 
175 /* True if any function references __nvptx_uni.  */
176 static bool need_unisimt_decl;
177 
178 static int nvptx_mach_max_workers ();
179 
180 /* Allocate a new, cleared machine_function structure.  */
181 
182 static struct machine_function *
183 nvptx_init_machine_status (void)
184 {
185   struct machine_function *p = ggc_cleared_alloc<machine_function> ();
186   p->return_mode = VOIDmode;
187   return p;
188 }
189 
190 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
191    and -fopenacc is also enabled.  */
192 
193 static void
194 diagnose_openacc_conflict (bool optval, const char *optname)
195 {
196   if (flag_openacc && optval)
197     error ("option %s is not supported together with %<-fopenacc%>", optname);
198 }
199 
200 /* Implement TARGET_OPTION_OVERRIDE.  */
201 
202 static void
203 nvptx_option_override (void)
204 {
205   init_machine_status = nvptx_init_machine_status;
206 
207   /* Set toplevel_reorder, unless explicitly disabled.  We need
208      reordering so that we emit necessary assembler decls of
209      undeclared variables. */
210   if (!global_options_set.x_flag_toplevel_reorder)
211     flag_toplevel_reorder = 1;
212 
213   debug_nonbind_markers_p = 0;
214 
215   /* Set flag_no_common, unless explicitly disabled.  We fake common
216      using .weak, and that's not entirely accurate, so avoid it
217      unless forced.  */
218   if (!global_options_set.x_flag_no_common)
219     flag_no_common = 1;
220 
221   /* The patch area requires nops, which we don't have.  */
222   if (function_entry_patch_area_size > 0)
223     sorry ("not generating patch area, nops not supported");
224 
225   /* Assumes that it will see only hard registers.  */
226   flag_var_tracking = 0;
227 
228   if (nvptx_optimize < 0)
229     nvptx_optimize = optimize > 0;
230 
231   declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
232   needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
233   declared_libfuncs_htab
234     = hash_table<declared_libfunc_hasher>::create_ggc (17);
235 
236   oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
237   SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
238   oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
239   oacc_bcast_partition = 0;
240 
241   worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
242   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
243   worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
244 
245   vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
246   SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
247   vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
248   vector_red_partition = 0;
249 
250   diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
251   diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
252   diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
253 
254   if (TARGET_GOMP)
255     target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
256 }
257 
258 /* Return a ptx type for MODE.  If PROMOTE, then use .u32 for QImode to
259    deal with ptx ideosyncracies.  */
260 
261 const char *
262 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
263 {
264   switch (mode)
265     {
266     case E_BLKmode:
267       return ".b8";
268     case E_BImode:
269       return ".pred";
270     case E_QImode:
271       if (promote)
272 	return ".u32";
273       else
274 	return ".u8";
275     case E_HImode:
276       return ".u16";
277     case E_SImode:
278       return ".u32";
279     case E_DImode:
280       return ".u64";
281 
282     case E_SFmode:
283       return ".f32";
284     case E_DFmode:
285       return ".f64";
286 
287     case E_V2SImode:
288       return ".v2.u32";
289     case E_V2DImode:
290       return ".v2.u64";
291 
292     default:
293       gcc_unreachable ();
294     }
295 }
296 
297 /* Encode the PTX data area that DECL (which might not actually be a
298    _DECL) should reside in.  */
299 
300 static void
301 nvptx_encode_section_info (tree decl, rtx rtl, int first)
302 {
303   default_encode_section_info (decl, rtl, first);
304   if (first && MEM_P (rtl))
305     {
306       nvptx_data_area area = DATA_AREA_GENERIC;
307 
308       if (TREE_CONSTANT (decl))
309 	area = DATA_AREA_CONST;
310       else if (TREE_CODE (decl) == VAR_DECL)
311 	{
312 	  if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
313 	    {
314 	      area = DATA_AREA_SHARED;
315 	      if (DECL_INITIAL (decl))
316 		error ("static initialization of variable %q+D in %<.shared%>"
317 		       " memory is not supported", decl);
318 	    }
319 	  else
320 	    area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
321 	}
322 
323       SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
324     }
325 }
326 
327 /* Return the PTX name of the data area in which SYM should be
328    placed.  The symbol must have already been processed by
329    nvptx_encode_seciton_info, or equivalent.  */
330 
331 static const char *
332 section_for_sym (rtx sym)
333 {
334   nvptx_data_area area = SYMBOL_DATA_AREA (sym);
335   /* Same order as nvptx_data_area enum.  */
336   static char const *const areas[] =
337     {"", ".global", ".shared", ".local", ".const", ".param"};
338 
339   return areas[area];
340 }
341 
342 /* Similarly for a decl.  */
343 
344 static const char *
345 section_for_decl (const_tree decl)
346 {
347   return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
348 }
349 
350 /* Check NAME for special function names and redirect them by returning a
351    replacement.  This applies to malloc, free and realloc, for which we
352    want to use libgcc wrappers, and call, which triggers a bug in
353    ptxas.  We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
354    not active in an offload compiler -- the names are all set by the
355    host-side compiler.  */
356 
357 static const char *
358 nvptx_name_replacement (const char *name)
359 {
360   if (strcmp (name, "call") == 0)
361     return "__nvptx_call";
362   if (strcmp (name, "malloc") == 0)
363     return "__nvptx_malloc";
364   if (strcmp (name, "free") == 0)
365     return "__nvptx_free";
366   if (strcmp (name, "realloc") == 0)
367     return "__nvptx_realloc";
368   return name;
369 }
370 
371 /* If MODE should be treated as two registers of an inner mode, return
372    that inner mode.  Otherwise return VOIDmode.  */
373 
374 static machine_mode
375 maybe_split_mode (machine_mode mode)
376 {
377   if (COMPLEX_MODE_P (mode))
378     return GET_MODE_INNER (mode);
379 
380   if (mode == TImode)
381     return DImode;
382 
383   return VOIDmode;
384 }
385 
386 /* Return true if mode should be treated as two registers.  */
387 
388 static bool
389 split_mode_p (machine_mode mode)
390 {
391   return maybe_split_mode (mode) != VOIDmode;
392 }
393 
394 /* Output a register, subreg, or register pair (with optional
395    enclosing braces).  */
396 
397 static void
398 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
399 	    int subreg_offset = -1)
400 {
401   if (inner_mode == VOIDmode)
402     {
403       if (HARD_REGISTER_NUM_P (regno))
404 	fprintf (file, "%s", reg_names[regno]);
405       else
406 	fprintf (file, "%%r%d", regno);
407     }
408   else if (subreg_offset >= 0)
409     {
410       output_reg (file, regno, VOIDmode);
411       fprintf (file, "$%d", subreg_offset);
412     }
413   else
414     {
415       if (subreg_offset == -1)
416 	fprintf (file, "{");
417       output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
418       fprintf (file, ",");
419       output_reg (file, regno, inner_mode, 0);
420       if (subreg_offset == -1)
421 	fprintf (file, "}");
422     }
423 }
424 
425 /* Emit forking instructions for MASK.  */
426 
427 static void
428 nvptx_emit_forking (unsigned mask, bool is_call)
429 {
430   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
431 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
432   if (mask)
433     {
434       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
435 
436       /* Emit fork at all levels.  This helps form SESE regions, as
437 	 it creates a block with a single successor before entering a
438 	 partitooned region.  That is a good candidate for the end of
439 	 an SESE region.  */
440       emit_insn (gen_nvptx_fork (op));
441       emit_insn (gen_nvptx_forked (op));
442     }
443 }
444 
445 /* Emit joining instructions for MASK.  */
446 
447 static void
448 nvptx_emit_joining (unsigned mask, bool is_call)
449 {
450   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
451 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
452   if (mask)
453     {
454       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
455 
456       /* Emit joining for all non-call pars to ensure there's a single
457 	 predecessor for the block the join insn ends up in.  This is
458 	 needed for skipping entire loops.  */
459       emit_insn (gen_nvptx_joining (op));
460       emit_insn (gen_nvptx_join (op));
461     }
462 }
463 
464 
465 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
466    returned in memory.  Integer and floating types supported by the
467    machine are passed in registers, everything else is passed in
468    memory.  Complex types are split.  */
469 
470 static bool
471 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
472 {
473   if (type)
474     {
475       if (AGGREGATE_TYPE_P (type))
476 	return true;
477       if (TREE_CODE (type) == VECTOR_TYPE)
478 	return true;
479     }
480 
481   if (!for_return && COMPLEX_MODE_P (mode))
482     /* Complex types are passed as two underlying args.  */
483     mode = GET_MODE_INNER (mode);
484 
485   if (GET_MODE_CLASS (mode) != MODE_INT
486       && GET_MODE_CLASS (mode) != MODE_FLOAT)
487     return true;
488 
489   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
490     return true;
491 
492   return false;
493 }
494 
495 /* A non-memory argument of mode MODE is being passed, determine the mode it
496    should be promoted to.  This is also used for determining return
497    type promotion.  */
498 
499 static machine_mode
500 promote_arg (machine_mode mode, bool prototyped)
501 {
502   if (!prototyped && mode == SFmode)
503     /* K&R float promotion for unprototyped functions.  */
504     mode = DFmode;
505   else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
506     mode = SImode;
507 
508   return mode;
509 }
510 
511 /* A non-memory return type of MODE is being returned.  Determine the
512    mode it should be promoted to.  */
513 
514 static machine_mode
515 promote_return (machine_mode mode)
516 {
517   return promote_arg (mode, true);
518 }
519 
520 /* Implement TARGET_FUNCTION_ARG.  */
521 
522 static rtx
523 nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
524 {
525   if (arg.end_marker_p () || !arg.named)
526     return NULL_RTX;
527 
528   return gen_reg_rtx (arg.mode);
529 }
530 
531 /* Implement TARGET_FUNCTION_INCOMING_ARG.  */
532 
533 static rtx
534 nvptx_function_incoming_arg (cumulative_args_t cum_v,
535 			     const function_arg_info &arg)
536 {
537   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
538 
539   if (arg.end_marker_p () || !arg.named)
540     return NULL_RTX;
541 
542   /* No need to deal with split modes here, the only case that can
543      happen is complex modes and those are dealt with by
544      TARGET_SPLIT_COMPLEX_ARG.  */
545   return gen_rtx_UNSPEC (arg.mode,
546 			 gen_rtvec (1, GEN_INT (cum->count)),
547 			 UNSPEC_ARG_REG);
548 }
549 
550 /* Implement TARGET_FUNCTION_ARG_ADVANCE.  */
551 
552 static void
553 nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
554 {
555   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
556 
557   cum->count++;
558 }
559 
560 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
561 
562    For nvptx This is only used for varadic args.  The type has already
563    been promoted and/or converted to invisible reference.  */
564 
565 static unsigned
566 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
567 {
568   return GET_MODE_ALIGNMENT (mode);
569 }
570 
571 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
572 
573    For nvptx, we know how to handle functions declared as stdarg: by
574    passing an extra pointer to the unnamed arguments.  However, the
575    Fortran frontend can produce a different situation, where a
576    function pointer is declared with no arguments, but the actual
577    function and calls to it take more arguments.  In that case, we
578    want to ensure the call matches the definition of the function.  */
579 
580 static bool
581 nvptx_strict_argument_naming (cumulative_args_t cum_v)
582 {
583   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
584 
585   return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
586 }
587 
588 /* Implement TARGET_LIBCALL_VALUE.  */
589 
590 static rtx
591 nvptx_libcall_value (machine_mode mode, const_rtx)
592 {
593   if (!cfun || !cfun->machine->doing_call)
594     /* Pretend to return in a hard reg for early uses before pseudos can be
595        generated.  */
596     return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
597 
598   return gen_reg_rtx (mode);
599 }
600 
601 /* TARGET_FUNCTION_VALUE implementation.  Returns an RTX representing the place
602    where function FUNC returns or receives a value of data type TYPE.  */
603 
604 static rtx
605 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
606 		      bool outgoing)
607 {
608   machine_mode mode = promote_return (TYPE_MODE (type));
609 
610   if (outgoing)
611     {
612       gcc_assert (cfun);
613       cfun->machine->return_mode = mode;
614       return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
615     }
616 
617   return nvptx_libcall_value (mode, NULL_RTX);
618 }
619 
620 /* Implement TARGET_FUNCTION_VALUE_REGNO_P.  */
621 
622 static bool
623 nvptx_function_value_regno_p (const unsigned int regno)
624 {
625   return regno == NVPTX_RETURN_REGNUM;
626 }
627 
628 /* Types with a mode other than those supported by the machine are passed by
629    reference in memory.  */
630 
631 static bool
632 nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
633 {
634   return pass_in_memory (arg.mode, arg.type, false);
635 }
636 
637 /* Implement TARGET_RETURN_IN_MEMORY.  */
638 
639 static bool
640 nvptx_return_in_memory (const_tree type, const_tree)
641 {
642   return pass_in_memory (TYPE_MODE (type), type, true);
643 }
644 
645 /* Implement TARGET_PROMOTE_FUNCTION_MODE.  */
646 
647 static machine_mode
648 nvptx_promote_function_mode (const_tree type, machine_mode mode,
649 			     int *ARG_UNUSED (punsignedp),
650 			     const_tree funtype, int for_return)
651 {
652   return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
653 }
654 
655 /* Helper for write_arg.  Emit a single PTX argument of MODE, either
656    in a prototype, or as copy in a function prologue.  ARGNO is the
657    index of this argument in the PTX function.  FOR_REG is negative,
658    if we're emitting the PTX prototype.  It is zero if we're copying
659    to an argument register and it is greater than zero if we're
660    copying to a specific hard register.  */
661 
662 static int
663 write_arg_mode (std::stringstream &s, int for_reg, int argno,
664 		machine_mode mode)
665 {
666   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
667 
668   if (for_reg < 0)
669     {
670       /* Writing PTX prototype.  */
671       s << (argno ? ", " : " (");
672       s << ".param" << ptx_type << " %in_ar" << argno;
673     }
674   else
675     {
676       s << "\t.reg" << ptx_type << " ";
677       if (for_reg)
678 	s << reg_names[for_reg];
679       else
680 	s << "%ar" << argno;
681       s << ";\n";
682       if (argno >= 0)
683 	{
684 	  s << "\tld.param" << ptx_type << " ";
685 	  if (for_reg)
686 	    s << reg_names[for_reg];
687 	  else
688 	    s << "%ar" << argno;
689 	  s << ", [%in_ar" << argno << "];\n";
690 	}
691     }
692   return argno + 1;
693 }
694 
695 /* Process function parameter TYPE to emit one or more PTX
696    arguments. S, FOR_REG and ARGNO as for write_arg_mode.  PROTOTYPED
697    is true, if this is a prototyped function, rather than an old-style
698    C declaration.  Returns the next argument number to use.
699 
700    The promotion behavior here must match the regular GCC function
701    parameter marshalling machinery.  */
702 
703 static int
704 write_arg_type (std::stringstream &s, int for_reg, int argno,
705 		tree type, bool prototyped)
706 {
707   machine_mode mode = TYPE_MODE (type);
708 
709   if (mode == VOIDmode)
710     return argno;
711 
712   if (pass_in_memory (mode, type, false))
713     mode = Pmode;
714   else
715     {
716       bool split = TREE_CODE (type) == COMPLEX_TYPE;
717 
718       if (split)
719 	{
720 	  /* Complex types are sent as two separate args.  */
721 	  type = TREE_TYPE (type);
722 	  mode = TYPE_MODE (type);
723 	  prototyped = true;
724 	}
725 
726       mode = promote_arg (mode, prototyped);
727       if (split)
728 	argno = write_arg_mode (s, for_reg, argno, mode);
729     }
730 
731   return write_arg_mode (s, for_reg, argno, mode);
732 }
733 
734 /* Emit a PTX return as a prototype or function prologue declaration
735    for MODE.  */
736 
737 static void
738 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
739 {
740   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
741   const char *pfx = "\t.reg";
742   const char *sfx = ";\n";
743 
744   if (for_proto)
745     pfx = "(.param", sfx = "_out) ";
746 
747   s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
748 }
749 
750 /* Process a function return TYPE to emit a PTX return as a prototype
751    or function prologue declaration.  Returns true if return is via an
752    additional pointer parameter.  The promotion behavior here must
753    match the regular GCC function return mashalling.  */
754 
755 static bool
756 write_return_type (std::stringstream &s, bool for_proto, tree type)
757 {
758   machine_mode mode = TYPE_MODE (type);
759 
760   if (mode == VOIDmode)
761     return false;
762 
763   bool return_in_mem = pass_in_memory (mode, type, true);
764 
765   if (return_in_mem)
766     {
767       if (for_proto)
768 	return return_in_mem;
769 
770       /* Named return values can cause us to return a pointer as well
771 	 as expect an argument for the return location.  This is
772 	 optimization-level specific, so no caller can make use of
773 	 this data, but more importantly for us, we must ensure it
774 	 doesn't change the PTX prototype.  */
775       mode = (machine_mode) cfun->machine->return_mode;
776 
777       if (mode == VOIDmode)
778 	return return_in_mem;
779 
780       /* Clear return_mode to inhibit copy of retval to non-existent
781 	 retval parameter.  */
782       cfun->machine->return_mode = VOIDmode;
783     }
784   else
785     mode = promote_return (mode);
786 
787   write_return_mode (s, for_proto, mode);
788 
789   return return_in_mem;
790 }
791 
792 /* Look for attributes in ATTRS that would indicate we must write a function
793    as a .entry kernel rather than a .func.  Return true if one is found.  */
794 
795 static bool
796 write_as_kernel (tree attrs)
797 {
798   return (lookup_attribute ("kernel", attrs) != NULL_TREE
799 	  || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
800 	      && lookup_attribute ("oacc function", attrs) != NULL_TREE));
801   /* For OpenMP target regions, the corresponding kernel entry is emitted from
802      write_omp_entry as a separate function.  */
803 }
804 
805 /* Emit a linker marker for a function decl or defn.  */
806 
807 static void
808 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
809 		 const char *name)
810 {
811   s << "\n// BEGIN";
812   if (globalize)
813     s << " GLOBAL";
814   s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
815   s << name << "\n";
816 }
817 
818 /* Emit a linker marker for a variable decl or defn.  */
819 
820 static void
821 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
822 {
823   fprintf (file, "\n// BEGIN%s VAR %s: ",
824 	   globalize ? " GLOBAL" : "",
825 	   is_defn ? "DEF" : "DECL");
826   assemble_name_raw (file, name);
827   fputs ("\n", file);
828 }
829 
830 /* Write a .func or .kernel declaration or definition along with
831    a helper comment for use by ld.  S is the stream to write to, DECL
832    the decl for the function with name NAME.   For definitions, emit
833    a declaration too.  */
834 
835 static const char *
836 write_fn_proto (std::stringstream &s, bool is_defn,
837 		const char *name, const_tree decl)
838 {
839   if (is_defn)
840     /* Emit a declaration. The PTX assembler gets upset without it.   */
841     name = write_fn_proto (s, false, name, decl);
842   else
843     {
844       /* Avoid repeating the name replacement.  */
845       name = nvptx_name_replacement (name);
846       if (name[0] == '*')
847 	name++;
848     }
849 
850   write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
851 
852   /* PTX declaration.  */
853   if (DECL_EXTERNAL (decl))
854     s << ".extern ";
855   else if (TREE_PUBLIC (decl))
856     s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
857   s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
858 
859   tree fntype = TREE_TYPE (decl);
860   tree result_type = TREE_TYPE (fntype);
861 
862   /* atomic_compare_exchange_$n builtins have an exceptional calling
863      convention.  */
864   int not_atomic_weak_arg = -1;
865   if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
866     switch (DECL_FUNCTION_CODE (decl))
867       {
868       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
869       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
870       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
871       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
872       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
873 	/* These atomics skip the 'weak' parm in an actual library
874 	   call.  We must skip it in the prototype too.  */
875 	not_atomic_weak_arg = 3;
876 	break;
877 
878       default:
879 	break;
880       }
881 
882   /* Declare the result.  */
883   bool return_in_mem = write_return_type (s, true, result_type);
884 
885   s << name;
886 
887   int argno = 0;
888 
889   /* Emit argument list.  */
890   if (return_in_mem)
891     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
892 
893   /* We get:
894      NULL in TYPE_ARG_TYPES, for old-style functions
895      NULL in DECL_ARGUMENTS, for builtin functions without another
896        declaration.
897      So we have to pick the best one we have.  */
898   tree args = TYPE_ARG_TYPES (fntype);
899   bool prototyped = true;
900   if (!args)
901     {
902       args = DECL_ARGUMENTS (decl);
903       prototyped = false;
904     }
905 
906   for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
907     {
908       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
909 
910       if (not_atomic_weak_arg)
911 	argno = write_arg_type (s, -1, argno, type, prototyped);
912       else
913 	gcc_assert (type == boolean_type_node);
914     }
915 
916   if (stdarg_p (fntype))
917     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
918 
919   if (DECL_STATIC_CHAIN (decl))
920     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
921 
922   if (!argno && strcmp (name, "main") == 0)
923     {
924       argno = write_arg_type (s, -1, argno, integer_type_node, true);
925       argno = write_arg_type (s, -1, argno, ptr_type_node, true);
926     }
927 
928   if (argno)
929     s << ")";
930 
931   s << (is_defn ? "\n" : ";\n");
932 
933   return name;
934 }
935 
936 /* Construct a function declaration from a call insn.  This can be
937    necessary for two reasons - either we have an indirect call which
938    requires a .callprototype declaration, or we have a libcall
939    generated by emit_library_call for which no decl exists.  */
940 
941 static void
942 write_fn_proto_from_insn (std::stringstream &s, const char *name,
943 			  rtx result, rtx pat)
944 {
945   if (!name)
946     {
947       s << "\t.callprototype ";
948       name = "_";
949     }
950   else
951     {
952       name = nvptx_name_replacement (name);
953       write_fn_marker (s, false, true, name);
954       s << "\t.extern .func ";
955     }
956 
957   if (result != NULL_RTX)
958     write_return_mode (s, true, GET_MODE (result));
959 
960   s << name;
961 
962   int arg_end = XVECLEN (pat, 0);
963   for (int i = 1; i < arg_end; i++)
964     {
965       /* We don't have to deal with mode splitting & promotion here,
966 	 as that was already done when generating the call
967 	 sequence.  */
968       machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
969 
970       write_arg_mode (s, -1, i - 1, mode);
971     }
972   if (arg_end != 1)
973     s << ")";
974   s << ";\n";
975 }
976 
977 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
978    table and write a ptx prototype.  These are emitted at end of
979    compilation.  */
980 
981 static void
982 nvptx_record_fndecl (tree decl)
983 {
984   tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
985   if (*slot == NULL)
986     {
987       *slot = decl;
988       const char *name = get_fnname_from_decl (decl);
989       write_fn_proto (func_decls, false, name, decl);
990     }
991 }
992 
993 /* Record a libcall or unprototyped external function. CALLEE is the
994    SYMBOL_REF.  Insert into the libfunc hash table and emit a ptx
995    declaration for it.  */
996 
997 static void
998 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
999 {
1000   rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1001   if (*slot == NULL)
1002     {
1003       *slot = callee;
1004 
1005       const char *name = XSTR (callee, 0);
1006       write_fn_proto_from_insn (func_decls, name, retval, pat);
1007     }
1008 }
1009 
1010 /* DECL is an external FUNCTION_DECL, that we're referencing.  If it
1011    is prototyped, record it now.  Otherwise record it as needed at end
1012    of compilation, when we might have more information about it.  */
1013 
1014 void
1015 nvptx_record_needed_fndecl (tree decl)
1016 {
1017   if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1018     {
1019       tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1020       if (*slot == NULL)
1021 	*slot = decl;
1022     }
1023   else
1024     nvptx_record_fndecl (decl);
1025 }
1026 
1027 /* SYM is a SYMBOL_REF.  If it refers to an external function, record
1028    it as needed.  */
1029 
1030 static void
1031 nvptx_maybe_record_fnsym (rtx sym)
1032 {
1033   tree decl = SYMBOL_REF_DECL (sym);
1034 
1035   if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1036     nvptx_record_needed_fndecl (decl);
1037 }
1038 
1039 /* Emit a local array to hold some part of a conventional stack frame
1040    and initialize REGNO to point to it.  If the size is zero, it'll
1041    never be valid to dereference, so we can simply initialize to
1042    zero.  */
1043 
1044 static void
1045 init_frame (FILE  *file, int regno, unsigned align, unsigned size)
1046 {
1047   if (size)
1048     fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1049 	     align, reg_names[regno], size);
1050   fprintf (file, "\t.reg.u%d %s;\n",
1051 	   POINTER_SIZE, reg_names[regno]);
1052   fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1053 		  :  "\tmov.u%d %s, 0;\n"),
1054 	   POINTER_SIZE, reg_names[regno], reg_names[regno]);
1055 }
1056 
1057 /* Emit soft stack frame setup sequence.  */
1058 
1059 static void
1060 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1061 {
1062   /* Maintain 64-bit stack alignment.  */
1063   unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1064   size = ROUND_UP (size, keep_align);
1065   int bits = POINTER_SIZE;
1066   const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1067   const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1068   const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1069   const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1070   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1071   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1072   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1073   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1074   fprintf (file, "\t{\n");
1075   fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1076   fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1077   fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1078   fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1079   fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1080 	   bits == 64 ? ".wide" : ".lo", bits / 8);
1081   fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1082 
1083   /* Initialize %sspslot = &__nvptx_stacks[tid.y].  */
1084   fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1085 
1086   /* Initialize %sspprev = __nvptx_stacks[tid.y].  */
1087   fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1088 	   bits, reg_sspprev, reg_sspslot);
1089 
1090   /* Initialize %frame = %sspprev - size.  */
1091   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1092 	   bits, reg_frame, reg_sspprev, size);
1093 
1094   /* Apply alignment, if larger than 64.  */
1095   if (alignment > keep_align)
1096     fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1097 	     bits, reg_frame, reg_frame, -alignment);
1098 
1099   size = crtl->outgoing_args_size;
1100   gcc_assert (size % keep_align == 0);
1101 
1102   /* Initialize %stack.  */
1103   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1104 	   bits, reg_stack, reg_frame, size);
1105 
1106   if (!crtl->is_leaf)
1107     fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1108 	     bits, reg_sspslot, reg_stack);
1109   fprintf (file, "\t}\n");
1110   cfun->machine->has_softstack = true;
1111   need_softstack_decl = true;
1112 }
1113 
1114 /* Emit code to initialize the REGNO predicate register to indicate
1115    whether we are not lane zero on the NAME axis.  */
1116 
1117 static void
1118 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1119 {
1120   fprintf (file, "\t{\n");
1121   fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1122   if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1123     {
1124       fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1125       fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1126     }
1127   fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1128   fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1129   if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1130     {
1131       fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1132       fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1133       fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1134 	       "// vector reduction buffer\n",
1135 	       REGNO (cfun->machine->red_partition),
1136 	       vector_red_partition);
1137     }
1138   /* Verify vector_red_size.  */
1139   gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1140 	      <= vector_red_size);
1141   fprintf (file, "\t}\n");
1142 }
1143 
1144 /* Emit code to initialize OpenACC worker broadcast and synchronization
1145    registers.  */
1146 
1147 static void
1148 nvptx_init_oacc_workers (FILE *file)
1149 {
1150   fprintf (file, "\t{\n");
1151   fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1152   if (cfun->machine->bcast_partition)
1153     {
1154       fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1155       fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1156     }
1157   fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1158   if (cfun->machine->bcast_partition)
1159     {
1160       fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1161       fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1162       fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1163       fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1164 	       "// vector broadcast offset\n",
1165 	       REGNO (cfun->machine->bcast_partition),
1166 	       oacc_bcast_partition);
1167     }
1168   /* Verify oacc_bcast_size.  */
1169   gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1170 	      <= oacc_bcast_size);
1171   if (cfun->machine->sync_bar)
1172     fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1173 	     "// vector synchronization barrier\n",
1174 	     REGNO (cfun->machine->sync_bar));
1175   fprintf (file, "\t}\n");
1176 }
1177 
1178 /* Emit code to initialize predicate and master lane index registers for
1179    -muniform-simt code generation variant.  */
1180 
1181 static void
1182 nvptx_init_unisimt_predicate (FILE *file)
1183 {
1184   cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1185   int loc = REGNO (cfun->machine->unisimt_location);
1186   int bits = POINTER_SIZE;
1187   fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1188   fprintf (file, "\t{\n");
1189   fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1190   fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1191   fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1192   fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1193 	   bits == 64 ? ".wide" : ".lo");
1194   fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1195   fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1196   if (cfun->machine->unisimt_predicate)
1197     {
1198       int master = REGNO (cfun->machine->unisimt_master);
1199       int pred = REGNO (cfun->machine->unisimt_predicate);
1200       fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1201       fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1202       /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'.  */
1203       fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1204       /* Compute predicate as 'tid.x == master'.  */
1205       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1206     }
1207   fprintf (file, "\t}\n");
1208   need_unisimt_decl = true;
1209 }
1210 
1211 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1212 
1213    extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1214    void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1215    {
1216      __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1217      __nvptx_uni[tid.y] = 0;
1218      gomp_nvptx_main (ORIG, arg);
1219    }
1220    ORIG itself should not be emitted as a PTX .entry function.  */
1221 
1222 static void
1223 write_omp_entry (FILE *file, const char *name, const char *orig)
1224 {
1225   static bool gomp_nvptx_main_declared;
1226   if (!gomp_nvptx_main_declared)
1227     {
1228       gomp_nvptx_main_declared = true;
1229       write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1230       func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1231         << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1232     }
1233   /* PR79332.  Single out this string; it confuses gcc.pot generation.  */
1234 #define NTID_Y "%ntid.y"
1235 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1236  (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1237 {\n\
1238 	.reg.u32 %r<3>;\n\
1239 	.reg.u" PS " %R<4>;\n\
1240 	mov.u32 %r0, %tid.y;\n\
1241 	mov.u32 %r1, " NTID_Y ";\n\
1242 	mov.u32 %r2, %ctaid.x;\n\
1243 	cvt.u" PS ".u32 %R1, %r0;\n\
1244 	" MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1245 	mov.u" PS " %R0, __nvptx_stacks;\n\
1246 	" MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1247 	ld.param.u" PS " %R2, [%stack];\n\
1248 	ld.param.u" PS " %R3, [%sz];\n\
1249 	add.u" PS " %R2, %R2, %R3;\n\
1250 	mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1251 	st.shared.u" PS " [%R0], %R2;\n\
1252 	mov.u" PS " %R0, __nvptx_uni;\n\
1253 	" MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1254 	mov.u32 %r0, 0;\n\
1255 	st.shared.u32 [%R0], %r0;\n\
1256 	mov.u" PS " %R0, \0;\n\
1257 	ld.param.u" PS " %R1, [%arg];\n\
1258 	{\n\
1259 		.param.u" PS " %P<2>;\n\
1260 		st.param.u" PS " [%P0], %R0;\n\
1261 		st.param.u" PS " [%P1], %R1;\n\
1262 		call.uni gomp_nvptx_main, (%P0, %P1);\n\
1263 	}\n\
1264 	ret.uni;\n\
1265 }\n"
1266   static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1267   static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32  ");
1268 #undef ENTRY_TEMPLATE
1269 #undef NTID_Y
1270   const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1271   /* Position ENTRY_2 after the embedded nul using strlen of the prefix.  */
1272   const char *entry_2 = entry_1 + strlen (entry64) + 1;
1273   fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1274   need_softstack_decl = need_unisimt_decl = true;
1275 }
1276 
1277 /* Implement ASM_DECLARE_FUNCTION_NAME.  Writes the start of a ptx
1278    function, including local var decls and copies from the arguments to
1279    local regs.  */
1280 
1281 void
1282 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1283 {
1284   tree fntype = TREE_TYPE (decl);
1285   tree result_type = TREE_TYPE (fntype);
1286   int argno = 0;
1287 
1288   if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1289       && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1290     {
1291       char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1292       sprintf (buf, "%s$impl", name);
1293       write_omp_entry (file, name, buf);
1294       name = buf;
1295     }
1296   /* We construct the initial part of the function into a string
1297      stream, in order to share the prototype writing code.  */
1298   std::stringstream s;
1299   write_fn_proto (s, true, name, decl);
1300   s << "{\n";
1301 
1302   bool return_in_mem = write_return_type (s, false, result_type);
1303   if (return_in_mem)
1304     argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1305 
1306   /* Declare and initialize incoming arguments.  */
1307   tree args = TYPE_ARG_TYPES (fntype);
1308   bool prototyped = true;
1309   if (!args)
1310     {
1311       args = DECL_ARGUMENTS (decl);
1312       prototyped = false;
1313     }
1314 
1315   for (; args != NULL_TREE; args = TREE_CHAIN (args))
1316     {
1317       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1318 
1319       argno = write_arg_type (s, 0, argno, type, prototyped);
1320     }
1321 
1322   if (stdarg_p (fntype))
1323     argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1324 			    true);
1325 
1326   if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1327     write_arg_type (s, STATIC_CHAIN_REGNUM,
1328 		    DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1329 		    true);
1330 
1331   fprintf (file, "%s", s.str().c_str());
1332 
1333   /* Usually 'crtl->is_leaf' is computed during register allocator
1334      initialization (which is not done on NVPTX) or for pressure-sensitive
1335      optimizations.  Initialize it here, except if already set.  */
1336   if (!crtl->is_leaf)
1337     crtl->is_leaf = leaf_function_p ();
1338 
1339   HOST_WIDE_INT sz = get_frame_size ();
1340   bool need_frameptr = sz || cfun->machine->has_chain;
1341   int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1342   if (!TARGET_SOFT_STACK)
1343     {
1344       /* Declare a local var for outgoing varargs.  */
1345       if (cfun->machine->has_varadic)
1346 	init_frame (file, STACK_POINTER_REGNUM,
1347 		    UNITS_PER_WORD, crtl->outgoing_args_size);
1348 
1349       /* Declare a local variable for the frame.  Force its size to be
1350 	 DImode-compatible.  */
1351       if (need_frameptr)
1352 	init_frame (file, FRAME_POINTER_REGNUM, alignment,
1353 		    ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1354     }
1355   else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1356 	   || (cfun->machine->has_simtreg && !crtl->is_leaf))
1357     init_softstack_frame (file, alignment, sz);
1358 
1359   if (cfun->machine->has_simtreg)
1360     {
1361       unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1362       unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1363       align = MAX (align, GET_MODE_SIZE (DImode));
1364       if (!crtl->is_leaf || cfun->calls_alloca)
1365 	simtsz = HOST_WIDE_INT_M1U;
1366       if (simtsz == HOST_WIDE_INT_M1U)
1367 	simtsz = nvptx_softstack_size;
1368       if (cfun->machine->has_softstack)
1369 	simtsz += POINTER_SIZE / 8;
1370       simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1371       if (align > GET_MODE_SIZE (DImode))
1372 	simtsz += align - GET_MODE_SIZE (DImode);
1373       if (simtsz)
1374 	fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1375 		HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1376     }
1377 
1378   /* Restore the vector reduction partition register, if necessary.
1379      FIXME: Find out when and why this is necessary, and fix it.  */
1380   if (cfun->machine->red_partition)
1381     regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1382       = cfun->machine->red_partition;
1383 
1384   /* Declare the pseudos we have as ptx registers.  */
1385   int maxregs = max_reg_num ();
1386   for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1387     {
1388       if (regno_reg_rtx[i] != const0_rtx)
1389 	{
1390 	  machine_mode mode = PSEUDO_REGNO_MODE (i);
1391 	  machine_mode split = maybe_split_mode (mode);
1392 
1393 	  if (split_mode_p (mode))
1394 	    mode = split;
1395 	  fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1396 	  output_reg (file, i, split, -2);
1397 	  fprintf (file, ";\n");
1398 	}
1399     }
1400 
1401   /* Emit axis predicates. */
1402   if (cfun->machine->axis_predicate[0])
1403     nvptx_init_axis_predicate (file,
1404 			       REGNO (cfun->machine->axis_predicate[0]), "y");
1405   if (cfun->machine->axis_predicate[1])
1406     nvptx_init_axis_predicate (file,
1407 			       REGNO (cfun->machine->axis_predicate[1]), "x");
1408   if (cfun->machine->unisimt_predicate
1409       || (cfun->machine->has_simtreg && !crtl->is_leaf))
1410     nvptx_init_unisimt_predicate (file);
1411   if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1412     nvptx_init_oacc_workers (file);
1413 }
1414 
1415 /* Output code for switching uniform-simt state.  ENTERING indicates whether
1416    we are entering or leaving non-uniform execution region.  */
1417 
1418 static void
1419 nvptx_output_unisimt_switch (FILE *file, bool entering)
1420 {
1421   if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1422     return;
1423   fprintf (file, "\t{\n");
1424   fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1425   fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1426   if (!crtl->is_leaf)
1427     {
1428       int loc = REGNO (cfun->machine->unisimt_location);
1429       fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1430     }
1431   if (cfun->machine->unisimt_predicate)
1432     {
1433       int master = REGNO (cfun->machine->unisimt_master);
1434       int pred = REGNO (cfun->machine->unisimt_predicate);
1435       fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1436       fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1437 	       master, entering ? "%ustmp2" : "0");
1438       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1439     }
1440   fprintf (file, "\t}\n");
1441 }
1442 
1443 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1444    ENTERING indicates whether we are entering or leaving non-uniform execution.
1445    PTR is the register pointing to allocated storage, it is assigned to on
1446    entering and used to restore state on leaving.  SIZE and ALIGN are used only
1447    on entering.  */
1448 
1449 static void
1450 nvptx_output_softstack_switch (FILE *file, bool entering,
1451 			       rtx ptr, rtx size, rtx align)
1452 {
1453   gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1454   if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1455     return;
1456   int bits = POINTER_SIZE, regno = REGNO (ptr);
1457   fprintf (file, "\t{\n");
1458   if (entering)
1459     {
1460       fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1461 	       HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1462 	       cfun->machine->simt_stack_size);
1463       fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1464       if (CONST_INT_P (size))
1465 	fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1466 		 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1467       else
1468 	output_reg (file, REGNO (size), VOIDmode);
1469       fputs (";\n", file);
1470       if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1471 	fprintf (file,
1472 		 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1473 		 bits, regno, regno, UINTVAL (align));
1474     }
1475   if (cfun->machine->has_softstack)
1476     {
1477       const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1478       if (entering)
1479 	{
1480 	  fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1481 		   bits, regno, bits / 8, reg_stack);
1482 	  fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1483 		   bits, reg_stack, regno, bits / 8);
1484 	}
1485       else
1486 	{
1487 	  fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1488 		   bits, reg_stack, regno, bits / 8);
1489 	}
1490       nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1491     }
1492   fprintf (file, "\t}\n");
1493 }
1494 
1495 /* Output code to enter non-uniform execution region.  DEST is a register
1496    to hold a per-lane allocation given by SIZE and ALIGN.  */
1497 
1498 const char *
1499 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1500 {
1501   nvptx_output_unisimt_switch (asm_out_file, true);
1502   nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1503   return "";
1504 }
1505 
1506 /* Output code to leave non-uniform execution region.  SRC is the register
1507    holding per-lane storage previously allocated by omp_simt_enter insn.  */
1508 
1509 const char *
1510 nvptx_output_simt_exit (rtx src)
1511 {
1512   nvptx_output_unisimt_switch (asm_out_file, false);
1513   nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1514   return "";
1515 }
1516 
1517 /* Output instruction that sets soft stack pointer in shared memory to the
1518    value in register given by SRC_REGNO.  */
1519 
1520 const char *
1521 nvptx_output_set_softstack (unsigned src_regno)
1522 {
1523   if (cfun->machine->has_softstack && !crtl->is_leaf)
1524     {
1525       fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1526 	       POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1527       output_reg (asm_out_file, src_regno, VOIDmode);
1528       fprintf (asm_out_file, ";\n");
1529     }
1530   return "";
1531 }
1532 /* Output a return instruction.  Also copy the return value to its outgoing
1533    location.  */
1534 
1535 const char *
1536 nvptx_output_return (void)
1537 {
1538   machine_mode mode = (machine_mode)cfun->machine->return_mode;
1539 
1540   if (mode != VOIDmode)
1541     fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1542 	     nvptx_ptx_type_from_mode (mode, false),
1543 	     reg_names[NVPTX_RETURN_REGNUM],
1544 	     reg_names[NVPTX_RETURN_REGNUM]);
1545 
1546   return "ret;";
1547 }
1548 
1549 /* Terminate a function by writing a closing brace to FILE.  */
1550 
1551 void
1552 nvptx_function_end (FILE *file)
1553 {
1554   fprintf (file, "}\n");
1555 }
1556 
1557 /* Decide whether we can make a sibling call to a function.  For ptx, we
1558    can't.  */
1559 
1560 static bool
1561 nvptx_function_ok_for_sibcall (tree, tree)
1562 {
1563   return false;
1564 }
1565 
1566 /* Return Dynamic ReAlignment Pointer RTX.  For PTX there isn't any.  */
1567 
1568 static rtx
1569 nvptx_get_drap_rtx (void)
1570 {
1571   if (TARGET_SOFT_STACK && stack_realign_drap)
1572     return arg_pointer_rtx;
1573   return NULL_RTX;
1574 }
1575 
1576 /* Implement the TARGET_CALL_ARGS hook.  Record information about one
1577    argument to the next call.  */
1578 
1579 static void
1580 nvptx_call_args (rtx arg, tree fntype)
1581 {
1582   if (!cfun->machine->doing_call)
1583     {
1584       cfun->machine->doing_call = true;
1585       cfun->machine->is_varadic = false;
1586       cfun->machine->num_args = 0;
1587 
1588       if (fntype && stdarg_p (fntype))
1589 	{
1590 	  cfun->machine->is_varadic = true;
1591 	  cfun->machine->has_varadic = true;
1592 	  cfun->machine->num_args++;
1593 	}
1594     }
1595 
1596   if (REG_P (arg) && arg != pc_rtx)
1597     {
1598       cfun->machine->num_args++;
1599       cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1600 						  cfun->machine->call_args);
1601     }
1602 }
1603 
1604 /* Implement the corresponding END_CALL_ARGS hook.  Clear and free the
1605    information we recorded.  */
1606 
1607 static void
1608 nvptx_end_call_args (void)
1609 {
1610   cfun->machine->doing_call = false;
1611   free_EXPR_LIST_list (&cfun->machine->call_args);
1612 }
1613 
1614 /* Emit the sequence for a call to ADDRESS, setting RETVAL.  Keep
1615    track of whether calls involving static chains or varargs were seen
1616    in the current function.
1617    For libcalls, maintain a hash table of decls we have seen, and
1618    record a function decl for later when encountering a new one.  */
1619 
1620 void
1621 nvptx_expand_call (rtx retval, rtx address)
1622 {
1623   rtx callee = XEXP (address, 0);
1624   rtx varargs = NULL_RTX;
1625   unsigned parallel = 0;
1626 
1627   if (!call_insn_operand (callee, Pmode))
1628     {
1629       callee = force_reg (Pmode, callee);
1630       address = change_address (address, QImode, callee);
1631     }
1632 
1633   if (GET_CODE (callee) == SYMBOL_REF)
1634     {
1635       tree decl = SYMBOL_REF_DECL (callee);
1636       if (decl != NULL_TREE)
1637 	{
1638 	  if (DECL_STATIC_CHAIN (decl))
1639 	    cfun->machine->has_chain = true;
1640 
1641 	  tree attr = oacc_get_fn_attrib (decl);
1642 	  if (attr)
1643 	    {
1644 	      tree dims = TREE_VALUE (attr);
1645 
1646 	      parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1647 	      for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1648 		{
1649 		  if (TREE_PURPOSE (dims)
1650 		      && !integer_zerop (TREE_PURPOSE (dims)))
1651 		    break;
1652 		  /* Not on this axis.  */
1653 		  parallel ^= GOMP_DIM_MASK (ix);
1654 		  dims = TREE_CHAIN (dims);
1655 		}
1656 	    }
1657 	}
1658     }
1659 
1660   unsigned nargs = cfun->machine->num_args;
1661   if (cfun->machine->is_varadic)
1662     {
1663       varargs = gen_reg_rtx (Pmode);
1664       emit_move_insn (varargs, stack_pointer_rtx);
1665     }
1666 
1667   rtvec vec = rtvec_alloc (nargs + 1);
1668   rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1669   int vec_pos = 0;
1670 
1671   rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1672   rtx tmp_retval = retval;
1673   if (retval)
1674     {
1675       if (!nvptx_register_operand (retval, GET_MODE (retval)))
1676 	tmp_retval = gen_reg_rtx (GET_MODE (retval));
1677       call = gen_rtx_SET (tmp_retval, call);
1678     }
1679   XVECEXP (pat, 0, vec_pos++) = call;
1680 
1681   /* Construct the call insn, including a USE for each argument pseudo
1682      register.  These will be used when printing the insn.  */
1683   for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1684     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1685 
1686   if (varargs)
1687     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1688 
1689   gcc_assert (vec_pos = XVECLEN (pat, 0));
1690 
1691   nvptx_emit_forking (parallel, true);
1692   emit_call_insn (pat);
1693   nvptx_emit_joining (parallel, true);
1694 
1695   if (tmp_retval != retval)
1696     emit_move_insn (retval, tmp_retval);
1697 }
1698 
1699 /* Emit a comparison COMPARE, and return the new test to be used in the
1700    jump.  */
1701 
1702 rtx
1703 nvptx_expand_compare (rtx compare)
1704 {
1705   rtx pred = gen_reg_rtx (BImode);
1706   rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1707 			    XEXP (compare, 0), XEXP (compare, 1));
1708   emit_insn (gen_rtx_SET (pred, cmp));
1709   return gen_rtx_NE (BImode, pred, const0_rtx);
1710 }
1711 
1712 /* Expand the oacc fork & join primitive into ptx-required unspecs.  */
1713 
1714 void
1715 nvptx_expand_oacc_fork (unsigned mode)
1716 {
1717   nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1718 }
1719 
1720 void
1721 nvptx_expand_oacc_join (unsigned mode)
1722 {
1723   nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1724 }
1725 
1726 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1727    objects.  */
1728 
1729 static rtx
1730 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1731 {
1732   rtx res;
1733 
1734   switch (GET_MODE (src))
1735     {
1736     case E_DImode:
1737       res = gen_unpackdisi2 (dst0, dst1, src);
1738       break;
1739     case E_DFmode:
1740       res = gen_unpackdfsi2 (dst0, dst1, src);
1741       break;
1742     default: gcc_unreachable ();
1743     }
1744   return res;
1745 }
1746 
1747 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1748    object.  */
1749 
1750 static rtx
1751 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1752 {
1753   rtx res;
1754 
1755   switch (GET_MODE (dst))
1756     {
1757     case E_DImode:
1758       res = gen_packsidi2 (dst, src0, src1);
1759       break;
1760     case E_DFmode:
1761       res = gen_packsidf2 (dst, src0, src1);
1762       break;
1763     default: gcc_unreachable ();
1764     }
1765   return res;
1766 }
1767 
1768 /* Generate an instruction or sequence to broadcast register REG
1769    across the vectors of a single warp.  */
1770 
1771 rtx
1772 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1773 {
1774   rtx res;
1775 
1776   switch (GET_MODE (dst))
1777     {
1778     case E_SImode:
1779       res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
1780       break;
1781     case E_SFmode:
1782       res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
1783       break;
1784     case E_DImode:
1785     case E_DFmode:
1786       {
1787 	rtx tmp0 = gen_reg_rtx (SImode);
1788 	rtx tmp1 = gen_reg_rtx (SImode);
1789 
1790 	start_sequence ();
1791 	emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
1792 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
1793 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
1794 	emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
1795 	res = get_insns ();
1796 	end_sequence ();
1797       }
1798       break;
1799     case E_BImode:
1800       {
1801 	rtx tmp = gen_reg_rtx (SImode);
1802 
1803 	start_sequence ();
1804 	emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
1805 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1806 	emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
1807 	res = get_insns ();
1808 	end_sequence ();
1809       }
1810       break;
1811     case E_QImode:
1812     case E_HImode:
1813       {
1814 	rtx tmp = gen_reg_rtx (SImode);
1815 
1816 	start_sequence ();
1817 	emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
1818 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
1819 	emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
1820 						    tmp)));
1821 	res = get_insns ();
1822 	end_sequence ();
1823       }
1824       break;
1825 
1826     default:
1827       gcc_unreachable ();
1828     }
1829   return res;
1830 }
1831 
1832 /* Generate an instruction or sequence to broadcast register REG
1833    across the vectors of a single warp.  */
1834 
1835 static rtx
1836 nvptx_gen_warp_bcast (rtx reg)
1837 {
1838   return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
1839 }
1840 
1841 /* Structure used when generating a worker-level spill or fill.  */
1842 
1843 struct broadcast_data_t
1844 {
1845   rtx base;  /* Register holding base addr of buffer.  */
1846   rtx ptr;  /* Iteration var,  if needed.  */
1847   unsigned offset; /* Offset into worker buffer.  */
1848 };
1849 
1850 /* Direction of the spill/fill and looping setup/teardown indicator.  */
1851 
1852 enum propagate_mask
1853   {
1854     PM_read = 1 << 0,
1855     PM_write = 1 << 1,
1856     PM_loop_begin = 1 << 2,
1857     PM_loop_end = 1 << 3,
1858 
1859     PM_read_write = PM_read | PM_write
1860   };
1861 
1862 /* Generate instruction(s) to spill or fill register REG to/from the
1863    worker broadcast array.  PM indicates what is to be done, REP
1864    how many loop iterations will be executed (0 for not a loop).  */
1865 
1866 static rtx
1867 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
1868 			broadcast_data_t *data, bool vector)
1869 {
1870   rtx  res;
1871   machine_mode mode = GET_MODE (reg);
1872 
1873   switch (mode)
1874     {
1875     case E_BImode:
1876       {
1877 	rtx tmp = gen_reg_rtx (SImode);
1878 
1879 	start_sequence ();
1880 	if (pm & PM_read)
1881 	  emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
1882 	emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
1883 	if (pm & PM_write)
1884 	  emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
1885 	res = get_insns ();
1886 	end_sequence ();
1887       }
1888       break;
1889 
1890     default:
1891       {
1892 	rtx addr = data->ptr;
1893 
1894 	if (!addr)
1895 	  {
1896 	    unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
1897 
1898 	    oacc_bcast_align = MAX (oacc_bcast_align, align);
1899 	    data->offset = ROUND_UP (data->offset, align);
1900 	    addr = data->base;
1901 	    gcc_assert (data->base != NULL);
1902 	    if (data->offset)
1903 	      addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
1904 	  }
1905 
1906 	addr = gen_rtx_MEM (mode, addr);
1907 	if (pm == PM_read)
1908 	  res = gen_rtx_SET (addr, reg);
1909 	else if (pm == PM_write)
1910 	  res = gen_rtx_SET (reg, addr);
1911 	else
1912 	  gcc_unreachable ();
1913 
1914 	if (data->ptr)
1915 	  {
1916 	    /* We're using a ptr, increment it.  */
1917 	    start_sequence ();
1918 
1919 	    emit_insn (res);
1920 	    emit_insn (gen_adddi3 (data->ptr, data->ptr,
1921 				   GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
1922 	    res = get_insns ();
1923 	    end_sequence ();
1924 	  }
1925 	else
1926 	  rep = 1;
1927 	data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
1928       }
1929       break;
1930     }
1931   return res;
1932 }
1933 
1934 /* Returns true if X is a valid address for use in a memory reference.  */
1935 
1936 static bool
1937 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
1938 {
1939   enum rtx_code code = GET_CODE (x);
1940 
1941   switch (code)
1942     {
1943     case REG:
1944       return true;
1945 
1946     case PLUS:
1947       if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
1948 	return true;
1949       return false;
1950 
1951     case CONST:
1952     case SYMBOL_REF:
1953     case LABEL_REF:
1954       return true;
1955 
1956     default:
1957       return false;
1958     }
1959 }
1960 
1961 /* Machinery to output constant initializers.  When beginning an
1962    initializer, we decide on a fragment size (which is visible in ptx
1963    in the type used), and then all initializer data is buffered until
1964    a fragment is filled and ready to be written out.  */
1965 
1966 static struct
1967 {
1968   unsigned HOST_WIDE_INT mask; /* Mask for storing fragment.  */
1969   unsigned HOST_WIDE_INT val; /* Current fragment value.  */
1970   unsigned HOST_WIDE_INT remaining; /*  Remaining bytes to be written
1971 					out.  */
1972   unsigned size;  /* Fragment size to accumulate.  */
1973   unsigned offset;  /* Offset within current fragment.  */
1974   bool started;   /* Whether we've output any initializer.  */
1975 } init_frag;
1976 
1977 /* The current fragment is full,  write it out.  SYM may provide a
1978    symbolic reference we should output,  in which case the fragment
1979    value is the addend.  */
1980 
1981 static void
1982 output_init_frag (rtx sym)
1983 {
1984   fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
1985   unsigned HOST_WIDE_INT val = init_frag.val;
1986 
1987   init_frag.started = true;
1988   init_frag.val = 0;
1989   init_frag.offset = 0;
1990   init_frag.remaining--;
1991 
1992   if (sym)
1993     {
1994       bool function = (SYMBOL_REF_DECL (sym)
1995 		       && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
1996       if (!function)
1997 	fprintf (asm_out_file, "generic(");
1998       output_address (VOIDmode, sym);
1999       if (!function)
2000 	fprintf (asm_out_file, ")");
2001       if (val)
2002 	fprintf (asm_out_file, " + ");
2003     }
2004 
2005   if (!sym || val)
2006     fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2007 }
2008 
2009 /* Add value VAL of size SIZE to the data we're emitting, and keep
2010    writing out chunks as they fill up.  */
2011 
2012 static void
2013 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2014 {
2015   val &= ((unsigned  HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1;
2016 
2017   for (unsigned part = 0; size; size -= part)
2018     {
2019       val >>= part * BITS_PER_UNIT;
2020       part = init_frag.size - init_frag.offset;
2021       part = MIN (part, size);
2022 
2023       unsigned HOST_WIDE_INT partial
2024 	= val << (init_frag.offset * BITS_PER_UNIT);
2025       init_frag.val |= partial & init_frag.mask;
2026       init_frag.offset += part;
2027 
2028       if (init_frag.offset == init_frag.size)
2029 	output_init_frag (NULL);
2030     }
2031 }
2032 
2033 /* Target hook for assembling integer object X of size SIZE.  */
2034 
2035 static bool
2036 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2037 {
2038   HOST_WIDE_INT val = 0;
2039 
2040   switch (GET_CODE (x))
2041     {
2042     default:
2043       /* Let the generic machinery figure it out, usually for a
2044 	 CONST_WIDE_INT.  */
2045       return false;
2046 
2047     case CONST_INT:
2048       nvptx_assemble_value (INTVAL (x), size);
2049       break;
2050 
2051     case CONST:
2052       x = XEXP (x, 0);
2053       gcc_assert (GET_CODE (x) == PLUS);
2054       val = INTVAL (XEXP (x, 1));
2055       x = XEXP (x, 0);
2056       gcc_assert (GET_CODE (x) == SYMBOL_REF);
2057       /* FALLTHROUGH */
2058 
2059     case SYMBOL_REF:
2060       gcc_assert (size == init_frag.size);
2061       if (init_frag.offset)
2062 	sorry ("cannot emit unaligned pointers in ptx assembly");
2063 
2064       nvptx_maybe_record_fnsym (x);
2065       init_frag.val = val;
2066       output_init_frag (x);
2067       break;
2068     }
2069 
2070   return true;
2071 }
2072 
2073 /* Output SIZE zero bytes.  We ignore the FILE argument since the
2074    functions we're calling to perform the output just use
2075    asm_out_file.  */
2076 
2077 void
2078 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2079 {
2080   /* Finish the current fragment, if it's started.  */
2081   if (init_frag.offset)
2082     {
2083       unsigned part = init_frag.size - init_frag.offset;
2084       part = MIN (part, (unsigned)size);
2085       size -= part;
2086       nvptx_assemble_value (0, part);
2087     }
2088 
2089   /* If this skip doesn't terminate the initializer, write as many
2090      remaining pieces as possible directly.  */
2091   if (size < init_frag.remaining * init_frag.size)
2092     {
2093       while (size >= init_frag.size)
2094 	{
2095 	  size -= init_frag.size;
2096 	  output_init_frag (NULL_RTX);
2097 	}
2098       if (size)
2099 	nvptx_assemble_value (0, size);
2100     }
2101 }
2102 
2103 /* Output a string STR with length SIZE.  As in nvptx_output_skip we
2104    ignore the FILE arg.  */
2105 
2106 void
2107 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2108 {
2109   for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2110     nvptx_assemble_value (str[i], 1);
2111 }
2112 
2113 /* Return true if TYPE is a record type where the last field is an array without
2114    given dimension.  */
2115 
2116 static bool
2117 flexible_array_member_type_p (const_tree type)
2118 {
2119   if (TREE_CODE (type) != RECORD_TYPE)
2120     return false;
2121 
2122   const_tree last_field = NULL_TREE;
2123   for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2124     last_field = f;
2125 
2126   if (!last_field)
2127     return false;
2128 
2129   const_tree last_field_type = TREE_TYPE (last_field);
2130   if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2131     return false;
2132 
2133   return (! TYPE_DOMAIN (last_field_type)
2134 	  || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2135 }
2136 
2137 /* Emit a PTX variable decl and prepare for emission of its
2138    initializer.  NAME is the symbol name and SETION the PTX data
2139    area. The type is TYPE, object size SIZE and alignment is ALIGN.
2140    The caller has already emitted any indentation and linkage
2141    specifier.  It is responsible for any initializer, terminating ;
2142    and newline.  SIZE is in bytes, ALIGN is in bits -- confusingly
2143    this is the opposite way round that PTX wants them!  */
2144 
2145 static void
2146 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2147 			   const_tree type, HOST_WIDE_INT size, unsigned align,
2148 			   bool undefined = false)
2149 {
2150   bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2151     && (TYPE_DOMAIN (type) == NULL_TREE);
2152 
2153   if (undefined && flexible_array_member_type_p (type))
2154     {
2155       size = 0;
2156       atype = true;
2157     }
2158 
2159   while (TREE_CODE (type) == ARRAY_TYPE)
2160     type = TREE_TYPE (type);
2161 
2162   if (TREE_CODE (type) == VECTOR_TYPE
2163       || TREE_CODE (type) == COMPLEX_TYPE)
2164     /* Neither vector nor complex types can contain the other.  */
2165     type = TREE_TYPE (type);
2166 
2167   unsigned elt_size = int_size_in_bytes (type);
2168 
2169   /* Largest mode we're prepared to accept.  For BLKmode types we
2170      don't know if it'll contain pointer constants, so have to choose
2171      pointer size, otherwise we can choose DImode.  */
2172   machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2173 
2174   elt_size |= GET_MODE_SIZE (elt_mode);
2175   elt_size &= -elt_size; /* Extract LSB set.  */
2176 
2177   init_frag.size = elt_size;
2178   /* Avoid undefined shift behavior by using '2'.  */
2179   init_frag.mask = ((unsigned HOST_WIDE_INT)2
2180 		    << (elt_size * BITS_PER_UNIT - 1)) - 1;
2181   init_frag.val = 0;
2182   init_frag.offset = 0;
2183   init_frag.started = false;
2184   /* Size might not be a multiple of elt size, if there's an
2185      initialized trailing struct array with smaller type than
2186      elt_size. */
2187   init_frag.remaining = (size + elt_size - 1) / elt_size;
2188 
2189   fprintf (file, "%s .align %d .u%d ",
2190 	   section, align / BITS_PER_UNIT,
2191 	   elt_size * BITS_PER_UNIT);
2192   assemble_name (file, name);
2193 
2194   if (size)
2195     /* We make everything an array, to simplify any initialization
2196        emission.  */
2197     fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining);
2198   else if (atype)
2199     fprintf (file, "[]");
2200 }
2201 
2202 /* Called when the initializer for a decl has been completely output through
2203    combinations of the three functions above.  */
2204 
2205 static void
2206 nvptx_assemble_decl_end (void)
2207 {
2208   if (init_frag.offset)
2209     /* This can happen with a packed struct with trailing array member.  */
2210     nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2211   fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2212 }
2213 
2214 /* Output an uninitialized common or file-scope variable.  */
2215 
2216 void
2217 nvptx_output_aligned_decl (FILE *file, const char *name,
2218 			   const_tree decl, HOST_WIDE_INT size, unsigned align)
2219 {
2220   write_var_marker (file, true, TREE_PUBLIC (decl), name);
2221 
2222   /* If this is public, it is common.  The nearest thing we have to
2223      common is weak.  */
2224   fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2225 
2226   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2227 			     TREE_TYPE (decl), size, align);
2228   nvptx_assemble_decl_end ();
2229 }
2230 
2231 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME.  Begin the process of
2232    writing a constant variable EXP with NAME and SIZE and its
2233    initializer to FILE.  */
2234 
2235 static void
2236 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2237 				 const_tree exp, HOST_WIDE_INT obj_size)
2238 {
2239   write_var_marker (file, true, false, name);
2240 
2241   fprintf (file, "\t");
2242 
2243   tree type = TREE_TYPE (exp);
2244   nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2245 			     TYPE_ALIGN (type));
2246 }
2247 
2248 /* Implement the ASM_DECLARE_OBJECT_NAME macro.  Used to start writing
2249    a variable DECL with NAME to FILE.  */
2250 
2251 void
2252 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2253 {
2254   write_var_marker (file, true, TREE_PUBLIC (decl), name);
2255 
2256   fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2257 			  : DECL_WEAK (decl) ? ".weak " : ".visible "));
2258 
2259   tree type = TREE_TYPE (decl);
2260   HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2261   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2262 			     type, obj_size, DECL_ALIGN (decl));
2263 }
2264 
2265 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing.  */
2266 
2267 static void
2268 nvptx_globalize_label (FILE *, const char *)
2269 {
2270 }
2271 
2272 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL.  Write an extern
2273    declaration only for variable DECL with NAME to FILE.  */
2274 
2275 static void
2276 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2277 {
2278   /* The middle end can place constant pool decls into the varpool as
2279      undefined.  Until that is fixed, catch the problem here.  */
2280   if (DECL_IN_CONSTANT_POOL (decl))
2281     return;
2282 
2283   /*  We support weak defintions, and hence have the right
2284       ASM_WEAKEN_DECL definition.  Diagnose the problem here.  */
2285   if (DECL_WEAK (decl))
2286     error_at (DECL_SOURCE_LOCATION (decl),
2287 	      "PTX does not support weak declarations"
2288 	      " (only weak definitions)");
2289   write_var_marker (file, false, TREE_PUBLIC (decl), name);
2290 
2291   fprintf (file, "\t.extern ");
2292   tree size = DECL_SIZE_UNIT (decl);
2293   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2294 			     TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2295 			     DECL_ALIGN (decl), true);
2296   nvptx_assemble_decl_end ();
2297 }
2298 
2299 /* Output a pattern for a move instruction.  */
2300 
2301 const char *
2302 nvptx_output_mov_insn (rtx dst, rtx src)
2303 {
2304   machine_mode dst_mode = GET_MODE (dst);
2305   machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2306 			    ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2307   machine_mode src_inner = (GET_CODE (src) == SUBREG
2308 			    ? GET_MODE (XEXP (src, 0)) : dst_mode);
2309 
2310   rtx sym = src;
2311   if (GET_CODE (sym) == CONST)
2312     sym = XEXP (XEXP (sym, 0), 0);
2313   if (SYMBOL_REF_P (sym))
2314     {
2315       if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2316 	return "%.\tcvta%D1%t0\t%0, %1;";
2317       nvptx_maybe_record_fnsym (sym);
2318     }
2319 
2320   if (src_inner == dst_inner)
2321     return "%.\tmov%t0\t%0, %1;";
2322 
2323   if (CONSTANT_P (src))
2324     return (GET_MODE_CLASS (dst_inner) == MODE_INT
2325 	    && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2326 	    ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2327 
2328   if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2329     {
2330       if (GET_MODE_BITSIZE (dst_mode) == 128
2331 	  && GET_MODE_BITSIZE (GET_MODE (src)) == 128)
2332 	{
2333 	  /* mov.b128 is not supported.  */
2334 	  if (dst_inner == V2DImode && src_inner == TImode)
2335 	    return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2336 	  else if (dst_inner == TImode && src_inner == V2DImode)
2337 	    return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2338 
2339 	  gcc_unreachable ();
2340 	}
2341       return "%.\tmov.b%T0\t%0, %1;";
2342     }
2343 
2344   return "%.\tcvt%t0%t1\t%0, %1;";
2345 }
2346 
2347 static void nvptx_print_operand (FILE *, rtx, int);
2348 
2349 /* Output INSN, which is a call to CALLEE with result RESULT.  For ptx, this
2350    involves writing .param declarations and in/out copies into them.  For
2351    indirect calls, also write the .callprototype.  */
2352 
2353 const char *
2354 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2355 {
2356   char buf[16];
2357   static int labelno;
2358   bool needs_tgt = register_operand (callee, Pmode);
2359   rtx pat = PATTERN (insn);
2360   if (GET_CODE (pat) == COND_EXEC)
2361     pat = COND_EXEC_CODE (pat);
2362   int arg_end = XVECLEN (pat, 0);
2363   tree decl = NULL_TREE;
2364 
2365   fprintf (asm_out_file, "\t{\n");
2366   if (result != NULL)
2367     fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2368 	     nvptx_ptx_type_from_mode (GET_MODE (result), false),
2369 	     reg_names[NVPTX_RETURN_REGNUM]);
2370 
2371   /* Ensure we have a ptx declaration in the output if necessary.  */
2372   if (GET_CODE (callee) == SYMBOL_REF)
2373     {
2374       decl = SYMBOL_REF_DECL (callee);
2375       if (!decl
2376 	  || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2377 	nvptx_record_libfunc (callee, result, pat);
2378       else if (DECL_EXTERNAL (decl))
2379 	nvptx_record_fndecl (decl);
2380     }
2381 
2382   if (needs_tgt)
2383     {
2384       ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2385       labelno++;
2386       ASM_OUTPUT_LABEL (asm_out_file, buf);
2387       std::stringstream s;
2388       write_fn_proto_from_insn (s, NULL, result, pat);
2389       fputs (s.str().c_str(), asm_out_file);
2390     }
2391 
2392   for (int argno = 1; argno < arg_end; argno++)
2393     {
2394       rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2395       machine_mode mode = GET_MODE (t);
2396       const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2397 
2398       /* Mode splitting has already been done.  */
2399       fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2400 	       "\t\tst.param%s [%%out_arg%d], ",
2401 	       ptx_type, argno, ptx_type, argno);
2402       output_reg (asm_out_file, REGNO (t), VOIDmode);
2403       fprintf (asm_out_file, ";\n");
2404     }
2405 
2406   /* The '.' stands for the call's predicate, if any.  */
2407   nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2408   fprintf (asm_out_file, "\t\tcall ");
2409   if (result != NULL_RTX)
2410     fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2411 
2412   if (decl)
2413     {
2414       const char *name = get_fnname_from_decl (decl);
2415       name = nvptx_name_replacement (name);
2416       assemble_name (asm_out_file, name);
2417     }
2418   else
2419     output_address (VOIDmode, callee);
2420 
2421   const char *open = "(";
2422   for (int argno = 1; argno < arg_end; argno++)
2423     {
2424       fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2425       open = "";
2426     }
2427   if (decl && DECL_STATIC_CHAIN (decl))
2428     {
2429       fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2430       open = "";
2431     }
2432   if (!open[0])
2433     fprintf (asm_out_file, ")");
2434 
2435   if (needs_tgt)
2436     {
2437       fprintf (asm_out_file, ", ");
2438       assemble_name (asm_out_file, buf);
2439     }
2440   fprintf (asm_out_file, ";\n");
2441 
2442   if (find_reg_note (insn, REG_NORETURN, NULL))
2443     {
2444       /* No return functions confuse the PTX JIT, as it doesn't realize
2445 	 the flow control barrier they imply.  It can seg fault if it
2446 	 encounters what looks like an unexitable loop.  Emit a trailing
2447 	 trap and exit, which it does grok.  */
2448       fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2449       fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2450     }
2451 
2452   if (result)
2453     {
2454       static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2455 
2456       if (!rval[0])
2457 	/* We must escape the '%' that starts RETURN_REGNUM.  */
2458 	sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2459 		 reg_names[NVPTX_RETURN_REGNUM]);
2460       return rval;
2461     }
2462 
2463   return "}";
2464 }
2465 
2466 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P.  */
2467 
2468 static bool
2469 nvptx_print_operand_punct_valid_p (unsigned char c)
2470 {
2471   return c == '.' || c== '#';
2472 }
2473 
2474 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE.  */
2475 
2476 static void
2477 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2478 {
2479   rtx off;
2480   if (GET_CODE (x) == CONST)
2481     x = XEXP (x, 0);
2482   switch (GET_CODE (x))
2483     {
2484     case PLUS:
2485       off = XEXP (x, 1);
2486       output_address (VOIDmode, XEXP (x, 0));
2487       fprintf (file, "+");
2488       output_address (VOIDmode, off);
2489       break;
2490 
2491     case SYMBOL_REF:
2492     case LABEL_REF:
2493       output_addr_const (file, x);
2494       break;
2495 
2496     default:
2497       gcc_assert (GET_CODE (x) != MEM);
2498       nvptx_print_operand (file, x, 0);
2499       break;
2500     }
2501 }
2502 
2503 /* Write assembly language output for the address ADDR to FILE.  */
2504 
2505 static void
2506 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2507 {
2508   nvptx_print_address_operand (file, addr, mode);
2509 }
2510 
2511 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2512 
2513    Meaning of CODE:
2514    . -- print the predicate for the instruction or an emptry string for an
2515         unconditional one.
2516    # -- print a rounding mode for the instruction
2517 
2518    A -- print a data area for a MEM
2519    c -- print an opcode suffix for a comparison operator, including a type code
2520    D -- print a data area for a MEM operand
2521    S -- print a shuffle kind specified by CONST_INT
2522    t -- print a type opcode suffix, promoting QImode to 32 bits
2523    T -- print a type size in bits
2524    u -- print a type opcode suffix without promotions.  */
2525 
2526 static void
2527 nvptx_print_operand (FILE *file, rtx x, int code)
2528 {
2529   if (code == '.')
2530     {
2531       x = current_insn_predicate;
2532       if (x)
2533 	{
2534 	  fputs ("@", file);
2535 	  if (GET_CODE (x) == EQ)
2536 	    fputs ("!", file);
2537 	  output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2538 	}
2539       return;
2540     }
2541   else if (code == '#')
2542     {
2543       fputs (".rn", file);
2544       return;
2545     }
2546 
2547   enum rtx_code x_code = GET_CODE (x);
2548   machine_mode mode = GET_MODE (x);
2549 
2550   switch (code)
2551     {
2552     case 'A':
2553       x = XEXP (x, 0);
2554       /* FALLTHROUGH.  */
2555 
2556     case 'D':
2557       if (GET_CODE (x) == CONST)
2558 	x = XEXP (x, 0);
2559       if (GET_CODE (x) == PLUS)
2560 	x = XEXP (x, 0);
2561 
2562       if (GET_CODE (x) == SYMBOL_REF)
2563 	fputs (section_for_sym (x), file);
2564       break;
2565 
2566     case 't':
2567     case 'u':
2568       if (x_code == SUBREG)
2569 	{
2570 	  machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2571 	  if (VECTOR_MODE_P (inner_mode)
2572 	      && (GET_MODE_SIZE (mode)
2573 		  <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2574 	    mode = GET_MODE_INNER (inner_mode);
2575 	  else if (split_mode_p (inner_mode))
2576 	    mode = maybe_split_mode (inner_mode);
2577 	  else
2578 	    mode = inner_mode;
2579 	}
2580       fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2581       break;
2582 
2583     case 'H':
2584     case 'L':
2585       {
2586 	rtx inner_x = SUBREG_REG (x);
2587 	machine_mode inner_mode = GET_MODE (inner_x);
2588 	machine_mode split = maybe_split_mode (inner_mode);
2589 
2590 	output_reg (file, REGNO (inner_x), split,
2591 		    (code == 'H'
2592 		     ? GET_MODE_SIZE (inner_mode) / 2
2593 		     : 0));
2594       }
2595       break;
2596 
2597     case 'S':
2598       {
2599 	nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2600 	/* Same order as nvptx_shuffle_kind.  */
2601 	static const char *const kinds[] =
2602 	  {".up", ".down", ".bfly", ".idx"};
2603 	fputs (kinds[kind], file);
2604       }
2605       break;
2606 
2607     case 'T':
2608       fprintf (file, "%d", GET_MODE_BITSIZE (mode));
2609       break;
2610 
2611     case 'j':
2612       fprintf (file, "@");
2613       goto common;
2614 
2615     case 'J':
2616       fprintf (file, "@!");
2617       goto common;
2618 
2619     case 'c':
2620       mode = GET_MODE (XEXP (x, 0));
2621       switch (x_code)
2622 	{
2623 	case EQ:
2624 	  fputs (".eq", file);
2625 	  break;
2626 	case NE:
2627 	  if (FLOAT_MODE_P (mode))
2628 	    fputs (".neu", file);
2629 	  else
2630 	    fputs (".ne", file);
2631 	  break;
2632 	case LE:
2633 	case LEU:
2634 	  fputs (".le", file);
2635 	  break;
2636 	case GE:
2637 	case GEU:
2638 	  fputs (".ge", file);
2639 	  break;
2640 	case LT:
2641 	case LTU:
2642 	  fputs (".lt", file);
2643 	  break;
2644 	case GT:
2645 	case GTU:
2646 	  fputs (".gt", file);
2647 	  break;
2648 	case LTGT:
2649 	  fputs (".ne", file);
2650 	  break;
2651 	case UNEQ:
2652 	  fputs (".equ", file);
2653 	  break;
2654 	case UNLE:
2655 	  fputs (".leu", file);
2656 	  break;
2657 	case UNGE:
2658 	  fputs (".geu", file);
2659 	  break;
2660 	case UNLT:
2661 	  fputs (".ltu", file);
2662 	  break;
2663 	case UNGT:
2664 	  fputs (".gtu", file);
2665 	  break;
2666 	case UNORDERED:
2667 	  fputs (".nan", file);
2668 	  break;
2669 	case ORDERED:
2670 	  fputs (".num", file);
2671 	  break;
2672 	default:
2673 	  gcc_unreachable ();
2674 	}
2675       if (FLOAT_MODE_P (mode)
2676 	  || x_code == EQ || x_code == NE
2677 	  || x_code == GEU || x_code == GTU
2678 	  || x_code == LEU || x_code == LTU)
2679 	fputs (nvptx_ptx_type_from_mode (mode, true), file);
2680       else
2681 	fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
2682       break;
2683     default:
2684     common:
2685       switch (x_code)
2686 	{
2687 	case SUBREG:
2688 	  {
2689 	    rtx inner_x = SUBREG_REG (x);
2690 	    machine_mode inner_mode = GET_MODE (inner_x);
2691 	    machine_mode split = maybe_split_mode (inner_mode);
2692 
2693 	    if (VECTOR_MODE_P (inner_mode)
2694 		&& (GET_MODE_SIZE (mode)
2695 		    <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2696 	      {
2697 		output_reg (file, REGNO (inner_x), VOIDmode);
2698 		fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
2699 	      }
2700 	    else if (split_mode_p (inner_mode)
2701 		&& (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
2702 	      output_reg (file, REGNO (inner_x), split);
2703 	    else
2704 	      output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
2705 	  }
2706 	  break;
2707 
2708 	case REG:
2709 	  output_reg (file, REGNO (x), maybe_split_mode (mode));
2710 	  break;
2711 
2712 	case MEM:
2713 	  fputc ('[', file);
2714 	  nvptx_print_address_operand (file, XEXP (x, 0), mode);
2715 	  fputc (']', file);
2716 	  break;
2717 
2718 	case CONST_INT:
2719 	  output_addr_const (file, x);
2720 	  break;
2721 
2722 	case CONST:
2723 	case SYMBOL_REF:
2724 	case LABEL_REF:
2725 	  /* We could use output_addr_const, but that can print things like
2726 	     "x-8", which breaks ptxas.  Need to ensure it is output as
2727 	     "x+-8".  */
2728 	  nvptx_print_address_operand (file, x, VOIDmode);
2729 	  break;
2730 
2731 	case CONST_DOUBLE:
2732 	  long vals[2];
2733 	  real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
2734 	  vals[0] &= 0xffffffff;
2735 	  vals[1] &= 0xffffffff;
2736 	  if (mode == SFmode)
2737 	    fprintf (file, "0f%08lx", vals[0]);
2738 	  else
2739 	    fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
2740 	  break;
2741 
2742 	case CONST_VECTOR:
2743 	  {
2744 	    unsigned n = CONST_VECTOR_NUNITS (x);
2745 	    fprintf (file, "{ ");
2746 	    for (unsigned i = 0; i < n; ++i)
2747 	      {
2748 		if (i != 0)
2749 		  fprintf (file, ", ");
2750 
2751 		rtx elem = CONST_VECTOR_ELT (x, i);
2752 		output_addr_const (file, elem);
2753 	      }
2754 	    fprintf (file, " }");
2755 	  }
2756 	  break;
2757 
2758 	default:
2759 	  output_addr_const (file, x);
2760 	}
2761     }
2762 }
2763 
2764 /* Record replacement regs used to deal with subreg operands.  */
2765 struct reg_replace
2766 {
2767   rtx replacement[MAX_RECOG_OPERANDS];
2768   machine_mode mode;
2769   int n_allocated;
2770   int n_in_use;
2771 };
2772 
2773 /* Allocate or reuse a replacement in R and return the rtx.  */
2774 
2775 static rtx
2776 get_replacement (struct reg_replace *r)
2777 {
2778   if (r->n_allocated == r->n_in_use)
2779     r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
2780   return r->replacement[r->n_in_use++];
2781 }
2782 
2783 /* Clean up subreg operands.  In ptx assembly, everything is typed, and
2784    the presence of subregs would break the rules for most instructions.
2785    Replace them with a suitable new register of the right size, plus
2786    conversion copyin/copyout instructions.  */
2787 
2788 static void
2789 nvptx_reorg_subreg (void)
2790 {
2791   struct reg_replace qiregs, hiregs, siregs, diregs;
2792   rtx_insn *insn, *next;
2793 
2794   qiregs.n_allocated = 0;
2795   hiregs.n_allocated = 0;
2796   siregs.n_allocated = 0;
2797   diregs.n_allocated = 0;
2798   qiregs.mode = QImode;
2799   hiregs.mode = HImode;
2800   siregs.mode = SImode;
2801   diregs.mode = DImode;
2802 
2803   for (insn = get_insns (); insn; insn = next)
2804     {
2805       next = NEXT_INSN (insn);
2806       if (!NONDEBUG_INSN_P (insn)
2807 	  || asm_noperands (PATTERN (insn)) >= 0
2808 	  || GET_CODE (PATTERN (insn)) == USE
2809 	  || GET_CODE (PATTERN (insn)) == CLOBBER)
2810 	continue;
2811 
2812       qiregs.n_in_use = 0;
2813       hiregs.n_in_use = 0;
2814       siregs.n_in_use = 0;
2815       diregs.n_in_use = 0;
2816       extract_insn (insn);
2817       enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
2818 
2819       for (int i = 0; i < recog_data.n_operands; i++)
2820 	{
2821 	  rtx op = recog_data.operand[i];
2822 	  if (GET_CODE (op) != SUBREG)
2823 	    continue;
2824 
2825 	  rtx inner = SUBREG_REG (op);
2826 
2827 	  machine_mode outer_mode = GET_MODE (op);
2828 	  machine_mode inner_mode = GET_MODE (inner);
2829 	  gcc_assert (s_ok);
2830 	  if (s_ok
2831 	      && (GET_MODE_PRECISION (inner_mode)
2832 		  >= GET_MODE_PRECISION (outer_mode)))
2833 	    continue;
2834 	  gcc_assert (SCALAR_INT_MODE_P (outer_mode));
2835 	  struct reg_replace *r = (outer_mode == QImode ? &qiregs
2836 				   : outer_mode == HImode ? &hiregs
2837 				   : outer_mode == SImode ? &siregs
2838 				   : &diregs);
2839 	  rtx new_reg = get_replacement (r);
2840 
2841 	  if (recog_data.operand_type[i] != OP_OUT)
2842 	    {
2843 	      enum rtx_code code;
2844 	      if (GET_MODE_PRECISION (inner_mode)
2845 		  < GET_MODE_PRECISION (outer_mode))
2846 		code = ZERO_EXTEND;
2847 	      else
2848 		code = TRUNCATE;
2849 
2850 	      rtx pat = gen_rtx_SET (new_reg,
2851 				     gen_rtx_fmt_e (code, outer_mode, inner));
2852 	      emit_insn_before (pat, insn);
2853 	    }
2854 
2855 	  if (recog_data.operand_type[i] != OP_IN)
2856 	    {
2857 	      enum rtx_code code;
2858 	      if (GET_MODE_PRECISION (inner_mode)
2859 		  < GET_MODE_PRECISION (outer_mode))
2860 		code = TRUNCATE;
2861 	      else
2862 		code = ZERO_EXTEND;
2863 
2864 	      rtx pat = gen_rtx_SET (inner,
2865 				     gen_rtx_fmt_e (code, inner_mode, new_reg));
2866 	      emit_insn_after (pat, insn);
2867 	    }
2868 	  validate_change (insn, recog_data.operand_loc[i], new_reg, false);
2869 	}
2870     }
2871 }
2872 
2873 /* Return a SImode "master lane index" register for uniform-simt, allocating on
2874    first use.  */
2875 
2876 static rtx
2877 nvptx_get_unisimt_master ()
2878 {
2879   rtx &master = cfun->machine->unisimt_master;
2880   return master ? master : master = gen_reg_rtx (SImode);
2881 }
2882 
2883 /* Return a BImode "predicate" register for uniform-simt, similar to above.  */
2884 
2885 static rtx
2886 nvptx_get_unisimt_predicate ()
2887 {
2888   rtx &pred = cfun->machine->unisimt_predicate;
2889   return pred ? pred : pred = gen_reg_rtx (BImode);
2890 }
2891 
2892 /* Return true if given call insn references one of the functions provided by
2893    the CUDA runtime: malloc, free, vprintf.  */
2894 
2895 static bool
2896 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
2897 {
2898   rtx pat = PATTERN (insn);
2899   gcc_checking_assert (GET_CODE (pat) == PARALLEL);
2900   pat = XVECEXP (pat, 0, 0);
2901   if (GET_CODE (pat) == SET)
2902     pat = SET_SRC (pat);
2903   gcc_checking_assert (GET_CODE (pat) == CALL
2904 		       && GET_CODE (XEXP (pat, 0)) == MEM);
2905   rtx addr = XEXP (XEXP (pat, 0), 0);
2906   if (GET_CODE (addr) != SYMBOL_REF)
2907     return false;
2908   const char *name = XSTR (addr, 0);
2909   /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
2910      references with forced assembler name refer to PTX syscalls.  For vprintf,
2911      accept both normal and forced-assembler-name references.  */
2912   return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
2913 	  || !strcmp (name, "*malloc")
2914 	  || !strcmp (name, "*free"));
2915 }
2916 
2917 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
2918    propagate its value from lane MASTER to current lane.  */
2919 
2920 static void
2921 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
2922 {
2923   rtx reg;
2924   if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set)))
2925     emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn);
2926 }
2927 
2928 /* Adjust code for uniform-simt code generation variant by making atomics and
2929    "syscalls" conditionally executed, and inserting shuffle-based propagation
2930    for registers being set.  */
2931 
2932 static void
2933 nvptx_reorg_uniform_simt ()
2934 {
2935   rtx_insn *insn, *next;
2936 
2937   for (insn = get_insns (); insn; insn = next)
2938     {
2939       next = NEXT_INSN (insn);
2940       if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
2941 	  && !(NONJUMP_INSN_P (insn)
2942 	       && GET_CODE (PATTERN (insn)) == PARALLEL
2943 	       && get_attr_atomic (insn)))
2944 	continue;
2945       rtx pat = PATTERN (insn);
2946       rtx master = nvptx_get_unisimt_master ();
2947       for (int i = 0; i < XVECLEN (pat, 0); i++)
2948 	nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
2949       rtx pred = nvptx_get_unisimt_predicate ();
2950       pred = gen_rtx_NE (BImode, pred, const0_rtx);
2951       pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
2952       validate_change (insn, &PATTERN (insn), pat, false);
2953     }
2954 }
2955 
2956 /* Offloading function attributes.  */
2957 
2958 struct offload_attrs
2959 {
2960   unsigned mask;
2961   int num_gangs;
2962   int num_workers;
2963   int vector_length;
2964 };
2965 
2966 /* Define entries for cfun->machine->axis_dim.  */
2967 
2968 #define MACH_VECTOR_LENGTH 0
2969 #define MACH_MAX_WORKERS 1
2970 
2971 static void populate_offload_attrs (offload_attrs *oa);
2972 
2973 static void
2974 init_axis_dim (void)
2975 {
2976   offload_attrs oa;
2977   int max_workers;
2978 
2979   populate_offload_attrs (&oa);
2980 
2981   if (oa.num_workers == 0)
2982     max_workers = PTX_CTA_SIZE / oa.vector_length;
2983   else
2984     max_workers = oa.num_workers;
2985 
2986   cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
2987   cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
2988   cfun->machine->axis_dim_init_p = true;
2989 }
2990 
2991 static int ATTRIBUTE_UNUSED
2992 nvptx_mach_max_workers ()
2993 {
2994   if (!cfun->machine->axis_dim_init_p)
2995     init_axis_dim ();
2996   return cfun->machine->axis_dim[MACH_MAX_WORKERS];
2997 }
2998 
2999 static int ATTRIBUTE_UNUSED
3000 nvptx_mach_vector_length ()
3001 {
3002   if (!cfun->machine->axis_dim_init_p)
3003     init_axis_dim ();
3004   return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3005 }
3006 
3007 /* Loop structure of the function.  The entire function is described as
3008    a NULL loop.  */
3009 
3010 struct parallel
3011 {
3012   /* Parent parallel.  */
3013   parallel *parent;
3014 
3015   /* Next sibling parallel.  */
3016   parallel *next;
3017 
3018   /* First child parallel.  */
3019   parallel *inner;
3020 
3021   /* Partitioning mask of the parallel.  */
3022   unsigned mask;
3023 
3024   /* Partitioning used within inner parallels. */
3025   unsigned inner_mask;
3026 
3027   /* Location of parallel forked and join.  The forked is the first
3028      block in the parallel and the join is the first block after of
3029      the partition.  */
3030   basic_block forked_block;
3031   basic_block join_block;
3032 
3033   rtx_insn *forked_insn;
3034   rtx_insn *join_insn;
3035 
3036   rtx_insn *fork_insn;
3037   rtx_insn *joining_insn;
3038 
3039   /* Basic blocks in this parallel, but not in child parallels.  The
3040      FORKED and JOINING blocks are in the partition.  The FORK and JOIN
3041      blocks are not.  */
3042   auto_vec<basic_block> blocks;
3043 
3044 public:
3045   parallel (parallel *parent, unsigned mode);
3046   ~parallel ();
3047 };
3048 
3049 /* Constructor links the new parallel into it's parent's chain of
3050    children.  */
3051 
3052 parallel::parallel (parallel *parent_, unsigned mask_)
3053   :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3054 {
3055   forked_block = join_block = 0;
3056   forked_insn = join_insn = 0;
3057   fork_insn = joining_insn = 0;
3058 
3059   if (parent)
3060     {
3061       next = parent->inner;
3062       parent->inner = this;
3063     }
3064 }
3065 
3066 parallel::~parallel ()
3067 {
3068   delete inner;
3069   delete next;
3070 }
3071 
3072 /* Map of basic blocks to insns */
3073 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3074 
3075 /* A tuple of an insn of interest and the BB in which it resides.  */
3076 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3077 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3078 
3079 /* Split basic blocks such that each forked and join unspecs are at
3080    the start of their basic blocks.  Thus afterwards each block will
3081    have a single partitioning mode.  We also do the same for return
3082    insns, as they are executed by every thread.  Return the
3083    partitioning mode of the function as a whole.  Populate MAP with
3084    head and tail blocks.  We also clear the BB visited flag, which is
3085    used when finding partitions.  */
3086 
3087 static void
3088 nvptx_split_blocks (bb_insn_map_t *map)
3089 {
3090   insn_bb_vec_t worklist;
3091   basic_block block;
3092   rtx_insn *insn;
3093 
3094   /* Locate all the reorg instructions of interest.  */
3095   FOR_ALL_BB_FN (block, cfun)
3096     {
3097       bool seen_insn = false;
3098 
3099       /* Clear visited flag, for use by parallel locator  */
3100       block->flags &= ~BB_VISITED;
3101 
3102       FOR_BB_INSNS (block, insn)
3103 	{
3104 	  if (!INSN_P (insn))
3105 	    continue;
3106 	  switch (recog_memoized (insn))
3107 	    {
3108 	    default:
3109 	      seen_insn = true;
3110 	      continue;
3111 	    case CODE_FOR_nvptx_forked:
3112 	    case CODE_FOR_nvptx_join:
3113 	      break;
3114 
3115 	    case CODE_FOR_return:
3116 	      /* We also need to split just before return insns, as
3117 		 that insn needs executing by all threads, but the
3118 		 block it is in probably does not.  */
3119 	      break;
3120 	    }
3121 
3122 	  if (seen_insn)
3123 	    /* We've found an instruction that  must be at the start of
3124 	       a block, but isn't.  Add it to the worklist.  */
3125 	    worklist.safe_push (insn_bb_t (insn, block));
3126 	  else
3127 	    /* It was already the first instruction.  Just add it to
3128 	       the map.  */
3129 	    map->get_or_insert (block) = insn;
3130 	  seen_insn = true;
3131 	}
3132     }
3133 
3134   /* Split blocks on the worklist.  */
3135   unsigned ix;
3136   insn_bb_t *elt;
3137   basic_block remap = 0;
3138   for (ix = 0; worklist.iterate (ix, &elt); ix++)
3139     {
3140       if (remap != elt->second)
3141 	{
3142 	  block = elt->second;
3143 	  remap = block;
3144 	}
3145 
3146       /* Split block before insn. The insn is in the new block  */
3147       edge e = split_block (block, PREV_INSN (elt->first));
3148 
3149       block = e->dest;
3150       map->get_or_insert (block) = elt->first;
3151     }
3152 }
3153 
3154 /* Return true if MASK contains parallelism that requires shared
3155    memory to broadcast.  */
3156 
3157 static bool
3158 nvptx_needs_shared_bcast (unsigned mask)
3159 {
3160   bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3161   bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3162     && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3163 
3164   return worker || large_vector;
3165 }
3166 
3167 /* BLOCK is a basic block containing a head or tail instruction.
3168    Locate the associated prehead or pretail instruction, which must be
3169    in the single predecessor block.  */
3170 
3171 static rtx_insn *
3172 nvptx_discover_pre (basic_block block, int expected)
3173 {
3174   gcc_assert (block->preds->length () == 1);
3175   basic_block pre_block = (*block->preds)[0]->src;
3176   rtx_insn *pre_insn;
3177 
3178   for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3179        pre_insn = PREV_INSN (pre_insn))
3180     gcc_assert (pre_insn != BB_HEAD (pre_block));
3181 
3182   gcc_assert (recog_memoized (pre_insn) == expected);
3183   return pre_insn;
3184 }
3185 
3186 /* Dump this parallel and all its inner parallels.  */
3187 
3188 static void
3189 nvptx_dump_pars (parallel *par, unsigned depth)
3190 {
3191   fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3192 	   depth, par->mask,
3193 	   par->forked_block ? par->forked_block->index : -1,
3194 	   par->join_block ? par->join_block->index : -1);
3195 
3196   fprintf (dump_file, "    blocks:");
3197 
3198   basic_block block;
3199   for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3200     fprintf (dump_file, " %d", block->index);
3201   fprintf (dump_file, "\n");
3202   if (par->inner)
3203     nvptx_dump_pars (par->inner, depth + 1);
3204 
3205   if (par->next)
3206     nvptx_dump_pars (par->next, depth);
3207 }
3208 
3209 /* If BLOCK contains a fork/join marker, process it to create or
3210    terminate a loop structure.  Add this block to the current loop,
3211    and then walk successor blocks.   */
3212 
3213 static parallel *
3214 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3215 {
3216   if (block->flags & BB_VISITED)
3217     return par;
3218   block->flags |= BB_VISITED;
3219 
3220   if (rtx_insn **endp = map->get (block))
3221     {
3222       rtx_insn *end = *endp;
3223 
3224       /* This is a block head or tail, or return instruction.  */
3225       switch (recog_memoized (end))
3226 	{
3227 	case CODE_FOR_return:
3228 	  /* Return instructions are in their own block, and we
3229 	     don't need to do anything more.  */
3230 	  return par;
3231 
3232 	case CODE_FOR_nvptx_forked:
3233 	  /* Loop head, create a new inner loop and add it into
3234 	     our parent's child list.  */
3235 	  {
3236 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3237 
3238 	    gcc_assert (mask);
3239 	    par = new parallel (par, mask);
3240 	    par->forked_block = block;
3241 	    par->forked_insn = end;
3242 	    if (nvptx_needs_shared_bcast (mask))
3243 	      par->fork_insn
3244 		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3245 	  }
3246 	  break;
3247 
3248 	case CODE_FOR_nvptx_join:
3249 	  /* A loop tail.  Finish the current loop and return to
3250 	     parent.  */
3251 	  {
3252 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3253 
3254 	    gcc_assert (par->mask == mask);
3255 	    gcc_assert (par->join_block == NULL);
3256 	    par->join_block = block;
3257 	    par->join_insn = end;
3258 	    if (nvptx_needs_shared_bcast (mask))
3259 	      par->joining_insn
3260 		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3261 	    par = par->parent;
3262 	  }
3263 	  break;
3264 
3265 	default:
3266 	  gcc_unreachable ();
3267 	}
3268     }
3269 
3270   if (par)
3271     /* Add this block onto the current loop's list of blocks.  */
3272     par->blocks.safe_push (block);
3273   else
3274     /* This must be the entry block.  Create a NULL parallel.  */
3275     par = new parallel (0, 0);
3276 
3277   /* Walk successor blocks.  */
3278   edge e;
3279   edge_iterator ei;
3280 
3281   FOR_EACH_EDGE (e, ei, block->succs)
3282     nvptx_find_par (map, par, e->dest);
3283 
3284   return par;
3285 }
3286 
3287 /* DFS walk the CFG looking for fork & join markers.  Construct
3288    loop structures as we go.  MAP is a mapping of basic blocks
3289    to head & tail markers, discovered when splitting blocks.  This
3290    speeds up the discovery.  We rely on the BB visited flag having
3291    been cleared when splitting blocks.  */
3292 
3293 static parallel *
3294 nvptx_discover_pars (bb_insn_map_t *map)
3295 {
3296   basic_block block;
3297 
3298   /* Mark exit blocks as visited.  */
3299   block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3300   block->flags |= BB_VISITED;
3301 
3302   /* And entry block as not.  */
3303   block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3304   block->flags &= ~BB_VISITED;
3305 
3306   parallel *par = nvptx_find_par (map, 0, block);
3307 
3308   if (dump_file)
3309     {
3310       fprintf (dump_file, "\nLoops\n");
3311       nvptx_dump_pars (par, 0);
3312       fprintf (dump_file, "\n");
3313     }
3314 
3315   return par;
3316 }
3317 
3318 /* Analyse a group of BBs within a partitioned region and create N
3319    Single-Entry-Single-Exit regions.  Some of those regions will be
3320    trivial ones consisting of a single BB.  The blocks of a
3321    partitioned region might form a set of disjoint graphs -- because
3322    the region encloses a differently partitoned sub region.
3323 
3324    We use the linear time algorithm described in 'Finding Regions Fast:
3325    Single Entry Single Exit and control Regions in Linear Time'
3326    Johnson, Pearson & Pingali.  That algorithm deals with complete
3327    CFGs, where a back edge is inserted from END to START, and thus the
3328    problem becomes one of finding equivalent loops.
3329 
3330    In this case we have a partial CFG.  We complete it by redirecting
3331    any incoming edge to the graph to be from an arbitrary external BB,
3332    and similarly redirecting any outgoing edge to be to  that BB.
3333    Thus we end up with a closed graph.
3334 
3335    The algorithm works by building a spanning tree of an undirected
3336    graph and keeping track of back edges from nodes further from the
3337    root in the tree to nodes nearer to the root in the tree.  In the
3338    description below, the root is up and the tree grows downwards.
3339 
3340    We avoid having to deal with degenerate back-edges to the same
3341    block, by splitting each BB into 3 -- one for input edges, one for
3342    the node itself and one for the output edges.  Such back edges are
3343    referred to as 'Brackets'.  Cycle equivalent nodes will have the
3344    same set of brackets.
3345 
3346    Determining bracket equivalency is done by maintaining a list of
3347    brackets in such a manner that the list length and final bracket
3348    uniquely identify the set.
3349 
3350    We use coloring to mark all BBs with cycle equivalency with the
3351    same color.  This is the output of the 'Finding Regions Fast'
3352    algorithm.  Notice it doesn't actually find the set of nodes within
3353    a particular region, just unorderd sets of nodes that are the
3354    entries and exits of SESE regions.
3355 
3356    After determining cycle equivalency, we need to find the minimal
3357    set of SESE regions.  Do this with a DFS coloring walk of the
3358    complete graph.  We're either 'looking' or 'coloring'.  When
3359    looking, and we're in the subgraph, we start coloring the color of
3360    the current node, and remember that node as the start of the
3361    current color's SESE region.  Every time we go to a new node, we
3362    decrement the count of nodes with thet color.  If it reaches zero,
3363    we remember that node as the end of the current color's SESE region
3364    and return to 'looking'.  Otherwise we color the node the current
3365    color.
3366 
3367    This way we end up with coloring the inside of non-trivial SESE
3368    regions with the color of that region.  */
3369 
3370 /* A pair of BBs.  We use this to represent SESE regions.  */
3371 typedef std::pair<basic_block, basic_block> bb_pair_t;
3372 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3373 
3374 /* A node in the undirected CFG.  The discriminator SECOND indicates just
3375    above or just below the BB idicated by FIRST.  */
3376 typedef std::pair<basic_block, int> pseudo_node_t;
3377 
3378 /* A bracket indicates an edge towards the root of the spanning tree of the
3379    undirected graph.  Each bracket has a color, determined
3380    from the currrent set of brackets.  */
3381 struct bracket
3382 {
3383   pseudo_node_t back; /* Back target */
3384 
3385   /* Current color and size of set.  */
3386   unsigned color;
3387   unsigned size;
3388 
3389   bracket (pseudo_node_t back_)
3390   : back (back_), color (~0u), size (~0u)
3391   {
3392   }
3393 
3394   unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3395   {
3396     if (length != size)
3397       {
3398 	size = length;
3399 	color = color_counts.length ();
3400 	color_counts.quick_push (0);
3401       }
3402     color_counts[color]++;
3403     return color;
3404   }
3405 };
3406 
3407 typedef auto_vec<bracket> bracket_vec_t;
3408 
3409 /* Basic block info for finding SESE regions.    */
3410 
3411 struct bb_sese
3412 {
3413   int node;  /* Node number in spanning tree.  */
3414   int parent; /* Parent node number.  */
3415 
3416   /* The algorithm splits each node A into Ai, A', Ao. The incoming
3417      edges arrive at pseudo-node Ai and the outgoing edges leave at
3418      pseudo-node Ao.  We have to remember which way we arrived at a
3419      particular node when generating the spanning tree.  dir > 0 means
3420      we arrived at Ai, dir < 0 means we arrived at Ao.  */
3421   int dir;
3422 
3423   /* Lowest numbered pseudo-node reached via a backedge from thsis
3424      node, or any descendant.  */
3425   pseudo_node_t high;
3426 
3427   int color;  /* Cycle-equivalence color  */
3428 
3429   /* Stack of brackets for this node.  */
3430   bracket_vec_t brackets;
3431 
3432   bb_sese (unsigned node_, unsigned p, int dir_)
3433   :node (node_), parent (p), dir (dir_)
3434   {
3435   }
3436   ~bb_sese ();
3437 
3438   /* Push a bracket ending at BACK.  */
3439   void push (const pseudo_node_t &back)
3440   {
3441     if (dump_file)
3442       fprintf (dump_file, "Pushing backedge %d:%+d\n",
3443 	       back.first ? back.first->index : 0, back.second);
3444     brackets.safe_push (bracket (back));
3445   }
3446 
3447   void append (bb_sese *child);
3448   void remove (const pseudo_node_t &);
3449 
3450   /* Set node's color.  */
3451   void set_color (auto_vec<unsigned> &color_counts)
3452   {
3453     color = brackets.last ().get_color (color_counts, brackets.length ());
3454   }
3455 };
3456 
3457 bb_sese::~bb_sese ()
3458 {
3459 }
3460 
3461 /* Destructively append CHILD's brackets.  */
3462 
3463 void
3464 bb_sese::append (bb_sese *child)
3465 {
3466   if (int len = child->brackets.length ())
3467     {
3468       int ix;
3469 
3470       if (dump_file)
3471 	{
3472 	  for (ix = 0; ix < len; ix++)
3473 	    {
3474 	      const pseudo_node_t &pseudo = child->brackets[ix].back;
3475 	      fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3476 		       child->node, pseudo.first ? pseudo.first->index : 0,
3477 		       pseudo.second);
3478 	    }
3479 	}
3480       if (!brackets.length ())
3481 	std::swap (brackets, child->brackets);
3482       else
3483 	{
3484 	  brackets.reserve (len);
3485 	  for (ix = 0; ix < len; ix++)
3486 	    brackets.quick_push (child->brackets[ix]);
3487 	}
3488     }
3489 }
3490 
3491 /* Remove brackets that terminate at PSEUDO.  */
3492 
3493 void
3494 bb_sese::remove (const pseudo_node_t &pseudo)
3495 {
3496   unsigned removed = 0;
3497   int len = brackets.length ();
3498 
3499   for (int ix = 0; ix < len; ix++)
3500     {
3501       if (brackets[ix].back == pseudo)
3502 	{
3503 	  if (dump_file)
3504 	    fprintf (dump_file, "Removing backedge %d:%+d\n",
3505 		     pseudo.first ? pseudo.first->index : 0, pseudo.second);
3506 	  removed++;
3507 	}
3508       else if (removed)
3509 	brackets[ix-removed] = brackets[ix];
3510     }
3511   while (removed--)
3512     brackets.pop ();
3513 }
3514 
3515 /* Accessors for BB's aux pointer.  */
3516 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3517 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3518 
3519 /* DFS walk creating SESE data structures.  Only cover nodes with
3520    BB_VISITED set.  Append discovered blocks to LIST.  We number in
3521    increments of 3 so that the above and below pseudo nodes can be
3522    implicitly numbered too.  */
3523 
3524 static int
3525 nvptx_sese_number (int n, int p, int dir, basic_block b,
3526 		   auto_vec<basic_block> *list)
3527 {
3528   if (BB_GET_SESE (b))
3529     return n;
3530 
3531   if (dump_file)
3532     fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
3533 	     b->index, n, p, dir);
3534 
3535   BB_SET_SESE (b, new bb_sese (n, p, dir));
3536   p = n;
3537 
3538   n += 3;
3539   list->quick_push (b);
3540 
3541   /* First walk the nodes on the 'other side' of this node, then walk
3542      the nodes on the same side.  */
3543   for (unsigned ix = 2; ix; ix--)
3544     {
3545       vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
3546       size_t offset = (dir > 0 ? offsetof (edge_def, dest)
3547 		       : offsetof (edge_def, src));
3548       edge e;
3549       edge_iterator ei;
3550 
3551       FOR_EACH_EDGE (e, ei, edges)
3552 	{
3553 	  basic_block target = *(basic_block *)((char *)e + offset);
3554 
3555 	  if (target->flags & BB_VISITED)
3556 	    n = nvptx_sese_number (n, p, dir, target, list);
3557 	}
3558       dir = -dir;
3559     }
3560   return n;
3561 }
3562 
3563 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
3564    EDGES are the outgoing edges and OFFSET is the offset to the src
3565    or dst block on the edges.   */
3566 
3567 static void
3568 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
3569 		   vec<edge, va_gc> *edges, size_t offset)
3570 {
3571   edge e;
3572   edge_iterator ei;
3573   int hi_back = depth;
3574   pseudo_node_t node_back (NULL, depth);
3575   int hi_child = depth;
3576   pseudo_node_t node_child (NULL, depth);
3577   basic_block child = NULL;
3578   unsigned num_children = 0;
3579   int usd = -dir * sese->dir;
3580 
3581   if (dump_file)
3582     fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
3583 	     me->index, sese->node, dir);
3584 
3585   if (dir < 0)
3586     {
3587       /* This is the above pseudo-child.  It has the BB itself as an
3588 	 additional child node.  */
3589       node_child = sese->high;
3590       hi_child = node_child.second;
3591       if (node_child.first)
3592 	hi_child += BB_GET_SESE (node_child.first)->node;
3593       num_children++;
3594     }
3595 
3596   /* Examine each edge.
3597      - if it is a child (a) append its bracket list and (b) record
3598           whether it is the child with the highest reaching bracket.
3599      - if it is an edge to ancestor, record whether it's the highest
3600           reaching backlink.  */
3601   FOR_EACH_EDGE (e, ei, edges)
3602     {
3603       basic_block target = *(basic_block *)((char *)e + offset);
3604 
3605       if (bb_sese *t_sese = BB_GET_SESE (target))
3606 	{
3607 	  if (t_sese->parent == sese->node && !(t_sese->dir + usd))
3608 	    {
3609 	      /* Child node.  Append its bracket list. */
3610 	      num_children++;
3611 	      sese->append (t_sese);
3612 
3613 	      /* Compare it's hi value.  */
3614 	      int t_hi = t_sese->high.second;
3615 
3616 	      if (basic_block child_hi_block = t_sese->high.first)
3617 		t_hi += BB_GET_SESE (child_hi_block)->node;
3618 
3619 	      if (hi_child > t_hi)
3620 		{
3621 		  hi_child = t_hi;
3622 		  node_child = t_sese->high;
3623 		  child = target;
3624 		}
3625 	    }
3626 	  else if (t_sese->node < sese->node + dir
3627 		   && !(dir < 0 && sese->parent == t_sese->node))
3628 	    {
3629 	      /* Non-parental ancestor node -- a backlink.  */
3630 	      int d = usd * t_sese->dir;
3631 	      int back = t_sese->node + d;
3632 
3633 	      if (hi_back > back)
3634 		{
3635 		  hi_back = back;
3636 		  node_back = pseudo_node_t (target, d);
3637 		}
3638 	    }
3639 	}
3640       else
3641 	{ /* Fallen off graph, backlink to entry node.  */
3642 	  hi_back = 0;
3643 	  node_back = pseudo_node_t (NULL, 0);
3644 	}
3645     }
3646 
3647   /* Remove any brackets that terminate at this pseudo node.  */
3648   sese->remove (pseudo_node_t (me, dir));
3649 
3650   /* Now push any backlinks from this pseudo node.  */
3651   FOR_EACH_EDGE (e, ei, edges)
3652     {
3653       basic_block target = *(basic_block *)((char *)e + offset);
3654       if (bb_sese *t_sese = BB_GET_SESE (target))
3655 	{
3656 	  if (t_sese->node < sese->node + dir
3657 	      && !(dir < 0 && sese->parent == t_sese->node))
3658 	    /* Non-parental ancestor node - backedge from me.  */
3659 	    sese->push (pseudo_node_t (target, usd * t_sese->dir));
3660 	}
3661       else
3662 	{
3663 	  /* back edge to entry node */
3664 	  sese->push (pseudo_node_t (NULL, 0));
3665 	}
3666     }
3667 
3668  /* If this node leads directly or indirectly to a no-return region of
3669      the graph, then fake a backedge to entry node.  */
3670   if (!sese->brackets.length () || !edges || !edges->length ())
3671     {
3672       hi_back = 0;
3673       node_back = pseudo_node_t (NULL, 0);
3674       sese->push (node_back);
3675     }
3676 
3677   /* Record the highest reaching backedge from us or a descendant.  */
3678   sese->high = hi_back < hi_child ? node_back : node_child;
3679 
3680   if (num_children > 1)
3681     {
3682       /* There is more than one child -- this is a Y shaped piece of
3683 	 spanning tree.  We have to insert a fake backedge from this
3684 	 node to the highest ancestor reached by not-the-highest
3685 	 reaching child.  Note that there may be multiple children
3686 	 with backedges to the same highest node.  That's ok and we
3687 	 insert the edge to that highest node.  */
3688       hi_child = depth;
3689       if (dir < 0 && child)
3690 	{
3691 	  node_child = sese->high;
3692 	  hi_child = node_child.second;
3693 	  if (node_child.first)
3694 	    hi_child += BB_GET_SESE (node_child.first)->node;
3695 	}
3696 
3697       FOR_EACH_EDGE (e, ei, edges)
3698 	{
3699 	  basic_block target = *(basic_block *)((char *)e + offset);
3700 
3701 	  if (target == child)
3702 	    /* Ignore the highest child. */
3703 	    continue;
3704 
3705 	  bb_sese *t_sese = BB_GET_SESE (target);
3706 	  if (!t_sese)
3707 	    continue;
3708 	  if (t_sese->parent != sese->node)
3709 	    /* Not a child. */
3710 	    continue;
3711 
3712 	  /* Compare its hi value.  */
3713 	  int t_hi = t_sese->high.second;
3714 
3715 	  if (basic_block child_hi_block = t_sese->high.first)
3716 	    t_hi += BB_GET_SESE (child_hi_block)->node;
3717 
3718 	  if (hi_child > t_hi)
3719 	    {
3720 	      hi_child = t_hi;
3721 	      node_child = t_sese->high;
3722 	    }
3723 	}
3724 
3725       sese->push (node_child);
3726     }
3727 }
3728 
3729 
3730 /* DFS walk of BB graph.  Color node BLOCK according to COLORING then
3731    proceed to successors.  Set SESE entry and exit nodes of
3732    REGIONS.  */
3733 
3734 static void
3735 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
3736 		  basic_block block, int coloring)
3737 {
3738   bb_sese *sese = BB_GET_SESE (block);
3739 
3740   if (block->flags & BB_VISITED)
3741     {
3742       /* If we've already encountered this block, either we must not
3743 	 be coloring, or it must have been colored the current color.  */
3744       gcc_assert (coloring < 0 || (sese && coloring == sese->color));
3745       return;
3746     }
3747 
3748   block->flags |= BB_VISITED;
3749 
3750   if (sese)
3751     {
3752       if (coloring < 0)
3753 	{
3754 	  /* Start coloring a region.  */
3755 	  regions[sese->color].first = block;
3756 	  coloring = sese->color;
3757 	}
3758 
3759       if (!--color_counts[sese->color] && sese->color == coloring)
3760 	{
3761 	  /* Found final block of SESE region.  */
3762 	  regions[sese->color].second = block;
3763 	  coloring = -1;
3764 	}
3765       else
3766 	/* Color the node, so we can assert on revisiting the node
3767 	   that the graph is indeed SESE.  */
3768 	sese->color = coloring;
3769     }
3770   else
3771     /* Fallen off the subgraph, we cannot be coloring.  */
3772     gcc_assert (coloring < 0);
3773 
3774   /* Walk each successor block.  */
3775   if (block->succs && block->succs->length ())
3776     {
3777       edge e;
3778       edge_iterator ei;
3779 
3780       FOR_EACH_EDGE (e, ei, block->succs)
3781 	nvptx_sese_color (color_counts, regions, e->dest, coloring);
3782     }
3783   else
3784     gcc_assert (coloring < 0);
3785 }
3786 
3787 /* Find minimal set of SESE regions covering BLOCKS.  REGIONS might
3788    end up with NULL entries in it.  */
3789 
3790 static void
3791 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
3792 {
3793   basic_block block;
3794   int ix;
3795 
3796   /* First clear each BB of the whole function.  */
3797   FOR_ALL_BB_FN (block, cfun)
3798     {
3799       block->flags &= ~BB_VISITED;
3800       BB_SET_SESE (block, 0);
3801     }
3802 
3803   /* Mark blocks in the function that are in this graph.  */
3804   for (ix = 0; blocks.iterate (ix, &block); ix++)
3805     block->flags |= BB_VISITED;
3806 
3807   /* Counts of nodes assigned to each color.  There cannot be more
3808      colors than blocks (and hopefully there will be fewer).  */
3809   auto_vec<unsigned> color_counts;
3810   color_counts.reserve (blocks.length ());
3811 
3812   /* Worklist of nodes in the spanning tree.  Again, there cannot be
3813      more nodes in the tree than blocks (there will be fewer if the
3814      CFG of blocks is disjoint).  */
3815   auto_vec<basic_block> spanlist;
3816   spanlist.reserve (blocks.length ());
3817 
3818   /* Make sure every block has its cycle class determined.  */
3819   for (ix = 0; blocks.iterate (ix, &block); ix++)
3820     {
3821       if (BB_GET_SESE (block))
3822 	/* We already met this block in an earlier graph solve.  */
3823 	continue;
3824 
3825       if (dump_file)
3826 	fprintf (dump_file, "Searching graph starting at %d\n", block->index);
3827 
3828       /* Number the nodes reachable from block initial DFS order.  */
3829       int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
3830 
3831       /* Now walk in reverse DFS order to find cycle equivalents.  */
3832       while (spanlist.length ())
3833 	{
3834 	  block = spanlist.pop ();
3835 	  bb_sese *sese = BB_GET_SESE (block);
3836 
3837 	  /* Do the pseudo node below.  */
3838 	  nvptx_sese_pseudo (block, sese, depth, +1,
3839 			     sese->dir > 0 ? block->succs : block->preds,
3840 			     (sese->dir > 0 ? offsetof (edge_def, dest)
3841 			      : offsetof (edge_def, src)));
3842 	  sese->set_color (color_counts);
3843 	  /* Do the pseudo node above.  */
3844 	  nvptx_sese_pseudo (block, sese, depth, -1,
3845 			     sese->dir < 0 ? block->succs : block->preds,
3846 			     (sese->dir < 0 ? offsetof (edge_def, dest)
3847 			      : offsetof (edge_def, src)));
3848 	}
3849       if (dump_file)
3850 	fprintf (dump_file, "\n");
3851     }
3852 
3853   if (dump_file)
3854     {
3855       unsigned count;
3856       const char *comma = "";
3857 
3858       fprintf (dump_file, "Found %d cycle equivalents\n",
3859 	       color_counts.length ());
3860       for (ix = 0; color_counts.iterate (ix, &count); ix++)
3861 	{
3862 	  fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
3863 
3864 	  comma = "";
3865 	  for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
3866 	    if (BB_GET_SESE (block)->color == ix)
3867 	      {
3868 		block->flags |= BB_VISITED;
3869 		fprintf (dump_file, "%s%d", comma, block->index);
3870 		comma=",";
3871 	      }
3872 	  fprintf (dump_file, "}");
3873 	  comma = ", ";
3874 	}
3875       fprintf (dump_file, "\n");
3876    }
3877 
3878   /* Now we've colored every block in the subgraph.  We now need to
3879      determine the minimal set of SESE regions that cover that
3880      subgraph.  Do this with a DFS walk of the complete function.
3881      During the walk we're either 'looking' or 'coloring'.  When we
3882      reach the last node of a particular color, we stop coloring and
3883      return to looking.  */
3884 
3885   /* There cannot be more SESE regions than colors.  */
3886   regions.reserve (color_counts.length ());
3887   for (ix = color_counts.length (); ix--;)
3888     regions.quick_push (bb_pair_t (0, 0));
3889 
3890   for (ix = 0; blocks.iterate (ix, &block); ix++)
3891     block->flags &= ~BB_VISITED;
3892 
3893   nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
3894 
3895   if (dump_file)
3896     {
3897       const char *comma = "";
3898       int len = regions.length ();
3899 
3900       fprintf (dump_file, "SESE regions:");
3901       for (ix = 0; ix != len; ix++)
3902 	{
3903 	  basic_block from = regions[ix].first;
3904 	  basic_block to = regions[ix].second;
3905 
3906 	  if (from)
3907 	    {
3908 	      fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
3909 	      if (to != from)
3910 		fprintf (dump_file, "->%d", to->index);
3911 
3912 	      int color = BB_GET_SESE (from)->color;
3913 
3914 	      /* Print the blocks within the region (excluding ends).  */
3915 	      FOR_EACH_BB_FN (block, cfun)
3916 		{
3917 		  bb_sese *sese = BB_GET_SESE (block);
3918 
3919 		  if (sese && sese->color == color
3920 		      && block != from && block != to)
3921 		    fprintf (dump_file, ".%d", block->index);
3922 		}
3923 	      fprintf (dump_file, "}");
3924 	    }
3925 	  comma = ",";
3926 	}
3927       fprintf (dump_file, "\n\n");
3928     }
3929 
3930   for (ix = 0; blocks.iterate (ix, &block); ix++)
3931     delete BB_GET_SESE (block);
3932 }
3933 
3934 #undef BB_SET_SESE
3935 #undef BB_GET_SESE
3936 
3937 /* Propagate live state at the start of a partitioned region.  IS_CALL
3938    indicates whether the propagation is for a (partitioned) call
3939    instruction.  BLOCK provides the live register information, and
3940    might not contain INSN. Propagation is inserted just after INSN. RW
3941    indicates whether we are reading and/or writing state.  This
3942    separation is needed for worker-level proppagation where we
3943    essentially do a spill & fill.  FN is the underlying worker
3944    function to generate the propagation instructions for single
3945    register.  DATA is user data.
3946 
3947    Returns true if we didn't emit any instructions.
3948 
3949    We propagate the live register set for non-calls and the entire
3950    frame for calls and non-calls.  We could do better by (a)
3951    propagating just the live set that is used within the partitioned
3952    regions and (b) only propagating stack entries that are used.  The
3953    latter might be quite hard to determine.  */
3954 
3955 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
3956 
3957 static bool
3958 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
3959 		 propagate_mask rw, propagator_fn fn, void *data, bool vector)
3960 {
3961   bitmap live = DF_LIVE_IN (block);
3962   bitmap_iterator iterator;
3963   unsigned ix;
3964   bool empty = true;
3965 
3966   /* Copy the frame array.  */
3967   HOST_WIDE_INT fs = get_frame_size ();
3968   if (fs)
3969     {
3970       rtx tmp = gen_reg_rtx (DImode);
3971       rtx idx = NULL_RTX;
3972       rtx ptr = gen_reg_rtx (Pmode);
3973       rtx pred = NULL_RTX;
3974       rtx_code_label *label = NULL;
3975 
3976       empty = false;
3977       /* The frame size might not be DImode compatible, but the frame
3978 	 array's declaration will be.  So it's ok to round up here.  */
3979       fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
3980       /* Detect single iteration loop. */
3981       if (fs == 1)
3982 	fs = 0;
3983 
3984       start_sequence ();
3985       emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
3986       if (fs)
3987 	{
3988 	  idx = gen_reg_rtx (SImode);
3989 	  pred = gen_reg_rtx (BImode);
3990 	  label = gen_label_rtx ();
3991 
3992 	  emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
3993 	  /* Allow worker function to initialize anything needed.  */
3994 	  rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
3995 	  if (init)
3996 	    emit_insn (init);
3997 	  emit_label (label);
3998 	  LABEL_NUSES (label)++;
3999 	  emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4000 	}
4001       if (rw & PM_read)
4002 	emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4003       emit_insn (fn (tmp, rw, fs, data, vector));
4004       if (rw & PM_write)
4005 	emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4006       if (fs)
4007 	{
4008 	  emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4009 	  emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4010 	  emit_insn (gen_br_true_uni (pred, label));
4011 	  rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4012 	  if (fini)
4013 	    emit_insn (fini);
4014 	  emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4015 	}
4016       emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4017       emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4018       rtx cpy = get_insns ();
4019       end_sequence ();
4020       insn = emit_insn_after (cpy, insn);
4021     }
4022 
4023   if (!is_call)
4024     /* Copy live registers.  */
4025     EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4026       {
4027 	rtx reg = regno_reg_rtx[ix];
4028 
4029 	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4030 	  {
4031 	    rtx bcast = fn (reg, rw, 0, data, vector);
4032 
4033 	    insn = emit_insn_after (bcast, insn);
4034 	    empty = false;
4035 	  }
4036       }
4037   return empty;
4038 }
4039 
4040 /* Worker for nvptx_warp_propagate.  */
4041 
4042 static rtx
4043 warp_prop_gen (rtx reg, propagate_mask pm,
4044 	       unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4045 	       bool ARG_UNUSED (vector))
4046 {
4047   if (!(pm & PM_read_write))
4048     return 0;
4049 
4050   return nvptx_gen_warp_bcast (reg);
4051 }
4052 
4053 /* Propagate state that is live at start of BLOCK across the vectors
4054    of a single warp.  Propagation is inserted just after INSN.
4055    IS_CALL and return as for nvptx_propagate.  */
4056 
4057 static bool
4058 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4059 {
4060   return nvptx_propagate (is_call, block, insn, PM_read_write,
4061 			  warp_prop_gen, 0, false);
4062 }
4063 
4064 /* Worker for nvptx_shared_propagate.  */
4065 
4066 static rtx
4067 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4068 		 bool vector)
4069 {
4070   broadcast_data_t *data = (broadcast_data_t *)data_;
4071 
4072   if (pm & PM_loop_begin)
4073     {
4074       /* Starting a loop, initialize pointer.    */
4075       unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4076 
4077       oacc_bcast_align = MAX (oacc_bcast_align, align);
4078       data->offset = ROUND_UP (data->offset, align);
4079 
4080       data->ptr = gen_reg_rtx (Pmode);
4081 
4082       return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4083     }
4084   else if (pm & PM_loop_end)
4085     {
4086       rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4087       data->ptr = NULL_RTX;
4088       return clobber;
4089     }
4090   else
4091     return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4092 }
4093 
4094 /* Spill or fill live state that is live at start of BLOCK.  PRE_P
4095    indicates if this is just before partitioned mode (do spill), or
4096    just after it starts (do fill). Sequence is inserted just after
4097    INSN.  IS_CALL and return as for nvptx_propagate.  */
4098 
4099 static bool
4100 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4101 			rtx_insn *insn, bool vector)
4102 {
4103   broadcast_data_t data;
4104 
4105   data.base = gen_reg_rtx (Pmode);
4106   data.offset = 0;
4107   data.ptr = NULL_RTX;
4108 
4109   bool empty = nvptx_propagate (is_call, block, insn,
4110 				pre_p ? PM_read : PM_write, shared_prop_gen,
4111 				&data, vector);
4112   gcc_assert (empty == !data.offset);
4113   if (data.offset)
4114     {
4115       rtx bcast_sym = oacc_bcast_sym;
4116 
4117       /* Stuff was emitted, initialize the base pointer now.  */
4118       if (vector && nvptx_mach_max_workers () > 1)
4119 	{
4120 	  if (!cfun->machine->bcast_partition)
4121 	    {
4122 	      /* It would be nice to place this register in
4123 		 DATA_AREA_SHARED.  */
4124 	      cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4125 	    }
4126 	  if (!cfun->machine->sync_bar)
4127 	    cfun->machine->sync_bar = gen_reg_rtx (SImode);
4128 
4129 	  bcast_sym = cfun->machine->bcast_partition;
4130 	}
4131 
4132       rtx init = gen_rtx_SET (data.base, bcast_sym);
4133       emit_insn_after (init, insn);
4134 
4135       unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4136       unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4137 			   ? nvptx_mach_max_workers () + 1
4138 			   : 1);
4139 
4140       oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4141       oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4142     }
4143   return empty;
4144 }
4145 
4146 /* Emit a CTA-level synchronization barrier.  LOCK is the barrier number,
4147    which is an integer or a register.  THREADS is the number of threads
4148    controlled by the barrier.  */
4149 
4150 static rtx
4151 nvptx_cta_sync (rtx lock, int threads)
4152 {
4153   return gen_nvptx_barsync (lock, GEN_INT (threads));
4154 }
4155 
4156 #if WORKAROUND_PTXJIT_BUG
4157 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4158    real insns.  */
4159 
4160 static rtx_insn *
4161 bb_first_real_insn (basic_block bb)
4162 {
4163   rtx_insn *insn;
4164 
4165   /* Find first insn of from block.  */
4166   FOR_BB_INSNS (bb, insn)
4167     if (INSN_P (insn))
4168       return insn;
4169 
4170   return 0;
4171 }
4172 #endif
4173 
4174 /* Return true if INSN needs neutering.  */
4175 
4176 static bool
4177 needs_neutering_p (rtx_insn *insn)
4178 {
4179   if (!INSN_P (insn))
4180     return false;
4181 
4182   switch (recog_memoized (insn))
4183     {
4184     case CODE_FOR_nvptx_fork:
4185     case CODE_FOR_nvptx_forked:
4186     case CODE_FOR_nvptx_joining:
4187     case CODE_FOR_nvptx_join:
4188     case CODE_FOR_nvptx_barsync:
4189       return false;
4190     default:
4191       return true;
4192     }
4193 }
4194 
4195 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM.  */
4196 
4197 static bool
4198 verify_neutering_jumps (basic_block from,
4199 			rtx_insn *vector_jump, rtx_insn *worker_jump,
4200 			rtx_insn *vector_label, rtx_insn *worker_label)
4201 {
4202   basic_block bb = from;
4203   rtx_insn *insn = BB_HEAD (bb);
4204   bool seen_worker_jump = false;
4205   bool seen_vector_jump = false;
4206   bool seen_worker_label = false;
4207   bool seen_vector_label = false;
4208   bool worker_neutered = false;
4209   bool vector_neutered = false;
4210   while (true)
4211     {
4212       if (insn == worker_jump)
4213 	{
4214 	  seen_worker_jump = true;
4215 	  worker_neutered = true;
4216 	  gcc_assert (!vector_neutered);
4217 	}
4218       else if (insn == vector_jump)
4219 	{
4220 	  seen_vector_jump = true;
4221 	  vector_neutered = true;
4222 	}
4223       else if (insn == worker_label)
4224 	{
4225 	  seen_worker_label = true;
4226 	  gcc_assert (worker_neutered);
4227 	  worker_neutered = false;
4228 	}
4229       else if (insn == vector_label)
4230 	{
4231 	  seen_vector_label = true;
4232 	  gcc_assert (vector_neutered);
4233 	  vector_neutered = false;
4234 	}
4235       else if (INSN_P (insn))
4236 	switch (recog_memoized (insn))
4237 	  {
4238 	  case CODE_FOR_nvptx_barsync:
4239 	    gcc_assert (!vector_neutered && !worker_neutered);
4240 	    break;
4241 	  default:
4242 	    break;
4243 	  }
4244 
4245       if (insn != BB_END (bb))
4246 	insn = NEXT_INSN (insn);
4247       else if (JUMP_P (insn) && single_succ_p (bb)
4248 	       && !seen_vector_jump && !seen_worker_jump)
4249 	{
4250 	  bb = single_succ (bb);
4251 	  insn = BB_HEAD (bb);
4252 	}
4253       else
4254 	break;
4255     }
4256 
4257   gcc_assert (!(vector_jump && !seen_vector_jump));
4258   gcc_assert (!(worker_jump && !seen_worker_jump));
4259 
4260   if (seen_vector_label || seen_worker_label)
4261     {
4262       gcc_assert (!(vector_label && !seen_vector_label));
4263       gcc_assert (!(worker_label && !seen_worker_label));
4264 
4265       return true;
4266     }
4267 
4268   return false;
4269 }
4270 
4271 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO.  */
4272 
4273 static void
4274 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4275 			 rtx_insn *worker_label)
4276 {
4277   basic_block bb = to;
4278   rtx_insn *insn = BB_END (bb);
4279   bool seen_worker_label = false;
4280   bool seen_vector_label = false;
4281   while (true)
4282     {
4283       if (insn == worker_label)
4284 	{
4285 	  seen_worker_label = true;
4286 	  gcc_assert (!seen_vector_label);
4287 	}
4288       else if (insn == vector_label)
4289 	seen_vector_label = true;
4290       else if (INSN_P (insn))
4291 	switch (recog_memoized (insn))
4292 	  {
4293 	  case CODE_FOR_nvptx_barsync:
4294 	    gcc_assert (!seen_vector_label && !seen_worker_label);
4295 	    break;
4296 	  }
4297 
4298       if (insn != BB_HEAD (bb))
4299 	insn = PREV_INSN (insn);
4300       else
4301 	break;
4302     }
4303 
4304   gcc_assert (!(vector_label && !seen_vector_label));
4305   gcc_assert (!(worker_label && !seen_worker_label));
4306 }
4307 
4308 /* Single neutering according to MASK.  FROM is the incoming block and
4309    TO is the outgoing block.  These may be the same block. Insert at
4310    start of FROM:
4311 
4312      if (tid.<axis>) goto end.
4313 
4314    and insert before ending branch of TO (if there is such an insn):
4315 
4316      end:
4317      <possibly-broadcast-cond>
4318      <branch>
4319 
4320    We currently only use differnt FROM and TO when skipping an entire
4321    loop.  We could do more if we detected superblocks.  */
4322 
4323 static void
4324 nvptx_single (unsigned mask, basic_block from, basic_block to)
4325 {
4326   rtx_insn *head = BB_HEAD (from);
4327   rtx_insn *tail = BB_END (to);
4328   unsigned skip_mask = mask;
4329 
4330   while (true)
4331     {
4332       /* Find first insn of from block.  */
4333       while (head != BB_END (from) && !needs_neutering_p (head))
4334 	head = NEXT_INSN (head);
4335 
4336       if (from == to)
4337 	break;
4338 
4339       if (!(JUMP_P (head) && single_succ_p (from)))
4340 	break;
4341 
4342       basic_block jump_target = single_succ (from);
4343       if (!single_pred_p (jump_target))
4344 	break;
4345 
4346       from = jump_target;
4347       head = BB_HEAD (from);
4348     }
4349 
4350   /* Find last insn of to block */
4351   rtx_insn *limit = from == to ? head : BB_HEAD (to);
4352   while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4353     tail = PREV_INSN (tail);
4354 
4355   /* Detect if tail is a branch.  */
4356   rtx tail_branch = NULL_RTX;
4357   rtx cond_branch = NULL_RTX;
4358   if (tail && INSN_P (tail))
4359     {
4360       tail_branch = PATTERN (tail);
4361       if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4362 	tail_branch = NULL_RTX;
4363       else
4364 	{
4365 	  cond_branch = SET_SRC (tail_branch);
4366 	  if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4367 	    cond_branch = NULL_RTX;
4368 	}
4369     }
4370 
4371   if (tail == head)
4372     {
4373       /* If this is empty, do nothing.  */
4374       if (!head || !needs_neutering_p (head))
4375 	return;
4376 
4377       if (cond_branch)
4378 	{
4379 	  /* If we're only doing vector single, there's no need to
4380 	     emit skip code because we'll not insert anything.  */
4381 	  if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4382 	    skip_mask = 0;
4383 	}
4384       else if (tail_branch)
4385 	/* Block with only unconditional branch.  Nothing to do.  */
4386 	return;
4387     }
4388 
4389   /* Insert the vector test inside the worker test.  */
4390   unsigned mode;
4391   rtx_insn *before = tail;
4392   rtx_insn *neuter_start = NULL;
4393   rtx_insn *worker_label = NULL, *vector_label = NULL;
4394   rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4395   rtx_insn *warp_sync = NULL;
4396   for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4397     if (GOMP_DIM_MASK (mode) & skip_mask)
4398       {
4399 	rtx_code_label *label = gen_label_rtx ();
4400 	rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4401 	rtx_insn **mode_jump
4402 	  = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4403 	rtx_insn **mode_label
4404 	  = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4405 
4406 	if (!pred)
4407 	  {
4408 	    pred = gen_reg_rtx (BImode);
4409 	    cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4410 	  }
4411 
4412 	rtx br;
4413 	if (mode == GOMP_DIM_VECTOR)
4414 	  br = gen_br_true (pred, label);
4415 	else
4416 	  br = gen_br_true_uni (pred, label);
4417 	if (neuter_start)
4418 	  neuter_start = emit_insn_after (br, neuter_start);
4419 	else
4420 	  neuter_start = emit_insn_before (br, head);
4421 	*mode_jump = neuter_start;
4422 
4423 	LABEL_NUSES (label)++;
4424 	rtx_insn *label_insn;
4425 	if (tail_branch)
4426 	  {
4427 	    label_insn = emit_label_before (label, before);
4428 	    if (mode == GOMP_DIM_VECTOR)
4429 	      {
4430 		if (TARGET_PTX_6_0)
4431 		  warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4432 					       label_insn);
4433 		else
4434 		  warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4435 					       label_insn);
4436 	      }
4437 	    before = label_insn;
4438 	  }
4439 	else
4440 	  {
4441 	    label_insn = emit_label_after (label, tail);
4442 	    if (mode == GOMP_DIM_VECTOR)
4443 	      {
4444 		if (TARGET_PTX_6_0)
4445 		  warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4446 					       label_insn);
4447 		else
4448 		  warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4449 					       label_insn);
4450 	      }
4451 	    if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4452 		&& CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4453 	      emit_insn_after (gen_exit (), label_insn);
4454 	  }
4455 
4456 	*mode_label = label_insn;
4457       }
4458 
4459   /* Now deal with propagating the branch condition.  */
4460   if (cond_branch)
4461     {
4462       rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4463 
4464       if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4465 	  && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4466 	{
4467 	  /* Vector mode only, do a shuffle.  */
4468 #if WORKAROUND_PTXJIT_BUG
4469 	  /* The branch condition %rcond is propagated like this:
4470 
4471 		{
4472 		    .reg .u32 %x;
4473 		    mov.u32 %x,%tid.x;
4474 		    setp.ne.u32 %rnotvzero,%x,0;
4475 		 }
4476 
4477 		 @%rnotvzero bra Lskip;
4478 		 setp.<op>.<type> %rcond,op1,op2;
4479 		 Lskip:
4480 		 selp.u32 %rcondu32,1,0,%rcond;
4481 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4482 		 setp.ne.u32 %rcond,%rcondu32,0;
4483 
4484 	     There seems to be a bug in the ptx JIT compiler (observed at driver
4485 	     version 381.22, at -O1 and higher for sm_61), that drops the shfl
4486 	     unless %rcond is initialized to something before 'bra Lskip'.  The
4487 	     bug is not observed with ptxas from cuda 8.0.61.
4488 
4489 	     It is true that the code is non-trivial: at Lskip, %rcond is
4490 	     uninitialized in threads 1-31, and after the selp the same holds
4491 	     for %rcondu32.  But shfl propagates the defined value in thread 0
4492 	     to threads 1-31, so after the shfl %rcondu32 is defined in threads
4493 	     0-31, and after the setp.ne %rcond is defined in threads 0-31.
4494 
4495 	     There is nothing in the PTX spec to suggest that this is wrong, or
4496 	     to explain why the extra initialization is needed.  So, we classify
4497 	     it as a JIT bug, and the extra initialization as workaround:
4498 
4499 		{
4500 		    .reg .u32 %x;
4501 		    mov.u32 %x,%tid.x;
4502 		    setp.ne.u32 %rnotvzero,%x,0;
4503 		}
4504 
4505 		+.reg .pred %rcond2;
4506 		+setp.eq.u32 %rcond2, 1, 0;
4507 
4508 		 @%rnotvzero bra Lskip;
4509 		 setp.<op>.<type> %rcond,op1,op2;
4510 		+mov.pred %rcond2, %rcond;
4511 		 Lskip:
4512 		+mov.pred %rcond, %rcond2;
4513 		 selp.u32 %rcondu32,1,0,%rcond;
4514 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4515 		 setp.ne.u32 %rcond,%rcondu32,0;
4516 	  */
4517 	  rtx_insn *label = PREV_INSN (tail);
4518 	  if (label == warp_sync)
4519 	    label = PREV_INSN (label);
4520 	  gcc_assert (label && LABEL_P (label));
4521 	  rtx tmp = gen_reg_rtx (BImode);
4522 	  emit_insn_before (gen_movbi (tmp, const0_rtx),
4523 			    bb_first_real_insn (from));
4524 	  emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4525 	  emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4526 #endif
4527 	  emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
4528 	}
4529       else
4530 	{
4531 	  /* Includes worker mode, do spill & fill.  By construction
4532 	     we should never have worker mode only. */
4533 	  broadcast_data_t data;
4534 	  unsigned size = GET_MODE_SIZE (SImode);
4535 	  bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
4536 	  bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
4537 	  rtx barrier = GEN_INT (0);
4538 	  int threads = 0;
4539 
4540 	  data.base = oacc_bcast_sym;
4541 	  data.ptr = 0;
4542 
4543 	  bool use_partitioning_p = (vector && !worker
4544 				     && nvptx_mach_max_workers () > 1
4545 				     && cfun->machine->bcast_partition);
4546 	  if (use_partitioning_p)
4547 	    {
4548 	      data.base = cfun->machine->bcast_partition;
4549 	      barrier = cfun->machine->sync_bar;
4550 	      threads = nvptx_mach_vector_length ();
4551 	    }
4552 	  gcc_assert (data.base != NULL);
4553 	  gcc_assert (barrier);
4554 
4555 	  unsigned int psize = ROUND_UP (size, oacc_bcast_align);
4556 	  unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4557 			       ? nvptx_mach_max_workers () + 1
4558 			       : 1);
4559 
4560 	  oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4561 	  oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4562 
4563 	  data.offset = 0;
4564 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
4565 						    vector),
4566 			    before);
4567 
4568 	  /* Barrier so other workers can see the write.  */
4569 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4570 	  data.offset = 0;
4571 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
4572 						    vector),
4573 			    tail);
4574 	  /* This barrier is needed to avoid worker zero clobbering
4575 	     the broadcast buffer before all the other workers have
4576 	     had a chance to read this instance of it.  */
4577 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
4578 	}
4579 
4580       extract_insn (tail);
4581       rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
4582 				 UNSPEC_BR_UNIFIED);
4583       validate_change (tail, recog_data.operand_loc[0], unsp, false);
4584     }
4585 
4586   bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
4587 					    vector_label, worker_label);
4588   if (!seen_label)
4589     verify_neutering_labels (to, vector_label, worker_label);
4590 }
4591 
4592 /* PAR is a parallel that is being skipped in its entirety according to
4593    MASK.  Treat this as skipping a superblock starting at forked
4594    and ending at joining.  */
4595 
4596 static void
4597 nvptx_skip_par (unsigned mask, parallel *par)
4598 {
4599   basic_block tail = par->join_block;
4600   gcc_assert (tail->preds->length () == 1);
4601 
4602   basic_block pre_tail = (*tail->preds)[0]->src;
4603   gcc_assert (pre_tail->succs->length () == 1);
4604 
4605   nvptx_single (mask, par->forked_block, pre_tail);
4606 }
4607 
4608 /* If PAR has a single inner parallel and PAR itself only contains
4609    empty entry and exit blocks, swallow the inner PAR.  */
4610 
4611 static void
4612 nvptx_optimize_inner (parallel *par)
4613 {
4614   parallel *inner = par->inner;
4615 
4616   /* We mustn't be the outer dummy par.  */
4617   if (!par->mask)
4618     return;
4619 
4620   /* We must have a single inner par.  */
4621   if (!inner || inner->next)
4622     return;
4623 
4624   /* We must only contain 2 blocks ourselves -- the head and tail of
4625      the inner par.  */
4626   if (par->blocks.length () != 2)
4627     return;
4628 
4629   /* We must be disjoint partitioning.  As we only have vector and
4630      worker partitioning, this is sufficient to guarantee the pars
4631      have adjacent partitioning.  */
4632   if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
4633     /* This indicates malformed code generation.  */
4634     return;
4635 
4636   /* The outer forked insn should be immediately followed by the inner
4637      fork insn.  */
4638   rtx_insn *forked = par->forked_insn;
4639   rtx_insn *fork = BB_END (par->forked_block);
4640 
4641   if (NEXT_INSN (forked) != fork)
4642     return;
4643   gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
4644 
4645   /* The outer joining insn must immediately follow the inner join
4646      insn.  */
4647   rtx_insn *joining = par->joining_insn;
4648   rtx_insn *join = inner->join_insn;
4649   if (NEXT_INSN (join) != joining)
4650     return;
4651 
4652   /* Preconditions met.  Swallow the inner par.  */
4653   if (dump_file)
4654     fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
4655 	     inner->mask, inner->forked_block->index,
4656 	     inner->join_block->index,
4657 	     par->mask, par->forked_block->index, par->join_block->index);
4658 
4659   par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
4660 
4661   par->blocks.reserve (inner->blocks.length ());
4662   while (inner->blocks.length ())
4663     par->blocks.quick_push (inner->blocks.pop ());
4664 
4665   par->inner = inner->inner;
4666   inner->inner = NULL;
4667 
4668   delete inner;
4669 }
4670 
4671 /* Process the parallel PAR and all its contained
4672    parallels.  We do everything but the neutering.  Return mask of
4673    partitioned modes used within this parallel.  */
4674 
4675 static unsigned
4676 nvptx_process_pars (parallel *par)
4677 {
4678   if (nvptx_optimize)
4679     nvptx_optimize_inner (par);
4680 
4681   unsigned inner_mask = par->mask;
4682 
4683   /* Do the inner parallels first.  */
4684   if (par->inner)
4685     {
4686       par->inner_mask = nvptx_process_pars (par->inner);
4687       inner_mask |= par->inner_mask;
4688     }
4689 
4690   bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
4691   bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
4692   bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4693 		      && nvptx_mach_vector_length () > PTX_WARP_SIZE);
4694 
4695   if (worker || large_vector)
4696     {
4697       nvptx_shared_propagate (false, is_call, par->forked_block,
4698 			      par->forked_insn, !worker);
4699       bool no_prop_p
4700 	= nvptx_shared_propagate (true, is_call, par->forked_block,
4701 				  par->fork_insn, !worker);
4702       bool empty_loop_p
4703 	= !is_call && (NEXT_INSN (par->forked_insn)
4704 		       && NEXT_INSN (par->forked_insn) == par->joining_insn);
4705       rtx barrier = GEN_INT (0);
4706       int threads = 0;
4707 
4708       if (!worker && cfun->machine->sync_bar)
4709 	{
4710 	  barrier = cfun->machine->sync_bar;
4711 	  threads = nvptx_mach_vector_length ();
4712 	}
4713 
4714       if (no_prop_p && empty_loop_p)
4715 	;
4716       else if (no_prop_p && is_call)
4717 	;
4718       else
4719 	{
4720 	  /* Insert begin and end synchronizations.  */
4721 	  emit_insn_before (nvptx_cta_sync (barrier, threads),
4722 			    par->forked_insn);
4723 	  emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
4724 	}
4725     }
4726   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
4727     nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
4728 
4729   /* Now do siblings.  */
4730   if (par->next)
4731     inner_mask |= nvptx_process_pars (par->next);
4732   return inner_mask;
4733 }
4734 
4735 /* Neuter the parallel described by PAR.  We recurse in depth-first
4736    order.  MODES are the partitioning of the execution and OUTER is
4737    the partitioning of the parallels we are contained in.  */
4738 
4739 static void
4740 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
4741 {
4742   unsigned me = (par->mask
4743 		 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
4744 		    | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
4745   unsigned  skip_mask = 0, neuter_mask = 0;
4746 
4747   if (par->inner)
4748     nvptx_neuter_pars (par->inner, modes, outer | me);
4749 
4750   for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4751     {
4752       if ((outer | me) & GOMP_DIM_MASK (mode))
4753 	{} /* Mode is partitioned: no neutering.  */
4754       else if (!(modes & GOMP_DIM_MASK (mode)))
4755 	{} /* Mode is not used: nothing to do.  */
4756       else if (par->inner_mask & GOMP_DIM_MASK (mode)
4757 	       || !par->forked_insn)
4758 	/* Partitioned in inner parallels, or we're not a partitioned
4759 	   at all: neuter individual blocks.  */
4760 	neuter_mask |= GOMP_DIM_MASK (mode);
4761       else if (!par->parent || !par->parent->forked_insn
4762 	       || par->parent->inner_mask & GOMP_DIM_MASK (mode))
4763 	/* Parent isn't a parallel or contains this paralleling: skip
4764 	   parallel at this level.  */
4765 	skip_mask |= GOMP_DIM_MASK (mode);
4766       else
4767 	{} /* Parent will skip this parallel itself.  */
4768     }
4769 
4770   if (neuter_mask)
4771     {
4772       int ix, len;
4773 
4774       if (nvptx_optimize)
4775 	{
4776 	  /* Neuter whole SESE regions.  */
4777 	  bb_pair_vec_t regions;
4778 
4779 	  nvptx_find_sese (par->blocks, regions);
4780 	  len = regions.length ();
4781 	  for (ix = 0; ix != len; ix++)
4782 	    {
4783 	      basic_block from = regions[ix].first;
4784 	      basic_block to = regions[ix].second;
4785 
4786 	      if (from)
4787 		nvptx_single (neuter_mask, from, to);
4788 	      else
4789 		gcc_assert (!to);
4790 	    }
4791 	}
4792       else
4793 	{
4794 	  /* Neuter each BB individually.  */
4795 	  len = par->blocks.length ();
4796 	  for (ix = 0; ix != len; ix++)
4797 	    {
4798 	      basic_block block = par->blocks[ix];
4799 
4800 	      nvptx_single (neuter_mask, block, block);
4801 	    }
4802 	}
4803     }
4804 
4805   if (skip_mask)
4806     nvptx_skip_par (skip_mask, par);
4807 
4808   if (par->next)
4809     nvptx_neuter_pars (par->next, modes, outer);
4810 }
4811 
4812 static void
4813 populate_offload_attrs (offload_attrs *oa)
4814 {
4815   tree attr = oacc_get_fn_attrib (current_function_decl);
4816   tree dims = TREE_VALUE (attr);
4817   unsigned ix;
4818 
4819   oa->mask = 0;
4820 
4821   for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
4822     {
4823       tree t = TREE_VALUE (dims);
4824       int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
4825       tree allowed = TREE_PURPOSE (dims);
4826 
4827       if (size != 1 && !(allowed && integer_zerop (allowed)))
4828 	oa->mask |= GOMP_DIM_MASK (ix);
4829 
4830       switch (ix)
4831 	{
4832 	case GOMP_DIM_GANG:
4833 	  oa->num_gangs = size;
4834 	  break;
4835 
4836 	case GOMP_DIM_WORKER:
4837 	  oa->num_workers = size;
4838 	  break;
4839 
4840 	case GOMP_DIM_VECTOR:
4841 	  oa->vector_length = size;
4842 	  break;
4843 	}
4844     }
4845 }
4846 
4847 #if WORKAROUND_PTXJIT_BUG_2
4848 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT.  This variant
4849    is needed in the nvptx target because the branches generated for
4850    parititioning are NONJUMP_INSN_P, not JUMP_P.  */
4851 
4852 static rtx
4853 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
4854 {
4855   rtx pat;
4856   if ((strict && !JUMP_P (insn))
4857       || (!strict && !INSN_P (insn)))
4858     return NULL_RTX;
4859   pat = PATTERN (insn);
4860 
4861   /* The set is allowed to appear either as the insn pattern or
4862      the first set in a PARALLEL.  */
4863   if (GET_CODE (pat) == PARALLEL)
4864     pat = XVECEXP (pat, 0, 0);
4865   if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
4866     return pat;
4867 
4868   return NULL_RTX;
4869 }
4870 
4871 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT.  */
4872 
4873 static rtx
4874 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
4875 {
4876   rtx x = nvptx_pc_set (insn, strict);
4877 
4878   if (!x)
4879     return NULL_RTX;
4880   x = SET_SRC (x);
4881   if (GET_CODE (x) == LABEL_REF)
4882     return x;
4883   if (GET_CODE (x) != IF_THEN_ELSE)
4884     return NULL_RTX;
4885   if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
4886     return XEXP (x, 1);
4887   if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
4888     return XEXP (x, 2);
4889   return NULL_RTX;
4890 }
4891 
4892 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
4893    insn inbetween the branch and the label.  This works around a JIT bug
4894    observed at driver version 384.111, at -O0 for sm_50.  */
4895 
4896 static void
4897 prevent_branch_around_nothing (void)
4898 {
4899   rtx_insn *seen_label = NULL;
4900     for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4901       {
4902 	if (INSN_P (insn) && condjump_p (insn))
4903 	  {
4904 	    seen_label = label_ref_label (nvptx_condjump_label (insn, false));
4905 	    continue;
4906 	  }
4907 
4908 	if (seen_label == NULL)
4909 	  continue;
4910 
4911 	if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4912 	  continue;
4913 
4914 	if (INSN_P (insn))
4915 	  switch (recog_memoized (insn))
4916 	    {
4917 	    case CODE_FOR_nvptx_fork:
4918 	    case CODE_FOR_nvptx_forked:
4919 	    case CODE_FOR_nvptx_joining:
4920 	    case CODE_FOR_nvptx_join:
4921 	      continue;
4922 	    default:
4923 	      seen_label = NULL;
4924 	      continue;
4925 	    }
4926 
4927 	if (LABEL_P (insn) && insn == seen_label)
4928 	  emit_insn_before (gen_fake_nop (), insn);
4929 
4930 	seen_label = NULL;
4931       }
4932   }
4933 #endif
4934 
4935 #ifdef WORKAROUND_PTXJIT_BUG_3
4936 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns.  This
4937    works around a hang observed at driver version 390.48 for sm_50.  */
4938 
4939 static void
4940 workaround_barsyncs (void)
4941 {
4942   bool seen_barsync = false;
4943   for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
4944     {
4945       if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
4946 	{
4947 	  if (seen_barsync)
4948 	    {
4949 	      emit_insn_before (gen_nvptx_membar_cta (), insn);
4950 	      emit_insn_before (gen_nvptx_membar_cta (), insn);
4951 	    }
4952 
4953 	  seen_barsync = true;
4954 	  continue;
4955 	}
4956 
4957       if (!seen_barsync)
4958 	continue;
4959 
4960       if (NOTE_P (insn) || DEBUG_INSN_P (insn))
4961 	continue;
4962       else if (INSN_P (insn))
4963 	switch (recog_memoized (insn))
4964 	  {
4965 	  case CODE_FOR_nvptx_fork:
4966 	  case CODE_FOR_nvptx_forked:
4967 	  case CODE_FOR_nvptx_joining:
4968 	  case CODE_FOR_nvptx_join:
4969 	    continue;
4970 	  default:
4971 	    break;
4972 	  }
4973 
4974       seen_barsync = false;
4975     }
4976 }
4977 #endif
4978 
4979 /* PTX-specific reorganization
4980    - Split blocks at fork and join instructions
4981    - Compute live registers
4982    - Mark now-unused registers, so function begin doesn't declare
4983    unused registers.
4984    - Insert state propagation when entering partitioned mode
4985    - Insert neutering instructions when in single mode
4986    - Replace subregs with suitable sequences.
4987 */
4988 
4989 static void
4990 nvptx_reorg (void)
4991 {
4992   /* We are freeing block_for_insn in the toplev to keep compatibility
4993      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
4994   compute_bb_for_insn ();
4995 
4996   thread_prologue_and_epilogue_insns ();
4997 
4998   /* Split blocks and record interesting unspecs.  */
4999   bb_insn_map_t bb_insn_map;
5000 
5001   nvptx_split_blocks (&bb_insn_map);
5002 
5003   /* Compute live regs */
5004   df_clear_flags (DF_LR_RUN_DCE);
5005   df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
5006   df_live_add_problem ();
5007   df_live_set_all_dirty ();
5008   df_analyze ();
5009   regstat_init_n_sets_and_refs ();
5010 
5011   if (dump_file)
5012     df_dump (dump_file);
5013 
5014   /* Mark unused regs as unused.  */
5015   int max_regs = max_reg_num ();
5016   for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
5017     if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5018       regno_reg_rtx[i] = const0_rtx;
5019 
5020   /* Determine launch dimensions of the function.  If it is not an
5021      offloaded function  (i.e. this is a regular compiler), the
5022      function has no neutering.  */
5023   tree attr = oacc_get_fn_attrib (current_function_decl);
5024   if (attr)
5025     {
5026       /* If we determined this mask before RTL expansion, we could
5027 	 elide emission of some levels of forks and joins.  */
5028       offload_attrs oa;
5029 
5030       populate_offload_attrs (&oa);
5031 
5032       /* If there is worker neutering, there must be vector
5033 	 neutering.  Otherwise the hardware will fail.  */
5034       gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5035 		  || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5036 
5037       /* Discover & process partitioned regions.  */
5038       parallel *pars = nvptx_discover_pars (&bb_insn_map);
5039       nvptx_process_pars (pars);
5040       nvptx_neuter_pars (pars, oa.mask, 0);
5041       delete pars;
5042     }
5043 
5044   /* Replace subregs.  */
5045   nvptx_reorg_subreg ();
5046 
5047   if (TARGET_UNIFORM_SIMT)
5048     nvptx_reorg_uniform_simt ();
5049 
5050 #if WORKAROUND_PTXJIT_BUG_2
5051   prevent_branch_around_nothing ();
5052 #endif
5053 
5054 #ifdef WORKAROUND_PTXJIT_BUG_3
5055   workaround_barsyncs ();
5056 #endif
5057 
5058   regstat_free_n_sets_and_refs ();
5059 
5060   df_finish_pass (true);
5061 }
5062 
5063 /* Handle a "kernel" attribute; arguments as in
5064    struct attribute_spec.handler.  */
5065 
5066 static tree
5067 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5068 			       int ARG_UNUSED (flags), bool *no_add_attrs)
5069 {
5070   tree decl = *node;
5071 
5072   if (TREE_CODE (decl) != FUNCTION_DECL)
5073     {
5074       error ("%qE attribute only applies to functions", name);
5075       *no_add_attrs = true;
5076     }
5077   else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5078     {
5079       error ("%qE attribute requires a void return type", name);
5080       *no_add_attrs = true;
5081     }
5082 
5083   return NULL_TREE;
5084 }
5085 
5086 /* Handle a "shared" attribute; arguments as in
5087    struct attribute_spec.handler.  */
5088 
5089 static tree
5090 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5091 			       int ARG_UNUSED (flags), bool *no_add_attrs)
5092 {
5093   tree decl = *node;
5094 
5095   if (TREE_CODE (decl) != VAR_DECL)
5096     {
5097       error ("%qE attribute only applies to variables", name);
5098       *no_add_attrs = true;
5099     }
5100   else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5101     {
5102       error ("%qE attribute not allowed with auto storage class", name);
5103       *no_add_attrs = true;
5104     }
5105 
5106   return NULL_TREE;
5107 }
5108 
5109 /* Table of valid machine attributes.  */
5110 static const struct attribute_spec nvptx_attribute_table[] =
5111 {
5112   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5113        affects_type_identity, handler, exclude } */
5114   { "kernel", 0, 0, true, false,  false, false, nvptx_handle_kernel_attribute,
5115     NULL },
5116   { "shared", 0, 0, true, false,  false, false, nvptx_handle_shared_attribute,
5117     NULL },
5118   { NULL, 0, 0, false, false, false, false, NULL, NULL }
5119 };
5120 
5121 /* Limit vector alignments to BIGGEST_ALIGNMENT.  */
5122 
5123 static HOST_WIDE_INT
5124 nvptx_vector_alignment (const_tree type)
5125 {
5126   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
5127 
5128   return MIN (align, BIGGEST_ALIGNMENT);
5129 }
5130 
5131 /* Indicate that INSN cannot be duplicated.   */
5132 
5133 static bool
5134 nvptx_cannot_copy_insn_p (rtx_insn *insn)
5135 {
5136   switch (recog_memoized (insn))
5137     {
5138     case CODE_FOR_nvptx_shufflesi:
5139     case CODE_FOR_nvptx_shufflesf:
5140     case CODE_FOR_nvptx_barsync:
5141     case CODE_FOR_nvptx_fork:
5142     case CODE_FOR_nvptx_forked:
5143     case CODE_FOR_nvptx_joining:
5144     case CODE_FOR_nvptx_join:
5145       return true;
5146     default:
5147       return false;
5148     }
5149 }
5150 
5151 /* Section anchors do not work.  Initialization for flag_section_anchor
5152    probes the existence of the anchoring target hooks and prevents
5153    anchoring if they don't exist.  However, we may be being used with
5154    a host-side compiler that does support anchoring, and hence see
5155    the anchor flag set (as it's not recalculated).  So provide an
5156    implementation denying anchoring.  */
5157 
5158 static bool
5159 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5160 {
5161   return false;
5162 }
5163 
5164 /* Record a symbol for mkoffload to enter into the mapping table.  */
5165 
5166 static void
5167 nvptx_record_offload_symbol (tree decl)
5168 {
5169   switch (TREE_CODE (decl))
5170     {
5171     case VAR_DECL:
5172       fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5173 	       IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5174       break;
5175 
5176     case FUNCTION_DECL:
5177       {
5178 	tree attr = oacc_get_fn_attrib (decl);
5179 	/* OpenMP offloading does not set this attribute.  */
5180 	tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5181 
5182 	fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5183 		 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5184 
5185 	for (; dims; dims = TREE_CHAIN (dims))
5186 	  {
5187 	    int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5188 
5189 	    gcc_assert (!TREE_PURPOSE (dims));
5190 	    fprintf (asm_out_file, ", %#x", size);
5191 	  }
5192 
5193 	fprintf (asm_out_file, "\n");
5194       }
5195       break;
5196 
5197     default:
5198       gcc_unreachable ();
5199     }
5200 }
5201 
5202 /* Implement TARGET_ASM_FILE_START.  Write the kinds of things ptxas expects
5203    at the start of a file.  */
5204 
5205 static void
5206 nvptx_file_start (void)
5207 {
5208   fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5209   fputs ("\t.version\t3.1\n", asm_out_file);
5210   if (TARGET_SM35)
5211     fputs ("\t.target\tsm_35\n", asm_out_file);
5212   else
5213     fputs ("\t.target\tsm_30\n", asm_out_file);
5214   fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5215   fputs ("// END PREAMBLE\n", asm_out_file);
5216 }
5217 
5218 /* Emit a declaration for a worker and vector-level buffer in .shared
5219    memory.  */
5220 
5221 static void
5222 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5223 {
5224   const char *name = XSTR (sym, 0);
5225 
5226   write_var_marker (file, true, false, name);
5227   fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5228 	   align, name, size);
5229 }
5230 
5231 /* Write out the function declarations we've collected and declare storage
5232    for the broadcast buffer.  */
5233 
5234 static void
5235 nvptx_file_end (void)
5236 {
5237   hash_table<tree_hasher>::iterator iter;
5238   tree decl;
5239   FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5240     nvptx_record_fndecl (decl);
5241   fputs (func_decls.str().c_str(), asm_out_file);
5242 
5243   if (oacc_bcast_size)
5244     write_shared_buffer (asm_out_file, oacc_bcast_sym,
5245 			 oacc_bcast_align, oacc_bcast_size);
5246 
5247   if (worker_red_size)
5248     write_shared_buffer (asm_out_file, worker_red_sym,
5249 			 worker_red_align, worker_red_size);
5250 
5251   if (vector_red_size)
5252     write_shared_buffer (asm_out_file, vector_red_sym,
5253 			 vector_red_align, vector_red_size);
5254 
5255   if (need_softstack_decl)
5256     {
5257       write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5258       /* 32 is the maximum number of warps in a block.  Even though it's an
5259          external declaration, emit the array size explicitly; otherwise, it
5260          may fail at PTX JIT time if the definition is later in link order.  */
5261       fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5262 	       POINTER_SIZE);
5263     }
5264   if (need_unisimt_decl)
5265     {
5266       write_var_marker (asm_out_file, false, true, "__nvptx_uni");
5267       fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
5268     }
5269 }
5270 
5271 /* Expander for the shuffle builtins.  */
5272 
5273 static rtx
5274 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
5275 {
5276   if (ignore)
5277     return target;
5278 
5279   rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
5280 			 NULL_RTX, mode, EXPAND_NORMAL);
5281   if (!REG_P (src))
5282     src = copy_to_mode_reg (mode, src);
5283 
5284   rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
5285 			 NULL_RTX, SImode, EXPAND_NORMAL);
5286   rtx op = expand_expr (CALL_EXPR_ARG  (exp, 2),
5287 			NULL_RTX, SImode, EXPAND_NORMAL);
5288 
5289   if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
5290     idx = copy_to_mode_reg (SImode, idx);
5291 
5292   rtx pat = nvptx_gen_shuffle (target, src, idx,
5293 			       (nvptx_shuffle_kind) INTVAL (op));
5294   if (pat)
5295     emit_insn (pat);
5296 
5297   return target;
5298 }
5299 
5300 const char *
5301 nvptx_output_red_partition (rtx dst, rtx offset)
5302 {
5303   const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
5304   const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
5305 
5306   if (offset == const0_rtx)
5307     fprintf (asm_out_file, zero_offset, REGNO (dst),
5308 	     REGNO (cfun->machine->red_partition));
5309   else
5310     fprintf (asm_out_file, with_offset, REGNO (dst),
5311 	     REGNO (cfun->machine->red_partition), UINTVAL (offset));
5312 
5313   return "";
5314 }
5315 
5316 /* Shared-memory reduction address expander.  */
5317 
5318 static rtx
5319 nvptx_expand_shared_addr (tree exp, rtx target,
5320 			  machine_mode ARG_UNUSED (mode), int ignore,
5321 			  int vector)
5322 {
5323   if (ignore)
5324     return target;
5325 
5326   unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
5327   unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
5328   unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
5329   rtx addr = worker_red_sym;
5330 
5331   if (vector)
5332     {
5333       offload_attrs oa;
5334 
5335       populate_offload_attrs (&oa);
5336 
5337       unsigned int psize = ROUND_UP (size + offset, align);
5338       unsigned int pnum = nvptx_mach_max_workers ();
5339       vector_red_partition = MAX (vector_red_partition, psize);
5340       vector_red_size = MAX (vector_red_size, psize * pnum);
5341       vector_red_align = MAX (vector_red_align, align);
5342 
5343       if (cfun->machine->red_partition == NULL)
5344 	cfun->machine->red_partition = gen_reg_rtx (Pmode);
5345 
5346       addr = gen_reg_rtx (Pmode);
5347       emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
5348     }
5349   else
5350     {
5351       worker_red_align = MAX (worker_red_align, align);
5352       worker_red_size = MAX (worker_red_size, size + offset);
5353 
5354       if (offset)
5355 	{
5356 	  addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
5357 	  addr = gen_rtx_CONST (Pmode, addr);
5358 	}
5359    }
5360 
5361   emit_move_insn (target, addr);
5362   return target;
5363 }
5364 
5365 /* Expand the CMP_SWAP PTX builtins.  We have our own versions that do
5366    not require taking the address of any object, other than the memory
5367    cell being operated on.  */
5368 
5369 static rtx
5370 nvptx_expand_cmp_swap (tree exp, rtx target,
5371 		       machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
5372 {
5373   machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
5374 
5375   if (!target)
5376     target = gen_reg_rtx (mode);
5377 
5378   rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
5379 			 NULL_RTX, Pmode, EXPAND_NORMAL);
5380   rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
5381 			 NULL_RTX, mode, EXPAND_NORMAL);
5382   rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
5383 			 NULL_RTX, mode, EXPAND_NORMAL);
5384   rtx pat;
5385 
5386   mem = gen_rtx_MEM (mode, mem);
5387   if (!REG_P (cmp))
5388     cmp = copy_to_mode_reg (mode, cmp);
5389   if (!REG_P (src))
5390     src = copy_to_mode_reg (mode, src);
5391 
5392   if (mode == SImode)
5393     pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
5394   else
5395     pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
5396 
5397   emit_insn (pat);
5398 
5399   return target;
5400 }
5401 
5402 
5403 /* Codes for all the NVPTX builtins.  */
5404 enum nvptx_builtins
5405 {
5406   NVPTX_BUILTIN_SHUFFLE,
5407   NVPTX_BUILTIN_SHUFFLELL,
5408   NVPTX_BUILTIN_WORKER_ADDR,
5409   NVPTX_BUILTIN_VECTOR_ADDR,
5410   NVPTX_BUILTIN_CMP_SWAP,
5411   NVPTX_BUILTIN_CMP_SWAPLL,
5412   NVPTX_BUILTIN_MAX
5413 };
5414 
5415 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
5416 
5417 /* Return the NVPTX builtin for CODE.  */
5418 
5419 static tree
5420 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
5421 {
5422   if (code >= NVPTX_BUILTIN_MAX)
5423     return error_mark_node;
5424 
5425   return nvptx_builtin_decls[code];
5426 }
5427 
5428 /* Set up all builtin functions for this target.  */
5429 
5430 static void
5431 nvptx_init_builtins (void)
5432 {
5433 #define DEF(ID, NAME, T)						\
5434   (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID]				\
5435    = add_builtin_function ("__builtin_nvptx_" NAME,			\
5436 			   build_function_type_list T,			\
5437 			   NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
5438 #define ST sizetype
5439 #define UINT unsigned_type_node
5440 #define LLUINT long_long_unsigned_type_node
5441 #define PTRVOID ptr_type_node
5442 
5443   DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
5444   DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
5445   DEF (WORKER_ADDR, "worker_addr",
5446        (PTRVOID, ST, UINT, UINT, NULL_TREE));
5447   DEF (VECTOR_ADDR, "vector_addr",
5448        (PTRVOID, ST, UINT, UINT, NULL_TREE));
5449   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
5450   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
5451 
5452 #undef DEF
5453 #undef ST
5454 #undef UINT
5455 #undef LLUINT
5456 #undef PTRVOID
5457 }
5458 
5459 /* Expand an expression EXP that calls a built-in function,
5460    with result going to TARGET if that's convenient
5461    (and in mode MODE if that's convenient).
5462    SUBTARGET may be used as the target for computing one of EXP's operands.
5463    IGNORE is nonzero if the value is to be ignored.  */
5464 
5465 static rtx
5466 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
5467 		      machine_mode mode, int ignore)
5468 {
5469   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
5470   switch (DECL_MD_FUNCTION_CODE (fndecl))
5471     {
5472     case NVPTX_BUILTIN_SHUFFLE:
5473     case NVPTX_BUILTIN_SHUFFLELL:
5474       return nvptx_expand_shuffle (exp, target, mode, ignore);
5475 
5476     case NVPTX_BUILTIN_WORKER_ADDR:
5477       return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
5478 
5479     case NVPTX_BUILTIN_VECTOR_ADDR:
5480       return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
5481 
5482     case NVPTX_BUILTIN_CMP_SWAP:
5483     case NVPTX_BUILTIN_CMP_SWAPLL:
5484       return nvptx_expand_cmp_swap (exp, target, mode, ignore);
5485 
5486     default: gcc_unreachable ();
5487     }
5488 }
5489 
5490 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
5491 
5492 static int
5493 nvptx_simt_vf ()
5494 {
5495   return PTX_WARP_SIZE;
5496 }
5497 
5498 /* Return 1 if TRAIT NAME is present in the OpenMP context's
5499    device trait set, return 0 if not present in any OpenMP context in the
5500    whole translation unit, or -1 if not present in the current OpenMP context
5501    but might be present in another OpenMP context in the same TU.  */
5502 
5503 int
5504 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
5505 				const char *name)
5506 {
5507   switch (trait)
5508     {
5509     case omp_device_kind:
5510       return strcmp (name, "gpu") == 0;
5511     case omp_device_arch:
5512       return strcmp (name, "nvptx") == 0;
5513     case omp_device_isa:
5514       if (strcmp (name, "sm_30") == 0)
5515 	return !TARGET_SM35;
5516       if (strcmp (name, "sm_35") == 0)
5517 	return TARGET_SM35;
5518       return 0;
5519     default:
5520       gcc_unreachable ();
5521     }
5522 }
5523 
5524 static bool
5525 nvptx_welformed_vector_length_p (int l)
5526 {
5527   gcc_assert (l > 0);
5528   return l % PTX_WARP_SIZE == 0;
5529 }
5530 
5531 static void
5532 nvptx_apply_dim_limits (int dims[])
5533 {
5534   /* Check that the vector_length is not too large.  */
5535   if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
5536     dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
5537 
5538   /* Check that the number of workers is not too large.  */
5539   if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
5540     dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
5541 
5542   /* Ensure that num_worker * vector_length <= cta size.  */
5543   if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
5544       && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
5545     dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5546 
5547   /* If we need a per-worker barrier ... .  */
5548   if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
5549       && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5550     /* Don't use more barriers than available.  */
5551     dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
5552 				 PTX_NUM_PER_WORKER_BARRIERS);
5553 }
5554 
5555 /* Return true if FNDECL contains calls to vector-partitionable routines.  */
5556 
5557 static bool
5558 has_vector_partitionable_routine_calls_p (tree fndecl)
5559 {
5560   if (!fndecl)
5561     return false;
5562 
5563   basic_block bb;
5564   FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
5565     for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
5566 	 gsi_next_nondebug (&i))
5567       {
5568 	gimple *stmt = gsi_stmt (i);
5569 	if (gimple_code (stmt) != GIMPLE_CALL)
5570 	  continue;
5571 
5572 	tree callee = gimple_call_fndecl (stmt);
5573 	if (!callee)
5574 	  continue;
5575 
5576 	tree attrs  = oacc_get_fn_attrib (callee);
5577 	if (attrs == NULL_TREE)
5578 	  return false;
5579 
5580 	int partition_level = oacc_fn_attrib_level (attrs);
5581 	bool seq_routine_p = partition_level == GOMP_DIM_MAX;
5582 	if (!seq_routine_p)
5583 	  return true;
5584       }
5585 
5586   return false;
5587 }
5588 
5589 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
5590    DIMS has changed.  */
5591 
5592 static void
5593 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
5594 {
5595   bool oacc_default_dims_p = false;
5596   bool oacc_min_dims_p = false;
5597   bool offload_region_p = false;
5598   bool routine_p = false;
5599   bool routine_seq_p = false;
5600   int default_vector_length = -1;
5601 
5602   if (decl == NULL_TREE)
5603     {
5604       if (fn_level == -1)
5605 	oacc_default_dims_p = true;
5606       else if (fn_level == -2)
5607 	oacc_min_dims_p = true;
5608       else
5609 	gcc_unreachable ();
5610     }
5611   else if (fn_level == -1)
5612     offload_region_p = true;
5613   else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
5614     {
5615       routine_p = true;
5616       routine_seq_p = fn_level == GOMP_DIM_MAX;
5617     }
5618   else
5619     gcc_unreachable ();
5620 
5621   if (oacc_min_dims_p)
5622     {
5623       gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
5624       gcc_assert (dims[GOMP_DIM_WORKER] == 1);
5625       gcc_assert (dims[GOMP_DIM_GANG] == 1);
5626 
5627       dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5628       return;
5629     }
5630 
5631   if (routine_p)
5632     {
5633       if (!routine_seq_p)
5634 	dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5635 
5636       return;
5637     }
5638 
5639   if (oacc_default_dims_p)
5640     {
5641       /* -1  : not set
5642 	  0  : set at runtime, f.i. -fopenacc-dims=-
5643          >= 1: set at compile time, f.i. -fopenacc-dims=1.  */
5644       gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5645       gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5646       gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5647 
5648       /* But -fopenacc-dims=- is not yet supported on trunk.  */
5649       gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5650       gcc_assert (dims[GOMP_DIM_WORKER] != 0);
5651       gcc_assert (dims[GOMP_DIM_GANG] != 0);
5652     }
5653 
5654   if (offload_region_p)
5655     {
5656       /* -1   : not set
5657 	  0   : set using variable, f.i. num_gangs (n)
5658 	  >= 1: set using constant, f.i. num_gangs (1).  */
5659       gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
5660       gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
5661       gcc_assert (dims[GOMP_DIM_GANG] >= -1);
5662     }
5663 
5664   if (offload_region_p)
5665     default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
5666   else
5667     /* oacc_default_dims_p.  */
5668     default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
5669 
5670   int old_dims[GOMP_DIM_MAX];
5671   unsigned int i;
5672   for (i = 0; i < GOMP_DIM_MAX; ++i)
5673     old_dims[i] = dims[i];
5674 
5675   const char *vector_reason = NULL;
5676   if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
5677     {
5678       default_vector_length = PTX_WARP_SIZE;
5679 
5680       if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
5681 	{
5682 	  vector_reason = G_("using vector_length (%d) due to call to"
5683 			     " vector-partitionable routine, ignoring %d");
5684 	  dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
5685 	}
5686     }
5687 
5688   if (dims[GOMP_DIM_VECTOR] == 0)
5689     {
5690       vector_reason = G_("using vector_length (%d), ignoring runtime setting");
5691       dims[GOMP_DIM_VECTOR] = default_vector_length;
5692     }
5693 
5694   if (dims[GOMP_DIM_VECTOR] > 0
5695       && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
5696     dims[GOMP_DIM_VECTOR] = default_vector_length;
5697 
5698   nvptx_apply_dim_limits (dims);
5699 
5700   if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
5701     warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5702 		vector_reason != NULL
5703 		? vector_reason
5704 		: G_("using vector_length (%d), ignoring %d"),
5705 		dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
5706 
5707   if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
5708     warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
5709 		G_("using num_workers (%d), ignoring %d"),
5710 		dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
5711 
5712   if (oacc_default_dims_p)
5713     {
5714       if (dims[GOMP_DIM_VECTOR] < 0)
5715 	dims[GOMP_DIM_VECTOR] = default_vector_length;
5716       if (dims[GOMP_DIM_WORKER] < 0)
5717 	dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
5718       if (dims[GOMP_DIM_GANG] < 0)
5719 	dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
5720       nvptx_apply_dim_limits (dims);
5721     }
5722 
5723   if (offload_region_p)
5724     {
5725       for (i = 0; i < GOMP_DIM_MAX; i++)
5726 	{
5727 	  if (!(dims[i] < 0))
5728 	    continue;
5729 
5730 	  if ((used & GOMP_DIM_MASK (i)) == 0)
5731 	    /* Function oacc_validate_dims will apply the minimal dimension.  */
5732 	    continue;
5733 
5734 	  dims[i] = (i == GOMP_DIM_VECTOR
5735 		     ? default_vector_length
5736 		     : oacc_get_default_dim (i));
5737 	}
5738 
5739       nvptx_apply_dim_limits (dims);
5740     }
5741 }
5742 
5743 /* Validate compute dimensions of an OpenACC offload or routine, fill
5744    in non-unity defaults.  FN_LEVEL indicates the level at which a
5745    routine might spawn a loop.  It is negative for non-routines.  If
5746    DECL is null, we are validating the default dimensions.  */
5747 
5748 static bool
5749 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
5750 {
5751   int old_dims[GOMP_DIM_MAX];
5752   unsigned int i;
5753 
5754   for (i = 0; i < GOMP_DIM_MAX; ++i)
5755     old_dims[i] = dims[i];
5756 
5757   nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
5758 
5759   gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
5760   if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
5761     gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
5762 
5763   for (i = 0; i < GOMP_DIM_MAX; ++i)
5764     if (old_dims[i] != dims[i])
5765       return true;
5766 
5767   return false;
5768 }
5769 
5770 /* Return maximum dimension size, or zero for unbounded.  */
5771 
5772 static int
5773 nvptx_dim_limit (int axis)
5774 {
5775   switch (axis)
5776     {
5777     case GOMP_DIM_VECTOR:
5778       return PTX_MAX_VECTOR_LENGTH;
5779 
5780     default:
5781       break;
5782     }
5783   return 0;
5784 }
5785 
5786 /* Determine whether fork & joins are needed.  */
5787 
5788 static bool
5789 nvptx_goacc_fork_join (gcall *call, const int dims[],
5790 		       bool ARG_UNUSED (is_fork))
5791 {
5792   tree arg = gimple_call_arg (call, 2);
5793   unsigned axis = TREE_INT_CST_LOW (arg);
5794 
5795   /* We only care about worker and vector partitioning.  */
5796   if (axis < GOMP_DIM_WORKER)
5797     return false;
5798 
5799   /* If the size is 1, there's no partitioning.  */
5800   if (dims[axis] == 1)
5801     return false;
5802 
5803   return true;
5804 }
5805 
5806 /* Generate a PTX builtin function call that returns the address in
5807    the worker reduction buffer at OFFSET.  TYPE is the type of the
5808    data at that location.  */
5809 
5810 static tree
5811 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
5812 {
5813   enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
5814   if (vector)
5815     addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
5816   machine_mode mode = TYPE_MODE (type);
5817   tree fndecl = nvptx_builtin_decl (addr_dim, true);
5818   tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
5819   tree align = build_int_cst (unsigned_type_node,
5820 			      GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
5821   tree call = build_call_expr (fndecl, 3, offset, size, align);
5822 
5823   return fold_convert (build_pointer_type (type), call);
5824 }
5825 
5826 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR.  This function
5827    will cast the variable if necessary.  */
5828 
5829 static void
5830 nvptx_generate_vector_shuffle (location_t loc,
5831 			       tree dest_var, tree var, unsigned shift,
5832 			       gimple_seq *seq)
5833 {
5834   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
5835   tree_code code = NOP_EXPR;
5836   tree arg_type = unsigned_type_node;
5837   tree var_type = TREE_TYPE (var);
5838   tree dest_type = var_type;
5839 
5840   if (TREE_CODE (var_type) == COMPLEX_TYPE)
5841     var_type = TREE_TYPE (var_type);
5842 
5843   if (TREE_CODE (var_type) == REAL_TYPE)
5844     code = VIEW_CONVERT_EXPR;
5845 
5846   if (TYPE_SIZE (var_type)
5847       == TYPE_SIZE (long_long_unsigned_type_node))
5848     {
5849       fn = NVPTX_BUILTIN_SHUFFLELL;
5850       arg_type = long_long_unsigned_type_node;
5851     }
5852 
5853   tree call = nvptx_builtin_decl (fn, true);
5854   tree bits = build_int_cst (unsigned_type_node, shift);
5855   tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
5856   tree expr;
5857 
5858   if (var_type != dest_type)
5859     {
5860       /* Do real and imaginary parts separately.  */
5861       tree real = fold_build1 (REALPART_EXPR, var_type, var);
5862       real = fold_build1 (code, arg_type, real);
5863       real = build_call_expr_loc (loc, call, 3, real, bits, kind);
5864       real = fold_build1 (code, var_type, real);
5865 
5866       tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
5867       imag = fold_build1 (code, arg_type, imag);
5868       imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
5869       imag = fold_build1 (code, var_type, imag);
5870 
5871       expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
5872     }
5873   else
5874     {
5875       expr = fold_build1 (code, arg_type, var);
5876       expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
5877       expr = fold_build1 (code, dest_type, expr);
5878     }
5879 
5880   gimplify_assign (dest_var, expr, seq);
5881 }
5882 
5883 /* Lazily generate the global lock var decl and return its address.  */
5884 
5885 static tree
5886 nvptx_global_lock_addr ()
5887 {
5888   tree v = global_lock_var;
5889 
5890   if (!v)
5891     {
5892       tree name = get_identifier ("__reduction_lock");
5893       tree type = build_qualified_type (unsigned_type_node,
5894 					TYPE_QUAL_VOLATILE);
5895       v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
5896       global_lock_var = v;
5897       DECL_ARTIFICIAL (v) = 1;
5898       DECL_EXTERNAL (v) = 1;
5899       TREE_STATIC (v) = 1;
5900       TREE_PUBLIC (v) = 1;
5901       TREE_USED (v) = 1;
5902       mark_addressable (v);
5903       mark_decl_referenced (v);
5904     }
5905 
5906   return build_fold_addr_expr (v);
5907 }
5908 
5909 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
5910    GSI.  We use a lockless scheme for nearly all case, which looks
5911    like:
5912      actual = initval(OP);
5913      do {
5914        guess = actual;
5915        write = guess OP myval;
5916        actual = cmp&swap (ptr, guess, write)
5917      } while (actual bit-different-to guess);
5918    return write;
5919 
5920    This relies on a cmp&swap instruction, which is available for 32-
5921    and 64-bit types.  Larger types must use a locking scheme.  */
5922 
5923 static tree
5924 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
5925 		       tree ptr, tree var, tree_code op)
5926 {
5927   unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
5928   tree_code code = NOP_EXPR;
5929   tree arg_type = unsigned_type_node;
5930   tree var_type = TREE_TYPE (var);
5931 
5932   if (TREE_CODE (var_type) == COMPLEX_TYPE
5933       || TREE_CODE (var_type) == REAL_TYPE)
5934     code = VIEW_CONVERT_EXPR;
5935 
5936   if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
5937     {
5938       arg_type = long_long_unsigned_type_node;
5939       fn = NVPTX_BUILTIN_CMP_SWAPLL;
5940     }
5941 
5942   tree swap_fn = nvptx_builtin_decl (fn, true);
5943 
5944   gimple_seq init_seq = NULL;
5945   tree init_var = make_ssa_name (arg_type);
5946   tree init_expr = omp_reduction_init_op (loc, op, var_type);
5947   init_expr = fold_build1 (code, arg_type, init_expr);
5948   gimplify_assign (init_var, init_expr, &init_seq);
5949   gimple *init_end = gimple_seq_last (init_seq);
5950 
5951   gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
5952 
5953   /* Split the block just after the init stmts.  */
5954   basic_block pre_bb = gsi_bb (*gsi);
5955   edge pre_edge = split_block (pre_bb, init_end);
5956   basic_block loop_bb = pre_edge->dest;
5957   pre_bb = pre_edge->src;
5958   /* Reset the iterator.  */
5959   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5960 
5961   tree expect_var = make_ssa_name (arg_type);
5962   tree actual_var = make_ssa_name (arg_type);
5963   tree write_var = make_ssa_name (arg_type);
5964 
5965   /* Build and insert the reduction calculation.  */
5966   gimple_seq red_seq = NULL;
5967   tree write_expr = fold_build1 (code, var_type, expect_var);
5968   write_expr = fold_build2 (op, var_type, write_expr, var);
5969   write_expr = fold_build1 (code, arg_type, write_expr);
5970   gimplify_assign (write_var, write_expr, &red_seq);
5971 
5972   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
5973 
5974   /* Build & insert the cmp&swap sequence.  */
5975   gimple_seq latch_seq = NULL;
5976   tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
5977 					ptr, expect_var, write_var);
5978   gimplify_assign (actual_var, swap_expr, &latch_seq);
5979 
5980   gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
5981 				   NULL_TREE, NULL_TREE);
5982   gimple_seq_add_stmt (&latch_seq, cond);
5983 
5984   gimple *latch_end = gimple_seq_last (latch_seq);
5985   gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
5986 
5987   /* Split the block just after the latch stmts.  */
5988   edge post_edge = split_block (loop_bb, latch_end);
5989   basic_block post_bb = post_edge->dest;
5990   loop_bb = post_edge->src;
5991   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
5992 
5993   post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
5994   post_edge->probability = profile_probability::even ();
5995   edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
5996   loop_edge->probability = profile_probability::even ();
5997   set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
5998   set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
5999 
6000   gphi *phi = create_phi_node (expect_var, loop_bb);
6001   add_phi_arg (phi, init_var, pre_edge, loc);
6002   add_phi_arg (phi, actual_var, loop_edge, loc);
6003 
6004   loop *loop = alloc_loop ();
6005   loop->header = loop_bb;
6006   loop->latch = loop_bb;
6007   add_loop (loop, loop_bb->loop_father);
6008 
6009   return fold_build1 (code, var_type, write_var);
6010 }
6011 
6012 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
6013    GSI.  This is necessary for types larger than 64 bits, where there
6014    is no cmp&swap instruction to implement a lockless scheme.  We use
6015    a lock variable in global memory.
6016 
6017    while (cmp&swap (&lock_var, 0, 1))
6018      continue;
6019    T accum = *ptr;
6020    accum = accum OP var;
6021    *ptr = accum;
6022    cmp&swap (&lock_var, 1, 0);
6023    return accum;
6024 
6025    A lock in global memory is necessary to force execution engine
6026    descheduling and avoid resource starvation that can occur if the
6027    lock is in .shared memory.  */
6028 
6029 static tree
6030 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6031 		       tree ptr, tree var, tree_code op)
6032 {
6033   tree var_type = TREE_TYPE (var);
6034   tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6035   tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6036   tree uns_locked = build_int_cst (unsigned_type_node, 1);
6037 
6038   /* Split the block just before the gsi.  Insert a gimple nop to make
6039      this easier.  */
6040   gimple *nop = gimple_build_nop ();
6041   gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6042   basic_block entry_bb = gsi_bb (*gsi);
6043   edge entry_edge = split_block (entry_bb, nop);
6044   basic_block lock_bb = entry_edge->dest;
6045   /* Reset the iterator.  */
6046   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6047 
6048   /* Build and insert the locking sequence.  */
6049   gimple_seq lock_seq = NULL;
6050   tree lock_var = make_ssa_name (unsigned_type_node);
6051   tree lock_expr = nvptx_global_lock_addr ();
6052   lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6053 				   uns_unlocked, uns_locked);
6054   gimplify_assign (lock_var, lock_expr, &lock_seq);
6055   gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6056 				   NULL_TREE, NULL_TREE);
6057   gimple_seq_add_stmt (&lock_seq, cond);
6058   gimple *lock_end = gimple_seq_last (lock_seq);
6059   gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6060 
6061   /* Split the block just after the lock sequence.  */
6062   edge locked_edge = split_block (lock_bb, lock_end);
6063   basic_block update_bb = locked_edge->dest;
6064   lock_bb = locked_edge->src;
6065   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6066 
6067   /* Create the lock loop ... */
6068   locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6069   locked_edge->probability = profile_probability::even ();
6070   edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6071   loop_edge->probability = profile_probability::even ();
6072   set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6073   set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6074 
6075   /* ... and the loop structure.  */
6076   loop *lock_loop = alloc_loop ();
6077   lock_loop->header = lock_bb;
6078   lock_loop->latch = lock_bb;
6079   lock_loop->nb_iterations_estimate = 1;
6080   lock_loop->any_estimate = true;
6081   add_loop (lock_loop, entry_bb->loop_father);
6082 
6083   /* Build and insert the reduction calculation.  */
6084   gimple_seq red_seq = NULL;
6085   tree acc_in = make_ssa_name (var_type);
6086   tree ref_in = build_simple_mem_ref (ptr);
6087   TREE_THIS_VOLATILE (ref_in) = 1;
6088   gimplify_assign (acc_in, ref_in, &red_seq);
6089 
6090   tree acc_out = make_ssa_name (var_type);
6091   tree update_expr = fold_build2 (op, var_type, ref_in, var);
6092   gimplify_assign (acc_out, update_expr, &red_seq);
6093 
6094   tree ref_out = build_simple_mem_ref (ptr);
6095   TREE_THIS_VOLATILE (ref_out) = 1;
6096   gimplify_assign (ref_out, acc_out, &red_seq);
6097 
6098   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6099 
6100   /* Build & insert the unlock sequence.  */
6101   gimple_seq unlock_seq = NULL;
6102   tree unlock_expr = nvptx_global_lock_addr ();
6103   unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6104 				     uns_locked, uns_unlocked);
6105   gimplify_and_add (unlock_expr, &unlock_seq);
6106   gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6107 
6108   return acc_out;
6109 }
6110 
6111 /* Emit a sequence to update a reduction accumlator at *PTR with the
6112    value held in VAR using operator OP.  Return the updated value.
6113 
6114    TODO: optimize for atomic ops and indepedent complex ops.  */
6115 
6116 static tree
6117 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6118 			tree ptr, tree var, tree_code op)
6119 {
6120   tree type = TREE_TYPE (var);
6121   tree size = TYPE_SIZE (type);
6122 
6123   if (size == TYPE_SIZE (unsigned_type_node)
6124       || size == TYPE_SIZE (long_long_unsigned_type_node))
6125     return nvptx_lockless_update (loc, gsi, ptr, var, op);
6126   else
6127     return nvptx_lockfull_update (loc, gsi, ptr, var, op);
6128 }
6129 
6130 /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
6131 
6132 static void
6133 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
6134 {
6135   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6136   tree lhs = gimple_call_lhs (call);
6137   tree var = gimple_call_arg (call, 2);
6138   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6139   gimple_seq seq = NULL;
6140 
6141   push_gimplify_context (true);
6142 
6143   if (level != GOMP_DIM_GANG)
6144     {
6145       /* Copy the receiver object.  */
6146       tree ref_to_res = gimple_call_arg (call, 1);
6147 
6148       if (!integer_zerop (ref_to_res))
6149 	var = build_simple_mem_ref (ref_to_res);
6150     }
6151 
6152   if (level == GOMP_DIM_WORKER
6153       || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6154     {
6155       /* Store incoming value to worker reduction buffer.  */
6156       tree offset = gimple_call_arg (call, 5);
6157       tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6158 					     level == GOMP_DIM_VECTOR);
6159       tree ptr = make_ssa_name (TREE_TYPE (call));
6160 
6161       gimplify_assign (ptr, call, &seq);
6162       tree ref = build_simple_mem_ref (ptr);
6163       TREE_THIS_VOLATILE (ref) = 1;
6164       gimplify_assign (ref, var, &seq);
6165     }
6166 
6167   if (lhs)
6168     gimplify_assign (lhs, var, &seq);
6169 
6170   pop_gimplify_context (NULL);
6171   gsi_replace_with_seq (&gsi, seq, true);
6172 }
6173 
6174 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6175 
6176 static void
6177 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
6178 {
6179   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6180   tree lhs = gimple_call_lhs (call);
6181   tree var = gimple_call_arg (call, 2);
6182   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6183   enum tree_code rcode
6184     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6185   tree init = omp_reduction_init_op (gimple_location (call), rcode,
6186 				     TREE_TYPE (var));
6187   gimple_seq seq = NULL;
6188 
6189   push_gimplify_context (true);
6190 
6191   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6192     {
6193       /* Initialize vector-non-zeroes to INIT_VAL (OP).  */
6194       tree tid = make_ssa_name (integer_type_node);
6195       tree dim_vector = gimple_call_arg (call, 3);
6196       gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6197 						     dim_vector);
6198       gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6199 					     NULL_TREE, NULL_TREE);
6200 
6201       gimple_call_set_lhs (tid_call, tid);
6202       gimple_seq_add_stmt (&seq, tid_call);
6203       gimple_seq_add_stmt (&seq, cond_stmt);
6204 
6205       /* Split the block just after the call.  */
6206       edge init_edge = split_block (gsi_bb (gsi), call);
6207       basic_block init_bb = init_edge->dest;
6208       basic_block call_bb = init_edge->src;
6209 
6210       /* Fixup flags from call_bb to init_bb.  */
6211       init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
6212       init_edge->probability = profile_probability::even ();
6213 
6214       /* Set the initialization stmts.  */
6215       gimple_seq init_seq = NULL;
6216       tree init_var = make_ssa_name (TREE_TYPE (var));
6217       gimplify_assign (init_var, init, &init_seq);
6218       gsi = gsi_start_bb (init_bb);
6219       gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6220 
6221       /* Split block just after the init stmt.  */
6222       gsi_prev (&gsi);
6223       edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6224       basic_block dst_bb = inited_edge->dest;
6225 
6226       /* Create false edge from call_bb to dst_bb.  */
6227       edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
6228       nop_edge->probability = profile_probability::even ();
6229 
6230       /* Create phi node in dst block.  */
6231       gphi *phi = create_phi_node (lhs, dst_bb);
6232       add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6233       add_phi_arg (phi, var, nop_edge, gimple_location (call));
6234 
6235       /* Reset dominator of dst bb.  */
6236       set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
6237 
6238       /* Reset the gsi.  */
6239       gsi = gsi_for_stmt (call);
6240     }
6241   else
6242     {
6243       if (level == GOMP_DIM_GANG)
6244 	{
6245 	  /* If there's no receiver object, propagate the incoming VAR.  */
6246 	  tree ref_to_res = gimple_call_arg (call, 1);
6247 	  if (integer_zerop (ref_to_res))
6248 	    init = var;
6249 	}
6250 
6251       if (lhs != NULL_TREE)
6252 	gimplify_assign (lhs, init, &seq);
6253     }
6254 
6255   pop_gimplify_context (NULL);
6256   gsi_replace_with_seq (&gsi, seq, true);
6257 }
6258 
6259 /* NVPTX implementation of GOACC_REDUCTION_FINI.  */
6260 
6261 static void
6262 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
6263 {
6264   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6265   tree lhs = gimple_call_lhs (call);
6266   tree ref_to_res = gimple_call_arg (call, 1);
6267   tree var = gimple_call_arg (call, 2);
6268   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6269   enum tree_code op
6270     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6271   gimple_seq seq = NULL;
6272   tree r = NULL_TREE;;
6273 
6274   push_gimplify_context (true);
6275 
6276   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6277     {
6278       /* Emit binary shuffle tree.  TODO. Emit this as an actual loop,
6279 	 but that requires a method of emitting a unified jump at the
6280 	 gimple level.  */
6281       for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
6282 	{
6283 	  tree other_var = make_ssa_name (TREE_TYPE (var));
6284 	  nvptx_generate_vector_shuffle (gimple_location (call),
6285 					 other_var, var, shfl, &seq);
6286 
6287 	  r = make_ssa_name (TREE_TYPE (var));
6288 	  gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
6289 					   var, other_var), &seq);
6290 	  var = r;
6291 	}
6292     }
6293   else
6294     {
6295       tree accum = NULL_TREE;
6296 
6297       if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
6298 	{
6299 	  /* Get reduction buffer address.  */
6300 	  tree offset = gimple_call_arg (call, 5);
6301 	  tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6302 						 level == GOMP_DIM_VECTOR);
6303 	  tree ptr = make_ssa_name (TREE_TYPE (call));
6304 
6305 	  gimplify_assign (ptr, call, &seq);
6306 	  accum = ptr;
6307 	}
6308       else if (integer_zerop (ref_to_res))
6309 	r = var;
6310       else
6311 	accum = ref_to_res;
6312 
6313       if (accum)
6314 	{
6315 	  /* UPDATE the accumulator.  */
6316 	  gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
6317 	  seq = NULL;
6318 	  r = nvptx_reduction_update (gimple_location (call), &gsi,
6319 				      accum, var, op);
6320 	}
6321     }
6322 
6323   if (lhs)
6324     gimplify_assign (lhs, r, &seq);
6325   pop_gimplify_context (NULL);
6326 
6327   gsi_replace_with_seq (&gsi, seq, true);
6328 }
6329 
6330 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN.  */
6331 
6332 static void
6333 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
6334 {
6335   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6336   tree lhs = gimple_call_lhs (call);
6337   tree var = gimple_call_arg (call, 2);
6338   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6339   gimple_seq seq = NULL;
6340 
6341   push_gimplify_context (true);
6342   if (level == GOMP_DIM_WORKER
6343       || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6344     {
6345       /* Read the worker reduction buffer.  */
6346       tree offset = gimple_call_arg (call, 5);
6347       tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6348 					     level == GOMP_DIM_VECTOR);
6349       tree ptr = make_ssa_name (TREE_TYPE (call));
6350 
6351       gimplify_assign (ptr, call, &seq);
6352       var = build_simple_mem_ref (ptr);
6353       TREE_THIS_VOLATILE (var) = 1;
6354     }
6355 
6356   if (level != GOMP_DIM_GANG)
6357     {
6358       /* Write to the receiver object.  */
6359       tree ref_to_res = gimple_call_arg (call, 1);
6360 
6361       if (!integer_zerop (ref_to_res))
6362 	gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
6363     }
6364 
6365   if (lhs)
6366     gimplify_assign (lhs, var, &seq);
6367 
6368   pop_gimplify_context (NULL);
6369 
6370   gsi_replace_with_seq (&gsi, seq, true);
6371 }
6372 
6373 /* NVPTX reduction expander.  */
6374 
6375 static void
6376 nvptx_goacc_reduction (gcall *call)
6377 {
6378   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
6379   offload_attrs oa;
6380 
6381   populate_offload_attrs (&oa);
6382 
6383   switch (code)
6384     {
6385     case IFN_GOACC_REDUCTION_SETUP:
6386       nvptx_goacc_reduction_setup (call, &oa);
6387       break;
6388 
6389     case IFN_GOACC_REDUCTION_INIT:
6390       nvptx_goacc_reduction_init (call, &oa);
6391       break;
6392 
6393     case IFN_GOACC_REDUCTION_FINI:
6394       nvptx_goacc_reduction_fini (call, &oa);
6395       break;
6396 
6397     case IFN_GOACC_REDUCTION_TEARDOWN:
6398       nvptx_goacc_reduction_teardown (call, &oa);
6399       break;
6400 
6401     default:
6402       gcc_unreachable ();
6403     }
6404 }
6405 
6406 static bool
6407 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
6408 			      rtx x ATTRIBUTE_UNUSED)
6409 {
6410   return true;
6411 }
6412 
6413 static bool
6414 nvptx_vector_mode_supported (machine_mode mode)
6415 {
6416   return (mode == V2SImode
6417 	  || mode == V2DImode);
6418 }
6419 
6420 /* Return the preferred mode for vectorizing scalar MODE.  */
6421 
6422 static machine_mode
6423 nvptx_preferred_simd_mode (scalar_mode mode)
6424 {
6425   switch (mode)
6426     {
6427     case E_DImode:
6428       return V2DImode;
6429     case E_SImode:
6430       return V2SImode;
6431 
6432     default:
6433       return default_preferred_simd_mode (mode);
6434     }
6435 }
6436 
6437 unsigned int
6438 nvptx_data_alignment (const_tree type, unsigned int basic_align)
6439 {
6440   if (TREE_CODE (type) == INTEGER_TYPE)
6441     {
6442       unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
6443       if (size == GET_MODE_SIZE (TImode))
6444 	return GET_MODE_BITSIZE (maybe_split_mode (TImode));
6445     }
6446 
6447   return basic_align;
6448 }
6449 
6450 /* Implement TARGET_MODES_TIEABLE_P.  */
6451 
6452 static bool
6453 nvptx_modes_tieable_p (machine_mode, machine_mode)
6454 {
6455   return false;
6456 }
6457 
6458 /* Implement TARGET_HARD_REGNO_NREGS.  */
6459 
6460 static unsigned int
6461 nvptx_hard_regno_nregs (unsigned int, machine_mode)
6462 {
6463   return 1;
6464 }
6465 
6466 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
6467 
6468 static bool
6469 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
6470 {
6471   return false;
6472 }
6473 
6474 static GTY(()) tree nvptx_previous_fndecl;
6475 
6476 static void
6477 nvptx_set_current_function (tree fndecl)
6478 {
6479   if (!fndecl || fndecl == nvptx_previous_fndecl)
6480     return;
6481 
6482   nvptx_previous_fndecl = fndecl;
6483   vector_red_partition = 0;
6484   oacc_bcast_partition = 0;
6485 }
6486 
6487 #undef TARGET_OPTION_OVERRIDE
6488 #define TARGET_OPTION_OVERRIDE nvptx_option_override
6489 
6490 #undef TARGET_ATTRIBUTE_TABLE
6491 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
6492 
6493 #undef TARGET_LRA_P
6494 #define TARGET_LRA_P hook_bool_void_false
6495 
6496 #undef TARGET_LEGITIMATE_ADDRESS_P
6497 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
6498 
6499 #undef  TARGET_PROMOTE_FUNCTION_MODE
6500 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
6501 
6502 #undef TARGET_FUNCTION_ARG
6503 #define TARGET_FUNCTION_ARG nvptx_function_arg
6504 #undef TARGET_FUNCTION_INCOMING_ARG
6505 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
6506 #undef TARGET_FUNCTION_ARG_ADVANCE
6507 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
6508 #undef TARGET_FUNCTION_ARG_BOUNDARY
6509 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
6510 #undef TARGET_PASS_BY_REFERENCE
6511 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
6512 #undef TARGET_FUNCTION_VALUE_REGNO_P
6513 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
6514 #undef TARGET_FUNCTION_VALUE
6515 #define TARGET_FUNCTION_VALUE nvptx_function_value
6516 #undef TARGET_LIBCALL_VALUE
6517 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
6518 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
6519 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
6520 #undef TARGET_GET_DRAP_RTX
6521 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
6522 #undef TARGET_SPLIT_COMPLEX_ARG
6523 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
6524 #undef TARGET_RETURN_IN_MEMORY
6525 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
6526 #undef TARGET_OMIT_STRUCT_RETURN_REG
6527 #define TARGET_OMIT_STRUCT_RETURN_REG true
6528 #undef TARGET_STRICT_ARGUMENT_NAMING
6529 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
6530 #undef TARGET_CALL_ARGS
6531 #define TARGET_CALL_ARGS nvptx_call_args
6532 #undef TARGET_END_CALL_ARGS
6533 #define TARGET_END_CALL_ARGS nvptx_end_call_args
6534 
6535 #undef TARGET_ASM_FILE_START
6536 #define TARGET_ASM_FILE_START nvptx_file_start
6537 #undef TARGET_ASM_FILE_END
6538 #define TARGET_ASM_FILE_END nvptx_file_end
6539 #undef TARGET_ASM_GLOBALIZE_LABEL
6540 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
6541 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
6542 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
6543 #undef  TARGET_PRINT_OPERAND
6544 #define TARGET_PRINT_OPERAND nvptx_print_operand
6545 #undef  TARGET_PRINT_OPERAND_ADDRESS
6546 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
6547 #undef  TARGET_PRINT_OPERAND_PUNCT_VALID_P
6548 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
6549 #undef TARGET_ASM_INTEGER
6550 #define TARGET_ASM_INTEGER nvptx_assemble_integer
6551 #undef TARGET_ASM_DECL_END
6552 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
6553 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
6554 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
6555 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
6556 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
6557 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
6558 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
6559 
6560 #undef TARGET_MACHINE_DEPENDENT_REORG
6561 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
6562 #undef TARGET_NO_REGISTER_ALLOCATION
6563 #define TARGET_NO_REGISTER_ALLOCATION true
6564 
6565 #undef TARGET_ENCODE_SECTION_INFO
6566 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
6567 #undef TARGET_RECORD_OFFLOAD_SYMBOL
6568 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
6569 
6570 #undef TARGET_VECTOR_ALIGNMENT
6571 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
6572 
6573 #undef TARGET_CANNOT_COPY_INSN_P
6574 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
6575 
6576 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
6577 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
6578 
6579 #undef TARGET_INIT_BUILTINS
6580 #define TARGET_INIT_BUILTINS nvptx_init_builtins
6581 #undef TARGET_EXPAND_BUILTIN
6582 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
6583 #undef  TARGET_BUILTIN_DECL
6584 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
6585 
6586 #undef TARGET_SIMT_VF
6587 #define TARGET_SIMT_VF nvptx_simt_vf
6588 
6589 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
6590 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
6591 
6592 #undef TARGET_GOACC_VALIDATE_DIMS
6593 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
6594 
6595 #undef TARGET_GOACC_DIM_LIMIT
6596 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
6597 
6598 #undef TARGET_GOACC_FORK_JOIN
6599 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
6600 
6601 #undef TARGET_GOACC_REDUCTION
6602 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
6603 
6604 #undef TARGET_CANNOT_FORCE_CONST_MEM
6605 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
6606 
6607 #undef TARGET_VECTOR_MODE_SUPPORTED_P
6608 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
6609 
6610 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
6611 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
6612     nvptx_preferred_simd_mode
6613 
6614 #undef TARGET_MODES_TIEABLE_P
6615 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
6616 
6617 #undef TARGET_HARD_REGNO_NREGS
6618 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
6619 
6620 #undef TARGET_CAN_CHANGE_MODE_CLASS
6621 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
6622 
6623 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
6624 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
6625 
6626 #undef TARGET_SET_CURRENT_FUNCTION
6627 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
6628 
6629 struct gcc_target targetm = TARGET_INITIALIZER;
6630 
6631 #include "gt-nvptx.h"
6632