xref: /netbsd-src/external/gpl3/gcc/dist/gcc/config/nvptx/nvptx.cc (revision b1e838363e3c6fc78a55519254d99869742dd33c)
1 /* Target code for NVPTX.
2    Copyright (C) 2014-2022 Free Software Foundation, Inc.
3    Contributed by Bernd Schmidt <bernds@codesourcery.com>
4 
5    This file is part of GCC.
6 
7    GCC is free software; you can redistribute it and/or modify it
8    under the terms of the GNU General Public License as published
9    by the Free Software Foundation; either version 3, or (at your
10    option) any later version.
11 
12    GCC is distributed in the hope that it will be useful, but WITHOUT
13    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
15    License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with GCC; see the file COPYING3.  If not see
19    <http://www.gnu.org/licenses/>.  */
20 
21 #define IN_TARGET_CODE 1
22 
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
64 #include "dumpfile.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
68 #include "attribs.h"
69 #include "tree-vrp.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
72 #include "gimplify.h"
73 #include "tree-phinodes.h"
74 #include "cfgloop.h"
75 #include "fold-const.h"
76 #include "intl.h"
77 #include "opts.h"
78 #include "tree-pretty-print.h"
79 #include "rtl-iter.h"
80 #include "cgraph.h"
81 
82 /* This file should be included last.  */
83 #include "target-def.h"
84 
85 #define WORKAROUND_PTXJIT_BUG 1
86 #define WORKAROUND_PTXJIT_BUG_2 1
87 #define WORKAROUND_PTXJIT_BUG_3 1
88 
89 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
90    block, which has had a maximum number of threads of 1024 since CUDA version
91    2.x.  */
92 #define PTX_CTA_SIZE 1024
93 
94 #define PTX_CTA_NUM_BARRIERS 16
95 #define PTX_WARP_SIZE 32
96 
97 #define PTX_PER_CTA_BARRIER 0
98 #define PTX_NUM_PER_CTA_BARRIERS 1
99 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
100 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
101 
102 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
103 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
104 #define PTX_WORKER_LENGTH 32
105 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime.  */
106 
107 /* The various PTX memory areas an object might reside in.  */
108 enum nvptx_data_area
109 {
110   DATA_AREA_GENERIC,
111   DATA_AREA_GLOBAL,
112   DATA_AREA_SHARED,
113   DATA_AREA_LOCAL,
114   DATA_AREA_CONST,
115   DATA_AREA_PARAM,
116   DATA_AREA_MAX
117 };
118 
119 /*  We record the data area in the target symbol flags.  */
120 #define SYMBOL_DATA_AREA(SYM) \
121   (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
122 		    & 7)
123 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
124   (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
125 
126 /* Record the function decls we've written, and the libfuncs and function
127    decls corresponding to them.  */
128 static std::stringstream func_decls;
129 
130 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
131 {
hashdeclared_libfunc_hasher132   static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
equaldeclared_libfunc_hasher133   static bool equal (rtx a, rtx b) { return a == b; }
134 };
135 
136 static GTY((cache))
137   hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
138 
139 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
140 {
hashtree_hasher141   static hashval_t hash (tree t) { return htab_hash_pointer (t); }
equaltree_hasher142   static bool equal (tree a, tree b) { return a == b; }
143 };
144 
145 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
146 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
147 
148 /* Buffer needed to broadcast across workers and vectors.  This is
149    used for both worker-neutering and worker broadcasting, and
150    vector-neutering and boardcasting when vector_length > 32.  It is
151    shared by all functions emitted.  The buffer is placed in shared
152    memory.  It'd be nice if PTX supported common blocks, because then
153    this could be shared across TUs (taking the largest size).  */
154 static unsigned oacc_bcast_size;
155 static unsigned oacc_bcast_partition;
156 static unsigned oacc_bcast_align;
157 static GTY(()) rtx oacc_bcast_sym;
158 
159 /* Buffer needed for worker reductions.  This has to be distinct from
160    the worker broadcast array, as both may be live concurrently.  */
161 static unsigned worker_red_size;
162 static unsigned worker_red_align;
163 static GTY(()) rtx worker_red_sym;
164 
165 /* Buffer needed for vector reductions, when vector_length >
166    PTX_WARP_SIZE.  This has to be distinct from the worker broadcast
167    array, as both may be live concurrently.  */
168 static unsigned vector_red_size;
169 static unsigned vector_red_align;
170 static unsigned vector_red_partition;
171 static GTY(()) rtx vector_red_sym;
172 
173 /* Shared memory block for gang-private variables.  */
174 static unsigned gang_private_shared_size;
175 static unsigned gang_private_shared_align;
176 static GTY(()) rtx gang_private_shared_sym;
177 static hash_map<tree_decl_hash, unsigned int> gang_private_shared_hmap;
178 
179 /* Global lock variable, needed for 128bit worker & gang reductions.  */
180 static GTY(()) tree global_lock_var;
181 
182 /* True if any function references __nvptx_stacks.  */
183 static bool need_softstack_decl;
184 
185 /* True if any function references __nvptx_uni.  */
186 static bool need_unisimt_decl;
187 
188 static int nvptx_mach_max_workers ();
189 
190 /* Allocate a new, cleared machine_function structure.  */
191 
192 static struct machine_function *
nvptx_init_machine_status(void)193 nvptx_init_machine_status (void)
194 {
195   struct machine_function *p = ggc_cleared_alloc<machine_function> ();
196   p->return_mode = VOIDmode;
197   return p;
198 }
199 
200 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
201    and -fopenacc is also enabled.  */
202 
203 static void
diagnose_openacc_conflict(bool optval,const char * optname)204 diagnose_openacc_conflict (bool optval, const char *optname)
205 {
206   if (flag_openacc && optval)
207     error ("option %s is not supported together with %<-fopenacc%>", optname);
208 }
209 
210 static enum ptx_version
first_ptx_version_supporting_sm(enum ptx_isa sm)211 first_ptx_version_supporting_sm (enum ptx_isa sm)
212 {
213   switch (sm)
214     {
215     case PTX_ISA_SM30:
216       return PTX_VERSION_3_0;
217     case PTX_ISA_SM35:
218       return PTX_VERSION_3_1;
219     case PTX_ISA_SM53:
220       return PTX_VERSION_4_2;
221     case PTX_ISA_SM70:
222       return PTX_VERSION_6_0;
223     case PTX_ISA_SM75:
224       return PTX_VERSION_6_3;
225     case PTX_ISA_SM80:
226       return PTX_VERSION_7_0;
227     default:
228       gcc_unreachable ();
229     }
230 }
231 
232 static enum ptx_version
default_ptx_version_option(void)233 default_ptx_version_option (void)
234 {
235   enum ptx_version first
236     = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
237 
238   /* Pick a version that supports the sm.  */
239   enum ptx_version res = first;
240 
241   /* Pick at least 3.1.  This has been the smallest version historically.  */
242   res = MAX (res, PTX_VERSION_3_1);
243 
244   /* Pick at least 6.0, to enable using bar.warp.sync to have a way to force
245      warp convergence.  */
246   res = MAX (res, PTX_VERSION_6_0);
247 
248   /* Verify that we pick a version that supports the sm.  */
249   gcc_assert (first <= res);
250   return res;
251 }
252 
253 static const char *
ptx_version_to_string(enum ptx_version v)254 ptx_version_to_string (enum ptx_version v)
255 {
256   switch (v)
257     {
258     case PTX_VERSION_3_0:
259       return "3.0";
260     case PTX_VERSION_3_1:
261       return "3.1";
262     case PTX_VERSION_4_2:
263       return "4.2";
264     case PTX_VERSION_6_0:
265       return "6.0";
266     case PTX_VERSION_6_3:
267       return "6.3";
268     case PTX_VERSION_7_0:
269       return "7.0";
270     default:
271       gcc_unreachable ();
272     }
273 }
274 
275 unsigned int
ptx_version_to_number(enum ptx_version v,bool major_p)276 ptx_version_to_number (enum ptx_version v, bool major_p)
277 {
278   switch (v)
279     {
280     case PTX_VERSION_3_0:
281       return major_p ? 3 : 0;
282     case PTX_VERSION_3_1:
283       return major_p ? 3 : 1;
284     case PTX_VERSION_4_2:
285       return major_p ? 4 : 2;
286     case PTX_VERSION_6_0:
287       return major_p ? 6 : 0;
288     case PTX_VERSION_6_3:
289       return major_p ? 6 : 3;
290     case PTX_VERSION_7_0:
291       return major_p ? 7 : 0;
292     default:
293       gcc_unreachable ();
294     }
295 }
296 
297 static const char *
sm_version_to_string(enum ptx_isa sm)298 sm_version_to_string (enum ptx_isa sm)
299 {
300   switch (sm)
301     {
302 #define NVPTX_SM(XX, SEP)			\
303       case PTX_ISA_SM ## XX:			\
304 	return #XX;
305 #include "nvptx-sm.def"
306 #undef NVPTX_SM
307     default:
308       gcc_unreachable ();
309     }
310 }
311 
312 static void
handle_ptx_version_option(void)313 handle_ptx_version_option (void)
314 {
315   if (!OPTION_SET_P (ptx_version_option)
316       || ptx_version_option == PTX_VERSION_default)
317     {
318       ptx_version_option = default_ptx_version_option ();
319       return;
320     }
321 
322   enum ptx_version first
323     = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
324 
325   if (ptx_version_option < first)
326     error ("PTX version (%<-mptx%>) needs to be at least %s to support selected"
327 	   " %<-misa%> (sm_%s)", ptx_version_to_string (first),
328 	   sm_version_to_string ((enum ptx_isa)ptx_isa_option));
329 }
330 
331 /* Implement TARGET_OPTION_OVERRIDE.  */
332 
333 static void
nvptx_option_override(void)334 nvptx_option_override (void)
335 {
336   init_machine_status = nvptx_init_machine_status;
337 
338   handle_ptx_version_option ();
339 
340   /* Set toplevel_reorder, unless explicitly disabled.  We need
341      reordering so that we emit necessary assembler decls of
342      undeclared variables. */
343   if (!OPTION_SET_P (flag_toplevel_reorder))
344     flag_toplevel_reorder = 1;
345 
346   debug_nonbind_markers_p = 0;
347 
348   /* Set flag_no_common, unless explicitly disabled.  We fake common
349      using .weak, and that's not entirely accurate, so avoid it
350      unless forced.  */
351   if (!OPTION_SET_P (flag_no_common))
352     flag_no_common = 1;
353 
354   /* The patch area requires nops, which we don't have.  */
355   HOST_WIDE_INT patch_area_size, patch_area_entry;
356   parse_and_check_patch_area (flag_patchable_function_entry, false,
357 			      &patch_area_size, &patch_area_entry);
358   if (patch_area_size > 0)
359     sorry ("not generating patch area, nops not supported");
360 
361   /* Assumes that it will see only hard registers.  */
362   flag_var_tracking = 0;
363 
364   if (nvptx_optimize < 0)
365     nvptx_optimize = optimize > 0;
366 
367   declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
368   needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
369   declared_libfuncs_htab
370     = hash_table<declared_libfunc_hasher>::create_ggc (17);
371 
372   oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
373   SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
374   oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
375   oacc_bcast_partition = 0;
376 
377   worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
378   SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
379   worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
380 
381   vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
382   SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
383   vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
384   vector_red_partition = 0;
385 
386   gang_private_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gang_private_shared");
387   SET_SYMBOL_DATA_AREA (gang_private_shared_sym, DATA_AREA_SHARED);
388   gang_private_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
389 
390   diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
391   diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
392   diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
393 
394   if (TARGET_GOMP)
395     target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
396 }
397 
398 /* Return a ptx type for MODE.  If PROMOTE, then use .u32 for QImode to
399    deal with ptx ideosyncracies.  */
400 
401 const char *
nvptx_ptx_type_from_mode(machine_mode mode,bool promote)402 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
403 {
404   switch (mode)
405     {
406     case E_BLKmode:
407       return ".b8";
408     case E_BImode:
409       return ".pred";
410     case E_QImode:
411       if (promote)
412 	return ".u32";
413       else
414 	return ".u8";
415     case E_HImode:
416       return ".u16";
417     case E_SImode:
418       return ".u32";
419     case E_DImode:
420       return ".u64";
421 
422     case E_HFmode:
423       return ".f16";
424     case E_SFmode:
425       return ".f32";
426     case E_DFmode:
427       return ".f64";
428 
429     case E_V2SImode:
430       return ".v2.u32";
431     case E_V2DImode:
432       return ".v2.u64";
433 
434     default:
435       gcc_unreachable ();
436     }
437 }
438 
439 /* Encode the PTX data area that DECL (which might not actually be a
440    _DECL) should reside in.  */
441 
442 static void
nvptx_encode_section_info(tree decl,rtx rtl,int first)443 nvptx_encode_section_info (tree decl, rtx rtl, int first)
444 {
445   default_encode_section_info (decl, rtl, first);
446   if (first && MEM_P (rtl))
447     {
448       nvptx_data_area area = DATA_AREA_GENERIC;
449 
450       if (TREE_CONSTANT (decl))
451 	area = DATA_AREA_CONST;
452       else if (TREE_CODE (decl) == VAR_DECL)
453 	{
454 	  if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
455 	    {
456 	      area = DATA_AREA_SHARED;
457 	      if (DECL_INITIAL (decl))
458 		error ("static initialization of variable %q+D in %<.shared%>"
459 		       " memory is not supported", decl);
460 	    }
461 	  else
462 	    area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
463 	}
464 
465       SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
466     }
467 }
468 
469 /* Return the PTX name of the data area in which SYM should be
470    placed.  The symbol must have already been processed by
471    nvptx_encode_seciton_info, or equivalent.  */
472 
473 static const char *
section_for_sym(rtx sym)474 section_for_sym (rtx sym)
475 {
476   nvptx_data_area area = SYMBOL_DATA_AREA (sym);
477   /* Same order as nvptx_data_area enum.  */
478   static char const *const areas[] =
479     {"", ".global", ".shared", ".local", ".const", ".param"};
480 
481   return areas[area];
482 }
483 
484 /* Similarly for a decl.  */
485 
486 static const char *
section_for_decl(const_tree decl)487 section_for_decl (const_tree decl)
488 {
489   return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
490 }
491 
492 /* Check NAME for special function names and redirect them by returning a
493    replacement.  This applies to malloc, free and realloc, for which we
494    want to use libgcc wrappers, and call, which triggers a bug in
495    ptxas.  We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
496    not active in an offload compiler -- the names are all set by the
497    host-side compiler.  */
498 
499 static const char *
nvptx_name_replacement(const char * name)500 nvptx_name_replacement (const char *name)
501 {
502   if (strcmp (name, "call") == 0)
503     return "__nvptx_call";
504   if (strcmp (name, "malloc") == 0)
505     return "__nvptx_malloc";
506   if (strcmp (name, "free") == 0)
507     return "__nvptx_free";
508   if (strcmp (name, "realloc") == 0)
509     return "__nvptx_realloc";
510   return name;
511 }
512 
513 /* Return NULL if NAME contains no dot.  Otherwise return a copy of NAME
514    with the dots replaced with dollar signs.  */
515 
516 static char *
nvptx_replace_dot(const char * name)517 nvptx_replace_dot (const char *name)
518 {
519   if (strchr (name, '.') == NULL)
520     return NULL;
521 
522   char *p = xstrdup (name);
523   for (size_t i = 0; i < strlen (p); ++i)
524     if (p[i] == '.')
525       p[i] = '$';
526   return p;
527 }
528 
529 /* If MODE should be treated as two registers of an inner mode, return
530    that inner mode.  Otherwise return VOIDmode.  */
531 
532 static machine_mode
maybe_split_mode(machine_mode mode)533 maybe_split_mode (machine_mode mode)
534 {
535   if (COMPLEX_MODE_P (mode))
536     return GET_MODE_INNER (mode);
537 
538   if (mode == TImode)
539     return DImode;
540 
541   return VOIDmode;
542 }
543 
544 /* Return true if mode should be treated as two registers.  */
545 
546 static bool
split_mode_p(machine_mode mode)547 split_mode_p (machine_mode mode)
548 {
549   return maybe_split_mode (mode) != VOIDmode;
550 }
551 
552 /* Output a register, subreg, or register pair (with optional
553    enclosing braces).  */
554 
555 static void
output_reg(FILE * file,unsigned regno,machine_mode inner_mode,int subreg_offset=-1)556 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
557 	    int subreg_offset = -1)
558 {
559   if (inner_mode == VOIDmode)
560     {
561       if (HARD_REGISTER_NUM_P (regno))
562 	fprintf (file, "%s", reg_names[regno]);
563       else
564 	fprintf (file, "%%r%d", regno);
565     }
566   else if (subreg_offset >= 0)
567     {
568       output_reg (file, regno, VOIDmode);
569       fprintf (file, "$%d", subreg_offset);
570     }
571   else
572     {
573       if (subreg_offset == -1)
574 	fprintf (file, "{");
575       output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
576       fprintf (file, ",");
577       output_reg (file, regno, inner_mode, 0);
578       if (subreg_offset == -1)
579 	fprintf (file, "}");
580     }
581 }
582 
583 /* Emit forking instructions for MASK.  */
584 
585 static void
nvptx_emit_forking(unsigned mask,bool is_call)586 nvptx_emit_forking (unsigned mask, bool is_call)
587 {
588   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
589 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
590   if (mask)
591     {
592       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
593 
594       /* Emit fork at all levels.  This helps form SESE regions, as
595 	 it creates a block with a single successor before entering a
596 	 partitooned region.  That is a good candidate for the end of
597 	 an SESE region.  */
598       emit_insn (gen_nvptx_fork (op));
599       emit_insn (gen_nvptx_forked (op));
600     }
601 }
602 
603 /* Emit joining instructions for MASK.  */
604 
605 static void
nvptx_emit_joining(unsigned mask,bool is_call)606 nvptx_emit_joining (unsigned mask, bool is_call)
607 {
608   mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
609 	   | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
610   if (mask)
611     {
612       rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
613 
614       /* Emit joining for all non-call pars to ensure there's a single
615 	 predecessor for the block the join insn ends up in.  This is
616 	 needed for skipping entire loops.  */
617       emit_insn (gen_nvptx_joining (op));
618       emit_insn (gen_nvptx_join (op));
619     }
620 }
621 
622 
623 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
624    returned in memory.  Integer and floating types supported by the
625    machine are passed in registers, everything else is passed in
626    memory.  Complex types are split.  */
627 
628 static bool
pass_in_memory(machine_mode mode,const_tree type,bool for_return)629 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
630 {
631   if (type)
632     {
633       if (AGGREGATE_TYPE_P (type))
634 	return true;
635       if (TREE_CODE (type) == VECTOR_TYPE)
636 	return true;
637     }
638 
639   if (!for_return && COMPLEX_MODE_P (mode))
640     /* Complex types are passed as two underlying args.  */
641     mode = GET_MODE_INNER (mode);
642 
643   if (GET_MODE_CLASS (mode) != MODE_INT
644       && GET_MODE_CLASS (mode) != MODE_FLOAT)
645     return true;
646 
647   if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
648     return true;
649 
650   return false;
651 }
652 
653 /* A non-memory argument of mode MODE is being passed, determine the mode it
654    should be promoted to.  This is also used for determining return
655    type promotion.  */
656 
657 static machine_mode
promote_arg(machine_mode mode,bool prototyped)658 promote_arg (machine_mode mode, bool prototyped)
659 {
660   if (!prototyped && mode == SFmode)
661     /* K&R float promotion for unprototyped functions.  */
662     mode = DFmode;
663   else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
664     mode = SImode;
665 
666   return mode;
667 }
668 
669 /* A non-memory return type of MODE is being returned.  Determine the
670    mode it should be promoted to.  */
671 
672 static machine_mode
promote_return(machine_mode mode)673 promote_return (machine_mode mode)
674 {
675   return promote_arg (mode, true);
676 }
677 
678 /* Implement TARGET_FUNCTION_ARG.  */
679 
680 static rtx
nvptx_function_arg(cumulative_args_t,const function_arg_info & arg)681 nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
682 {
683   if (arg.end_marker_p () || !arg.named)
684     return NULL_RTX;
685 
686   return gen_reg_rtx (arg.mode);
687 }
688 
689 /* Implement TARGET_FUNCTION_INCOMING_ARG.  */
690 
691 static rtx
nvptx_function_incoming_arg(cumulative_args_t cum_v,const function_arg_info & arg)692 nvptx_function_incoming_arg (cumulative_args_t cum_v,
693 			     const function_arg_info &arg)
694 {
695   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
696 
697   if (arg.end_marker_p () || !arg.named)
698     return NULL_RTX;
699 
700   /* No need to deal with split modes here, the only case that can
701      happen is complex modes and those are dealt with by
702      TARGET_SPLIT_COMPLEX_ARG.  */
703   return gen_rtx_UNSPEC (arg.mode,
704 			 gen_rtvec (1, GEN_INT (cum->count)),
705 			 UNSPEC_ARG_REG);
706 }
707 
708 /* Implement TARGET_FUNCTION_ARG_ADVANCE.  */
709 
710 static void
nvptx_function_arg_advance(cumulative_args_t cum_v,const function_arg_info &)711 nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
712 {
713   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
714 
715   cum->count++;
716 }
717 
718 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
719 
720    For nvptx This is only used for varadic args.  The type has already
721    been promoted and/or converted to invisible reference.  */
722 
723 static unsigned
nvptx_function_arg_boundary(machine_mode mode,const_tree ARG_UNUSED (type))724 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
725 {
726   return GET_MODE_ALIGNMENT (mode);
727 }
728 
729 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
730 
731    For nvptx, we know how to handle functions declared as stdarg: by
732    passing an extra pointer to the unnamed arguments.  However, the
733    Fortran frontend can produce a different situation, where a
734    function pointer is declared with no arguments, but the actual
735    function and calls to it take more arguments.  In that case, we
736    want to ensure the call matches the definition of the function.  */
737 
738 static bool
nvptx_strict_argument_naming(cumulative_args_t cum_v)739 nvptx_strict_argument_naming (cumulative_args_t cum_v)
740 {
741   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
742 
743   return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
744 }
745 
746 /* Implement TARGET_LIBCALL_VALUE.  */
747 
748 static rtx
nvptx_libcall_value(machine_mode mode,const_rtx)749 nvptx_libcall_value (machine_mode mode, const_rtx)
750 {
751   if (!cfun || !cfun->machine->doing_call)
752     /* Pretend to return in a hard reg for early uses before pseudos can be
753        generated.  */
754     return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
755 
756   return gen_reg_rtx (mode);
757 }
758 
759 /* TARGET_FUNCTION_VALUE implementation.  Returns an RTX representing the place
760    where function FUNC returns or receives a value of data type TYPE.  */
761 
762 static rtx
nvptx_function_value(const_tree type,const_tree ARG_UNUSED (func),bool outgoing)763 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
764 		      bool outgoing)
765 {
766   machine_mode mode = promote_return (TYPE_MODE (type));
767 
768   if (outgoing)
769     {
770       gcc_assert (cfun);
771       cfun->machine->return_mode = mode;
772       return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
773     }
774 
775   return nvptx_libcall_value (mode, NULL_RTX);
776 }
777 
778 /* Implement TARGET_FUNCTION_VALUE_REGNO_P.  */
779 
780 static bool
nvptx_function_value_regno_p(const unsigned int regno)781 nvptx_function_value_regno_p (const unsigned int regno)
782 {
783   return regno == NVPTX_RETURN_REGNUM;
784 }
785 
786 /* Types with a mode other than those supported by the machine are passed by
787    reference in memory.  */
788 
789 static bool
nvptx_pass_by_reference(cumulative_args_t,const function_arg_info & arg)790 nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
791 {
792   return pass_in_memory (arg.mode, arg.type, false);
793 }
794 
795 /* Implement TARGET_RETURN_IN_MEMORY.  */
796 
797 static bool
nvptx_return_in_memory(const_tree type,const_tree)798 nvptx_return_in_memory (const_tree type, const_tree)
799 {
800   return pass_in_memory (TYPE_MODE (type), type, true);
801 }
802 
803 /* Implement TARGET_PROMOTE_FUNCTION_MODE.  */
804 
805 static machine_mode
nvptx_promote_function_mode(const_tree type,machine_mode mode,int * ARG_UNUSED (punsignedp),const_tree funtype,int for_return)806 nvptx_promote_function_mode (const_tree type, machine_mode mode,
807 			     int *ARG_UNUSED (punsignedp),
808 			     const_tree funtype, int for_return)
809 {
810   return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
811 }
812 
813 /* Helper for write_arg.  Emit a single PTX argument of MODE, either
814    in a prototype, or as copy in a function prologue.  ARGNO is the
815    index of this argument in the PTX function.  FOR_REG is negative,
816    if we're emitting the PTX prototype.  It is zero if we're copying
817    to an argument register and it is greater than zero if we're
818    copying to a specific hard register.  */
819 
820 static int
write_arg_mode(std::stringstream & s,int for_reg,int argno,machine_mode mode)821 write_arg_mode (std::stringstream &s, int for_reg, int argno,
822 		machine_mode mode)
823 {
824   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
825 
826   if (for_reg < 0)
827     {
828       /* Writing PTX prototype.  */
829       s << (argno ? ", " : " (");
830       s << ".param" << ptx_type << " %in_ar" << argno;
831     }
832   else
833     {
834       s << "\t.reg" << ptx_type << " ";
835       if (for_reg)
836 	s << reg_names[for_reg];
837       else
838 	s << "%ar" << argno;
839       s << ";\n";
840       if (argno >= 0)
841 	{
842 	  s << "\tld.param" << ptx_type << " ";
843 	  if (for_reg)
844 	    s << reg_names[for_reg];
845 	  else
846 	    s << "%ar" << argno;
847 	  s << ", [%in_ar" << argno << "];\n";
848 	}
849     }
850   return argno + 1;
851 }
852 
853 /* Process function parameter TYPE to emit one or more PTX
854    arguments. S, FOR_REG and ARGNO as for write_arg_mode.  PROTOTYPED
855    is true, if this is a prototyped function, rather than an old-style
856    C declaration.  Returns the next argument number to use.
857 
858    The promotion behavior here must match the regular GCC function
859    parameter marshalling machinery.  */
860 
861 static int
write_arg_type(std::stringstream & s,int for_reg,int argno,tree type,bool prototyped)862 write_arg_type (std::stringstream &s, int for_reg, int argno,
863 		tree type, bool prototyped)
864 {
865   machine_mode mode = TYPE_MODE (type);
866 
867   if (mode == VOIDmode)
868     return argno;
869 
870   if (pass_in_memory (mode, type, false))
871     mode = Pmode;
872   else
873     {
874       bool split = TREE_CODE (type) == COMPLEX_TYPE;
875 
876       if (split)
877 	{
878 	  /* Complex types are sent as two separate args.  */
879 	  type = TREE_TYPE (type);
880 	  mode = TYPE_MODE (type);
881 	  prototyped = true;
882 	}
883 
884       mode = promote_arg (mode, prototyped);
885       if (split)
886 	argno = write_arg_mode (s, for_reg, argno, mode);
887     }
888 
889   return write_arg_mode (s, for_reg, argno, mode);
890 }
891 
892 /* Emit a PTX return as a prototype or function prologue declaration
893    for MODE.  */
894 
895 static void
write_return_mode(std::stringstream & s,bool for_proto,machine_mode mode)896 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
897 {
898   const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
899   const char *pfx = "\t.reg";
900   const char *sfx = ";\n";
901 
902   if (for_proto)
903     pfx = "(.param", sfx = "_out) ";
904 
905   s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
906 }
907 
908 /* Process a function return TYPE to emit a PTX return as a prototype
909    or function prologue declaration.  Returns true if return is via an
910    additional pointer parameter.  The promotion behavior here must
911    match the regular GCC function return mashalling.  */
912 
913 static bool
write_return_type(std::stringstream & s,bool for_proto,tree type)914 write_return_type (std::stringstream &s, bool for_proto, tree type)
915 {
916   machine_mode mode = TYPE_MODE (type);
917 
918   if (mode == VOIDmode)
919     return false;
920 
921   bool return_in_mem = pass_in_memory (mode, type, true);
922 
923   if (return_in_mem)
924     {
925       if (for_proto)
926 	return return_in_mem;
927 
928       /* Named return values can cause us to return a pointer as well
929 	 as expect an argument for the return location.  This is
930 	 optimization-level specific, so no caller can make use of
931 	 this data, but more importantly for us, we must ensure it
932 	 doesn't change the PTX prototype.  */
933       mode = (machine_mode) cfun->machine->return_mode;
934 
935       if (mode == VOIDmode)
936 	return return_in_mem;
937 
938       /* Clear return_mode to inhibit copy of retval to non-existent
939 	 retval parameter.  */
940       cfun->machine->return_mode = VOIDmode;
941     }
942   else
943     mode = promote_return (mode);
944 
945   write_return_mode (s, for_proto, mode);
946 
947   return return_in_mem;
948 }
949 
950 /* Look for attributes in ATTRS that would indicate we must write a function
951    as a .entry kernel rather than a .func.  Return true if one is found.  */
952 
953 static bool
write_as_kernel(tree attrs)954 write_as_kernel (tree attrs)
955 {
956   return (lookup_attribute ("kernel", attrs) != NULL_TREE
957 	  || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
958 	      && lookup_attribute ("oacc function", attrs) != NULL_TREE));
959   /* For OpenMP target regions, the corresponding kernel entry is emitted from
960      write_omp_entry as a separate function.  */
961 }
962 
963 /* Emit a linker marker for a function decl or defn.  */
964 
965 static void
write_fn_marker(std::stringstream & s,bool is_defn,bool globalize,const char * name)966 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
967 		 const char *name)
968 {
969   s << "\n// BEGIN";
970   if (globalize)
971     s << " GLOBAL";
972   s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
973   s << name << "\n";
974 }
975 
976 /* Emit a linker marker for a variable decl or defn.  */
977 
978 static void
write_var_marker(FILE * file,bool is_defn,bool globalize,const char * name)979 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
980 {
981   fprintf (file, "\n// BEGIN%s VAR %s: ",
982 	   globalize ? " GLOBAL" : "",
983 	   is_defn ? "DEF" : "DECL");
984   assemble_name_raw (file, name);
985   fputs ("\n", file);
986 }
987 
988 /* Helper function for write_fn_proto.  */
989 
990 static void
write_fn_proto_1(std::stringstream & s,bool is_defn,const char * name,const_tree decl)991 write_fn_proto_1 (std::stringstream &s, bool is_defn,
992 		  const char *name, const_tree decl)
993 {
994   if (lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL)
995     write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
996 
997   /* PTX declaration.  */
998   if (DECL_EXTERNAL (decl))
999     s << ".extern ";
1000   else if (TREE_PUBLIC (decl))
1001     s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
1002   s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
1003 
1004   tree fntype = TREE_TYPE (decl);
1005   tree result_type = TREE_TYPE (fntype);
1006 
1007   /* atomic_compare_exchange_$n builtins have an exceptional calling
1008      convention.  */
1009   int not_atomic_weak_arg = -1;
1010   if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
1011     switch (DECL_FUNCTION_CODE (decl))
1012       {
1013       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
1014       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
1015       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
1016       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
1017       case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
1018 	/* These atomics skip the 'weak' parm in an actual library
1019 	   call.  We must skip it in the prototype too.  */
1020 	not_atomic_weak_arg = 3;
1021 	break;
1022 
1023       default:
1024 	break;
1025       }
1026 
1027   /* Declare the result.  */
1028   bool return_in_mem = write_return_type (s, true, result_type);
1029 
1030   s << name;
1031 
1032   int argno = 0;
1033 
1034   /* Emit argument list.  */
1035   if (return_in_mem)
1036     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1037 
1038   /* We get:
1039      NULL in TYPE_ARG_TYPES, for old-style functions
1040      NULL in DECL_ARGUMENTS, for builtin functions without another
1041        declaration.
1042      So we have to pick the best one we have.  */
1043   tree args = TYPE_ARG_TYPES (fntype);
1044   bool prototyped = true;
1045   if (!args)
1046     {
1047       args = DECL_ARGUMENTS (decl);
1048       prototyped = false;
1049     }
1050 
1051   for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
1052     {
1053       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1054 
1055       if (not_atomic_weak_arg)
1056 	argno = write_arg_type (s, -1, argno, type, prototyped);
1057       else
1058 	gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE);
1059     }
1060 
1061   if (stdarg_p (fntype))
1062     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1063 
1064   if (DECL_STATIC_CHAIN (decl))
1065     argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1066 
1067   if (argno < 2 && strcmp (name, "main") == 0)
1068     {
1069       if (argno == 0)
1070 	argno = write_arg_type (s, -1, argno, integer_type_node, true);
1071 
1072       if (argno == 1)
1073 	argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1074     }
1075 
1076   if (argno)
1077     s << ")";
1078 
1079   s << (is_defn ? "\n" : ";\n");
1080 }
1081 
1082 /* Write a .func or .kernel declaration or definition along with
1083    a helper comment for use by ld.  S is the stream to write to, DECL
1084    the decl for the function with name NAME.  For definitions, emit
1085    a declaration too.  */
1086 
1087 static void
write_fn_proto(std::stringstream & s,bool is_defn,const char * name,const_tree decl)1088 write_fn_proto (std::stringstream &s, bool is_defn,
1089 		const char *name, const_tree decl)
1090 {
1091   const char *replacement = nvptx_name_replacement (name);
1092   char *replaced_dots = NULL;
1093   if (replacement != name)
1094     name = replacement;
1095   else
1096     {
1097       replaced_dots = nvptx_replace_dot (name);
1098       if (replaced_dots)
1099 	name = replaced_dots;
1100     }
1101   if (name[0] == '*')
1102     name++;
1103 
1104   if (is_defn)
1105     /* Emit a declaration.  The PTX assembler gets upset without it.  */
1106     write_fn_proto_1 (s, false, name, decl);
1107 
1108   write_fn_proto_1 (s, is_defn, name, decl);
1109 
1110   if (replaced_dots)
1111     XDELETE (replaced_dots);
1112 }
1113 
1114 /* Construct a function declaration from a call insn.  This can be
1115    necessary for two reasons - either we have an indirect call which
1116    requires a .callprototype declaration, or we have a libcall
1117    generated by emit_library_call for which no decl exists.  */
1118 
1119 static void
write_fn_proto_from_insn(std::stringstream & s,const char * name,rtx result,rtx pat)1120 write_fn_proto_from_insn (std::stringstream &s, const char *name,
1121 			  rtx result, rtx pat)
1122 {
1123   char *replaced_dots = NULL;
1124 
1125   if (!name)
1126     {
1127       s << "\t.callprototype ";
1128       name = "_";
1129     }
1130   else
1131     {
1132       const char *replacement = nvptx_name_replacement (name);
1133       if (replacement != name)
1134 	name = replacement;
1135       else
1136 	{
1137 	  replaced_dots = nvptx_replace_dot (name);
1138 	  if (replaced_dots)
1139 	    name = replaced_dots;
1140 	}
1141       write_fn_marker (s, false, true, name);
1142       s << "\t.extern .func ";
1143     }
1144 
1145   if (result != NULL_RTX)
1146     write_return_mode (s, true, GET_MODE (result));
1147 
1148   s << name;
1149   if (replaced_dots)
1150     XDELETE (replaced_dots);
1151 
1152   int arg_end = XVECLEN (pat, 0);
1153   for (int i = 1; i < arg_end; i++)
1154     {
1155       /* We don't have to deal with mode splitting & promotion here,
1156 	 as that was already done when generating the call
1157 	 sequence.  */
1158       machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
1159 
1160       write_arg_mode (s, -1, i - 1, mode);
1161     }
1162   if (arg_end != 1)
1163     s << ")";
1164   s << ";\n";
1165 }
1166 
1167 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
1168    table and write a ptx prototype.  These are emitted at end of
1169    compilation.  */
1170 
1171 static void
nvptx_record_fndecl(tree decl)1172 nvptx_record_fndecl (tree decl)
1173 {
1174   tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
1175   if (*slot == NULL)
1176     {
1177       *slot = decl;
1178       const char *name = get_fnname_from_decl (decl);
1179       write_fn_proto (func_decls, false, name, decl);
1180     }
1181 }
1182 
1183 /* Record a libcall or unprototyped external function. CALLEE is the
1184    SYMBOL_REF.  Insert into the libfunc hash table and emit a ptx
1185    declaration for it.  */
1186 
1187 static void
nvptx_record_libfunc(rtx callee,rtx retval,rtx pat)1188 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1189 {
1190   rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1191   if (*slot == NULL)
1192     {
1193       *slot = callee;
1194 
1195       const char *name = XSTR (callee, 0);
1196       write_fn_proto_from_insn (func_decls, name, retval, pat);
1197     }
1198 }
1199 
1200 /* DECL is an external FUNCTION_DECL, that we're referencing.  If it
1201    is prototyped, record it now.  Otherwise record it as needed at end
1202    of compilation, when we might have more information about it.  */
1203 
1204 void
nvptx_record_needed_fndecl(tree decl)1205 nvptx_record_needed_fndecl (tree decl)
1206 {
1207   if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1208     {
1209       tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1210       if (*slot == NULL)
1211 	*slot = decl;
1212     }
1213   else
1214     nvptx_record_fndecl (decl);
1215 }
1216 
1217 /* SYM is a SYMBOL_REF.  If it refers to an external function, record
1218    it as needed.  */
1219 
1220 static void
nvptx_maybe_record_fnsym(rtx sym)1221 nvptx_maybe_record_fnsym (rtx sym)
1222 {
1223   tree decl = SYMBOL_REF_DECL (sym);
1224 
1225   if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1226     nvptx_record_needed_fndecl (decl);
1227 }
1228 
1229 /* Emit a local array to hold some part of a conventional stack frame
1230    and initialize REGNO to point to it.  If the size is zero, it'll
1231    never be valid to dereference, so we can simply initialize to
1232    zero.  */
1233 
1234 static void
init_frame(FILE * file,int regno,unsigned align,unsigned size)1235 init_frame (FILE  *file, int regno, unsigned align, unsigned size)
1236 {
1237   if (size)
1238     fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1239 	     align, reg_names[regno], size);
1240   fprintf (file, "\t.reg.u%d %s;\n",
1241 	   POINTER_SIZE, reg_names[regno]);
1242   fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1243 		  :  "\tmov.u%d %s, 0;\n"),
1244 	   POINTER_SIZE, reg_names[regno], reg_names[regno]);
1245 }
1246 
1247 /* Emit soft stack frame setup sequence.  */
1248 
1249 static void
init_softstack_frame(FILE * file,unsigned alignment,HOST_WIDE_INT size)1250 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1251 {
1252   /* Maintain 64-bit stack alignment.  */
1253   unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1254   size = ROUND_UP (size, keep_align);
1255   int bits = POINTER_SIZE;
1256   const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1257   const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1258   const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1259   const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1260   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1261   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1262   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1263   fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1264   fprintf (file, "\t{\n");
1265   fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1266   fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1267   fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1268   fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1269   fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1270 	   bits == 64 ? ".wide" : ".lo", bits / 8);
1271   fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1272 
1273   /* Initialize %sspslot = &__nvptx_stacks[tid.y].  */
1274   fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1275 
1276   /* Initialize %sspprev = __nvptx_stacks[tid.y].  */
1277   fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1278 	   bits, reg_sspprev, reg_sspslot);
1279 
1280   /* Initialize %frame = %sspprev - size.  */
1281   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1282 	   bits, reg_frame, reg_sspprev, size);
1283 
1284   /* Apply alignment, if larger than 64.  */
1285   if (alignment > keep_align)
1286     fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1287 	     bits, reg_frame, reg_frame, -alignment);
1288 
1289   size = crtl->outgoing_args_size;
1290   gcc_assert (size % keep_align == 0);
1291 
1292   /* Initialize %stack.  */
1293   fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1294 	   bits, reg_stack, reg_frame, size);
1295 
1296   if (!crtl->is_leaf)
1297     fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1298 	     bits, reg_sspslot, reg_stack);
1299   fprintf (file, "\t}\n");
1300   cfun->machine->has_softstack = true;
1301   need_softstack_decl = true;
1302 }
1303 
1304 /* Emit code to initialize the REGNO predicate register to indicate
1305    whether we are not lane zero on the NAME axis.  */
1306 
1307 static void
nvptx_init_axis_predicate(FILE * file,int regno,const char * name)1308 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1309 {
1310   fprintf (file, "\t{\n");
1311   fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1312   if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1313     {
1314       fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1315       fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1316     }
1317   fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1318   fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1319   if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1320     {
1321       fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1322       fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1323       fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1324 	       "// vector reduction buffer\n",
1325 	       REGNO (cfun->machine->red_partition),
1326 	       vector_red_partition);
1327     }
1328   /* Verify vector_red_size.  */
1329   gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1330 	      <= vector_red_size);
1331   fprintf (file, "\t}\n");
1332 }
1333 
1334 /* Emit code to initialize OpenACC worker broadcast and synchronization
1335    registers.  */
1336 
1337 static void
nvptx_init_oacc_workers(FILE * file)1338 nvptx_init_oacc_workers (FILE *file)
1339 {
1340   fprintf (file, "\t{\n");
1341   fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1342   if (cfun->machine->bcast_partition)
1343     {
1344       fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1345       fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1346     }
1347   fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1348   if (cfun->machine->bcast_partition)
1349     {
1350       fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1351       fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1352       fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1353       fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1354 	       "// vector broadcast offset\n",
1355 	       REGNO (cfun->machine->bcast_partition),
1356 	       oacc_bcast_partition);
1357     }
1358   /* Verify oacc_bcast_size.  */
1359   gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1360 	      <= oacc_bcast_size);
1361   if (cfun->machine->sync_bar)
1362     fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1363 	     "// vector synchronization barrier\n",
1364 	     REGNO (cfun->machine->sync_bar));
1365   fprintf (file, "\t}\n");
1366 }
1367 
1368 /* Emit code to initialize predicate and master lane index registers for
1369    -muniform-simt code generation variant.  */
1370 
1371 static void
nvptx_init_unisimt_predicate(FILE * file)1372 nvptx_init_unisimt_predicate (FILE *file)
1373 {
1374   cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1375   int loc = REGNO (cfun->machine->unisimt_location);
1376   int bits = POINTER_SIZE;
1377   fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1378   fprintf (file, "\t{\n");
1379   fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1380   fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1381   fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1382   fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1383 	   bits == 64 ? ".wide" : ".lo");
1384   fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1385   fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1386   if (cfun->machine->unisimt_predicate)
1387     {
1388       int master = REGNO (cfun->machine->unisimt_master);
1389       int pred = REGNO (cfun->machine->unisimt_predicate);
1390       fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1391       if (cfun->machine->unisimt_outside_simt_predicate)
1392 	{
1393 	  int pred_outside_simt
1394 	    = REGNO (cfun->machine->unisimt_outside_simt_predicate);
1395 	  fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, 0;\n",
1396 		   pred_outside_simt, master);
1397 	}
1398       fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1399       /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'.  */
1400       fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1401       /* Compute predicate as 'tid.x == master'.  */
1402       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1403     }
1404   fprintf (file, "\t}\n");
1405   need_unisimt_decl = true;
1406 }
1407 
1408 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1409 
1410    extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1411    void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1412    {
1413      __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1414      __nvptx_uni[tid.y] = 0;
1415      gomp_nvptx_main (ORIG, arg);
1416    }
1417    ORIG itself should not be emitted as a PTX .entry function.  */
1418 
1419 static void
write_omp_entry(FILE * file,const char * name,const char * orig)1420 write_omp_entry (FILE *file, const char *name, const char *orig)
1421 {
1422   static bool gomp_nvptx_main_declared;
1423   if (!gomp_nvptx_main_declared)
1424     {
1425       gomp_nvptx_main_declared = true;
1426       write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1427       func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1428         << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1429     }
1430   /* PR79332.  Single out this string; it confuses gcc.pot generation.  */
1431 #define NTID_Y "%ntid.y"
1432 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1433  (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1434 {\n\
1435 	.reg.u32 %r<3>;\n\
1436 	.reg.u" PS " %R<4>;\n\
1437 	mov.u32 %r0, %tid.y;\n\
1438 	mov.u32 %r1, " NTID_Y ";\n\
1439 	mov.u32 %r2, %ctaid.x;\n\
1440 	cvt.u" PS ".u32 %R1, %r0;\n\
1441 	" MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1442 	mov.u" PS " %R0, __nvptx_stacks;\n\
1443 	" MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1444 	ld.param.u" PS " %R2, [%stack];\n\
1445 	ld.param.u" PS " %R3, [%sz];\n\
1446 	add.u" PS " %R2, %R2, %R3;\n\
1447 	mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1448 	st.shared.u" PS " [%R0], %R2;\n\
1449 	mov.u" PS " %R0, __nvptx_uni;\n\
1450 	" MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1451 	mov.u32 %r0, 0;\n\
1452 	st.shared.u32 [%R0], %r0;\n\
1453 	mov.u" PS " %R0, \0;\n\
1454 	ld.param.u" PS " %R1, [%arg];\n\
1455 	{\n\
1456 		.param.u" PS " %P<2>;\n\
1457 		st.param.u" PS " [%P0], %R0;\n\
1458 		st.param.u" PS " [%P1], %R1;\n\
1459 		call.uni gomp_nvptx_main, (%P0, %P1);\n\
1460 	}\n\
1461 	ret.uni;\n\
1462 }\n"
1463   static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1464   static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32  ");
1465 #undef ENTRY_TEMPLATE
1466 #undef NTID_Y
1467   const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1468   /* Position ENTRY_2 after the embedded nul using strlen of the prefix.  */
1469   const char *entry_2 = entry_1 + strlen (entry64) + 1;
1470   fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1471   need_softstack_decl = need_unisimt_decl = true;
1472 }
1473 
1474 /* Implement ASM_DECLARE_FUNCTION_NAME.  Writes the start of a ptx
1475    function, including local var decls and copies from the arguments to
1476    local regs.  */
1477 
1478 void
nvptx_declare_function_name(FILE * file,const char * name,const_tree decl)1479 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1480 {
1481   tree fntype = TREE_TYPE (decl);
1482   tree result_type = TREE_TYPE (fntype);
1483   int argno = 0;
1484 
1485   if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1486       && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1487     {
1488       char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1489       sprintf (buf, "%s$impl", name);
1490       write_omp_entry (file, name, buf);
1491       name = buf;
1492     }
1493   /* We construct the initial part of the function into a string
1494      stream, in order to share the prototype writing code.  */
1495   std::stringstream s;
1496   write_fn_proto (s, true, name, decl);
1497   s << "{\n";
1498 
1499   bool return_in_mem = write_return_type (s, false, result_type);
1500   if (return_in_mem)
1501     argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1502 
1503   /* Declare and initialize incoming arguments.  */
1504   tree args = TYPE_ARG_TYPES (fntype);
1505   bool prototyped = true;
1506   if (!args)
1507     {
1508       args = DECL_ARGUMENTS (decl);
1509       prototyped = false;
1510     }
1511 
1512   for (; args != NULL_TREE; args = TREE_CHAIN (args))
1513     {
1514       tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1515 
1516       argno = write_arg_type (s, 0, argno, type, prototyped);
1517     }
1518 
1519   if (stdarg_p (fntype))
1520     argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1521 			    true);
1522 
1523   if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1524     write_arg_type (s, STATIC_CHAIN_REGNUM,
1525 		    DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1526 		    true);
1527 
1528   fprintf (file, "%s", s.str().c_str());
1529 
1530   /* Usually 'crtl->is_leaf' is computed during register allocator
1531      initialization (which is not done on NVPTX) or for pressure-sensitive
1532      optimizations.  Initialize it here, except if already set.  */
1533   if (!crtl->is_leaf)
1534     crtl->is_leaf = leaf_function_p ();
1535 
1536   HOST_WIDE_INT sz = get_frame_size ();
1537   bool need_frameptr = sz || cfun->machine->has_chain;
1538   int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1539   if (!TARGET_SOFT_STACK)
1540     {
1541       /* Declare a local var for outgoing varargs.  */
1542       if (cfun->machine->has_varadic)
1543 	init_frame (file, STACK_POINTER_REGNUM,
1544 		    UNITS_PER_WORD, crtl->outgoing_args_size);
1545 
1546       /* Declare a local variable for the frame.  Force its size to be
1547 	 DImode-compatible.  */
1548       if (need_frameptr)
1549 	init_frame (file, FRAME_POINTER_REGNUM, alignment,
1550 		    ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1551     }
1552   else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1553 	   || (cfun->machine->has_simtreg && !crtl->is_leaf))
1554     init_softstack_frame (file, alignment, sz);
1555 
1556   if (cfun->machine->has_simtreg)
1557     {
1558       unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1559       unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1560       align = MAX (align, GET_MODE_SIZE (DImode));
1561       if (!crtl->is_leaf || cfun->calls_alloca)
1562 	simtsz = HOST_WIDE_INT_M1U;
1563       if (simtsz == HOST_WIDE_INT_M1U)
1564 	simtsz = nvptx_softstack_size;
1565       if (cfun->machine->has_softstack)
1566 	simtsz += POINTER_SIZE / 8;
1567       simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1568       if (align > GET_MODE_SIZE (DImode))
1569 	simtsz += align - GET_MODE_SIZE (DImode);
1570       if (simtsz)
1571 	fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1572 		HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1573     }
1574 
1575   /* Restore the vector reduction partition register, if necessary.
1576      FIXME: Find out when and why this is necessary, and fix it.  */
1577   if (cfun->machine->red_partition)
1578     regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1579       = cfun->machine->red_partition;
1580 
1581   /* Declare the pseudos we have as ptx registers.  */
1582   int maxregs = max_reg_num ();
1583   for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1584     {
1585       if (regno_reg_rtx[i] != const0_rtx)
1586 	{
1587 	  machine_mode mode = PSEUDO_REGNO_MODE (i);
1588 	  machine_mode split = maybe_split_mode (mode);
1589 
1590 	  if (split_mode_p (mode))
1591 	    mode = split;
1592 	  fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1593 	  output_reg (file, i, split, -2);
1594 	  fprintf (file, ";\n");
1595 	}
1596     }
1597 
1598   /* Emit axis predicates. */
1599   if (cfun->machine->axis_predicate[0])
1600     nvptx_init_axis_predicate (file,
1601 			       REGNO (cfun->machine->axis_predicate[0]), "y");
1602   if (cfun->machine->axis_predicate[1])
1603     nvptx_init_axis_predicate (file,
1604 			       REGNO (cfun->machine->axis_predicate[1]), "x");
1605   if (cfun->machine->unisimt_predicate
1606       || (cfun->machine->has_simtreg && !crtl->is_leaf))
1607     nvptx_init_unisimt_predicate (file);
1608   if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1609     nvptx_init_oacc_workers (file);
1610 }
1611 
1612 /* Output code for switching uniform-simt state.  ENTERING indicates whether
1613    we are entering or leaving non-uniform execution region.  */
1614 
1615 static void
nvptx_output_unisimt_switch(FILE * file,bool entering)1616 nvptx_output_unisimt_switch (FILE *file, bool entering)
1617 {
1618   if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1619     return;
1620   fprintf (file, "\t{\n");
1621   fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1622   fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1623   if (cfun->machine->unisimt_outside_simt_predicate)
1624     {
1625       int pred_outside_simt
1626 	= REGNO (cfun->machine->unisimt_outside_simt_predicate);
1627       fprintf (file, "\t\tmov.pred %%r%d, %d;\n", pred_outside_simt,
1628 	       entering ? 0 : 1);
1629     }
1630   if (!crtl->is_leaf)
1631     {
1632       int loc = REGNO (cfun->machine->unisimt_location);
1633       fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1634     }
1635   if (cfun->machine->unisimt_predicate)
1636     {
1637       int master = REGNO (cfun->machine->unisimt_master);
1638       int pred = REGNO (cfun->machine->unisimt_predicate);
1639       fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1640       fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1641 	       master, entering ? "%ustmp2" : "0");
1642       fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1643     }
1644   fprintf (file, "\t}\n");
1645 }
1646 
1647 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1648    ENTERING indicates whether we are entering or leaving non-uniform execution.
1649    PTR is the register pointing to allocated storage, it is assigned to on
1650    entering and used to restore state on leaving.  SIZE and ALIGN are used only
1651    on entering.  */
1652 
1653 static void
nvptx_output_softstack_switch(FILE * file,bool entering,rtx ptr,rtx size,rtx align)1654 nvptx_output_softstack_switch (FILE *file, bool entering,
1655 			       rtx ptr, rtx size, rtx align)
1656 {
1657   gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1658   if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1659     return;
1660   int bits = POINTER_SIZE, regno = REGNO (ptr);
1661   fprintf (file, "\t{\n");
1662   if (entering)
1663     {
1664       fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1665 	       HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1666 	       cfun->machine->simt_stack_size);
1667       fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1668       if (CONST_INT_P (size))
1669 	fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1670 		 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1671       else
1672 	output_reg (file, REGNO (size), VOIDmode);
1673       fputs (";\n", file);
1674       if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1675 	fprintf (file,
1676 		 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1677 		 bits, regno, regno, UINTVAL (align));
1678     }
1679   if (cfun->machine->has_softstack)
1680     {
1681       const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1682       if (entering)
1683 	{
1684 	  fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1685 		   bits, regno, bits / 8, reg_stack);
1686 	  fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1687 		   bits, reg_stack, regno, bits / 8);
1688 	}
1689       else
1690 	{
1691 	  fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1692 		   bits, reg_stack, regno, bits / 8);
1693 	}
1694       nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1695     }
1696   fprintf (file, "\t}\n");
1697 }
1698 
1699 /* Output code to enter non-uniform execution region.  DEST is a register
1700    to hold a per-lane allocation given by SIZE and ALIGN.  */
1701 
1702 const char *
nvptx_output_simt_enter(rtx dest,rtx size,rtx align)1703 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1704 {
1705   nvptx_output_unisimt_switch (asm_out_file, true);
1706   nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1707   return "";
1708 }
1709 
1710 /* Output code to leave non-uniform execution region.  SRC is the register
1711    holding per-lane storage previously allocated by omp_simt_enter insn.  */
1712 
1713 const char *
nvptx_output_simt_exit(rtx src)1714 nvptx_output_simt_exit (rtx src)
1715 {
1716   nvptx_output_unisimt_switch (asm_out_file, false);
1717   nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1718   return "";
1719 }
1720 
1721 /* Output instruction that sets soft stack pointer in shared memory to the
1722    value in register given by SRC_REGNO.  */
1723 
1724 const char *
nvptx_output_set_softstack(unsigned src_regno)1725 nvptx_output_set_softstack (unsigned src_regno)
1726 {
1727   if (cfun->machine->has_softstack && !crtl->is_leaf)
1728     {
1729       fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1730 	       POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1731       output_reg (asm_out_file, src_regno, VOIDmode);
1732       fprintf (asm_out_file, ";\n");
1733     }
1734   return "";
1735 }
1736 /* Output a return instruction.  Also copy the return value to its outgoing
1737    location.  */
1738 
1739 const char *
nvptx_output_return(void)1740 nvptx_output_return (void)
1741 {
1742   machine_mode mode = (machine_mode)cfun->machine->return_mode;
1743 
1744   if (mode != VOIDmode)
1745     fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1746 	     nvptx_ptx_type_from_mode (mode, false),
1747 	     reg_names[NVPTX_RETURN_REGNUM],
1748 	     reg_names[NVPTX_RETURN_REGNUM]);
1749 
1750   return "ret;";
1751 }
1752 
1753 /* Terminate a function by writing a closing brace to FILE.  */
1754 
1755 void
nvptx_function_end(FILE * file)1756 nvptx_function_end (FILE *file)
1757 {
1758   fprintf (file, "}\n");
1759 }
1760 
1761 /* Decide whether we can make a sibling call to a function.  For ptx, we
1762    can't.  */
1763 
1764 static bool
nvptx_function_ok_for_sibcall(tree,tree)1765 nvptx_function_ok_for_sibcall (tree, tree)
1766 {
1767   return false;
1768 }
1769 
1770 /* Return Dynamic ReAlignment Pointer RTX.  For PTX there isn't any.  */
1771 
1772 static rtx
nvptx_get_drap_rtx(void)1773 nvptx_get_drap_rtx (void)
1774 {
1775   if (TARGET_SOFT_STACK && stack_realign_drap)
1776     return arg_pointer_rtx;
1777   return NULL_RTX;
1778 }
1779 
1780 /* Implement the TARGET_CALL_ARGS hook.  Record information about one
1781    argument to the next call.  */
1782 
1783 static void
nvptx_call_args(rtx arg,tree fntype)1784 nvptx_call_args (rtx arg, tree fntype)
1785 {
1786   if (!cfun->machine->doing_call)
1787     {
1788       cfun->machine->doing_call = true;
1789       cfun->machine->is_varadic = false;
1790       cfun->machine->num_args = 0;
1791 
1792       if (fntype && stdarg_p (fntype))
1793 	{
1794 	  cfun->machine->is_varadic = true;
1795 	  cfun->machine->has_varadic = true;
1796 	  cfun->machine->num_args++;
1797 	}
1798     }
1799 
1800   if (REG_P (arg) && arg != pc_rtx)
1801     {
1802       cfun->machine->num_args++;
1803       cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1804 						  cfun->machine->call_args);
1805     }
1806 }
1807 
1808 /* Implement the corresponding END_CALL_ARGS hook.  Clear and free the
1809    information we recorded.  */
1810 
1811 static void
nvptx_end_call_args(void)1812 nvptx_end_call_args (void)
1813 {
1814   cfun->machine->doing_call = false;
1815   free_EXPR_LIST_list (&cfun->machine->call_args);
1816 }
1817 
1818 /* Emit the sequence for a call to ADDRESS, setting RETVAL.  Keep
1819    track of whether calls involving static chains or varargs were seen
1820    in the current function.
1821    For libcalls, maintain a hash table of decls we have seen, and
1822    record a function decl for later when encountering a new one.  */
1823 
1824 void
nvptx_expand_call(rtx retval,rtx address)1825 nvptx_expand_call (rtx retval, rtx address)
1826 {
1827   rtx callee = XEXP (address, 0);
1828   rtx varargs = NULL_RTX;
1829   unsigned parallel = 0;
1830 
1831   if (!call_insn_operand (callee, Pmode))
1832     {
1833       callee = force_reg (Pmode, callee);
1834       address = change_address (address, QImode, callee);
1835     }
1836 
1837   if (GET_CODE (callee) == SYMBOL_REF)
1838     {
1839       tree decl = SYMBOL_REF_DECL (callee);
1840       if (decl != NULL_TREE)
1841 	{
1842 	  if (DECL_STATIC_CHAIN (decl))
1843 	    cfun->machine->has_chain = true;
1844 
1845 	  tree attr = oacc_get_fn_attrib (decl);
1846 	  if (attr)
1847 	    {
1848 	      tree dims = TREE_VALUE (attr);
1849 
1850 	      parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1851 	      for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1852 		{
1853 		  if (TREE_PURPOSE (dims)
1854 		      && !integer_zerop (TREE_PURPOSE (dims)))
1855 		    break;
1856 		  /* Not on this axis.  */
1857 		  parallel ^= GOMP_DIM_MASK (ix);
1858 		  dims = TREE_CHAIN (dims);
1859 		}
1860 	    }
1861 	}
1862     }
1863 
1864   unsigned nargs = cfun->machine->num_args;
1865   if (cfun->machine->is_varadic)
1866     {
1867       varargs = gen_reg_rtx (Pmode);
1868       emit_move_insn (varargs, stack_pointer_rtx);
1869     }
1870 
1871   rtvec vec = rtvec_alloc (nargs + 1);
1872   rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1873   int vec_pos = 0;
1874 
1875   rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1876   rtx tmp_retval = retval;
1877   if (retval)
1878     {
1879       if (!nvptx_register_operand (retval, GET_MODE (retval)))
1880 	tmp_retval = gen_reg_rtx (GET_MODE (retval));
1881       call = gen_rtx_SET (tmp_retval, call);
1882     }
1883   XVECEXP (pat, 0, vec_pos++) = call;
1884 
1885   /* Construct the call insn, including a USE for each argument pseudo
1886      register.  These will be used when printing the insn.  */
1887   for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1888     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1889 
1890   if (varargs)
1891     XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1892 
1893   gcc_assert (vec_pos = XVECLEN (pat, 0));
1894 
1895   nvptx_emit_forking (parallel, true);
1896   emit_call_insn (pat);
1897   nvptx_emit_joining (parallel, true);
1898 
1899   if (tmp_retval != retval)
1900     emit_move_insn (retval, tmp_retval);
1901 }
1902 
1903 /* Emit a comparison COMPARE, and return the new test to be used in the
1904    jump.  */
1905 
1906 rtx
nvptx_expand_compare(rtx compare)1907 nvptx_expand_compare (rtx compare)
1908 {
1909   rtx pred = gen_reg_rtx (BImode);
1910   rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1911 			    XEXP (compare, 0), XEXP (compare, 1));
1912   emit_insn (gen_rtx_SET (pred, cmp));
1913   return gen_rtx_NE (BImode, pred, const0_rtx);
1914 }
1915 
1916 /* Expand the oacc fork & join primitive into ptx-required unspecs.  */
1917 
1918 void
nvptx_expand_oacc_fork(unsigned mode)1919 nvptx_expand_oacc_fork (unsigned mode)
1920 {
1921   nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1922 }
1923 
1924 void
nvptx_expand_oacc_join(unsigned mode)1925 nvptx_expand_oacc_join (unsigned mode)
1926 {
1927   nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1928 }
1929 
1930 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1931    objects.  */
1932 
1933 static rtx
nvptx_gen_unpack(rtx dst0,rtx dst1,rtx src)1934 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1935 {
1936   rtx res;
1937 
1938   switch (GET_MODE (src))
1939     {
1940     case E_DImode:
1941       res = gen_unpackdisi2 (dst0, dst1, src);
1942       break;
1943     case E_DFmode:
1944       res = gen_unpackdfsi2 (dst0, dst1, src);
1945       break;
1946     default: gcc_unreachable ();
1947     }
1948   return res;
1949 }
1950 
1951 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1952    object.  */
1953 
1954 static rtx
nvptx_gen_pack(rtx dst,rtx src0,rtx src1)1955 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1956 {
1957   rtx res;
1958 
1959   switch (GET_MODE (dst))
1960     {
1961     case E_DImode:
1962       res = gen_packsidi2 (dst, src0, src1);
1963       break;
1964     case E_DFmode:
1965       res = gen_packsidf2 (dst, src0, src1);
1966       break;
1967     default: gcc_unreachable ();
1968     }
1969   return res;
1970 }
1971 
1972 /* Generate an instruction or sequence to broadcast register REG
1973    across the vectors of a single warp.  */
1974 
1975 rtx
nvptx_gen_shuffle(rtx dst,rtx src,rtx idx,nvptx_shuffle_kind kind)1976 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1977 {
1978   rtx res;
1979 
1980   switch (GET_MODE (dst))
1981     {
1982       case E_DCmode:
1983       case E_CDImode:
1984 	{
1985 	  gcc_assert (GET_CODE (dst) == CONCAT);
1986 	  gcc_assert (GET_CODE (src) == CONCAT);
1987 	  rtx dst_real = XEXP (dst, 0);
1988 	  rtx dst_imag = XEXP (dst, 1);
1989 	  rtx src_real = XEXP (src, 0);
1990 	  rtx src_imag = XEXP (src, 1);
1991 
1992 	  start_sequence ();
1993 	  emit_insn (nvptx_gen_shuffle (dst_real, src_real, idx, kind));
1994 	  emit_insn (nvptx_gen_shuffle (dst_imag, src_imag, idx, kind));
1995 	  res = get_insns ();
1996 	  end_sequence ();
1997 	}
1998 	break;
1999     case E_SImode:
2000       res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
2001       break;
2002     case E_SFmode:
2003       res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
2004       break;
2005     case E_DImode:
2006     case E_DFmode:
2007       {
2008 	rtx tmp0 = gen_reg_rtx (SImode);
2009 	rtx tmp1 = gen_reg_rtx (SImode);
2010 
2011 	start_sequence ();
2012 	emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
2013 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2014 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2015 	emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
2016 	res = get_insns ();
2017 	end_sequence ();
2018       }
2019       break;
2020     case E_V2SImode:
2021       {
2022 	rtx src0 = gen_rtx_SUBREG (SImode, src, 0);
2023 	rtx src1 = gen_rtx_SUBREG (SImode, src, 4);
2024 	rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0);
2025 	rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4);
2026 	rtx tmp0 = gen_reg_rtx (SImode);
2027 	rtx tmp1 = gen_reg_rtx (SImode);
2028 	start_sequence ();
2029 	emit_insn (gen_movsi (tmp0, src0));
2030 	emit_insn (gen_movsi (tmp1, src1));
2031 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2032 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2033 	emit_insn (gen_movsi (dst0, tmp0));
2034 	emit_insn (gen_movsi (dst1, tmp1));
2035 	res = get_insns ();
2036 	end_sequence ();
2037       }
2038       break;
2039     case E_V2DImode:
2040       {
2041 	rtx src0 = gen_rtx_SUBREG (DImode, src, 0);
2042 	rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
2043 	rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
2044 	rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
2045 	rtx tmp0 = gen_reg_rtx (DImode);
2046 	rtx tmp1 = gen_reg_rtx (DImode);
2047 	start_sequence ();
2048 	emit_insn (gen_movdi (tmp0, src0));
2049 	emit_insn (gen_movdi (tmp1, src1));
2050 	emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2051 	emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2052 	emit_insn (gen_movdi (dst0, tmp0));
2053 	emit_insn (gen_movdi (dst1, tmp1));
2054 	res = get_insns ();
2055 	end_sequence ();
2056       }
2057       break;
2058     case E_BImode:
2059       {
2060 	rtx tmp = gen_reg_rtx (SImode);
2061 
2062 	start_sequence ();
2063 	emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
2064 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
2065 	emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
2066 	res = get_insns ();
2067 	end_sequence ();
2068       }
2069       break;
2070     case E_QImode:
2071     case E_HImode:
2072       {
2073 	rtx tmp = gen_reg_rtx (SImode);
2074 
2075 	start_sequence ();
2076 	emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
2077 	emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
2078 	emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
2079 						    tmp)));
2080 	res = get_insns ();
2081 	end_sequence ();
2082       }
2083       break;
2084 
2085     default:
2086       gcc_unreachable ();
2087     }
2088   return res;
2089 }
2090 
2091 /* Generate an instruction or sequence to broadcast register REG
2092    across the vectors of a single warp.  */
2093 
2094 static rtx
nvptx_gen_warp_bcast(rtx reg)2095 nvptx_gen_warp_bcast (rtx reg)
2096 {
2097   return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
2098 }
2099 
2100 /* Structure used when generating a worker-level spill or fill.  */
2101 
2102 struct broadcast_data_t
2103 {
2104   rtx base;  /* Register holding base addr of buffer.  */
2105   rtx ptr;  /* Iteration var,  if needed.  */
2106   unsigned offset; /* Offset into worker buffer.  */
2107 };
2108 
2109 /* Direction of the spill/fill and looping setup/teardown indicator.  */
2110 
2111 enum propagate_mask
2112   {
2113     PM_read = 1 << 0,
2114     PM_write = 1 << 1,
2115     PM_loop_begin = 1 << 2,
2116     PM_loop_end = 1 << 3,
2117 
2118     PM_read_write = PM_read | PM_write
2119   };
2120 
2121 /* Generate instruction(s) to spill or fill register REG to/from the
2122    worker broadcast array.  PM indicates what is to be done, REP
2123    how many loop iterations will be executed (0 for not a loop).  */
2124 
2125 static rtx
nvptx_gen_shared_bcast(rtx reg,propagate_mask pm,unsigned rep,broadcast_data_t * data,bool vector)2126 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
2127 			broadcast_data_t *data, bool vector)
2128 {
2129   rtx  res;
2130   machine_mode mode = GET_MODE (reg);
2131 
2132   switch (mode)
2133     {
2134     case E_BImode:
2135       {
2136 	rtx tmp = gen_reg_rtx (SImode);
2137 
2138 	start_sequence ();
2139 	if (pm & PM_read)
2140 	  emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
2141 	emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
2142 	if (pm & PM_write)
2143 	  emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
2144 	res = get_insns ();
2145 	end_sequence ();
2146       }
2147       break;
2148 
2149     default:
2150       {
2151 	rtx addr = data->ptr;
2152 
2153 	if (!addr)
2154 	  {
2155 	    unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
2156 
2157 	    oacc_bcast_align = MAX (oacc_bcast_align, align);
2158 	    data->offset = ROUND_UP (data->offset, align);
2159 	    addr = data->base;
2160 	    gcc_assert (data->base != NULL);
2161 	    if (data->offset)
2162 	      addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
2163 	  }
2164 
2165 	addr = gen_rtx_MEM (mode, addr);
2166 	if (pm == PM_read)
2167 	  res = gen_rtx_SET (addr, reg);
2168 	else if (pm == PM_write)
2169 	  res = gen_rtx_SET (reg, addr);
2170 	else
2171 	  gcc_unreachable ();
2172 
2173 	if (data->ptr)
2174 	  {
2175 	    /* We're using a ptr, increment it.  */
2176 	    start_sequence ();
2177 
2178 	    emit_insn (res);
2179 	    emit_insn (gen_adddi3 (data->ptr, data->ptr,
2180 				   GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
2181 	    res = get_insns ();
2182 	    end_sequence ();
2183 	  }
2184 	else
2185 	  rep = 1;
2186 	data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
2187       }
2188       break;
2189     }
2190   return res;
2191 }
2192 
2193 /* Returns true if X is a valid address for use in a memory reference.  */
2194 
2195 static bool
nvptx_legitimate_address_p(machine_mode,rtx x,bool)2196 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
2197 {
2198   enum rtx_code code = GET_CODE (x);
2199 
2200   switch (code)
2201     {
2202     case REG:
2203       return true;
2204 
2205     case PLUS:
2206       if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
2207 	return true;
2208       return false;
2209 
2210     case CONST:
2211     case SYMBOL_REF:
2212     case LABEL_REF:
2213       return true;
2214 
2215     default:
2216       return false;
2217     }
2218 }
2219 
2220 /* Machinery to output constant initializers.  When beginning an
2221    initializer, we decide on a fragment size (which is visible in ptx
2222    in the type used), and then all initializer data is buffered until
2223    a fragment is filled and ready to be written out.  */
2224 
2225 static struct
2226 {
2227   unsigned HOST_WIDE_INT mask; /* Mask for storing fragment.  */
2228   unsigned HOST_WIDE_INT val; /* Current fragment value.  */
2229   unsigned HOST_WIDE_INT remaining; /*  Remaining bytes to be written
2230 					out.  */
2231   unsigned size;  /* Fragment size to accumulate.  */
2232   unsigned offset;  /* Offset within current fragment.  */
2233   bool started;   /* Whether we've output any initializer.  */
2234 } init_frag;
2235 
2236 /* The current fragment is full,  write it out.  SYM may provide a
2237    symbolic reference we should output,  in which case the fragment
2238    value is the addend.  */
2239 
2240 static void
output_init_frag(rtx sym)2241 output_init_frag (rtx sym)
2242 {
2243   fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
2244   unsigned HOST_WIDE_INT val = init_frag.val;
2245 
2246   init_frag.started = true;
2247   init_frag.val = 0;
2248   init_frag.offset = 0;
2249   init_frag.remaining--;
2250 
2251   if (sym)
2252     {
2253       bool function = (SYMBOL_REF_DECL (sym)
2254 		       && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2255       if (!function)
2256 	fprintf (asm_out_file, "generic(");
2257       output_address (VOIDmode, sym);
2258       if (!function)
2259 	fprintf (asm_out_file, ")");
2260       if (val)
2261 	fprintf (asm_out_file, " + ");
2262     }
2263 
2264   if (!sym || val)
2265     fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2266 }
2267 
2268 /* Add value VAL of size SIZE to the data we're emitting, and keep
2269    writing out chunks as they fill up.  */
2270 
2271 static void
nvptx_assemble_value(unsigned HOST_WIDE_INT val,unsigned size)2272 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2273 {
2274   bool negative_p
2275     = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1));
2276 
2277   /* Avoid undefined behaviour.  */
2278   if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT)
2279     val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1;
2280 
2281   for (unsigned part = 0; size; size -= part)
2282     {
2283       if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT)
2284 	/* Avoid undefined behaviour.  */
2285 	val = negative_p ? -1 : 0;
2286       else
2287 	val >>= (part * BITS_PER_UNIT);
2288       part = init_frag.size - init_frag.offset;
2289       part = MIN (part, size);
2290 
2291       unsigned HOST_WIDE_INT partial
2292 	= val << (init_frag.offset * BITS_PER_UNIT);
2293       init_frag.val |= partial & init_frag.mask;
2294       init_frag.offset += part;
2295 
2296       if (init_frag.offset == init_frag.size)
2297 	output_init_frag (NULL);
2298     }
2299 }
2300 
2301 /* Target hook for assembling integer object X of size SIZE.  */
2302 
2303 static bool
nvptx_assemble_integer(rtx x,unsigned int size,int ARG_UNUSED (aligned_p))2304 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2305 {
2306   HOST_WIDE_INT val = 0;
2307 
2308   switch (GET_CODE (x))
2309     {
2310     default:
2311       /* Let the generic machinery figure it out, usually for a
2312 	 CONST_WIDE_INT.  */
2313       return false;
2314 
2315     case CONST_INT:
2316       nvptx_assemble_value (INTVAL (x), size);
2317       break;
2318 
2319     case CONST:
2320       x = XEXP (x, 0);
2321       gcc_assert (GET_CODE (x) == PLUS);
2322       val = INTVAL (XEXP (x, 1));
2323       x = XEXP (x, 0);
2324       gcc_assert (GET_CODE (x) == SYMBOL_REF);
2325       gcc_fallthrough (); /* FALLTHROUGH */
2326 
2327     case SYMBOL_REF:
2328       gcc_assert (size == init_frag.size);
2329       if (init_frag.offset)
2330 	sorry ("cannot emit unaligned pointers in ptx assembly");
2331 
2332       nvptx_maybe_record_fnsym (x);
2333       init_frag.val = val;
2334       output_init_frag (x);
2335       break;
2336     }
2337 
2338   return true;
2339 }
2340 
2341 /* Output SIZE zero bytes.  We ignore the FILE argument since the
2342    functions we're calling to perform the output just use
2343    asm_out_file.  */
2344 
2345 void
nvptx_output_skip(FILE *,unsigned HOST_WIDE_INT size)2346 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2347 {
2348   /* Finish the current fragment, if it's started.  */
2349   if (init_frag.offset)
2350     {
2351       unsigned part = init_frag.size - init_frag.offset;
2352       part = MIN (part, (unsigned)size);
2353       size -= part;
2354       nvptx_assemble_value (0, part);
2355     }
2356 
2357   /* If this skip doesn't terminate the initializer, write as many
2358      remaining pieces as possible directly.  */
2359   if (size < init_frag.remaining * init_frag.size)
2360     {
2361       while (size >= init_frag.size)
2362 	{
2363 	  size -= init_frag.size;
2364 	  output_init_frag (NULL_RTX);
2365 	}
2366       if (size)
2367 	nvptx_assemble_value (0, size);
2368     }
2369 }
2370 
2371 /* Output a string STR with length SIZE.  As in nvptx_output_skip we
2372    ignore the FILE arg.  */
2373 
2374 void
nvptx_output_ascii(FILE *,const char * str,unsigned HOST_WIDE_INT size)2375 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2376 {
2377   for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2378     nvptx_assemble_value (str[i], 1);
2379 }
2380 
2381 /* Return true if TYPE is a record type where the last field is an array without
2382    given dimension.  */
2383 
2384 static bool
flexible_array_member_type_p(const_tree type)2385 flexible_array_member_type_p (const_tree type)
2386 {
2387   if (TREE_CODE (type) != RECORD_TYPE)
2388     return false;
2389 
2390   const_tree last_field = NULL_TREE;
2391   for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2392     last_field = f;
2393 
2394   if (!last_field)
2395     return false;
2396 
2397   const_tree last_field_type = TREE_TYPE (last_field);
2398   if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2399     return false;
2400 
2401   return (! TYPE_DOMAIN (last_field_type)
2402 	  || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2403 }
2404 
2405 /* Emit a PTX variable decl and prepare for emission of its
2406    initializer.  NAME is the symbol name and SETION the PTX data
2407    area. The type is TYPE, object size SIZE and alignment is ALIGN.
2408    The caller has already emitted any indentation and linkage
2409    specifier.  It is responsible for any initializer, terminating ;
2410    and newline.  SIZE is in bytes, ALIGN is in bits -- confusingly
2411    this is the opposite way round that PTX wants them!  */
2412 
2413 static void
nvptx_assemble_decl_begin(FILE * file,const char * name,const char * section,const_tree type,HOST_WIDE_INT size,unsigned align,bool undefined=false)2414 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2415 			   const_tree type, HOST_WIDE_INT size, unsigned align,
2416 			   bool undefined = false)
2417 {
2418   bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2419     && (TYPE_DOMAIN (type) == NULL_TREE);
2420 
2421   if (undefined && flexible_array_member_type_p (type))
2422     {
2423       size = 0;
2424       atype = true;
2425     }
2426 
2427   while (TREE_CODE (type) == ARRAY_TYPE)
2428     type = TREE_TYPE (type);
2429 
2430   if (TREE_CODE (type) == VECTOR_TYPE
2431       || TREE_CODE (type) == COMPLEX_TYPE)
2432     /* Neither vector nor complex types can contain the other.  */
2433     type = TREE_TYPE (type);
2434 
2435   unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type);
2436 
2437   /* Largest mode we're prepared to accept.  For BLKmode types we
2438      don't know if it'll contain pointer constants, so have to choose
2439      pointer size, otherwise we can choose DImode.  */
2440   machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2441 
2442   elt_size |= GET_MODE_SIZE (elt_mode);
2443   elt_size &= -elt_size; /* Extract LSB set.  */
2444 
2445   init_frag.size = elt_size;
2446   /* Avoid undefined shift behavior by using '2'.  */
2447   init_frag.mask = ((unsigned HOST_WIDE_INT)2
2448 		    << (elt_size * BITS_PER_UNIT - 1)) - 1;
2449   init_frag.val = 0;
2450   init_frag.offset = 0;
2451   init_frag.started = false;
2452   /* Size might not be a multiple of elt size, if there's an
2453      initialized trailing struct array with smaller type than
2454      elt_size. */
2455   init_frag.remaining = (size + elt_size - 1) / elt_size;
2456 
2457   fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ",
2458 	   section, align / BITS_PER_UNIT,
2459 	   elt_size * BITS_PER_UNIT);
2460   assemble_name (file, name);
2461 
2462   if (size)
2463     /* We make everything an array, to simplify any initialization
2464        emission.  */
2465     fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining);
2466   else if (atype)
2467     fprintf (file, "[]");
2468 }
2469 
2470 /* Called when the initializer for a decl has been completely output through
2471    combinations of the three functions above.  */
2472 
2473 static void
nvptx_assemble_decl_end(void)2474 nvptx_assemble_decl_end (void)
2475 {
2476   if (init_frag.offset)
2477     /* This can happen with a packed struct with trailing array member.  */
2478     nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2479   fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2480 }
2481 
2482 /* Output an uninitialized common or file-scope variable.  */
2483 
2484 void
nvptx_output_aligned_decl(FILE * file,const char * name,const_tree decl,HOST_WIDE_INT size,unsigned align)2485 nvptx_output_aligned_decl (FILE *file, const char *name,
2486 			   const_tree decl, HOST_WIDE_INT size, unsigned align)
2487 {
2488   write_var_marker (file, true, TREE_PUBLIC (decl), name);
2489 
2490   /* If this is public, it is common.  The nearest thing we have to
2491      common is weak.  */
2492   fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2493 
2494   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2495 			     TREE_TYPE (decl), size, align);
2496   nvptx_assemble_decl_end ();
2497 }
2498 
2499 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME.  Begin the process of
2500    writing a constant variable EXP with NAME and SIZE and its
2501    initializer to FILE.  */
2502 
2503 static void
nvptx_asm_declare_constant_name(FILE * file,const char * name,const_tree exp,HOST_WIDE_INT obj_size)2504 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2505 				 const_tree exp, HOST_WIDE_INT obj_size)
2506 {
2507   write_var_marker (file, true, false, name);
2508 
2509   fprintf (file, "\t");
2510 
2511   tree type = TREE_TYPE (exp);
2512   nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2513 			     TYPE_ALIGN (type));
2514 }
2515 
2516 /* Implement the ASM_DECLARE_OBJECT_NAME macro.  Used to start writing
2517    a variable DECL with NAME to FILE.  */
2518 
2519 void
nvptx_declare_object_name(FILE * file,const char * name,const_tree decl)2520 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2521 {
2522   write_var_marker (file, true, TREE_PUBLIC (decl), name);
2523 
2524   fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2525 			  : DECL_WEAK (decl) ? ".weak " : ".visible "));
2526 
2527   tree type = TREE_TYPE (decl);
2528   HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2529   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2530 			     type, obj_size, DECL_ALIGN (decl));
2531 }
2532 
2533 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing.  */
2534 
2535 static void
nvptx_globalize_label(FILE *,const char *)2536 nvptx_globalize_label (FILE *, const char *)
2537 {
2538 }
2539 
2540 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL.  Write an extern
2541    declaration only for variable DECL with NAME to FILE.  */
2542 
2543 static void
nvptx_assemble_undefined_decl(FILE * file,const char * name,const_tree decl)2544 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2545 {
2546   /* The middle end can place constant pool decls into the varpool as
2547      undefined.  Until that is fixed, catch the problem here.  */
2548   if (DECL_IN_CONSTANT_POOL (decl))
2549     return;
2550 
2551   /*  We support weak defintions, and hence have the right
2552       ASM_WEAKEN_DECL definition.  Diagnose the problem here.  */
2553   if (DECL_WEAK (decl))
2554     error_at (DECL_SOURCE_LOCATION (decl),
2555 	      "PTX does not support weak declarations"
2556 	      " (only weak definitions)");
2557   write_var_marker (file, false, TREE_PUBLIC (decl), name);
2558 
2559   fprintf (file, "\t.extern ");
2560   tree size = DECL_SIZE_UNIT (decl);
2561   nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2562 			     TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2563 			     DECL_ALIGN (decl), true);
2564   nvptx_assemble_decl_end ();
2565 }
2566 
2567 /* Output a pattern for a move instruction.  */
2568 
2569 const char *
nvptx_output_mov_insn(rtx dst,rtx src)2570 nvptx_output_mov_insn (rtx dst, rtx src)
2571 {
2572   machine_mode dst_mode = GET_MODE (dst);
2573   machine_mode src_mode = GET_MODE (src);
2574   machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2575 			    ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2576   machine_mode src_inner = (GET_CODE (src) == SUBREG
2577 			    ? GET_MODE (XEXP (src, 0)) : dst_mode);
2578 
2579   rtx sym = src;
2580   if (GET_CODE (sym) == CONST)
2581     sym = XEXP (XEXP (sym, 0), 0);
2582   if (SYMBOL_REF_P (sym))
2583     {
2584       if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2585 	return "%.\tcvta%D1%t0\t%0, %1;";
2586       nvptx_maybe_record_fnsym (sym);
2587     }
2588 
2589   if (src_inner == dst_inner)
2590     return "%.\tmov%t0\t%0, %1;";
2591 
2592   if (CONSTANT_P (src))
2593     return (GET_MODE_CLASS (dst_inner) == MODE_INT
2594 	    && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2595 	    ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2596 
2597   if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2598     {
2599       if (GET_MODE_BITSIZE (dst_mode) == 128
2600 	  && GET_MODE_BITSIZE (src_mode) == 128)
2601 	{
2602 	  /* mov.b128 is not supported.  */
2603 	  if (dst_inner == V2DImode && src_inner == TImode)
2604 	    return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2605 	  else if (dst_inner == TImode && src_inner == V2DImode)
2606 	    return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2607 
2608 	  gcc_unreachable ();
2609 	}
2610       return "%.\tmov.b%T0\t%0, %1;";
2611     }
2612 
2613   if (GET_MODE_BITSIZE (src_inner) == 128
2614       && GET_MODE_BITSIZE (src_mode) == 64)
2615     return "%.\tmov.b%T0\t%0, %1;";
2616 
2617   return "%.\tcvt%t0%t1\t%0, %1;";
2618 }
2619 
2620 /* Output a pre/post barrier for MEM_OPERAND according to MEMMODEL.  */
2621 
2622 static void
nvptx_output_barrier(rtx * mem_operand,int memmodel,bool pre_p)2623 nvptx_output_barrier (rtx *mem_operand, int memmodel, bool pre_p)
2624 {
2625   bool post_p = !pre_p;
2626 
2627   switch (memmodel)
2628     {
2629     case MEMMODEL_RELAXED:
2630       return;
2631     case MEMMODEL_CONSUME:
2632     case MEMMODEL_ACQUIRE:
2633     case MEMMODEL_SYNC_ACQUIRE:
2634       if (post_p)
2635 	break;
2636       return;
2637     case MEMMODEL_RELEASE:
2638     case MEMMODEL_SYNC_RELEASE:
2639       if (pre_p)
2640 	break;
2641       return;
2642     case MEMMODEL_ACQ_REL:
2643     case MEMMODEL_SEQ_CST:
2644     case MEMMODEL_SYNC_SEQ_CST:
2645       if (pre_p || post_p)
2646 	break;
2647       return;
2648     default:
2649       gcc_unreachable ();
2650     }
2651 
2652   output_asm_insn ("%.\tmembar%B0;", mem_operand);
2653 }
2654 
2655 const char *
nvptx_output_atomic_insn(const char * asm_template,rtx * operands,int mem_pos,int memmodel_pos)2656 nvptx_output_atomic_insn (const char *asm_template, rtx *operands, int mem_pos,
2657 			  int memmodel_pos)
2658 {
2659   nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2660 			true);
2661   output_asm_insn (asm_template, operands);
2662   nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2663 			false);
2664   return "";
2665 }
2666 
2667 static void nvptx_print_operand (FILE *, rtx, int);
2668 
2669 /* Output INSN, which is a call to CALLEE with result RESULT.  For ptx, this
2670    involves writing .param declarations and in/out copies into them.  For
2671    indirect calls, also write the .callprototype.  */
2672 
2673 const char *
nvptx_output_call_insn(rtx_insn * insn,rtx result,rtx callee)2674 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2675 {
2676   char buf[16];
2677   static int labelno;
2678   bool needs_tgt = register_operand (callee, Pmode);
2679   rtx pat = PATTERN (insn);
2680   if (GET_CODE (pat) == COND_EXEC)
2681     pat = COND_EXEC_CODE (pat);
2682   int arg_end = XVECLEN (pat, 0);
2683   tree decl = NULL_TREE;
2684 
2685   fprintf (asm_out_file, "\t{\n");
2686   if (result != NULL)
2687     fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2688 	     nvptx_ptx_type_from_mode (GET_MODE (result), false),
2689 	     reg_names[NVPTX_RETURN_REGNUM]);
2690 
2691   /* Ensure we have a ptx declaration in the output if necessary.  */
2692   if (GET_CODE (callee) == SYMBOL_REF)
2693     {
2694       decl = SYMBOL_REF_DECL (callee);
2695       if (!decl
2696 	  || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2697 	nvptx_record_libfunc (callee, result, pat);
2698       else if (DECL_EXTERNAL (decl))
2699 	nvptx_record_fndecl (decl);
2700     }
2701 
2702   if (needs_tgt)
2703     {
2704       ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2705       labelno++;
2706       ASM_OUTPUT_LABEL (asm_out_file, buf);
2707       std::stringstream s;
2708       write_fn_proto_from_insn (s, NULL, result, pat);
2709       fputs (s.str().c_str(), asm_out_file);
2710     }
2711 
2712   for (int argno = 1; argno < arg_end; argno++)
2713     {
2714       rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2715       machine_mode mode = GET_MODE (t);
2716       const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2717 
2718       /* Mode splitting has already been done.  */
2719       fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2720 	       "\t\tst.param%s [%%out_arg%d], ",
2721 	       ptx_type, argno, ptx_type, argno);
2722       output_reg (asm_out_file, REGNO (t), VOIDmode);
2723       fprintf (asm_out_file, ";\n");
2724     }
2725 
2726   /* The '.' stands for the call's predicate, if any.  */
2727   nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2728   fprintf (asm_out_file, "\t\tcall ");
2729   if (result != NULL_RTX)
2730     fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2731 
2732   if (decl)
2733     {
2734       char *replaced_dots = NULL;
2735       const char *name = get_fnname_from_decl (decl);
2736       const char *replacement = nvptx_name_replacement (name);
2737       if (replacement != name)
2738 	name = replacement;
2739       else
2740 	{
2741 	  replaced_dots = nvptx_replace_dot (name);
2742 	  if (replaced_dots)
2743 	    name = replaced_dots;
2744 	}
2745       assemble_name (asm_out_file, name);
2746       if (replaced_dots)
2747 	XDELETE (replaced_dots);
2748     }
2749   else
2750     output_address (VOIDmode, callee);
2751 
2752   const char *open = "(";
2753   for (int argno = 1; argno < arg_end; argno++)
2754     {
2755       fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2756       open = "";
2757     }
2758   if (decl && DECL_STATIC_CHAIN (decl))
2759     {
2760       fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2761       open = "";
2762     }
2763   if (!open[0])
2764     fprintf (asm_out_file, ")");
2765 
2766   if (needs_tgt)
2767     {
2768       fprintf (asm_out_file, ", ");
2769       assemble_name (asm_out_file, buf);
2770     }
2771   fprintf (asm_out_file, ";\n");
2772 
2773   if (find_reg_note (insn, REG_NORETURN, NULL))
2774     {
2775       /* No return functions confuse the PTX JIT, as it doesn't realize
2776 	 the flow control barrier they imply.  It can seg fault if it
2777 	 encounters what looks like an unexitable loop.  Emit a trailing
2778 	 trap and exit, which it does grok.  */
2779       fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2780       fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2781     }
2782 
2783   if (result)
2784     {
2785       static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2786 
2787       if (!rval[0])
2788 	/* We must escape the '%' that starts RETURN_REGNUM.  */
2789 	sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2790 		 reg_names[NVPTX_RETURN_REGNUM]);
2791       return rval;
2792     }
2793 
2794   return "}";
2795 }
2796 
2797 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P.  */
2798 
2799 static bool
nvptx_print_operand_punct_valid_p(unsigned char c)2800 nvptx_print_operand_punct_valid_p (unsigned char c)
2801 {
2802   return c == '.' || c== '#';
2803 }
2804 
2805 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE.  */
2806 
2807 static void
nvptx_print_address_operand(FILE * file,rtx x,machine_mode)2808 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2809 {
2810   rtx off;
2811   if (GET_CODE (x) == CONST)
2812     x = XEXP (x, 0);
2813   switch (GET_CODE (x))
2814     {
2815     case PLUS:
2816       off = XEXP (x, 1);
2817       output_address (VOIDmode, XEXP (x, 0));
2818       fprintf (file, "+");
2819       output_address (VOIDmode, off);
2820       break;
2821 
2822     case SYMBOL_REF:
2823     case LABEL_REF:
2824       output_addr_const (file, x);
2825       break;
2826 
2827     default:
2828       gcc_assert (GET_CODE (x) != MEM);
2829       nvptx_print_operand (file, x, 0);
2830       break;
2831     }
2832 }
2833 
2834 /* Write assembly language output for the address ADDR to FILE.  */
2835 
2836 static void
nvptx_print_operand_address(FILE * file,machine_mode mode,rtx addr)2837 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2838 {
2839   nvptx_print_address_operand (file, addr, mode);
2840 }
2841 
2842 static nvptx_data_area
nvptx_mem_data_area(const_rtx x)2843 nvptx_mem_data_area (const_rtx x)
2844 {
2845   gcc_assert (GET_CODE (x) == MEM);
2846 
2847   const_rtx addr = XEXP (x, 0);
2848   subrtx_iterator::array_type array;
2849   FOR_EACH_SUBRTX (iter, array, addr, ALL)
2850     if (SYMBOL_REF_P (*iter))
2851       return SYMBOL_DATA_AREA (*iter);
2852 
2853   return DATA_AREA_GENERIC;
2854 }
2855 
2856 bool
nvptx_mem_maybe_shared_p(const_rtx x)2857 nvptx_mem_maybe_shared_p (const_rtx x)
2858 {
2859   nvptx_data_area area = nvptx_mem_data_area (x);
2860   return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC;
2861 }
2862 
2863 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2864 
2865    Meaning of CODE:
2866    . -- print the predicate for the instruction or an emptry string for an
2867         unconditional one.
2868    # -- print a rounding mode for the instruction
2869 
2870    A -- print a data area for a MEM
2871    c -- print an opcode suffix for a comparison operator, including a type code
2872    D -- print a data area for a MEM operand
2873    S -- print a shuffle kind specified by CONST_INT
2874    t -- print a type opcode suffix, promoting QImode to 32 bits
2875    T -- print a type size in bits
2876    u -- print a type opcode suffix without promotions.
2877    x -- print a destination operand that may also be a bit bucket.  */
2878 
2879 static void
nvptx_print_operand(FILE * file,rtx x,int code)2880 nvptx_print_operand (FILE *file, rtx x, int code)
2881 {
2882   if (code == '.')
2883     {
2884       x = current_insn_predicate;
2885       if (x)
2886 	{
2887 	  fputs ("@", file);
2888 	  if (GET_CODE (x) == EQ)
2889 	    fputs ("!", file);
2890 	  output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2891 	}
2892       return;
2893     }
2894   else if (code == '#')
2895     {
2896       fputs (".rn", file);
2897       return;
2898     }
2899 
2900   enum rtx_code x_code = GET_CODE (x);
2901   machine_mode mode = GET_MODE (x);
2902 
2903   switch (code)
2904     {
2905     case 'x':
2906       if (current_output_insn != NULL
2907 	  && find_reg_note (current_output_insn, REG_UNUSED, x) != NULL_RTX)
2908 	{
2909 	  fputs ("_", file);
2910 	  return;
2911 	}
2912       goto common;
2913     case 'B':
2914       if (SYMBOL_REF_P (XEXP (x, 0)))
2915 	switch (SYMBOL_DATA_AREA (XEXP (x, 0)))
2916 	  {
2917 	  case DATA_AREA_GENERIC:
2918 	    /* Assume worst-case: global.  */
2919 	    gcc_fallthrough (); /* FALLTHROUGH.  */
2920 	  case DATA_AREA_GLOBAL:
2921 	    break;
2922 	  case DATA_AREA_SHARED:
2923 	    fputs (".cta", file);
2924 	    return;
2925 	  case DATA_AREA_LOCAL:
2926 	  case DATA_AREA_CONST:
2927 	  case DATA_AREA_PARAM:
2928 	  default:
2929 	    gcc_unreachable ();
2930 	  }
2931 
2932       /* There are 2 cases where membar.sys differs from membar.gl:
2933 	 - host accesses global memory (f.i. systemwide atomics)
2934 	 - 2 or more devices are setup in peer-to-peer mode, and one
2935 	   peer can access global memory of other peer.
2936 	 Neither are currently supported by openMP/OpenACC on nvptx, but
2937 	 that could change, so we default to membar.sys.  We could support
2938 	 this more optimally by adding DATA_AREA_SYS and then emitting
2939 	 .gl for DATA_AREA_GLOBAL and .sys for DATA_AREA_SYS.  */
2940       fputs (".sys", file);
2941       return;
2942 
2943     case 'A':
2944       x = XEXP (x, 0);
2945       gcc_fallthrough (); /* FALLTHROUGH. */
2946 
2947     case 'D':
2948       if (GET_CODE (x) == CONST)
2949 	x = XEXP (x, 0);
2950       if (GET_CODE (x) == PLUS)
2951 	x = XEXP (x, 0);
2952 
2953       if (GET_CODE (x) == SYMBOL_REF)
2954 	fputs (section_for_sym (x), file);
2955       break;
2956 
2957     case 't':
2958     case 'u':
2959       if (x_code == SUBREG)
2960 	{
2961 	  machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2962 	  if (VECTOR_MODE_P (inner_mode)
2963 	      && (GET_MODE_SIZE (mode)
2964 		  <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2965 	    mode = GET_MODE_INNER (inner_mode);
2966 	  else if (split_mode_p (inner_mode))
2967 	    mode = maybe_split_mode (inner_mode);
2968 	  else
2969 	    mode = inner_mode;
2970 	}
2971       fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2972       break;
2973 
2974     case 'H':
2975     case 'L':
2976       {
2977 	rtx inner_x = SUBREG_REG (x);
2978 	machine_mode inner_mode = GET_MODE (inner_x);
2979 	machine_mode split = maybe_split_mode (inner_mode);
2980 
2981 	output_reg (file, REGNO (inner_x), split,
2982 		    (code == 'H'
2983 		     ? GET_MODE_SIZE (inner_mode) / 2
2984 		     : 0));
2985       }
2986       break;
2987 
2988     case 'S':
2989       {
2990 	nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2991 	/* Same order as nvptx_shuffle_kind.  */
2992 	static const char *const kinds[] =
2993 	  {".up", ".down", ".bfly", ".idx"};
2994 	fputs (kinds[kind], file);
2995       }
2996       break;
2997 
2998     case 'T':
2999       fprintf (file, "%d", GET_MODE_BITSIZE (mode));
3000       break;
3001 
3002     case 'j':
3003       fprintf (file, "@");
3004       goto common;
3005 
3006     case 'J':
3007       fprintf (file, "@!");
3008       goto common;
3009 
3010     case 'c':
3011       mode = GET_MODE (XEXP (x, 0));
3012       switch (x_code)
3013 	{
3014 	case EQ:
3015 	  fputs (".eq", file);
3016 	  break;
3017 	case NE:
3018 	  if (FLOAT_MODE_P (mode))
3019 	    fputs (".neu", file);
3020 	  else
3021 	    fputs (".ne", file);
3022 	  break;
3023 	case LE:
3024 	case LEU:
3025 	  fputs (".le", file);
3026 	  break;
3027 	case GE:
3028 	case GEU:
3029 	  fputs (".ge", file);
3030 	  break;
3031 	case LT:
3032 	case LTU:
3033 	  fputs (".lt", file);
3034 	  break;
3035 	case GT:
3036 	case GTU:
3037 	  fputs (".gt", file);
3038 	  break;
3039 	case LTGT:
3040 	  fputs (".ne", file);
3041 	  break;
3042 	case UNEQ:
3043 	  fputs (".equ", file);
3044 	  break;
3045 	case UNLE:
3046 	  fputs (".leu", file);
3047 	  break;
3048 	case UNGE:
3049 	  fputs (".geu", file);
3050 	  break;
3051 	case UNLT:
3052 	  fputs (".ltu", file);
3053 	  break;
3054 	case UNGT:
3055 	  fputs (".gtu", file);
3056 	  break;
3057 	case UNORDERED:
3058 	  fputs (".nan", file);
3059 	  break;
3060 	case ORDERED:
3061 	  fputs (".num", file);
3062 	  break;
3063 	default:
3064 	  gcc_unreachable ();
3065 	}
3066       if (FLOAT_MODE_P (mode)
3067 	  || x_code == EQ || x_code == NE
3068 	  || x_code == GEU || x_code == GTU
3069 	  || x_code == LEU || x_code == LTU)
3070 	fputs (nvptx_ptx_type_from_mode (mode, true), file);
3071       else
3072 	fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
3073       break;
3074     default:
3075     common:
3076       switch (x_code)
3077 	{
3078 	case SUBREG:
3079 	  {
3080 	    rtx inner_x = SUBREG_REG (x);
3081 	    machine_mode inner_mode = GET_MODE (inner_x);
3082 	    machine_mode split = maybe_split_mode (inner_mode);
3083 
3084 	    if (VECTOR_MODE_P (inner_mode)
3085 		&& (GET_MODE_SIZE (mode)
3086 		    <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
3087 	      {
3088 		output_reg (file, REGNO (inner_x), VOIDmode);
3089 		fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
3090 	      }
3091 	    else if (split_mode_p (inner_mode)
3092 		&& (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
3093 	      output_reg (file, REGNO (inner_x), split);
3094 	    else
3095 	      output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
3096 	  }
3097 	  break;
3098 
3099 	case REG:
3100 	  output_reg (file, REGNO (x), maybe_split_mode (mode));
3101 	  break;
3102 
3103 	case MEM:
3104 	  fputc ('[', file);
3105 	  nvptx_print_address_operand (file, XEXP (x, 0), mode);
3106 	  fputc (']', file);
3107 	  break;
3108 
3109 	case CONST_INT:
3110 	  output_addr_const (file, x);
3111 	  break;
3112 
3113 	case CONST:
3114 	case SYMBOL_REF:
3115 	case LABEL_REF:
3116 	  /* We could use output_addr_const, but that can print things like
3117 	     "x-8", which breaks ptxas.  Need to ensure it is output as
3118 	     "x+-8".  */
3119 	  nvptx_print_address_operand (file, x, VOIDmode);
3120 	  break;
3121 
3122 	case CONST_DOUBLE:
3123 	  long vals[2];
3124 	  real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
3125 	  vals[0] &= 0xffffffff;
3126 	  vals[1] &= 0xffffffff;
3127 	  if (mode == SFmode)
3128 	    fprintf (file, "0f%08lx", vals[0]);
3129 	  else
3130 	    fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
3131 	  break;
3132 
3133 	case CONST_VECTOR:
3134 	  {
3135 	    unsigned n = CONST_VECTOR_NUNITS (x);
3136 	    fprintf (file, "{ ");
3137 	    for (unsigned i = 0; i < n; ++i)
3138 	      {
3139 		if (i != 0)
3140 		  fprintf (file, ", ");
3141 
3142 		rtx elem = CONST_VECTOR_ELT (x, i);
3143 		output_addr_const (file, elem);
3144 	      }
3145 	    fprintf (file, " }");
3146 	  }
3147 	  break;
3148 
3149 	default:
3150 	  output_addr_const (file, x);
3151 	}
3152     }
3153 }
3154 
3155 /* Record replacement regs used to deal with subreg operands.  */
3156 struct reg_replace
3157 {
3158   rtx replacement[MAX_RECOG_OPERANDS];
3159   machine_mode mode;
3160   int n_allocated;
3161   int n_in_use;
3162 };
3163 
3164 /* Allocate or reuse a replacement in R and return the rtx.  */
3165 
3166 static rtx
get_replacement(struct reg_replace * r)3167 get_replacement (struct reg_replace *r)
3168 {
3169   if (r->n_allocated == r->n_in_use)
3170     r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
3171   return r->replacement[r->n_in_use++];
3172 }
3173 
3174 /* Clean up subreg operands.  In ptx assembly, everything is typed, and
3175    the presence of subregs would break the rules for most instructions.
3176    Replace them with a suitable new register of the right size, plus
3177    conversion copyin/copyout instructions.  */
3178 
3179 static void
nvptx_reorg_subreg(void)3180 nvptx_reorg_subreg (void)
3181 {
3182   struct reg_replace qiregs, hiregs, siregs, diregs;
3183   rtx_insn *insn, *next;
3184 
3185   qiregs.n_allocated = 0;
3186   hiregs.n_allocated = 0;
3187   siregs.n_allocated = 0;
3188   diregs.n_allocated = 0;
3189   qiregs.mode = QImode;
3190   hiregs.mode = HImode;
3191   siregs.mode = SImode;
3192   diregs.mode = DImode;
3193 
3194   for (insn = get_insns (); insn; insn = next)
3195     {
3196       next = NEXT_INSN (insn);
3197       if (!NONDEBUG_INSN_P (insn)
3198 	  || asm_noperands (PATTERN (insn)) >= 0
3199 	  || GET_CODE (PATTERN (insn)) == USE
3200 	  || GET_CODE (PATTERN (insn)) == CLOBBER)
3201 	continue;
3202 
3203       qiregs.n_in_use = 0;
3204       hiregs.n_in_use = 0;
3205       siregs.n_in_use = 0;
3206       diregs.n_in_use = 0;
3207       extract_insn (insn);
3208       enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
3209 
3210       for (int i = 0; i < recog_data.n_operands; i++)
3211 	{
3212 	  rtx op = recog_data.operand[i];
3213 	  if (GET_CODE (op) != SUBREG)
3214 	    continue;
3215 
3216 	  rtx inner = SUBREG_REG (op);
3217 
3218 	  machine_mode outer_mode = GET_MODE (op);
3219 	  machine_mode inner_mode = GET_MODE (inner);
3220 	  gcc_assert (s_ok);
3221 	  if (s_ok
3222 	      && (GET_MODE_PRECISION (inner_mode)
3223 		  >= GET_MODE_PRECISION (outer_mode)))
3224 	    continue;
3225 	  gcc_assert (SCALAR_INT_MODE_P (outer_mode));
3226 	  struct reg_replace *r = (outer_mode == QImode ? &qiregs
3227 				   : outer_mode == HImode ? &hiregs
3228 				   : outer_mode == SImode ? &siregs
3229 				   : &diregs);
3230 	  rtx new_reg = get_replacement (r);
3231 
3232 	  if (recog_data.operand_type[i] != OP_OUT)
3233 	    {
3234 	      enum rtx_code code;
3235 	      if (GET_MODE_PRECISION (inner_mode)
3236 		  < GET_MODE_PRECISION (outer_mode))
3237 		code = ZERO_EXTEND;
3238 	      else
3239 		code = TRUNCATE;
3240 
3241 	      rtx pat = gen_rtx_SET (new_reg,
3242 				     gen_rtx_fmt_e (code, outer_mode, inner));
3243 	      emit_insn_before (pat, insn);
3244 	    }
3245 
3246 	  if (recog_data.operand_type[i] != OP_IN)
3247 	    {
3248 	      enum rtx_code code;
3249 	      if (GET_MODE_PRECISION (inner_mode)
3250 		  < GET_MODE_PRECISION (outer_mode))
3251 		code = TRUNCATE;
3252 	      else
3253 		code = ZERO_EXTEND;
3254 
3255 	      rtx pat = gen_rtx_SET (inner,
3256 				     gen_rtx_fmt_e (code, inner_mode, new_reg));
3257 	      emit_insn_after (pat, insn);
3258 	    }
3259 	  validate_change (insn, recog_data.operand_loc[i], new_reg, false);
3260 	}
3261     }
3262 }
3263 
3264 /* Return a SImode "master lane index" register for uniform-simt, allocating on
3265    first use.  */
3266 
3267 static rtx
nvptx_get_unisimt_master()3268 nvptx_get_unisimt_master ()
3269 {
3270   rtx &master = cfun->machine->unisimt_master;
3271   return master ? master : master = gen_reg_rtx (SImode);
3272 }
3273 
3274 /* Return a BImode "predicate" register for uniform-simt, similar to above.  */
3275 
3276 static rtx
nvptx_get_unisimt_predicate()3277 nvptx_get_unisimt_predicate ()
3278 {
3279   rtx &pred = cfun->machine->unisimt_predicate;
3280   return pred ? pred : pred = gen_reg_rtx (BImode);
3281 }
3282 
3283 static rtx
nvptx_get_unisimt_outside_simt_predicate()3284 nvptx_get_unisimt_outside_simt_predicate ()
3285 {
3286   rtx &pred = cfun->machine->unisimt_outside_simt_predicate;
3287   return pred ? pred : pred = gen_reg_rtx (BImode);
3288 }
3289 
3290 /* Return true if given call insn references one of the functions provided by
3291    the CUDA runtime: malloc, free, vprintf.  */
3292 
3293 static bool
nvptx_call_insn_is_syscall_p(rtx_insn * insn)3294 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
3295 {
3296   rtx pat = PATTERN (insn);
3297   gcc_checking_assert (GET_CODE (pat) == PARALLEL);
3298   pat = XVECEXP (pat, 0, 0);
3299   if (GET_CODE (pat) == SET)
3300     pat = SET_SRC (pat);
3301   gcc_checking_assert (GET_CODE (pat) == CALL
3302 		       && GET_CODE (XEXP (pat, 0)) == MEM);
3303   rtx addr = XEXP (XEXP (pat, 0), 0);
3304   if (GET_CODE (addr) != SYMBOL_REF)
3305     return false;
3306   const char *name = XSTR (addr, 0);
3307   /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
3308      references with forced assembler name refer to PTX syscalls.  For vprintf,
3309      accept both normal and forced-assembler-name references.  */
3310   return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
3311 	  || !strcmp (name, "*malloc")
3312 	  || !strcmp (name, "*free"));
3313 }
3314 
3315 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
3316    propagate its value from lane MASTER to current lane.  */
3317 
3318 static bool
nvptx_unisimt_handle_set(rtx set,rtx_insn * insn,rtx master)3319 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
3320 {
3321   rtx reg;
3322   if (GET_CODE (set) == SET
3323       && REG_P (reg = SET_DEST (set))
3324       && find_reg_note (insn, REG_UNUSED, reg) == NULL_RTX)
3325     {
3326       emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX),
3327 		       insn);
3328       return true;
3329     }
3330 
3331   return false;
3332 }
3333 
3334 static void
predicate_insn(rtx_insn * insn,rtx pred)3335 predicate_insn (rtx_insn *insn, rtx pred)
3336 {
3337   rtx pat = PATTERN (insn);
3338   pred = gen_rtx_NE (BImode, pred, const0_rtx);
3339   pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
3340   bool changed_p = validate_change (insn, &PATTERN (insn), pat, false);
3341   gcc_assert (changed_p);
3342 }
3343 
3344 /* Adjust code for uniform-simt code generation variant by making atomics and
3345    "syscalls" conditionally executed, and inserting shuffle-based propagation
3346    for registers being set.  */
3347 
3348 static void
nvptx_reorg_uniform_simt()3349 nvptx_reorg_uniform_simt ()
3350 {
3351   rtx_insn *insn, *next;
3352 
3353   for (insn = get_insns (); insn; insn = next)
3354     {
3355       next = NEXT_INSN (insn);
3356 
3357       /* Skip NOTE, USE, etc.  */
3358       if (!INSN_P (insn) || recog_memoized (insn) == -1)
3359 	continue;
3360 
3361       if (CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
3362 	{
3363 	  /* Handle syscall.  */
3364 	}
3365       else if (get_attr_atomic (insn))
3366 	{
3367 	  /* Handle atomic insn.  */
3368 	}
3369       else
3370 	continue;
3371 
3372       rtx pat = PATTERN (insn);
3373       rtx master = nvptx_get_unisimt_master ();
3374       bool shuffle_p = false;
3375       switch (GET_CODE (pat))
3376        {
3377        case PARALLEL:
3378 	 for (int i = 0; i < XVECLEN (pat, 0); i++)
3379 	   shuffle_p
3380 	     |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
3381 	 break;
3382        case SET:
3383 	 shuffle_p |= nvptx_unisimt_handle_set (pat, insn, master);
3384 	 break;
3385        default:
3386 	 gcc_unreachable ();
3387        }
3388 
3389       if (shuffle_p && TARGET_PTX_6_0)
3390 	{
3391 	  /* The shuffle is a sync, so uniformity is guaranteed.  */
3392 	}
3393       else
3394 	{
3395 	  if (TARGET_PTX_6_0)
3396 	    {
3397 	      gcc_assert (!shuffle_p);
3398 	      /* Emit after the insn, to guarantee uniformity.  */
3399 	      emit_insn_after (gen_nvptx_warpsync (), insn);
3400 	    }
3401 	  else
3402 	    {
3403 	      /* Emit after the insn (and before the shuffle, if there are any)
3404 		 to check uniformity.  */
3405 	      emit_insn_after (gen_nvptx_uniform_warp_check (), insn);
3406 	    }
3407 	}
3408 
3409       rtx pred = nvptx_get_unisimt_predicate ();
3410       predicate_insn (insn, pred);
3411 
3412       pred = NULL_RTX;
3413       for (rtx_insn *post = NEXT_INSN (insn); post != next;
3414 	   post = NEXT_INSN (post))
3415 	{
3416 	  if (pred == NULL_RTX)
3417 	    pred = nvptx_get_unisimt_outside_simt_predicate ();
3418 	  predicate_insn (post, pred);
3419 	}
3420     }
3421 }
3422 
3423 /* Offloading function attributes.  */
3424 
3425 struct offload_attrs
3426 {
3427   unsigned mask;
3428   int num_gangs;
3429   int num_workers;
3430   int vector_length;
3431 };
3432 
3433 /* Define entries for cfun->machine->axis_dim.  */
3434 
3435 #define MACH_VECTOR_LENGTH 0
3436 #define MACH_MAX_WORKERS 1
3437 
3438 static void populate_offload_attrs (offload_attrs *oa);
3439 
3440 static void
init_axis_dim(void)3441 init_axis_dim (void)
3442 {
3443   offload_attrs oa;
3444   int max_workers;
3445 
3446   populate_offload_attrs (&oa);
3447 
3448   if (oa.num_workers == 0)
3449     max_workers = PTX_CTA_SIZE / oa.vector_length;
3450   else
3451     max_workers = oa.num_workers;
3452 
3453   cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
3454   cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
3455   cfun->machine->axis_dim_init_p = true;
3456 }
3457 
3458 static int ATTRIBUTE_UNUSED
nvptx_mach_max_workers()3459 nvptx_mach_max_workers ()
3460 {
3461   if (!cfun->machine->axis_dim_init_p)
3462     init_axis_dim ();
3463   return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3464 }
3465 
3466 static int ATTRIBUTE_UNUSED
nvptx_mach_vector_length()3467 nvptx_mach_vector_length ()
3468 {
3469   if (!cfun->machine->axis_dim_init_p)
3470     init_axis_dim ();
3471   return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3472 }
3473 
3474 /* Loop structure of the function.  The entire function is described as
3475    a NULL loop.  */
3476 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'.  */
3477 
3478 struct parallel
3479 {
3480   /* Parent parallel.  */
3481   parallel *parent;
3482 
3483   /* Next sibling parallel.  */
3484   parallel *next;
3485 
3486   /* First child parallel.  */
3487   parallel *inner;
3488 
3489   /* Partitioning mask of the parallel.  */
3490   unsigned mask;
3491 
3492   /* Partitioning used within inner parallels. */
3493   unsigned inner_mask;
3494 
3495   /* Location of parallel forked and join.  The forked is the first
3496      block in the parallel and the join is the first block after of
3497      the partition.  */
3498   basic_block forked_block;
3499   basic_block join_block;
3500 
3501   rtx_insn *forked_insn;
3502   rtx_insn *join_insn;
3503 
3504   rtx_insn *fork_insn;
3505   rtx_insn *joining_insn;
3506 
3507   /* Basic blocks in this parallel, but not in child parallels.  The
3508      FORKED and JOINING blocks are in the partition.  The FORK and JOIN
3509      blocks are not.  */
3510   auto_vec<basic_block> blocks;
3511 
3512 public:
3513   parallel (parallel *parent, unsigned mode);
3514   ~parallel ();
3515 };
3516 
3517 /* Constructor links the new parallel into it's parent's chain of
3518    children.  */
3519 
parallel(parallel * parent_,unsigned mask_)3520 parallel::parallel (parallel *parent_, unsigned mask_)
3521   :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3522 {
3523   forked_block = join_block = 0;
3524   forked_insn = join_insn = 0;
3525   fork_insn = joining_insn = 0;
3526 
3527   if (parent)
3528     {
3529       next = parent->inner;
3530       parent->inner = this;
3531     }
3532 }
3533 
~parallel()3534 parallel::~parallel ()
3535 {
3536   delete inner;
3537   delete next;
3538 }
3539 
3540 /* Map of basic blocks to insns */
3541 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3542 
3543 /* A tuple of an insn of interest and the BB in which it resides.  */
3544 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3545 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3546 
3547 /* Split basic blocks such that each forked and join unspecs are at
3548    the start of their basic blocks.  Thus afterwards each block will
3549    have a single partitioning mode.  We also do the same for return
3550    insns, as they are executed by every thread.  Return the
3551    partitioning mode of the function as a whole.  Populate MAP with
3552    head and tail blocks.  We also clear the BB visited flag, which is
3553    used when finding partitions.  */
3554 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_split_blocks'.  */
3555 
3556 static void
nvptx_split_blocks(bb_insn_map_t * map)3557 nvptx_split_blocks (bb_insn_map_t *map)
3558 {
3559   insn_bb_vec_t worklist;
3560   basic_block block;
3561   rtx_insn *insn;
3562 
3563   /* Locate all the reorg instructions of interest.  */
3564   FOR_ALL_BB_FN (block, cfun)
3565     {
3566       bool seen_insn = false;
3567 
3568       /* Clear visited flag, for use by parallel locator  */
3569       block->flags &= ~BB_VISITED;
3570 
3571       FOR_BB_INSNS (block, insn)
3572 	{
3573 	  if (!INSN_P (insn))
3574 	    continue;
3575 	  switch (recog_memoized (insn))
3576 	    {
3577 	    default:
3578 	      seen_insn = true;
3579 	      continue;
3580 	    case CODE_FOR_nvptx_forked:
3581 	    case CODE_FOR_nvptx_join:
3582 	      break;
3583 
3584 	    case CODE_FOR_return:
3585 	      /* We also need to split just before return insns, as
3586 		 that insn needs executing by all threads, but the
3587 		 block it is in probably does not.  */
3588 	      break;
3589 	    }
3590 
3591 	  if (seen_insn)
3592 	    /* We've found an instruction that  must be at the start of
3593 	       a block, but isn't.  Add it to the worklist.  */
3594 	    worklist.safe_push (insn_bb_t (insn, block));
3595 	  else
3596 	    /* It was already the first instruction.  Just add it to
3597 	       the map.  */
3598 	    map->get_or_insert (block) = insn;
3599 	  seen_insn = true;
3600 	}
3601     }
3602 
3603   /* Split blocks on the worklist.  */
3604   unsigned ix;
3605   insn_bb_t *elt;
3606   basic_block remap = 0;
3607   for (ix = 0; worklist.iterate (ix, &elt); ix++)
3608     {
3609       if (remap != elt->second)
3610 	{
3611 	  block = elt->second;
3612 	  remap = block;
3613 	}
3614 
3615       /* Split block before insn. The insn is in the new block  */
3616       edge e = split_block (block, PREV_INSN (elt->first));
3617 
3618       block = e->dest;
3619       map->get_or_insert (block) = elt->first;
3620     }
3621 }
3622 
3623 /* Return true if MASK contains parallelism that requires shared
3624    memory to broadcast.  */
3625 
3626 static bool
nvptx_needs_shared_bcast(unsigned mask)3627 nvptx_needs_shared_bcast (unsigned mask)
3628 {
3629   bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3630   bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3631     && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3632 
3633   return worker || large_vector;
3634 }
3635 
3636 /* BLOCK is a basic block containing a head or tail instruction.
3637    Locate the associated prehead or pretail instruction, which must be
3638    in the single predecessor block.  */
3639 
3640 static rtx_insn *
nvptx_discover_pre(basic_block block,int expected)3641 nvptx_discover_pre (basic_block block, int expected)
3642 {
3643   gcc_assert (block->preds->length () == 1);
3644   basic_block pre_block = (*block->preds)[0]->src;
3645   rtx_insn *pre_insn;
3646 
3647   for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3648        pre_insn = PREV_INSN (pre_insn))
3649     gcc_assert (pre_insn != BB_HEAD (pre_block));
3650 
3651   gcc_assert (recog_memoized (pre_insn) == expected);
3652   return pre_insn;
3653 }
3654 
3655 /* Dump this parallel and all its inner parallels.  */
3656 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_dump_pars'.  */
3657 
3658 static void
nvptx_dump_pars(parallel * par,unsigned depth)3659 nvptx_dump_pars (parallel *par, unsigned depth)
3660 {
3661   fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3662 	   depth, par->mask,
3663 	   par->forked_block ? par->forked_block->index : -1,
3664 	   par->join_block ? par->join_block->index : -1);
3665 
3666   fprintf (dump_file, "    blocks:");
3667 
3668   basic_block block;
3669   for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3670     fprintf (dump_file, " %d", block->index);
3671   fprintf (dump_file, "\n");
3672   if (par->inner)
3673     nvptx_dump_pars (par->inner, depth + 1);
3674 
3675   if (par->next)
3676     nvptx_dump_pars (par->next, depth);
3677 }
3678 
3679 /* If BLOCK contains a fork/join marker, process it to create or
3680    terminate a loop structure.  Add this block to the current loop,
3681    and then walk successor blocks.   */
3682 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_find_par'.  */
3683 
3684 static parallel *
nvptx_find_par(bb_insn_map_t * map,parallel * par,basic_block block)3685 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3686 {
3687   if (block->flags & BB_VISITED)
3688     return par;
3689   block->flags |= BB_VISITED;
3690 
3691   if (rtx_insn **endp = map->get (block))
3692     {
3693       rtx_insn *end = *endp;
3694 
3695       /* This is a block head or tail, or return instruction.  */
3696       switch (recog_memoized (end))
3697 	{
3698 	case CODE_FOR_return:
3699 	  /* Return instructions are in their own block, and we
3700 	     don't need to do anything more.  */
3701 	  return par;
3702 
3703 	case CODE_FOR_nvptx_forked:
3704 	  /* Loop head, create a new inner loop and add it into
3705 	     our parent's child list.  */
3706 	  {
3707 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3708 
3709 	    gcc_assert (mask);
3710 	    par = new parallel (par, mask);
3711 	    par->forked_block = block;
3712 	    par->forked_insn = end;
3713 	    if (nvptx_needs_shared_bcast (mask))
3714 	      par->fork_insn
3715 		= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3716 	  }
3717 	  break;
3718 
3719 	case CODE_FOR_nvptx_join:
3720 	  /* A loop tail.  Finish the current loop and return to
3721 	     parent.  */
3722 	  {
3723 	    unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3724 
3725 	    gcc_assert (par->mask == mask);
3726 	    gcc_assert (par->join_block == NULL);
3727 	    par->join_block = block;
3728 	    par->join_insn = end;
3729 	    if (nvptx_needs_shared_bcast (mask))
3730 	      par->joining_insn
3731 		= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3732 	    par = par->parent;
3733 	  }
3734 	  break;
3735 
3736 	default:
3737 	  gcc_unreachable ();
3738 	}
3739     }
3740 
3741   if (par)
3742     /* Add this block onto the current loop's list of blocks.  */
3743     par->blocks.safe_push (block);
3744   else
3745     /* This must be the entry block.  Create a NULL parallel.  */
3746     par = new parallel (0, 0);
3747 
3748   /* Walk successor blocks.  */
3749   edge e;
3750   edge_iterator ei;
3751 
3752   FOR_EACH_EDGE (e, ei, block->succs)
3753     nvptx_find_par (map, par, e->dest);
3754 
3755   return par;
3756 }
3757 
3758 /* DFS walk the CFG looking for fork & join markers.  Construct
3759    loop structures as we go.  MAP is a mapping of basic blocks
3760    to head & tail markers, discovered when splitting blocks.  This
3761    speeds up the discovery.  We rely on the BB visited flag having
3762    been cleared when splitting blocks.  */
3763 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_discover_pars'.  */
3764 
3765 static parallel *
nvptx_discover_pars(bb_insn_map_t * map)3766 nvptx_discover_pars (bb_insn_map_t *map)
3767 {
3768   basic_block block;
3769 
3770   /* Mark exit blocks as visited.  */
3771   block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3772   block->flags |= BB_VISITED;
3773 
3774   /* And entry block as not.  */
3775   block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3776   block->flags &= ~BB_VISITED;
3777 
3778   parallel *par = nvptx_find_par (map, 0, block);
3779 
3780   if (dump_file)
3781     {
3782       fprintf (dump_file, "\nLoops\n");
3783       nvptx_dump_pars (par, 0);
3784       fprintf (dump_file, "\n");
3785     }
3786 
3787   return par;
3788 }
3789 
3790 /* Analyse a group of BBs within a partitioned region and create N
3791    Single-Entry-Single-Exit regions.  Some of those regions will be
3792    trivial ones consisting of a single BB.  The blocks of a
3793    partitioned region might form a set of disjoint graphs -- because
3794    the region encloses a differently partitoned sub region.
3795 
3796    We use the linear time algorithm described in 'Finding Regions Fast:
3797    Single Entry Single Exit and control Regions in Linear Time'
3798    Johnson, Pearson & Pingali.  That algorithm deals with complete
3799    CFGs, where a back edge is inserted from END to START, and thus the
3800    problem becomes one of finding equivalent loops.
3801 
3802    In this case we have a partial CFG.  We complete it by redirecting
3803    any incoming edge to the graph to be from an arbitrary external BB,
3804    and similarly redirecting any outgoing edge to be to  that BB.
3805    Thus we end up with a closed graph.
3806 
3807    The algorithm works by building a spanning tree of an undirected
3808    graph and keeping track of back edges from nodes further from the
3809    root in the tree to nodes nearer to the root in the tree.  In the
3810    description below, the root is up and the tree grows downwards.
3811 
3812    We avoid having to deal with degenerate back-edges to the same
3813    block, by splitting each BB into 3 -- one for input edges, one for
3814    the node itself and one for the output edges.  Such back edges are
3815    referred to as 'Brackets'.  Cycle equivalent nodes will have the
3816    same set of brackets.
3817 
3818    Determining bracket equivalency is done by maintaining a list of
3819    brackets in such a manner that the list length and final bracket
3820    uniquely identify the set.
3821 
3822    We use coloring to mark all BBs with cycle equivalency with the
3823    same color.  This is the output of the 'Finding Regions Fast'
3824    algorithm.  Notice it doesn't actually find the set of nodes within
3825    a particular region, just unorderd sets of nodes that are the
3826    entries and exits of SESE regions.
3827 
3828    After determining cycle equivalency, we need to find the minimal
3829    set of SESE regions.  Do this with a DFS coloring walk of the
3830    complete graph.  We're either 'looking' or 'coloring'.  When
3831    looking, and we're in the subgraph, we start coloring the color of
3832    the current node, and remember that node as the start of the
3833    current color's SESE region.  Every time we go to a new node, we
3834    decrement the count of nodes with thet color.  If it reaches zero,
3835    we remember that node as the end of the current color's SESE region
3836    and return to 'looking'.  Otherwise we color the node the current
3837    color.
3838 
3839    This way we end up with coloring the inside of non-trivial SESE
3840    regions with the color of that region.  */
3841 
3842 /* A pair of BBs.  We use this to represent SESE regions.  */
3843 typedef std::pair<basic_block, basic_block> bb_pair_t;
3844 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3845 
3846 /* A node in the undirected CFG.  The discriminator SECOND indicates just
3847    above or just below the BB idicated by FIRST.  */
3848 typedef std::pair<basic_block, int> pseudo_node_t;
3849 
3850 /* A bracket indicates an edge towards the root of the spanning tree of the
3851    undirected graph.  Each bracket has a color, determined
3852    from the currrent set of brackets.  */
3853 struct bracket
3854 {
3855   pseudo_node_t back; /* Back target */
3856 
3857   /* Current color and size of set.  */
3858   unsigned color;
3859   unsigned size;
3860 
bracketbracket3861   bracket (pseudo_node_t back_)
3862   : back (back_), color (~0u), size (~0u)
3863   {
3864   }
3865 
get_colorbracket3866   unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3867   {
3868     if (length != size)
3869       {
3870 	size = length;
3871 	color = color_counts.length ();
3872 	color_counts.quick_push (0);
3873       }
3874     color_counts[color]++;
3875     return color;
3876   }
3877 };
3878 
3879 typedef auto_vec<bracket> bracket_vec_t;
3880 
3881 /* Basic block info for finding SESE regions.    */
3882 
3883 struct bb_sese
3884 {
3885   int node;  /* Node number in spanning tree.  */
3886   int parent; /* Parent node number.  */
3887 
3888   /* The algorithm splits each node A into Ai, A', Ao. The incoming
3889      edges arrive at pseudo-node Ai and the outgoing edges leave at
3890      pseudo-node Ao.  We have to remember which way we arrived at a
3891      particular node when generating the spanning tree.  dir > 0 means
3892      we arrived at Ai, dir < 0 means we arrived at Ao.  */
3893   int dir;
3894 
3895   /* Lowest numbered pseudo-node reached via a backedge from thsis
3896      node, or any descendant.  */
3897   pseudo_node_t high;
3898 
3899   int color;  /* Cycle-equivalence color  */
3900 
3901   /* Stack of brackets for this node.  */
3902   bracket_vec_t brackets;
3903 
bb_sesebb_sese3904   bb_sese (unsigned node_, unsigned p, int dir_)
3905   :node (node_), parent (p), dir (dir_)
3906   {
3907   }
3908   ~bb_sese ();
3909 
3910   /* Push a bracket ending at BACK.  */
pushbb_sese3911   void push (const pseudo_node_t &back)
3912   {
3913     if (dump_file)
3914       fprintf (dump_file, "Pushing backedge %d:%+d\n",
3915 	       back.first ? back.first->index : 0, back.second);
3916     brackets.safe_push (bracket (back));
3917   }
3918 
3919   void append (bb_sese *child);
3920   void remove (const pseudo_node_t &);
3921 
3922   /* Set node's color.  */
set_colorbb_sese3923   void set_color (auto_vec<unsigned> &color_counts)
3924   {
3925     color = brackets.last ().get_color (color_counts, brackets.length ());
3926   }
3927 };
3928 
~bb_sese()3929 bb_sese::~bb_sese ()
3930 {
3931 }
3932 
3933 /* Destructively append CHILD's brackets.  */
3934 
3935 void
append(bb_sese * child)3936 bb_sese::append (bb_sese *child)
3937 {
3938   if (int len = child->brackets.length ())
3939     {
3940       int ix;
3941 
3942       if (dump_file)
3943 	{
3944 	  for (ix = 0; ix < len; ix++)
3945 	    {
3946 	      const pseudo_node_t &pseudo = child->brackets[ix].back;
3947 	      fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3948 		       child->node, pseudo.first ? pseudo.first->index : 0,
3949 		       pseudo.second);
3950 	    }
3951 	}
3952       if (!brackets.length ())
3953 	std::swap (brackets, child->brackets);
3954       else
3955 	{
3956 	  brackets.reserve (len);
3957 	  for (ix = 0; ix < len; ix++)
3958 	    brackets.quick_push (child->brackets[ix]);
3959 	}
3960     }
3961 }
3962 
3963 /* Remove brackets that terminate at PSEUDO.  */
3964 
3965 void
remove(const pseudo_node_t & pseudo)3966 bb_sese::remove (const pseudo_node_t &pseudo)
3967 {
3968   unsigned removed = 0;
3969   int len = brackets.length ();
3970 
3971   for (int ix = 0; ix < len; ix++)
3972     {
3973       if (brackets[ix].back == pseudo)
3974 	{
3975 	  if (dump_file)
3976 	    fprintf (dump_file, "Removing backedge %d:%+d\n",
3977 		     pseudo.first ? pseudo.first->index : 0, pseudo.second);
3978 	  removed++;
3979 	}
3980       else if (removed)
3981 	brackets[ix-removed] = brackets[ix];
3982     }
3983   while (removed--)
3984     brackets.pop ();
3985 }
3986 
3987 /* Accessors for BB's aux pointer.  */
3988 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3989 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3990 
3991 /* DFS walk creating SESE data structures.  Only cover nodes with
3992    BB_VISITED set.  Append discovered blocks to LIST.  We number in
3993    increments of 3 so that the above and below pseudo nodes can be
3994    implicitly numbered too.  */
3995 
3996 static int
nvptx_sese_number(int n,int p,int dir,basic_block b,auto_vec<basic_block> * list)3997 nvptx_sese_number (int n, int p, int dir, basic_block b,
3998 		   auto_vec<basic_block> *list)
3999 {
4000   if (BB_GET_SESE (b))
4001     return n;
4002 
4003   if (dump_file)
4004     fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
4005 	     b->index, n, p, dir);
4006 
4007   BB_SET_SESE (b, new bb_sese (n, p, dir));
4008   p = n;
4009 
4010   n += 3;
4011   list->quick_push (b);
4012 
4013   /* First walk the nodes on the 'other side' of this node, then walk
4014      the nodes on the same side.  */
4015   for (unsigned ix = 2; ix; ix--)
4016     {
4017       vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
4018       size_t offset = (dir > 0 ? offsetof (edge_def, dest)
4019 		       : offsetof (edge_def, src));
4020       edge e;
4021       edge_iterator ei;
4022 
4023       FOR_EACH_EDGE (e, ei, edges)
4024 	{
4025 	  basic_block target = *(basic_block *)((char *)e + offset);
4026 
4027 	  if (target->flags & BB_VISITED)
4028 	    n = nvptx_sese_number (n, p, dir, target, list);
4029 	}
4030       dir = -dir;
4031     }
4032   return n;
4033 }
4034 
4035 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
4036    EDGES are the outgoing edges and OFFSET is the offset to the src
4037    or dst block on the edges.   */
4038 
4039 static void
nvptx_sese_pseudo(basic_block me,bb_sese * sese,int depth,int dir,vec<edge,va_gc> * edges,size_t offset)4040 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
4041 		   vec<edge, va_gc> *edges, size_t offset)
4042 {
4043   edge e;
4044   edge_iterator ei;
4045   int hi_back = depth;
4046   pseudo_node_t node_back (nullptr, depth);
4047   int hi_child = depth;
4048   pseudo_node_t node_child (nullptr, depth);
4049   basic_block child = NULL;
4050   unsigned num_children = 0;
4051   int usd = -dir * sese->dir;
4052 
4053   if (dump_file)
4054     fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
4055 	     me->index, sese->node, dir);
4056 
4057   if (dir < 0)
4058     {
4059       /* This is the above pseudo-child.  It has the BB itself as an
4060 	 additional child node.  */
4061       node_child = sese->high;
4062       hi_child = node_child.second;
4063       if (node_child.first)
4064 	hi_child += BB_GET_SESE (node_child.first)->node;
4065       num_children++;
4066     }
4067 
4068   /* Examine each edge.
4069      - if it is a child (a) append its bracket list and (b) record
4070           whether it is the child with the highest reaching bracket.
4071      - if it is an edge to ancestor, record whether it's the highest
4072           reaching backlink.  */
4073   FOR_EACH_EDGE (e, ei, edges)
4074     {
4075       basic_block target = *(basic_block *)((char *)e + offset);
4076 
4077       if (bb_sese *t_sese = BB_GET_SESE (target))
4078 	{
4079 	  if (t_sese->parent == sese->node && !(t_sese->dir + usd))
4080 	    {
4081 	      /* Child node.  Append its bracket list. */
4082 	      num_children++;
4083 	      sese->append (t_sese);
4084 
4085 	      /* Compare it's hi value.  */
4086 	      int t_hi = t_sese->high.second;
4087 
4088 	      if (basic_block child_hi_block = t_sese->high.first)
4089 		t_hi += BB_GET_SESE (child_hi_block)->node;
4090 
4091 	      if (hi_child > t_hi)
4092 		{
4093 		  hi_child = t_hi;
4094 		  node_child = t_sese->high;
4095 		  child = target;
4096 		}
4097 	    }
4098 	  else if (t_sese->node < sese->node + dir
4099 		   && !(dir < 0 && sese->parent == t_sese->node))
4100 	    {
4101 	      /* Non-parental ancestor node -- a backlink.  */
4102 	      int d = usd * t_sese->dir;
4103 	      int back = t_sese->node + d;
4104 
4105 	      if (hi_back > back)
4106 		{
4107 		  hi_back = back;
4108 		  node_back = pseudo_node_t (target, d);
4109 		}
4110 	    }
4111 	}
4112       else
4113 	{ /* Fallen off graph, backlink to entry node.  */
4114 	  hi_back = 0;
4115 	  node_back = pseudo_node_t (nullptr, 0);
4116 	}
4117     }
4118 
4119   /* Remove any brackets that terminate at this pseudo node.  */
4120   sese->remove (pseudo_node_t (me, dir));
4121 
4122   /* Now push any backlinks from this pseudo node.  */
4123   FOR_EACH_EDGE (e, ei, edges)
4124     {
4125       basic_block target = *(basic_block *)((char *)e + offset);
4126       if (bb_sese *t_sese = BB_GET_SESE (target))
4127 	{
4128 	  if (t_sese->node < sese->node + dir
4129 	      && !(dir < 0 && sese->parent == t_sese->node))
4130 	    /* Non-parental ancestor node - backedge from me.  */
4131 	    sese->push (pseudo_node_t (target, usd * t_sese->dir));
4132 	}
4133       else
4134 	{
4135 	  /* back edge to entry node */
4136 	  sese->push (pseudo_node_t (nullptr, 0));
4137 	}
4138     }
4139 
4140  /* If this node leads directly or indirectly to a no-return region of
4141      the graph, then fake a backedge to entry node.  */
4142   if (!sese->brackets.length () || !edges || !edges->length ())
4143     {
4144       hi_back = 0;
4145       node_back = pseudo_node_t (nullptr, 0);
4146       sese->push (node_back);
4147     }
4148 
4149   /* Record the highest reaching backedge from us or a descendant.  */
4150   sese->high = hi_back < hi_child ? node_back : node_child;
4151 
4152   if (num_children > 1)
4153     {
4154       /* There is more than one child -- this is a Y shaped piece of
4155 	 spanning tree.  We have to insert a fake backedge from this
4156 	 node to the highest ancestor reached by not-the-highest
4157 	 reaching child.  Note that there may be multiple children
4158 	 with backedges to the same highest node.  That's ok and we
4159 	 insert the edge to that highest node.  */
4160       hi_child = depth;
4161       if (dir < 0 && child)
4162 	{
4163 	  node_child = sese->high;
4164 	  hi_child = node_child.second;
4165 	  if (node_child.first)
4166 	    hi_child += BB_GET_SESE (node_child.first)->node;
4167 	}
4168 
4169       FOR_EACH_EDGE (e, ei, edges)
4170 	{
4171 	  basic_block target = *(basic_block *)((char *)e + offset);
4172 
4173 	  if (target == child)
4174 	    /* Ignore the highest child. */
4175 	    continue;
4176 
4177 	  bb_sese *t_sese = BB_GET_SESE (target);
4178 	  if (!t_sese)
4179 	    continue;
4180 	  if (t_sese->parent != sese->node)
4181 	    /* Not a child. */
4182 	    continue;
4183 
4184 	  /* Compare its hi value.  */
4185 	  int t_hi = t_sese->high.second;
4186 
4187 	  if (basic_block child_hi_block = t_sese->high.first)
4188 	    t_hi += BB_GET_SESE (child_hi_block)->node;
4189 
4190 	  if (hi_child > t_hi)
4191 	    {
4192 	      hi_child = t_hi;
4193 	      node_child = t_sese->high;
4194 	    }
4195 	}
4196 
4197       sese->push (node_child);
4198     }
4199 }
4200 
4201 
4202 /* DFS walk of BB graph.  Color node BLOCK according to COLORING then
4203    proceed to successors.  Set SESE entry and exit nodes of
4204    REGIONS.  */
4205 
4206 static void
nvptx_sese_color(auto_vec<unsigned> & color_counts,bb_pair_vec_t & regions,basic_block block,int coloring)4207 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t &regions,
4208 		  basic_block block, int coloring)
4209 {
4210   bb_sese *sese = BB_GET_SESE (block);
4211 
4212   if (block->flags & BB_VISITED)
4213     {
4214       /* If we've already encountered this block, either we must not
4215 	 be coloring, or it must have been colored the current color.  */
4216       gcc_assert (coloring < 0 || (sese && coloring == sese->color));
4217       return;
4218     }
4219 
4220   block->flags |= BB_VISITED;
4221 
4222   if (sese)
4223     {
4224       if (coloring < 0)
4225 	{
4226 	  /* Start coloring a region.  */
4227 	  regions[sese->color].first = block;
4228 	  coloring = sese->color;
4229 	}
4230 
4231       if (!--color_counts[sese->color] && sese->color == coloring)
4232 	{
4233 	  /* Found final block of SESE region.  */
4234 	  regions[sese->color].second = block;
4235 	  coloring = -1;
4236 	}
4237       else
4238 	/* Color the node, so we can assert on revisiting the node
4239 	   that the graph is indeed SESE.  */
4240 	sese->color = coloring;
4241     }
4242   else
4243     /* Fallen off the subgraph, we cannot be coloring.  */
4244     gcc_assert (coloring < 0);
4245 
4246   /* Walk each successor block.  */
4247   if (block->succs && block->succs->length ())
4248     {
4249       edge e;
4250       edge_iterator ei;
4251 
4252       FOR_EACH_EDGE (e, ei, block->succs)
4253 	nvptx_sese_color (color_counts, regions, e->dest, coloring);
4254     }
4255   else
4256     gcc_assert (coloring < 0);
4257 }
4258 
4259 /* Find minimal set of SESE regions covering BLOCKS.  REGIONS might
4260    end up with NULL entries in it.  */
4261 
4262 static void
nvptx_find_sese(auto_vec<basic_block> & blocks,bb_pair_vec_t & regions)4263 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t &regions)
4264 {
4265   basic_block block;
4266   int ix;
4267 
4268   /* First clear each BB of the whole function.  */
4269   FOR_ALL_BB_FN (block, cfun)
4270     {
4271       block->flags &= ~BB_VISITED;
4272       BB_SET_SESE (block, 0);
4273     }
4274 
4275   /* Mark blocks in the function that are in this graph.  */
4276   for (ix = 0; blocks.iterate (ix, &block); ix++)
4277     block->flags |= BB_VISITED;
4278 
4279   /* Counts of nodes assigned to each color.  There cannot be more
4280      colors than blocks (and hopefully there will be fewer).  */
4281   auto_vec<unsigned> color_counts;
4282   color_counts.reserve (blocks.length ());
4283 
4284   /* Worklist of nodes in the spanning tree.  Again, there cannot be
4285      more nodes in the tree than blocks (there will be fewer if the
4286      CFG of blocks is disjoint).  */
4287   auto_vec<basic_block> spanlist;
4288   spanlist.reserve (blocks.length ());
4289 
4290   /* Make sure every block has its cycle class determined.  */
4291   for (ix = 0; blocks.iterate (ix, &block); ix++)
4292     {
4293       if (BB_GET_SESE (block))
4294 	/* We already met this block in an earlier graph solve.  */
4295 	continue;
4296 
4297       if (dump_file)
4298 	fprintf (dump_file, "Searching graph starting at %d\n", block->index);
4299 
4300       /* Number the nodes reachable from block initial DFS order.  */
4301       int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
4302 
4303       /* Now walk in reverse DFS order to find cycle equivalents.  */
4304       while (spanlist.length ())
4305 	{
4306 	  block = spanlist.pop ();
4307 	  bb_sese *sese = BB_GET_SESE (block);
4308 
4309 	  /* Do the pseudo node below.  */
4310 	  nvptx_sese_pseudo (block, sese, depth, +1,
4311 			     sese->dir > 0 ? block->succs : block->preds,
4312 			     (sese->dir > 0 ? offsetof (edge_def, dest)
4313 			      : offsetof (edge_def, src)));
4314 	  sese->set_color (color_counts);
4315 	  /* Do the pseudo node above.  */
4316 	  nvptx_sese_pseudo (block, sese, depth, -1,
4317 			     sese->dir < 0 ? block->succs : block->preds,
4318 			     (sese->dir < 0 ? offsetof (edge_def, dest)
4319 			      : offsetof (edge_def, src)));
4320 	}
4321       if (dump_file)
4322 	fprintf (dump_file, "\n");
4323     }
4324 
4325   if (dump_file)
4326     {
4327       unsigned count;
4328       const char *comma = "";
4329 
4330       fprintf (dump_file, "Found %d cycle equivalents\n",
4331 	       color_counts.length ());
4332       for (ix = 0; color_counts.iterate (ix, &count); ix++)
4333 	{
4334 	  fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
4335 
4336 	  comma = "";
4337 	  for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
4338 	    if (BB_GET_SESE (block)->color == ix)
4339 	      {
4340 		block->flags |= BB_VISITED;
4341 		fprintf (dump_file, "%s%d", comma, block->index);
4342 		comma=",";
4343 	      }
4344 	  fprintf (dump_file, "}");
4345 	  comma = ", ";
4346 	}
4347       fprintf (dump_file, "\n");
4348    }
4349 
4350   /* Now we've colored every block in the subgraph.  We now need to
4351      determine the minimal set of SESE regions that cover that
4352      subgraph.  Do this with a DFS walk of the complete function.
4353      During the walk we're either 'looking' or 'coloring'.  When we
4354      reach the last node of a particular color, we stop coloring and
4355      return to looking.  */
4356 
4357   /* There cannot be more SESE regions than colors.  */
4358   regions.reserve (color_counts.length ());
4359   for (ix = color_counts.length (); ix--;)
4360     regions.quick_push (bb_pair_t (0, 0));
4361 
4362   for (ix = 0; blocks.iterate (ix, &block); ix++)
4363     block->flags &= ~BB_VISITED;
4364 
4365   nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
4366 
4367   if (dump_file)
4368     {
4369       const char *comma = "";
4370       int len = regions.length ();
4371 
4372       fprintf (dump_file, "SESE regions:");
4373       for (ix = 0; ix != len; ix++)
4374 	{
4375 	  basic_block from = regions[ix].first;
4376 	  basic_block to = regions[ix].second;
4377 
4378 	  if (from)
4379 	    {
4380 	      fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
4381 	      if (to != from)
4382 		fprintf (dump_file, "->%d", to->index);
4383 
4384 	      int color = BB_GET_SESE (from)->color;
4385 
4386 	      /* Print the blocks within the region (excluding ends).  */
4387 	      FOR_EACH_BB_FN (block, cfun)
4388 		{
4389 		  bb_sese *sese = BB_GET_SESE (block);
4390 
4391 		  if (sese && sese->color == color
4392 		      && block != from && block != to)
4393 		    fprintf (dump_file, ".%d", block->index);
4394 		}
4395 	      fprintf (dump_file, "}");
4396 	    }
4397 	  comma = ",";
4398 	}
4399       fprintf (dump_file, "\n\n");
4400     }
4401 
4402   for (ix = 0; blocks.iterate (ix, &block); ix++)
4403     delete BB_GET_SESE (block);
4404 }
4405 
4406 #undef BB_SET_SESE
4407 #undef BB_GET_SESE
4408 
4409 /* Propagate live state at the start of a partitioned region.  IS_CALL
4410    indicates whether the propagation is for a (partitioned) call
4411    instruction.  BLOCK provides the live register information, and
4412    might not contain INSN. Propagation is inserted just after INSN. RW
4413    indicates whether we are reading and/or writing state.  This
4414    separation is needed for worker-level proppagation where we
4415    essentially do a spill & fill.  FN is the underlying worker
4416    function to generate the propagation instructions for single
4417    register.  DATA is user data.
4418 
4419    Returns true if we didn't emit any instructions.
4420 
4421    We propagate the live register set for non-calls and the entire
4422    frame for calls and non-calls.  We could do better by (a)
4423    propagating just the live set that is used within the partitioned
4424    regions and (b) only propagating stack entries that are used.  The
4425    latter might be quite hard to determine.  */
4426 
4427 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
4428 
4429 static bool
nvptx_propagate(bool is_call,basic_block block,rtx_insn * insn,propagate_mask rw,propagator_fn fn,void * data,bool vector)4430 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
4431 		 propagate_mask rw, propagator_fn fn, void *data, bool vector)
4432 {
4433   bitmap live = DF_LIVE_IN (block);
4434   bitmap_iterator iterator;
4435   unsigned ix;
4436   bool empty = true;
4437 
4438   /* Copy the frame array.  */
4439   HOST_WIDE_INT fs = get_frame_size ();
4440   if (fs)
4441     {
4442       rtx tmp = gen_reg_rtx (DImode);
4443       rtx idx = NULL_RTX;
4444       rtx ptr = gen_reg_rtx (Pmode);
4445       rtx pred = NULL_RTX;
4446       rtx_code_label *label = NULL;
4447 
4448       empty = false;
4449       /* The frame size might not be DImode compatible, but the frame
4450 	 array's declaration will be.  So it's ok to round up here.  */
4451       fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
4452       /* Detect single iteration loop. */
4453       if (fs == 1)
4454 	fs = 0;
4455 
4456       start_sequence ();
4457       emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
4458       if (fs)
4459 	{
4460 	  idx = gen_reg_rtx (SImode);
4461 	  pred = gen_reg_rtx (BImode);
4462 	  label = gen_label_rtx ();
4463 
4464 	  emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
4465 	  /* Allow worker function to initialize anything needed.  */
4466 	  rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
4467 	  if (init)
4468 	    emit_insn (init);
4469 	  emit_label (label);
4470 	  LABEL_NUSES (label)++;
4471 	  emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4472 	}
4473       if (rw & PM_read)
4474 	emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4475       emit_insn (fn (tmp, rw, fs, data, vector));
4476       if (rw & PM_write)
4477 	emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4478       if (fs)
4479 	{
4480 	  emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4481 	  emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4482 	  emit_insn (gen_br_true_uni (pred, label));
4483 	  rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4484 	  if (fini)
4485 	    emit_insn (fini);
4486 	  emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4487 	}
4488       emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4489       emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4490       rtx cpy = get_insns ();
4491       end_sequence ();
4492       insn = emit_insn_after (cpy, insn);
4493     }
4494 
4495   if (!is_call)
4496     /* Copy live registers.  */
4497     EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4498       {
4499 	rtx reg = regno_reg_rtx[ix];
4500 
4501 	if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4502 	  {
4503 	    rtx bcast = fn (reg, rw, 0, data, vector);
4504 
4505 	    insn = emit_insn_after (bcast, insn);
4506 	    empty = false;
4507 	  }
4508       }
4509   return empty;
4510 }
4511 
4512 /* Worker for nvptx_warp_propagate.  */
4513 
4514 static rtx
warp_prop_gen(rtx reg,propagate_mask pm,unsigned ARG_UNUSED (count),void * ARG_UNUSED (data),bool ARG_UNUSED (vector))4515 warp_prop_gen (rtx reg, propagate_mask pm,
4516 	       unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4517 	       bool ARG_UNUSED (vector))
4518 {
4519   if (!(pm & PM_read_write))
4520     return 0;
4521 
4522   return nvptx_gen_warp_bcast (reg);
4523 }
4524 
4525 /* Propagate state that is live at start of BLOCK across the vectors
4526    of a single warp.  Propagation is inserted just after INSN.
4527    IS_CALL and return as for nvptx_propagate.  */
4528 
4529 static bool
nvptx_warp_propagate(bool is_call,basic_block block,rtx_insn * insn)4530 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4531 {
4532   return nvptx_propagate (is_call, block, insn, PM_read_write,
4533 			  warp_prop_gen, 0, false);
4534 }
4535 
4536 /* Worker for nvptx_shared_propagate.  */
4537 
4538 static rtx
shared_prop_gen(rtx reg,propagate_mask pm,unsigned rep,void * data_,bool vector)4539 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4540 		 bool vector)
4541 {
4542   broadcast_data_t *data = (broadcast_data_t *)data_;
4543 
4544   if (pm & PM_loop_begin)
4545     {
4546       /* Starting a loop, initialize pointer.    */
4547       unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4548 
4549       oacc_bcast_align = MAX (oacc_bcast_align, align);
4550       data->offset = ROUND_UP (data->offset, align);
4551 
4552       data->ptr = gen_reg_rtx (Pmode);
4553 
4554       return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4555     }
4556   else if (pm & PM_loop_end)
4557     {
4558       rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4559       data->ptr = NULL_RTX;
4560       return clobber;
4561     }
4562   else
4563     return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4564 }
4565 
4566 /* Spill or fill live state that is live at start of BLOCK.  PRE_P
4567    indicates if this is just before partitioned mode (do spill), or
4568    just after it starts (do fill). Sequence is inserted just after
4569    INSN.  IS_CALL and return as for nvptx_propagate.  */
4570 
4571 static bool
nvptx_shared_propagate(bool pre_p,bool is_call,basic_block block,rtx_insn * insn,bool vector)4572 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4573 			rtx_insn *insn, bool vector)
4574 {
4575   broadcast_data_t data;
4576 
4577   data.base = gen_reg_rtx (Pmode);
4578   data.offset = 0;
4579   data.ptr = NULL_RTX;
4580 
4581   bool empty = nvptx_propagate (is_call, block, insn,
4582 				pre_p ? PM_read : PM_write, shared_prop_gen,
4583 				&data, vector);
4584   gcc_assert (empty == !data.offset);
4585   if (data.offset)
4586     {
4587       rtx bcast_sym = oacc_bcast_sym;
4588 
4589       /* Stuff was emitted, initialize the base pointer now.  */
4590       if (vector && nvptx_mach_max_workers () > 1)
4591 	{
4592 	  if (!cfun->machine->bcast_partition)
4593 	    {
4594 	      /* It would be nice to place this register in
4595 		 DATA_AREA_SHARED.  */
4596 	      cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4597 	    }
4598 	  if (!cfun->machine->sync_bar)
4599 	    cfun->machine->sync_bar = gen_reg_rtx (SImode);
4600 
4601 	  bcast_sym = cfun->machine->bcast_partition;
4602 	}
4603 
4604       rtx init = gen_rtx_SET (data.base, bcast_sym);
4605       emit_insn_after (init, insn);
4606 
4607       unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4608       unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4609 			   ? nvptx_mach_max_workers () + 1
4610 			   : 1);
4611 
4612       oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4613       oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4614     }
4615   return empty;
4616 }
4617 
4618 /* Emit a CTA-level synchronization barrier.  LOCK is the barrier number,
4619    which is an integer or a register.  THREADS is the number of threads
4620    controlled by the barrier.  */
4621 
4622 static rtx
nvptx_cta_sync(rtx lock,int threads)4623 nvptx_cta_sync (rtx lock, int threads)
4624 {
4625   return gen_nvptx_barsync (lock, GEN_INT (threads));
4626 }
4627 
4628 #if WORKAROUND_PTXJIT_BUG
4629 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4630    real insns.  */
4631 
4632 static rtx_insn *
bb_first_real_insn(basic_block bb)4633 bb_first_real_insn (basic_block bb)
4634 {
4635   rtx_insn *insn;
4636 
4637   /* Find first insn of from block.  */
4638   FOR_BB_INSNS (bb, insn)
4639     if (INSN_P (insn))
4640       return insn;
4641 
4642   return 0;
4643 }
4644 #endif
4645 
4646 /* Return true if INSN needs neutering.  */
4647 
4648 static bool
needs_neutering_p(rtx_insn * insn)4649 needs_neutering_p (rtx_insn *insn)
4650 {
4651   if (!INSN_P (insn))
4652     return false;
4653 
4654   switch (recog_memoized (insn))
4655     {
4656     case CODE_FOR_nvptx_fork:
4657     case CODE_FOR_nvptx_forked:
4658     case CODE_FOR_nvptx_joining:
4659     case CODE_FOR_nvptx_join:
4660     case CODE_FOR_nvptx_barsync:
4661       return false;
4662     default:
4663       return true;
4664     }
4665 }
4666 
4667 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM.  */
4668 
4669 static bool
verify_neutering_jumps(basic_block from,rtx_insn * vector_jump,rtx_insn * worker_jump,rtx_insn * vector_label,rtx_insn * worker_label)4670 verify_neutering_jumps (basic_block from,
4671 			rtx_insn *vector_jump, rtx_insn *worker_jump,
4672 			rtx_insn *vector_label, rtx_insn *worker_label)
4673 {
4674   basic_block bb = from;
4675   rtx_insn *insn = BB_HEAD (bb);
4676   bool seen_worker_jump = false;
4677   bool seen_vector_jump = false;
4678   bool seen_worker_label = false;
4679   bool seen_vector_label = false;
4680   bool worker_neutered = false;
4681   bool vector_neutered = false;
4682   while (true)
4683     {
4684       if (insn == worker_jump)
4685 	{
4686 	  seen_worker_jump = true;
4687 	  worker_neutered = true;
4688 	  gcc_assert (!vector_neutered);
4689 	}
4690       else if (insn == vector_jump)
4691 	{
4692 	  seen_vector_jump = true;
4693 	  vector_neutered = true;
4694 	}
4695       else if (insn == worker_label)
4696 	{
4697 	  seen_worker_label = true;
4698 	  gcc_assert (worker_neutered);
4699 	  worker_neutered = false;
4700 	}
4701       else if (insn == vector_label)
4702 	{
4703 	  seen_vector_label = true;
4704 	  gcc_assert (vector_neutered);
4705 	  vector_neutered = false;
4706 	}
4707       else if (INSN_P (insn))
4708 	switch (recog_memoized (insn))
4709 	  {
4710 	  case CODE_FOR_nvptx_barsync:
4711 	    gcc_assert (!vector_neutered && !worker_neutered);
4712 	    break;
4713 	  default:
4714 	    break;
4715 	  }
4716 
4717       if (insn != BB_END (bb))
4718 	insn = NEXT_INSN (insn);
4719       else if (JUMP_P (insn) && single_succ_p (bb)
4720 	       && !seen_vector_jump && !seen_worker_jump)
4721 	{
4722 	  bb = single_succ (bb);
4723 	  insn = BB_HEAD (bb);
4724 	}
4725       else
4726 	break;
4727     }
4728 
4729   gcc_assert (!(vector_jump && !seen_vector_jump));
4730   gcc_assert (!(worker_jump && !seen_worker_jump));
4731 
4732   if (seen_vector_label || seen_worker_label)
4733     {
4734       gcc_assert (!(vector_label && !seen_vector_label));
4735       gcc_assert (!(worker_label && !seen_worker_label));
4736 
4737       return true;
4738     }
4739 
4740   return false;
4741 }
4742 
4743 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO.  */
4744 
4745 static void
verify_neutering_labels(basic_block to,rtx_insn * vector_label,rtx_insn * worker_label)4746 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4747 			 rtx_insn *worker_label)
4748 {
4749   basic_block bb = to;
4750   rtx_insn *insn = BB_END (bb);
4751   bool seen_worker_label = false;
4752   bool seen_vector_label = false;
4753   while (true)
4754     {
4755       if (insn == worker_label)
4756 	{
4757 	  seen_worker_label = true;
4758 	  gcc_assert (!seen_vector_label);
4759 	}
4760       else if (insn == vector_label)
4761 	seen_vector_label = true;
4762       else if (INSN_P (insn))
4763 	switch (recog_memoized (insn))
4764 	  {
4765 	  case CODE_FOR_nvptx_barsync:
4766 	    gcc_assert (!seen_vector_label && !seen_worker_label);
4767 	    break;
4768 	  }
4769 
4770       if (insn != BB_HEAD (bb))
4771 	insn = PREV_INSN (insn);
4772       else
4773 	break;
4774     }
4775 
4776   gcc_assert (!(vector_label && !seen_vector_label));
4777   gcc_assert (!(worker_label && !seen_worker_label));
4778 }
4779 
4780 /* Single neutering according to MASK.  FROM is the incoming block and
4781    TO is the outgoing block.  These may be the same block. Insert at
4782    start of FROM:
4783 
4784      if (tid.<axis>) goto end.
4785 
4786    and insert before ending branch of TO (if there is such an insn):
4787 
4788      end:
4789      <possibly-broadcast-cond>
4790      <branch>
4791 
4792    We currently only use differnt FROM and TO when skipping an entire
4793    loop.  We could do more if we detected superblocks.  */
4794 
4795 static void
nvptx_single(unsigned mask,basic_block from,basic_block to)4796 nvptx_single (unsigned mask, basic_block from, basic_block to)
4797 {
4798   rtx_insn *head = BB_HEAD (from);
4799   rtx_insn *tail = BB_END (to);
4800   unsigned skip_mask = mask;
4801 
4802   while (true)
4803     {
4804       /* Find first insn of from block.  */
4805       while (head != BB_END (from) && !needs_neutering_p (head))
4806 	head = NEXT_INSN (head);
4807 
4808       if (from == to)
4809 	break;
4810 
4811       if (!(JUMP_P (head) && single_succ_p (from)))
4812 	break;
4813 
4814       basic_block jump_target = single_succ (from);
4815       if (!single_pred_p (jump_target))
4816 	break;
4817 
4818       from = jump_target;
4819       head = BB_HEAD (from);
4820     }
4821 
4822   /* Find last insn of to block */
4823   rtx_insn *limit = from == to ? head : BB_HEAD (to);
4824   while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4825     tail = PREV_INSN (tail);
4826 
4827   /* Detect if tail is a branch.  */
4828   rtx tail_branch = NULL_RTX;
4829   rtx cond_branch = NULL_RTX;
4830   if (tail && INSN_P (tail))
4831     {
4832       tail_branch = PATTERN (tail);
4833       if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4834 	tail_branch = NULL_RTX;
4835       else
4836 	{
4837 	  cond_branch = SET_SRC (tail_branch);
4838 	  if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4839 	    cond_branch = NULL_RTX;
4840 	}
4841     }
4842 
4843   if (tail == head)
4844     {
4845       /* If this is empty, do nothing.  */
4846       if (!head || !needs_neutering_p (head))
4847 	return;
4848 
4849       if (cond_branch)
4850 	{
4851 	  /* If we're only doing vector single, there's no need to
4852 	     emit skip code because we'll not insert anything.  */
4853 	  if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4854 	    skip_mask = 0;
4855 	}
4856       else if (tail_branch)
4857 	/* Block with only unconditional branch.  Nothing to do.  */
4858 	return;
4859     }
4860 
4861   /* Insert the vector test inside the worker test.  */
4862   unsigned mode;
4863   rtx_insn *before = tail;
4864   rtx_insn *neuter_start = NULL;
4865   rtx_insn *worker_label = NULL, *vector_label = NULL;
4866   rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4867   rtx_insn *warp_sync = NULL;
4868   for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4869     if (GOMP_DIM_MASK (mode) & skip_mask)
4870       {
4871 	rtx_code_label *label = gen_label_rtx ();
4872 	rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4873 	rtx_insn **mode_jump
4874 	  = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4875 	rtx_insn **mode_label
4876 	  = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4877 
4878 	if (!pred)
4879 	  {
4880 	    pred = gen_reg_rtx (BImode);
4881 	    cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4882 	  }
4883 
4884 	rtx br;
4885 	if (mode == GOMP_DIM_VECTOR)
4886 	  br = gen_br_true (pred, label);
4887 	else
4888 	  br = gen_br_true_uni (pred, label);
4889 	if (neuter_start)
4890 	  neuter_start = emit_insn_after (br, neuter_start);
4891 	else
4892 	  neuter_start = emit_insn_before (br, head);
4893 	*mode_jump = neuter_start;
4894 
4895 	LABEL_NUSES (label)++;
4896 	rtx_insn *label_insn;
4897 	if (tail_branch)
4898 	  {
4899 	    label_insn = emit_label_before (label, before);
4900 	    if (mode == GOMP_DIM_VECTOR)
4901 	      {
4902 		if (TARGET_PTX_6_0)
4903 		  warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4904 					       label_insn);
4905 		else
4906 		  warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4907 					       label_insn);
4908 	      }
4909 	    before = label_insn;
4910 	  }
4911 	else
4912 	  {
4913 	    label_insn = emit_label_after (label, tail);
4914 	    if (mode == GOMP_DIM_VECTOR)
4915 	      {
4916 		if (TARGET_PTX_6_0)
4917 		  warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4918 					       label_insn);
4919 		else
4920 		  warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4921 					       label_insn);
4922 	      }
4923 	    if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4924 		&& CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4925 	      emit_insn_after (gen_exit (), label_insn);
4926 	  }
4927 
4928 	*mode_label = label_insn;
4929       }
4930 
4931   /* Now deal with propagating the branch condition.  */
4932   if (cond_branch)
4933     {
4934       rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4935 
4936       if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4937 	  && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4938 	{
4939 	  /* Vector mode only, do a shuffle.  */
4940 #if WORKAROUND_PTXJIT_BUG
4941 	  /* The branch condition %rcond is propagated like this:
4942 
4943 		{
4944 		    .reg .u32 %x;
4945 		    mov.u32 %x,%tid.x;
4946 		    setp.ne.u32 %rnotvzero,%x,0;
4947 		 }
4948 
4949 		 @%rnotvzero bra Lskip;
4950 		 setp.<op>.<type> %rcond,op1,op2;
4951 		 Lskip:
4952 		 selp.u32 %rcondu32,1,0,%rcond;
4953 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4954 		 setp.ne.u32 %rcond,%rcondu32,0;
4955 
4956 	     There seems to be a bug in the ptx JIT compiler (observed at driver
4957 	     version 381.22, at -O1 and higher for sm_61), that drops the shfl
4958 	     unless %rcond is initialized to something before 'bra Lskip'.  The
4959 	     bug is not observed with ptxas from cuda 8.0.61.
4960 
4961 	     It is true that the code is non-trivial: at Lskip, %rcond is
4962 	     uninitialized in threads 1-31, and after the selp the same holds
4963 	     for %rcondu32.  But shfl propagates the defined value in thread 0
4964 	     to threads 1-31, so after the shfl %rcondu32 is defined in threads
4965 	     0-31, and after the setp.ne %rcond is defined in threads 0-31.
4966 
4967 	     There is nothing in the PTX spec to suggest that this is wrong, or
4968 	     to explain why the extra initialization is needed.  So, we classify
4969 	     it as a JIT bug, and the extra initialization as workaround:
4970 
4971 		{
4972 		    .reg .u32 %x;
4973 		    mov.u32 %x,%tid.x;
4974 		    setp.ne.u32 %rnotvzero,%x,0;
4975 		}
4976 
4977 		+.reg .pred %rcond2;
4978 		+setp.eq.u32 %rcond2, 1, 0;
4979 
4980 		 @%rnotvzero bra Lskip;
4981 		 setp.<op>.<type> %rcond,op1,op2;
4982 		+mov.pred %rcond2, %rcond;
4983 		 Lskip:
4984 		+mov.pred %rcond, %rcond2;
4985 		 selp.u32 %rcondu32,1,0,%rcond;
4986 		 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4987 		 setp.ne.u32 %rcond,%rcondu32,0;
4988 	  */
4989 	  rtx_insn *label = PREV_INSN (tail);
4990 	  if (label == warp_sync)
4991 	    label = PREV_INSN (label);
4992 	  gcc_assert (label && LABEL_P (label));
4993 	  rtx tmp = gen_reg_rtx (BImode);
4994 	  emit_insn_before (gen_movbi (tmp, const0_rtx),
4995 			    bb_first_real_insn (from));
4996 	  emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4997 	  emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4998 #endif
4999 	  emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
5000 	}
5001       else
5002 	{
5003 	  /* Includes worker mode, do spill & fill.  By construction
5004 	     we should never have worker mode only. */
5005 	  broadcast_data_t data;
5006 	  unsigned size = GET_MODE_SIZE (SImode);
5007 	  bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
5008 	  bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
5009 	  rtx barrier = GEN_INT (0);
5010 	  int threads = 0;
5011 
5012 	  data.base = oacc_bcast_sym;
5013 	  data.ptr = 0;
5014 
5015 	  bool use_partitioning_p = (vector && !worker
5016 				     && nvptx_mach_max_workers () > 1
5017 				     && cfun->machine->bcast_partition);
5018 	  if (use_partitioning_p)
5019 	    {
5020 	      data.base = cfun->machine->bcast_partition;
5021 	      barrier = cfun->machine->sync_bar;
5022 	      threads = nvptx_mach_vector_length ();
5023 	    }
5024 	  gcc_assert (data.base != NULL);
5025 	  gcc_assert (barrier);
5026 
5027 	  unsigned int psize = ROUND_UP (size, oacc_bcast_align);
5028 	  unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
5029 			       ? nvptx_mach_max_workers () + 1
5030 			       : 1);
5031 
5032 	  oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
5033 	  oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
5034 
5035 	  data.offset = 0;
5036 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
5037 						    vector),
5038 			    before);
5039 
5040 	  /* Barrier so other workers can see the write.  */
5041 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
5042 	  data.offset = 0;
5043 	  emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
5044 						    vector),
5045 			    tail);
5046 	  /* This barrier is needed to avoid worker zero clobbering
5047 	     the broadcast buffer before all the other workers have
5048 	     had a chance to read this instance of it.  */
5049 	  emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
5050 	}
5051 
5052       extract_insn (tail);
5053       rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
5054 				 UNSPEC_BR_UNIFIED);
5055       validate_change (tail, recog_data.operand_loc[0], unsp, false);
5056     }
5057 
5058   bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
5059 					    vector_label, worker_label);
5060   if (!seen_label)
5061     verify_neutering_labels (to, vector_label, worker_label);
5062 }
5063 
5064 /* PAR is a parallel that is being skipped in its entirety according to
5065    MASK.  Treat this as skipping a superblock starting at forked
5066    and ending at joining.  */
5067 
5068 static void
nvptx_skip_par(unsigned mask,parallel * par)5069 nvptx_skip_par (unsigned mask, parallel *par)
5070 {
5071   basic_block tail = par->join_block;
5072   gcc_assert (tail->preds->length () == 1);
5073 
5074   basic_block pre_tail = (*tail->preds)[0]->src;
5075   gcc_assert (pre_tail->succs->length () == 1);
5076 
5077   nvptx_single (mask, par->forked_block, pre_tail);
5078 }
5079 
5080 /* If PAR has a single inner parallel and PAR itself only contains
5081    empty entry and exit blocks, swallow the inner PAR.  */
5082 
5083 static void
nvptx_optimize_inner(parallel * par)5084 nvptx_optimize_inner (parallel *par)
5085 {
5086   parallel *inner = par->inner;
5087 
5088   /* We mustn't be the outer dummy par.  */
5089   if (!par->mask)
5090     return;
5091 
5092   /* We must have a single inner par.  */
5093   if (!inner || inner->next)
5094     return;
5095 
5096   /* We must only contain 2 blocks ourselves -- the head and tail of
5097      the inner par.  */
5098   if (par->blocks.length () != 2)
5099     return;
5100 
5101   /* We must be disjoint partitioning.  As we only have vector and
5102      worker partitioning, this is sufficient to guarantee the pars
5103      have adjacent partitioning.  */
5104   if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
5105     /* This indicates malformed code generation.  */
5106     return;
5107 
5108   /* The outer forked insn should be immediately followed by the inner
5109      fork insn.  */
5110   rtx_insn *forked = par->forked_insn;
5111   rtx_insn *fork = BB_END (par->forked_block);
5112 
5113   if (NEXT_INSN (forked) != fork)
5114     return;
5115   gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
5116 
5117   /* The outer joining insn must immediately follow the inner join
5118      insn.  */
5119   rtx_insn *joining = par->joining_insn;
5120   rtx_insn *join = inner->join_insn;
5121   if (NEXT_INSN (join) != joining)
5122     return;
5123 
5124   /* Preconditions met.  Swallow the inner par.  */
5125   if (dump_file)
5126     fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
5127 	     inner->mask, inner->forked_block->index,
5128 	     inner->join_block->index,
5129 	     par->mask, par->forked_block->index, par->join_block->index);
5130 
5131   par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
5132 
5133   par->blocks.reserve (inner->blocks.length ());
5134   while (inner->blocks.length ())
5135     par->blocks.quick_push (inner->blocks.pop ());
5136 
5137   par->inner = inner->inner;
5138   inner->inner = NULL;
5139 
5140   delete inner;
5141 }
5142 
5143 /* Process the parallel PAR and all its contained
5144    parallels.  We do everything but the neutering.  Return mask of
5145    partitioned modes used within this parallel.  */
5146 
5147 static unsigned
nvptx_process_pars(parallel * par)5148 nvptx_process_pars (parallel *par)
5149 {
5150   if (nvptx_optimize)
5151     nvptx_optimize_inner (par);
5152 
5153   unsigned inner_mask = par->mask;
5154 
5155   /* Do the inner parallels first.  */
5156   if (par->inner)
5157     {
5158       par->inner_mask = nvptx_process_pars (par->inner);
5159       inner_mask |= par->inner_mask;
5160     }
5161 
5162   bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
5163   bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
5164   bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
5165 		      && nvptx_mach_vector_length () > PTX_WARP_SIZE);
5166 
5167   if (worker || large_vector)
5168     {
5169       nvptx_shared_propagate (false, is_call, par->forked_block,
5170 			      par->forked_insn, !worker);
5171       bool no_prop_p
5172 	= nvptx_shared_propagate (true, is_call, par->forked_block,
5173 				  par->fork_insn, !worker);
5174       bool empty_loop_p
5175 	= !is_call && (NEXT_INSN (par->forked_insn)
5176 		       && NEXT_INSN (par->forked_insn) == par->joining_insn);
5177       rtx barrier = GEN_INT (0);
5178       int threads = 0;
5179 
5180       if (!worker && cfun->machine->sync_bar)
5181 	{
5182 	  barrier = cfun->machine->sync_bar;
5183 	  threads = nvptx_mach_vector_length ();
5184 	}
5185 
5186       if (no_prop_p && empty_loop_p)
5187 	;
5188       else if (no_prop_p && is_call)
5189 	;
5190       else
5191 	{
5192 	  /* Insert begin and end synchronizations.  */
5193 	  emit_insn_before (nvptx_cta_sync (barrier, threads),
5194 			    par->forked_insn);
5195 	  emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
5196 	}
5197     }
5198   else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
5199     nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
5200 
5201   /* Now do siblings.  */
5202   if (par->next)
5203     inner_mask |= nvptx_process_pars (par->next);
5204   return inner_mask;
5205 }
5206 
5207 /* Neuter the parallel described by PAR.  We recurse in depth-first
5208    order.  MODES are the partitioning of the execution and OUTER is
5209    the partitioning of the parallels we are contained in.  */
5210 
5211 static void
nvptx_neuter_pars(parallel * par,unsigned modes,unsigned outer)5212 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
5213 {
5214   unsigned me = (par->mask
5215 		 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
5216 		    | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5217   unsigned  skip_mask = 0, neuter_mask = 0;
5218 
5219   if (par->inner)
5220     nvptx_neuter_pars (par->inner, modes, outer | me);
5221 
5222   for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
5223     {
5224       if ((outer | me) & GOMP_DIM_MASK (mode))
5225 	{} /* Mode is partitioned: no neutering.  */
5226       else if (!(modes & GOMP_DIM_MASK (mode)))
5227 	{} /* Mode is not used: nothing to do.  */
5228       else if (par->inner_mask & GOMP_DIM_MASK (mode)
5229 	       || !par->forked_insn)
5230 	/* Partitioned in inner parallels, or we're not a partitioned
5231 	   at all: neuter individual blocks.  */
5232 	neuter_mask |= GOMP_DIM_MASK (mode);
5233       else if (!par->parent || !par->parent->forked_insn
5234 	       || par->parent->inner_mask & GOMP_DIM_MASK (mode))
5235 	/* Parent isn't a parallel or contains this paralleling: skip
5236 	   parallel at this level.  */
5237 	skip_mask |= GOMP_DIM_MASK (mode);
5238       else
5239 	{} /* Parent will skip this parallel itself.  */
5240     }
5241 
5242   if (neuter_mask)
5243     {
5244       int ix, len;
5245 
5246       if (nvptx_optimize)
5247 	{
5248 	  /* Neuter whole SESE regions.  */
5249 	  bb_pair_vec_t regions;
5250 
5251 	  nvptx_find_sese (par->blocks, regions);
5252 	  len = regions.length ();
5253 	  for (ix = 0; ix != len; ix++)
5254 	    {
5255 	      basic_block from = regions[ix].first;
5256 	      basic_block to = regions[ix].second;
5257 
5258 	      if (from)
5259 		nvptx_single (neuter_mask, from, to);
5260 	      else
5261 		gcc_assert (!to);
5262 	    }
5263 	}
5264       else
5265 	{
5266 	  /* Neuter each BB individually.  */
5267 	  len = par->blocks.length ();
5268 	  for (ix = 0; ix != len; ix++)
5269 	    {
5270 	      basic_block block = par->blocks[ix];
5271 
5272 	      nvptx_single (neuter_mask, block, block);
5273 	    }
5274 	}
5275     }
5276 
5277   if (skip_mask)
5278     nvptx_skip_par (skip_mask, par);
5279 
5280   if (par->next)
5281     nvptx_neuter_pars (par->next, modes, outer);
5282 }
5283 
5284 static void
populate_offload_attrs(offload_attrs * oa)5285 populate_offload_attrs (offload_attrs *oa)
5286 {
5287   tree attr = oacc_get_fn_attrib (current_function_decl);
5288   tree dims = TREE_VALUE (attr);
5289   unsigned ix;
5290 
5291   oa->mask = 0;
5292 
5293   for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
5294     {
5295       tree t = TREE_VALUE (dims);
5296       int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
5297       tree allowed = TREE_PURPOSE (dims);
5298 
5299       if (size != 1 && !(allowed && integer_zerop (allowed)))
5300 	oa->mask |= GOMP_DIM_MASK (ix);
5301 
5302       switch (ix)
5303 	{
5304 	case GOMP_DIM_GANG:
5305 	  oa->num_gangs = size;
5306 	  break;
5307 
5308 	case GOMP_DIM_WORKER:
5309 	  oa->num_workers = size;
5310 	  break;
5311 
5312 	case GOMP_DIM_VECTOR:
5313 	  oa->vector_length = size;
5314 	  break;
5315 	}
5316     }
5317 }
5318 
5319 #if WORKAROUND_PTXJIT_BUG_2
5320 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT.  This variant
5321    is needed in the nvptx target because the branches generated for
5322    parititioning are NONJUMP_INSN_P, not JUMP_P.  */
5323 
5324 static rtx
nvptx_pc_set(const rtx_insn * insn,bool strict=true)5325 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
5326 {
5327   rtx pat;
5328   if ((strict && !JUMP_P (insn))
5329       || (!strict && !INSN_P (insn)))
5330     return NULL_RTX;
5331   pat = PATTERN (insn);
5332 
5333   /* The set is allowed to appear either as the insn pattern or
5334      the first set in a PARALLEL.  */
5335   if (GET_CODE (pat) == PARALLEL)
5336     pat = XVECEXP (pat, 0, 0);
5337   if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
5338     return pat;
5339 
5340   return NULL_RTX;
5341 }
5342 
5343 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT.  */
5344 
5345 static rtx
nvptx_condjump_label(const rtx_insn * insn,bool strict=true)5346 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
5347 {
5348   rtx x = nvptx_pc_set (insn, strict);
5349 
5350   if (!x)
5351     return NULL_RTX;
5352   x = SET_SRC (x);
5353   if (GET_CODE (x) == LABEL_REF)
5354     return x;
5355   if (GET_CODE (x) != IF_THEN_ELSE)
5356     return NULL_RTX;
5357   if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
5358     return XEXP (x, 1);
5359   if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
5360     return XEXP (x, 2);
5361   return NULL_RTX;
5362 }
5363 
5364 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
5365    insn inbetween the branch and the label.  This works around a JIT bug
5366    observed at driver version 384.111, at -O0 for sm_50.  */
5367 
5368 static void
prevent_branch_around_nothing(void)5369 prevent_branch_around_nothing (void)
5370 {
5371   rtx_insn *seen_label = NULL;
5372     for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5373       {
5374 	if (INSN_P (insn) && condjump_p (insn))
5375 	  {
5376 	    seen_label = label_ref_label (nvptx_condjump_label (insn, false));
5377 	    continue;
5378 	  }
5379 
5380 	if (seen_label == NULL)
5381 	  continue;
5382 
5383 	if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5384 	  continue;
5385 
5386 	if (INSN_P (insn))
5387 	  switch (recog_memoized (insn))
5388 	    {
5389 	    case CODE_FOR_nvptx_fork:
5390 	    case CODE_FOR_nvptx_forked:
5391 	    case CODE_FOR_nvptx_joining:
5392 	    case CODE_FOR_nvptx_join:
5393 	    case CODE_FOR_nop:
5394 	      continue;
5395 	    case -1:
5396 	      /* Handle asm ("") and similar.  */
5397 	      if (GET_CODE (PATTERN (insn)) == ASM_INPUT
5398 		  || GET_CODE (PATTERN (insn)) == ASM_OPERANDS
5399 		  || (GET_CODE (PATTERN (insn)) == PARALLEL
5400 		      && asm_noperands (PATTERN (insn)) >= 0))
5401 		continue;
5402 	      /* FALLTHROUGH.  */
5403 	    default:
5404 	      seen_label = NULL;
5405 	      continue;
5406 	    }
5407 
5408 	if (LABEL_P (insn) && insn == seen_label)
5409 	  emit_insn_before (gen_fake_nop (), insn);
5410 
5411 	seen_label = NULL;
5412       }
5413   }
5414 #endif
5415 
5416 #ifdef WORKAROUND_PTXJIT_BUG_3
5417 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns.  This
5418    works around a hang observed at driver version 390.48 for sm_50.  */
5419 
5420 static void
workaround_barsyncs(void)5421 workaround_barsyncs (void)
5422 {
5423   bool seen_barsync = false;
5424   for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5425     {
5426       if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
5427 	{
5428 	  if (seen_barsync)
5429 	    {
5430 	      emit_insn_before (gen_nvptx_membar_cta (), insn);
5431 	      emit_insn_before (gen_nvptx_membar_cta (), insn);
5432 	    }
5433 
5434 	  seen_barsync = true;
5435 	  continue;
5436 	}
5437 
5438       if (!seen_barsync)
5439 	continue;
5440 
5441       if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5442 	continue;
5443       else if (INSN_P (insn))
5444 	switch (recog_memoized (insn))
5445 	  {
5446 	  case CODE_FOR_nvptx_fork:
5447 	  case CODE_FOR_nvptx_forked:
5448 	  case CODE_FOR_nvptx_joining:
5449 	  case CODE_FOR_nvptx_join:
5450 	    continue;
5451 	  default:
5452 	    break;
5453 	  }
5454 
5455       seen_barsync = false;
5456     }
5457 }
5458 #endif
5459 
5460 static rtx
gen_comment(const char * s)5461 gen_comment (const char *s)
5462 {
5463   const char *sep = " ";
5464   size_t len = strlen (ASM_COMMENT_START) + strlen (sep) + strlen (s) + 1;
5465   char *comment = (char *) alloca (len);
5466   snprintf (comment, len, "%s%s%s", ASM_COMMENT_START, sep, s);
5467   return gen_rtx_ASM_INPUT_loc (VOIDmode, ggc_strdup (comment),
5468 				DECL_SOURCE_LOCATION (cfun->decl));
5469 }
5470 
5471 /* Initialize all declared regs at function entry.
5472    Advantage   : Fool-proof.
5473    Disadvantage: Potentially creates a lot of long live ranges and adds a lot
5474 		 of insns.  */
5475 
5476 static void
workaround_uninit_method_1(void)5477 workaround_uninit_method_1 (void)
5478 {
5479   rtx_insn *first = get_insns ();
5480   rtx_insn *insert_here = NULL;
5481 
5482   for (int ix = LAST_VIRTUAL_REGISTER + 1; ix < max_reg_num (); ix++)
5483     {
5484       rtx reg = regno_reg_rtx[ix];
5485 
5486       /* Skip undeclared registers.  */
5487       if (reg == const0_rtx)
5488 	continue;
5489 
5490       gcc_assert (CONST0_RTX (GET_MODE (reg)));
5491 
5492       start_sequence ();
5493       if (nvptx_comment && first != NULL)
5494 	emit_insn (gen_comment ("Start: Added by -minit-regs=1"));
5495       emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5496       rtx_insn *inits = get_insns ();
5497       end_sequence ();
5498 
5499       if (dump_file && (dump_flags & TDF_DETAILS))
5500 	for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
5501 	  fprintf (dump_file, "Default init of reg %u inserted: insn %u\n",
5502 		   ix, INSN_UID (init));
5503 
5504       if (first != NULL)
5505 	{
5506 	  insert_here = emit_insn_before (inits, first);
5507 	  first = NULL;
5508 	}
5509       else
5510 	insert_here = emit_insn_after (inits, insert_here);
5511     }
5512 
5513   if (nvptx_comment && insert_here != NULL)
5514     emit_insn_after (gen_comment ("End: Added by -minit-regs=1"), insert_here);
5515 }
5516 
5517 /* Find uses of regs that are not defined on all incoming paths, and insert a
5518    corresponding def at function entry.
5519    Advantage   : Simple.
5520    Disadvantage: Potentially creates long live ranges.
5521 		 May not catch all cases.  F.i. a clobber cuts a live range in
5522 		 the compiler and may prevent entry_lr_in from being set for a
5523 		 reg, but the clobber does not translate to a ptx insn, so in
5524 		 ptx there still may be an uninitialized ptx reg.  See f.i.
5525 		 gcc.c-torture/compile/20020926-1.c.  */
5526 
5527 static void
workaround_uninit_method_2(void)5528 workaround_uninit_method_2 (void)
5529 {
5530   auto_bitmap entry_pseudo_uninit;
5531   {
5532     auto_bitmap not_pseudo;
5533     bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
5534 
5535     bitmap entry_lr_in = DF_LR_IN (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5536     bitmap_and_compl (entry_pseudo_uninit, entry_lr_in, not_pseudo);
5537   }
5538 
5539   rtx_insn *first = get_insns ();
5540   rtx_insn *insert_here = NULL;
5541 
5542   bitmap_iterator iterator;
5543   unsigned ix;
5544   EXECUTE_IF_SET_IN_BITMAP (entry_pseudo_uninit, 0, ix, iterator)
5545     {
5546       rtx reg = regno_reg_rtx[ix];
5547       gcc_assert (CONST0_RTX (GET_MODE (reg)));
5548 
5549       start_sequence ();
5550       if (nvptx_comment && first != NULL)
5551 	emit_insn (gen_comment ("Start: Added by -minit-regs=2:"));
5552       emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5553       rtx_insn *inits = get_insns ();
5554       end_sequence ();
5555 
5556       if (dump_file && (dump_flags & TDF_DETAILS))
5557 	for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
5558 	  fprintf (dump_file, "Missing init of reg %u inserted: insn %u\n",
5559 		   ix, INSN_UID (init));
5560 
5561       if (first != NULL)
5562 	{
5563 	  insert_here = emit_insn_before (inits, first);
5564 	  first = NULL;
5565 	}
5566       else
5567 	insert_here = emit_insn_after (inits, insert_here);
5568     }
5569 
5570   if (nvptx_comment && insert_here != NULL)
5571     emit_insn_after (gen_comment ("End: Added by -minit-regs=2"), insert_here);
5572 }
5573 
5574 /* Find uses of regs that are not defined on all incoming paths, and insert a
5575    corresponding def on those.
5576    Advantage   : Doesn't create long live ranges.
5577    Disadvantage: More complex, and potentially also more defs.  */
5578 
5579 static void
workaround_uninit_method_3(void)5580 workaround_uninit_method_3 (void)
5581 {
5582   auto_bitmap not_pseudo;
5583   bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
5584 
5585   basic_block bb;
5586   FOR_EACH_BB_FN (bb, cfun)
5587     {
5588       if (single_pred_p (bb))
5589 	continue;
5590 
5591       auto_bitmap bb_pseudo_uninit;
5592       bitmap_and_compl (bb_pseudo_uninit, DF_LIVE_IN (bb), DF_MIR_IN (bb));
5593       bitmap_and_compl_into (bb_pseudo_uninit, not_pseudo);
5594 
5595       bitmap_iterator iterator;
5596       unsigned ix;
5597       EXECUTE_IF_SET_IN_BITMAP (bb_pseudo_uninit, 0, ix, iterator)
5598 	{
5599 	  bool have_false = false;
5600 	  bool have_true = false;
5601 
5602 	  edge e;
5603 	  edge_iterator ei;
5604 	  FOR_EACH_EDGE (e, ei, bb->preds)
5605 	    {
5606 	      if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
5607 		have_true = true;
5608 	      else
5609 		have_false = true;
5610 	    }
5611 	  if (have_false ^ have_true)
5612 	    continue;
5613 
5614 	  FOR_EACH_EDGE (e, ei, bb->preds)
5615 	    {
5616 	      if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
5617 		continue;
5618 
5619 	      rtx reg = regno_reg_rtx[ix];
5620 	      gcc_assert (CONST0_RTX (GET_MODE (reg)));
5621 
5622 	      start_sequence ();
5623 	      emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5624 	      rtx_insn *inits = get_insns ();
5625 	      end_sequence ();
5626 
5627 	      if (dump_file && (dump_flags & TDF_DETAILS))
5628 		for (rtx_insn *init = inits; init != NULL;
5629 		     init = NEXT_INSN (init))
5630 		  fprintf (dump_file,
5631 			   "Missing init of reg %u inserted on edge: %d -> %d:"
5632 			   " insn %u\n", ix, e->src->index, e->dest->index,
5633 			   INSN_UID (init));
5634 
5635 	      insert_insn_on_edge (inits, e);
5636 	    }
5637 	}
5638     }
5639 
5640   if (nvptx_comment)
5641     FOR_EACH_BB_FN (bb, cfun)
5642       {
5643 	if (single_pred_p (bb))
5644 	  continue;
5645 
5646 	edge e;
5647 	edge_iterator ei;
5648 	FOR_EACH_EDGE (e, ei, bb->preds)
5649 	  {
5650 	    if (e->insns.r == NULL_RTX)
5651 	      continue;
5652 	    start_sequence ();
5653 	    emit_insn (gen_comment ("Start: Added by -minit-regs=3:"));
5654 	    emit_insn (e->insns.r);
5655 	    emit_insn (gen_comment ("End: Added by -minit-regs=3:"));
5656 	    e->insns.r = get_insns ();
5657 	    end_sequence ();
5658 	  }
5659       }
5660 
5661   commit_edge_insertions ();
5662 }
5663 
5664 static void
workaround_uninit(void)5665 workaround_uninit (void)
5666 {
5667   switch (nvptx_init_regs)
5668     {
5669     case 0:
5670       /* Skip.  */
5671       break;
5672     case 1:
5673       workaround_uninit_method_1 ();
5674       break;
5675     case 2:
5676       workaround_uninit_method_2 ();
5677       break;
5678     case 3:
5679       workaround_uninit_method_3 ();
5680       break;
5681     default:
5682       gcc_unreachable ();
5683     }
5684 }
5685 
5686 /* PTX-specific reorganization
5687    - Split blocks at fork and join instructions
5688    - Compute live registers
5689    - Mark now-unused registers, so function begin doesn't declare
5690    unused registers.
5691    - Insert state propagation when entering partitioned mode
5692    - Insert neutering instructions when in single mode
5693    - Replace subregs with suitable sequences.
5694 */
5695 
5696 static void
nvptx_reorg(void)5697 nvptx_reorg (void)
5698 {
5699   /* We are freeing block_for_insn in the toplev to keep compatibility
5700      with old MDEP_REORGS that are not CFG based.  Recompute it now.  */
5701   compute_bb_for_insn ();
5702 
5703   thread_prologue_and_epilogue_insns ();
5704 
5705   /* Split blocks and record interesting unspecs.  */
5706   bb_insn_map_t bb_insn_map;
5707 
5708   nvptx_split_blocks (&bb_insn_map);
5709 
5710   /* Compute live regs */
5711   df_clear_flags (DF_LR_RUN_DCE);
5712   df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
5713   df_live_add_problem ();
5714   df_live_set_all_dirty ();
5715   if (nvptx_init_regs == 3)
5716     df_mir_add_problem ();
5717   df_analyze ();
5718   regstat_init_n_sets_and_refs ();
5719 
5720   if (dump_file)
5721     df_dump (dump_file);
5722 
5723   /* Mark unused regs as unused.  */
5724   int max_regs = max_reg_num ();
5725   for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
5726     if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5727       regno_reg_rtx[i] = const0_rtx;
5728 
5729   workaround_uninit ();
5730 
5731   /* Determine launch dimensions of the function.  If it is not an
5732      offloaded function  (i.e. this is a regular compiler), the
5733      function has no neutering.  */
5734   tree attr = oacc_get_fn_attrib (current_function_decl);
5735   if (attr)
5736     {
5737       /* If we determined this mask before RTL expansion, we could
5738 	 elide emission of some levels of forks and joins.  */
5739       offload_attrs oa;
5740 
5741       populate_offload_attrs (&oa);
5742 
5743       /* If there is worker neutering, there must be vector
5744 	 neutering.  Otherwise the hardware will fail.  */
5745       gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5746 		  || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5747 
5748       /* Discover & process partitioned regions.  */
5749       parallel *pars = nvptx_discover_pars (&bb_insn_map);
5750       nvptx_process_pars (pars);
5751       nvptx_neuter_pars (pars, oa.mask, 0);
5752       delete pars;
5753     }
5754 
5755   /* Replace subregs.  */
5756   nvptx_reorg_subreg ();
5757 
5758   if (TARGET_UNIFORM_SIMT)
5759     nvptx_reorg_uniform_simt ();
5760 
5761 #if WORKAROUND_PTXJIT_BUG_2
5762   prevent_branch_around_nothing ();
5763 #endif
5764 
5765 #ifdef WORKAROUND_PTXJIT_BUG_3
5766   workaround_barsyncs ();
5767 #endif
5768 
5769   regstat_free_n_sets_and_refs ();
5770 
5771   df_finish_pass (true);
5772 }
5773 
5774 /* Handle a "kernel" attribute; arguments as in
5775    struct attribute_spec.handler.  */
5776 
5777 static tree
nvptx_handle_kernel_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)5778 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5779 			       int ARG_UNUSED (flags), bool *no_add_attrs)
5780 {
5781   tree decl = *node;
5782 
5783   if (TREE_CODE (decl) != FUNCTION_DECL)
5784     {
5785       error ("%qE attribute only applies to functions", name);
5786       *no_add_attrs = true;
5787     }
5788   else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5789     {
5790       error ("%qE attribute requires a void return type", name);
5791       *no_add_attrs = true;
5792     }
5793 
5794   return NULL_TREE;
5795 }
5796 
5797 /* Handle a "shared" attribute; arguments as in
5798    struct attribute_spec.handler.  */
5799 
5800 static tree
nvptx_handle_shared_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)5801 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5802 			       int ARG_UNUSED (flags), bool *no_add_attrs)
5803 {
5804   tree decl = *node;
5805 
5806   if (TREE_CODE (decl) != VAR_DECL)
5807     {
5808       error ("%qE attribute only applies to variables", name);
5809       *no_add_attrs = true;
5810     }
5811   else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5812     {
5813       error ("%qE attribute not allowed with auto storage class", name);
5814       *no_add_attrs = true;
5815     }
5816 
5817   return NULL_TREE;
5818 }
5819 
5820 /* Table of valid machine attributes.  */
5821 static const struct attribute_spec nvptx_attribute_table[] =
5822 {
5823   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5824        affects_type_identity, handler, exclude } */
5825   { "kernel", 0, 0, true, false,  false, false, nvptx_handle_kernel_attribute,
5826     NULL },
5827   { "shared", 0, 0, true, false,  false, false, nvptx_handle_shared_attribute,
5828     NULL },
5829   { NULL, 0, 0, false, false, false, false, NULL, NULL }
5830 };
5831 
5832 /* Limit vector alignments to BIGGEST_ALIGNMENT.  */
5833 
5834 static HOST_WIDE_INT
nvptx_vector_alignment(const_tree type)5835 nvptx_vector_alignment (const_tree type)
5836 {
5837   unsigned HOST_WIDE_INT align;
5838   tree size = TYPE_SIZE (type);
5839 
5840   /* Ensure align is not bigger than BIGGEST_ALIGNMENT.  */
5841   if (tree_fits_uhwi_p (size))
5842     {
5843       align = tree_to_uhwi (size);
5844       align = MIN (align, BIGGEST_ALIGNMENT);
5845     }
5846   else
5847     align = BIGGEST_ALIGNMENT;
5848 
5849   /* Ensure align is not smaller than mode alignment.  */
5850   align = MAX (align, GET_MODE_ALIGNMENT (TYPE_MODE (type)));
5851 
5852   return align;
5853 }
5854 
5855 /* Indicate that INSN cannot be duplicated.   */
5856 
5857 static bool
nvptx_cannot_copy_insn_p(rtx_insn * insn)5858 nvptx_cannot_copy_insn_p (rtx_insn *insn)
5859 {
5860   switch (recog_memoized (insn))
5861     {
5862     case CODE_FOR_nvptx_shufflesi:
5863     case CODE_FOR_nvptx_shufflesf:
5864     case CODE_FOR_nvptx_barsync:
5865     case CODE_FOR_nvptx_fork:
5866     case CODE_FOR_nvptx_forked:
5867     case CODE_FOR_nvptx_joining:
5868     case CODE_FOR_nvptx_join:
5869       return true;
5870     default:
5871       return false;
5872     }
5873 }
5874 
5875 /* Section anchors do not work.  Initialization for flag_section_anchor
5876    probes the existence of the anchoring target hooks and prevents
5877    anchoring if they don't exist.  However, we may be being used with
5878    a host-side compiler that does support anchoring, and hence see
5879    the anchor flag set (as it's not recalculated).  So provide an
5880    implementation denying anchoring.  */
5881 
5882 static bool
nvptx_use_anchors_for_symbol_p(const_rtx ARG_UNUSED (a))5883 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5884 {
5885   return false;
5886 }
5887 
5888 /* Record a symbol for mkoffload to enter into the mapping table.  */
5889 
5890 static void
nvptx_record_offload_symbol(tree decl)5891 nvptx_record_offload_symbol (tree decl)
5892 {
5893   switch (TREE_CODE (decl))
5894     {
5895     case VAR_DECL:
5896       fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5897 	       IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5898       break;
5899 
5900     case FUNCTION_DECL:
5901       {
5902 	tree attr = oacc_get_fn_attrib (decl);
5903 	/* OpenMP offloading does not set this attribute.  */
5904 	tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5905 
5906 	fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5907 		 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5908 
5909 	for (; dims; dims = TREE_CHAIN (dims))
5910 	  {
5911 	    int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5912 
5913 	    gcc_assert (!TREE_PURPOSE (dims));
5914 	    fprintf (asm_out_file, ", %#x", size);
5915 	  }
5916 
5917 	fprintf (asm_out_file, "\n");
5918       }
5919       break;
5920 
5921     default:
5922       gcc_unreachable ();
5923     }
5924 }
5925 
5926 /* Implement TARGET_ASM_FILE_START.  Write the kinds of things ptxas expects
5927    at the start of a file.  */
5928 
5929 static void
nvptx_file_start(void)5930 nvptx_file_start (void)
5931 {
5932   fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5933 
5934   fputs ("\t.version\t", asm_out_file);
5935   fputs (ptx_version_to_string ((enum ptx_version)ptx_version_option),
5936 	 asm_out_file);
5937   fputs ("\n", asm_out_file);
5938 
5939   fputs ("\t.target\tsm_", asm_out_file);
5940   fputs (sm_version_to_string ((enum ptx_isa)ptx_isa_option),
5941 	 asm_out_file);
5942   fputs ("\n", asm_out_file);
5943 
5944   fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5945 
5946   fputs ("// END PREAMBLE\n", asm_out_file);
5947 }
5948 
5949 /* Emit a declaration for a worker and vector-level buffer in .shared
5950    memory.  */
5951 
5952 static void
write_shared_buffer(FILE * file,rtx sym,unsigned align,unsigned size)5953 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5954 {
5955   const char *name = XSTR (sym, 0);
5956 
5957   write_var_marker (file, true, false, name);
5958   fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5959 	   align, name, size);
5960 }
5961 
5962 /* Write out the function declarations we've collected and declare storage
5963    for the broadcast buffer.  */
5964 
5965 static void
nvptx_file_end(void)5966 nvptx_file_end (void)
5967 {
5968   hash_table<tree_hasher>::iterator iter;
5969   tree decl;
5970   FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5971     nvptx_record_fndecl (decl);
5972   fputs (func_decls.str().c_str(), asm_out_file);
5973 
5974   if (oacc_bcast_size)
5975     write_shared_buffer (asm_out_file, oacc_bcast_sym,
5976 			 oacc_bcast_align, oacc_bcast_size);
5977 
5978   if (worker_red_size)
5979     write_shared_buffer (asm_out_file, worker_red_sym,
5980 			 worker_red_align, worker_red_size);
5981 
5982   if (vector_red_size)
5983     write_shared_buffer (asm_out_file, vector_red_sym,
5984 			 vector_red_align, vector_red_size);
5985 
5986   if (gang_private_shared_size)
5987     write_shared_buffer (asm_out_file, gang_private_shared_sym,
5988 			 gang_private_shared_align, gang_private_shared_size);
5989 
5990   if (need_softstack_decl)
5991     {
5992       write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5993       /* 32 is the maximum number of warps in a block.  Even though it's an
5994          external declaration, emit the array size explicitly; otherwise, it
5995          may fail at PTX JIT time if the definition is later in link order.  */
5996       fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5997 	       POINTER_SIZE);
5998     }
5999   if (need_unisimt_decl)
6000     {
6001       write_var_marker (asm_out_file, false, true, "__nvptx_uni");
6002       fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
6003     }
6004 }
6005 
6006 /* Expander for the shuffle builtins.  */
6007 
6008 static rtx
nvptx_expand_shuffle(tree exp,rtx target,machine_mode mode,int ignore)6009 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
6010 {
6011   if (ignore)
6012     return target;
6013 
6014   rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
6015 			 NULL_RTX, mode, EXPAND_NORMAL);
6016   if (!REG_P (src))
6017     src = copy_to_mode_reg (mode, src);
6018 
6019   rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
6020 			 NULL_RTX, SImode, EXPAND_NORMAL);
6021   rtx op = expand_expr (CALL_EXPR_ARG  (exp, 2),
6022 			NULL_RTX, SImode, EXPAND_NORMAL);
6023 
6024   if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
6025     idx = copy_to_mode_reg (SImode, idx);
6026 
6027   rtx pat = nvptx_gen_shuffle (target, src, idx,
6028 			       (nvptx_shuffle_kind) INTVAL (op));
6029   if (pat)
6030     emit_insn (pat);
6031 
6032   return target;
6033 }
6034 
6035 const char *
nvptx_output_red_partition(rtx dst,rtx offset)6036 nvptx_output_red_partition (rtx dst, rtx offset)
6037 {
6038   const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
6039   const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
6040 
6041   if (offset == const0_rtx)
6042     fprintf (asm_out_file, zero_offset, REGNO (dst),
6043 	     REGNO (cfun->machine->red_partition));
6044   else
6045     fprintf (asm_out_file, with_offset, REGNO (dst),
6046 	     REGNO (cfun->machine->red_partition), UINTVAL (offset));
6047 
6048   return "";
6049 }
6050 
6051 /* Shared-memory reduction address expander.  */
6052 
6053 static rtx
nvptx_expand_shared_addr(tree exp,rtx target,machine_mode ARG_UNUSED (mode),int ignore,int vector)6054 nvptx_expand_shared_addr (tree exp, rtx target,
6055 			  machine_mode ARG_UNUSED (mode), int ignore,
6056 			  int vector)
6057 {
6058   if (ignore)
6059     return target;
6060 
6061   unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
6062   unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
6063   unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
6064   rtx addr = worker_red_sym;
6065 
6066   if (vector)
6067     {
6068       offload_attrs oa;
6069 
6070       populate_offload_attrs (&oa);
6071 
6072       unsigned int psize = ROUND_UP (size + offset, align);
6073       unsigned int pnum = nvptx_mach_max_workers ();
6074       vector_red_partition = MAX (vector_red_partition, psize);
6075       vector_red_size = MAX (vector_red_size, psize * pnum);
6076       vector_red_align = MAX (vector_red_align, align);
6077 
6078       if (cfun->machine->red_partition == NULL)
6079 	cfun->machine->red_partition = gen_reg_rtx (Pmode);
6080 
6081       addr = gen_reg_rtx (Pmode);
6082       emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
6083     }
6084   else
6085     {
6086       worker_red_align = MAX (worker_red_align, align);
6087       worker_red_size = MAX (worker_red_size, size + offset);
6088 
6089       if (offset)
6090 	{
6091 	  addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
6092 	  addr = gen_rtx_CONST (Pmode, addr);
6093 	}
6094    }
6095 
6096   emit_move_insn (target, addr);
6097   return target;
6098 }
6099 
6100 /* Expand the CMP_SWAP PTX builtins.  We have our own versions that do
6101    not require taking the address of any object, other than the memory
6102    cell being operated on.  */
6103 
6104 static rtx
nvptx_expand_cmp_swap(tree exp,rtx target,machine_mode ARG_UNUSED (m),int ARG_UNUSED (ignore))6105 nvptx_expand_cmp_swap (tree exp, rtx target,
6106 		       machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
6107 {
6108   machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
6109 
6110   if (!target)
6111     target = gen_reg_rtx (mode);
6112 
6113   rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
6114 			 NULL_RTX, Pmode, EXPAND_NORMAL);
6115   rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
6116 			 NULL_RTX, mode, EXPAND_NORMAL);
6117   rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
6118 			 NULL_RTX, mode, EXPAND_NORMAL);
6119   rtx pat;
6120 
6121   mem = gen_rtx_MEM (mode, mem);
6122   if (!REG_P (cmp))
6123     cmp = copy_to_mode_reg (mode, cmp);
6124   if (!REG_P (src))
6125     src = copy_to_mode_reg (mode, src);
6126 
6127   if (mode == SImode)
6128     pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
6129   else
6130     pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
6131 
6132   emit_insn (pat);
6133 
6134   return target;
6135 }
6136 
6137 
6138 /* Codes for all the NVPTX builtins.  */
6139 enum nvptx_builtins
6140 {
6141   NVPTX_BUILTIN_SHUFFLE,
6142   NVPTX_BUILTIN_SHUFFLELL,
6143   NVPTX_BUILTIN_WORKER_ADDR,
6144   NVPTX_BUILTIN_VECTOR_ADDR,
6145   NVPTX_BUILTIN_CMP_SWAP,
6146   NVPTX_BUILTIN_CMP_SWAPLL,
6147   NVPTX_BUILTIN_MEMBAR_GL,
6148   NVPTX_BUILTIN_MEMBAR_CTA,
6149   NVPTX_BUILTIN_MAX
6150 };
6151 
6152 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
6153 
6154 /* Return the NVPTX builtin for CODE.  */
6155 
6156 static tree
nvptx_builtin_decl(unsigned code,bool ARG_UNUSED (initialize_p))6157 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
6158 {
6159   if (code >= NVPTX_BUILTIN_MAX)
6160     return error_mark_node;
6161 
6162   return nvptx_builtin_decls[code];
6163 }
6164 
6165 /* Set up all builtin functions for this target.  */
6166 
6167 static void
nvptx_init_builtins(void)6168 nvptx_init_builtins (void)
6169 {
6170 #define DEF(ID, NAME, T)						\
6171   (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID]				\
6172    = add_builtin_function ("__builtin_nvptx_" NAME,			\
6173 			   build_function_type_list T,			\
6174 			   NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
6175 #define ST sizetype
6176 #define UINT unsigned_type_node
6177 #define LLUINT long_long_unsigned_type_node
6178 #define PTRVOID ptr_type_node
6179 #define VOID void_type_node
6180 
6181   DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
6182   DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
6183   DEF (WORKER_ADDR, "worker_addr",
6184        (PTRVOID, ST, UINT, UINT, NULL_TREE));
6185   DEF (VECTOR_ADDR, "vector_addr",
6186        (PTRVOID, ST, UINT, UINT, NULL_TREE));
6187   DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
6188   DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
6189   DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE));
6190   DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE));
6191 
6192 #undef DEF
6193 #undef ST
6194 #undef UINT
6195 #undef LLUINT
6196 #undef PTRVOID
6197 }
6198 
6199 /* Expand an expression EXP that calls a built-in function,
6200    with result going to TARGET if that's convenient
6201    (and in mode MODE if that's convenient).
6202    SUBTARGET may be used as the target for computing one of EXP's operands.
6203    IGNORE is nonzero if the value is to be ignored.  */
6204 
6205 static rtx
nvptx_expand_builtin(tree exp,rtx target,rtx ARG_UNUSED (subtarget),machine_mode mode,int ignore)6206 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
6207 		      machine_mode mode, int ignore)
6208 {
6209   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
6210   switch (DECL_MD_FUNCTION_CODE (fndecl))
6211     {
6212     case NVPTX_BUILTIN_SHUFFLE:
6213     case NVPTX_BUILTIN_SHUFFLELL:
6214       return nvptx_expand_shuffle (exp, target, mode, ignore);
6215 
6216     case NVPTX_BUILTIN_WORKER_ADDR:
6217       return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
6218 
6219     case NVPTX_BUILTIN_VECTOR_ADDR:
6220       return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
6221 
6222     case NVPTX_BUILTIN_CMP_SWAP:
6223     case NVPTX_BUILTIN_CMP_SWAPLL:
6224       return nvptx_expand_cmp_swap (exp, target, mode, ignore);
6225 
6226     case NVPTX_BUILTIN_MEMBAR_GL:
6227       emit_insn (gen_nvptx_membar_gl ());
6228       return NULL_RTX;
6229 
6230     case NVPTX_BUILTIN_MEMBAR_CTA:
6231       emit_insn (gen_nvptx_membar_cta ());
6232       return NULL_RTX;
6233 
6234     default: gcc_unreachable ();
6235     }
6236 }
6237 
6238 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp.  */
6239 
6240 static int
nvptx_simt_vf()6241 nvptx_simt_vf ()
6242 {
6243   return PTX_WARP_SIZE;
6244 }
6245 
6246 /* Return 1 if TRAIT NAME is present in the OpenMP context's
6247    device trait set, return 0 if not present in any OpenMP context in the
6248    whole translation unit, or -1 if not present in the current OpenMP context
6249    but might be present in another OpenMP context in the same TU.  */
6250 
6251 int
nvptx_omp_device_kind_arch_isa(enum omp_device_kind_arch_isa trait,const char * name)6252 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
6253 				const char *name)
6254 {
6255   switch (trait)
6256     {
6257     case omp_device_kind:
6258       return strcmp (name, "gpu") == 0;
6259     case omp_device_arch:
6260       return strcmp (name, "nvptx") == 0;
6261     case omp_device_isa:
6262 #define NVPTX_SM(XX, SEP)				\
6263       {							\
6264 	if (strcmp (name, "sm_" #XX) == 0)		\
6265 	  return ptx_isa_option == PTX_ISA_SM ## XX;	\
6266       }
6267 #include "nvptx-sm.def"
6268 #undef NVPTX_SM
6269       return 0;
6270     default:
6271       gcc_unreachable ();
6272     }
6273 }
6274 
6275 static bool
nvptx_welformed_vector_length_p(int l)6276 nvptx_welformed_vector_length_p (int l)
6277 {
6278   gcc_assert (l > 0);
6279   return l % PTX_WARP_SIZE == 0;
6280 }
6281 
6282 static void
nvptx_apply_dim_limits(int dims[])6283 nvptx_apply_dim_limits (int dims[])
6284 {
6285   /* Check that the vector_length is not too large.  */
6286   if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
6287     dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
6288 
6289   /* Check that the number of workers is not too large.  */
6290   if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
6291     dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
6292 
6293   /* Ensure that num_worker * vector_length <= cta size.  */
6294   if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
6295       && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
6296     dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6297 
6298   /* If we need a per-worker barrier ... .  */
6299   if (dims[GOMP_DIM_WORKER] > 0 &&  dims[GOMP_DIM_VECTOR] > 0
6300       && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
6301     /* Don't use more barriers than available.  */
6302     dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
6303 				 PTX_NUM_PER_WORKER_BARRIERS);
6304 }
6305 
6306 /* Return true if FNDECL contains calls to vector-partitionable routines.  */
6307 
6308 static bool
has_vector_partitionable_routine_calls_p(tree fndecl)6309 has_vector_partitionable_routine_calls_p (tree fndecl)
6310 {
6311   if (!fndecl)
6312     return false;
6313 
6314   basic_block bb;
6315   FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
6316     for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
6317 	 gsi_next_nondebug (&i))
6318       {
6319 	gimple *stmt = gsi_stmt (i);
6320 	if (gimple_code (stmt) != GIMPLE_CALL)
6321 	  continue;
6322 
6323 	tree callee = gimple_call_fndecl (stmt);
6324 	if (!callee)
6325 	  continue;
6326 
6327 	tree attrs  = oacc_get_fn_attrib (callee);
6328 	if (attrs == NULL_TREE)
6329 	  return false;
6330 
6331 	int partition_level = oacc_fn_attrib_level (attrs);
6332 	bool seq_routine_p = partition_level == GOMP_DIM_MAX;
6333 	if (!seq_routine_p)
6334 	  return true;
6335       }
6336 
6337   return false;
6338 }
6339 
6340 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
6341    DIMS has changed.  */
6342 
6343 static void
nvptx_goacc_validate_dims_1(tree decl,int dims[],int fn_level,unsigned used)6344 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
6345 {
6346   bool oacc_default_dims_p = false;
6347   bool oacc_min_dims_p = false;
6348   bool offload_region_p = false;
6349   bool routine_p = false;
6350   bool routine_seq_p = false;
6351   int default_vector_length = -1;
6352 
6353   if (decl == NULL_TREE)
6354     {
6355       if (fn_level == -1)
6356 	oacc_default_dims_p = true;
6357       else if (fn_level == -2)
6358 	oacc_min_dims_p = true;
6359       else
6360 	gcc_unreachable ();
6361     }
6362   else if (fn_level == -1)
6363     offload_region_p = true;
6364   else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
6365     {
6366       routine_p = true;
6367       routine_seq_p = fn_level == GOMP_DIM_MAX;
6368     }
6369   else
6370     gcc_unreachable ();
6371 
6372   if (oacc_min_dims_p)
6373     {
6374       gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
6375       gcc_assert (dims[GOMP_DIM_WORKER] == 1);
6376       gcc_assert (dims[GOMP_DIM_GANG] == 1);
6377 
6378       dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6379       return;
6380     }
6381 
6382   if (routine_p)
6383     {
6384       if (!routine_seq_p)
6385 	dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6386 
6387       return;
6388     }
6389 
6390   if (oacc_default_dims_p)
6391     {
6392       /* -1  : not set
6393 	  0  : set at runtime, f.i. -fopenacc-dims=-
6394          >= 1: set at compile time, f.i. -fopenacc-dims=1.  */
6395       gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
6396       gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
6397       gcc_assert (dims[GOMP_DIM_GANG] >= -1);
6398 
6399       /* But -fopenacc-dims=- is not yet supported on trunk.  */
6400       gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
6401       gcc_assert (dims[GOMP_DIM_WORKER] != 0);
6402       gcc_assert (dims[GOMP_DIM_GANG] != 0);
6403     }
6404 
6405   if (offload_region_p)
6406     {
6407       /* -1   : not set
6408 	  0   : set using variable, f.i. num_gangs (n)
6409 	  >= 1: set using constant, f.i. num_gangs (1).  */
6410       gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
6411       gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
6412       gcc_assert (dims[GOMP_DIM_GANG] >= -1);
6413     }
6414 
6415   if (offload_region_p)
6416     default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
6417   else
6418     /* oacc_default_dims_p.  */
6419     default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
6420 
6421   int old_dims[GOMP_DIM_MAX];
6422   unsigned int i;
6423   for (i = 0; i < GOMP_DIM_MAX; ++i)
6424     old_dims[i] = dims[i];
6425 
6426   const char *vector_reason = NULL;
6427   if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
6428     {
6429       default_vector_length = PTX_WARP_SIZE;
6430 
6431       if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
6432 	{
6433 	  vector_reason = G_("using %<vector_length (%d)%> due to call to"
6434 			     " vector-partitionable routine, ignoring %d");
6435 	  dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6436 	}
6437     }
6438 
6439   if (dims[GOMP_DIM_VECTOR] == 0)
6440     {
6441       vector_reason = G_("using %<vector_length (%d)%>, ignoring runtime setting");
6442       dims[GOMP_DIM_VECTOR] = default_vector_length;
6443     }
6444 
6445   if (dims[GOMP_DIM_VECTOR] > 0
6446       && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
6447     dims[GOMP_DIM_VECTOR] = default_vector_length;
6448 
6449   nvptx_apply_dim_limits (dims);
6450 
6451   if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
6452     warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6453 		vector_reason != NULL
6454 		? vector_reason
6455 		: G_("using %<vector_length (%d)%>, ignoring %d"),
6456 		dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
6457 
6458   if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
6459     warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6460 		G_("using %<num_workers (%d)%>, ignoring %d"),
6461 		dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
6462 
6463   if (oacc_default_dims_p)
6464     {
6465       if (dims[GOMP_DIM_VECTOR] < 0)
6466 	dims[GOMP_DIM_VECTOR] = default_vector_length;
6467       if (dims[GOMP_DIM_WORKER] < 0)
6468 	dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
6469       if (dims[GOMP_DIM_GANG] < 0)
6470 	dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
6471       nvptx_apply_dim_limits (dims);
6472     }
6473 
6474   if (offload_region_p)
6475     {
6476       for (i = 0; i < GOMP_DIM_MAX; i++)
6477 	{
6478 	  if (!(dims[i] < 0))
6479 	    continue;
6480 
6481 	  if ((used & GOMP_DIM_MASK (i)) == 0)
6482 	    /* Function oacc_validate_dims will apply the minimal dimension.  */
6483 	    continue;
6484 
6485 	  dims[i] = (i == GOMP_DIM_VECTOR
6486 		     ? default_vector_length
6487 		     : oacc_get_default_dim (i));
6488 	}
6489 
6490       nvptx_apply_dim_limits (dims);
6491     }
6492 }
6493 
6494 /* Validate compute dimensions of an OpenACC offload or routine, fill
6495    in non-unity defaults.  FN_LEVEL indicates the level at which a
6496    routine might spawn a loop.  It is negative for non-routines.  If
6497    DECL is null, we are validating the default dimensions.  */
6498 
6499 static bool
nvptx_goacc_validate_dims(tree decl,int dims[],int fn_level,unsigned used)6500 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
6501 {
6502   int old_dims[GOMP_DIM_MAX];
6503   unsigned int i;
6504 
6505   for (i = 0; i < GOMP_DIM_MAX; ++i)
6506     old_dims[i] = dims[i];
6507 
6508   nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
6509 
6510   gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
6511   if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
6512     gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
6513 
6514   for (i = 0; i < GOMP_DIM_MAX; ++i)
6515     if (old_dims[i] != dims[i])
6516       return true;
6517 
6518   return false;
6519 }
6520 
6521 /* Return maximum dimension size, or zero for unbounded.  */
6522 
6523 static int
nvptx_dim_limit(int axis)6524 nvptx_dim_limit (int axis)
6525 {
6526   switch (axis)
6527     {
6528     case GOMP_DIM_VECTOR:
6529       return PTX_MAX_VECTOR_LENGTH;
6530 
6531     default:
6532       break;
6533     }
6534   return 0;
6535 }
6536 
6537 /* Determine whether fork & joins are needed.  */
6538 
6539 static bool
nvptx_goacc_fork_join(gcall * call,const int dims[],bool ARG_UNUSED (is_fork))6540 nvptx_goacc_fork_join (gcall *call, const int dims[],
6541 		       bool ARG_UNUSED (is_fork))
6542 {
6543   tree arg = gimple_call_arg (call, 2);
6544   unsigned axis = TREE_INT_CST_LOW (arg);
6545 
6546   /* We only care about worker and vector partitioning.  */
6547   if (axis < GOMP_DIM_WORKER)
6548     return false;
6549 
6550   /* If the size is 1, there's no partitioning.  */
6551   if (dims[axis] == 1)
6552     return false;
6553 
6554   return true;
6555 }
6556 
6557 /* Generate a PTX builtin function call that returns the address in
6558    the worker reduction buffer at OFFSET.  TYPE is the type of the
6559    data at that location.  */
6560 
6561 static tree
nvptx_get_shared_red_addr(tree type,tree offset,bool vector)6562 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
6563 {
6564   enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
6565   if (vector)
6566     addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
6567   machine_mode mode = TYPE_MODE (type);
6568   tree fndecl = nvptx_builtin_decl (addr_dim, true);
6569   tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
6570   tree align = build_int_cst (unsigned_type_node,
6571 			      GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
6572   tree call = build_call_expr (fndecl, 3, offset, size, align);
6573 
6574   return fold_convert (build_pointer_type (type), call);
6575 }
6576 
6577 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR.  This function
6578    will cast the variable if necessary.  */
6579 
6580 static void
nvptx_generate_vector_shuffle(location_t loc,tree dest_var,tree var,unsigned shift,gimple_seq * seq)6581 nvptx_generate_vector_shuffle (location_t loc,
6582 			       tree dest_var, tree var, unsigned shift,
6583 			       gimple_seq *seq)
6584 {
6585   unsigned fn = NVPTX_BUILTIN_SHUFFLE;
6586   tree_code code = NOP_EXPR;
6587   tree arg_type = unsigned_type_node;
6588   tree var_type = TREE_TYPE (var);
6589   tree dest_type = var_type;
6590 
6591   if (TREE_CODE (var_type) == COMPLEX_TYPE)
6592     var_type = TREE_TYPE (var_type);
6593 
6594   if (TREE_CODE (var_type) == REAL_TYPE)
6595     code = VIEW_CONVERT_EXPR;
6596 
6597   if (TYPE_SIZE (var_type)
6598       == TYPE_SIZE (long_long_unsigned_type_node))
6599     {
6600       fn = NVPTX_BUILTIN_SHUFFLELL;
6601       arg_type = long_long_unsigned_type_node;
6602     }
6603 
6604   tree call = nvptx_builtin_decl (fn, true);
6605   tree bits = build_int_cst (unsigned_type_node, shift);
6606   tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
6607   tree expr;
6608 
6609   if (var_type != dest_type)
6610     {
6611       /* Do real and imaginary parts separately.  */
6612       tree real = fold_build1 (REALPART_EXPR, var_type, var);
6613       real = fold_build1 (code, arg_type, real);
6614       real = build_call_expr_loc (loc, call, 3, real, bits, kind);
6615       real = fold_build1 (code, var_type, real);
6616 
6617       tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
6618       imag = fold_build1 (code, arg_type, imag);
6619       imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
6620       imag = fold_build1 (code, var_type, imag);
6621 
6622       expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
6623     }
6624   else
6625     {
6626       expr = fold_build1 (code, arg_type, var);
6627       expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
6628       expr = fold_build1 (code, dest_type, expr);
6629     }
6630 
6631   gimplify_assign (dest_var, expr, seq);
6632 }
6633 
6634 /* Lazily generate the global lock var decl and return its address.  */
6635 
6636 static tree
nvptx_global_lock_addr()6637 nvptx_global_lock_addr ()
6638 {
6639   tree v = global_lock_var;
6640 
6641   if (!v)
6642     {
6643       tree name = get_identifier ("__reduction_lock");
6644       tree type = build_qualified_type (unsigned_type_node,
6645 					TYPE_QUAL_VOLATILE);
6646       v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
6647       global_lock_var = v;
6648       DECL_ARTIFICIAL (v) = 1;
6649       DECL_EXTERNAL (v) = 1;
6650       TREE_STATIC (v) = 1;
6651       TREE_PUBLIC (v) = 1;
6652       TREE_USED (v) = 1;
6653       mark_addressable (v);
6654       mark_decl_referenced (v);
6655     }
6656 
6657   return build_fold_addr_expr (v);
6658 }
6659 
6660 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
6661    GSI.  We use a lockless scheme for nearly all case, which looks
6662    like:
6663      actual = initval(OP);
6664      do {
6665        guess = actual;
6666        write = guess OP myval;
6667        actual = cmp&swap (ptr, guess, write)
6668      } while (actual bit-different-to guess);
6669    return write;
6670 
6671    This relies on a cmp&swap instruction, which is available for 32-
6672    and 64-bit types.  Larger types must use a locking scheme.  */
6673 
6674 static tree
nvptx_lockless_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)6675 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
6676 		       tree ptr, tree var, tree_code op)
6677 {
6678   unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
6679   tree_code code = NOP_EXPR;
6680   tree arg_type = unsigned_type_node;
6681   tree var_type = TREE_TYPE (var);
6682 
6683   if (TREE_CODE (var_type) == COMPLEX_TYPE
6684       || TREE_CODE (var_type) == REAL_TYPE)
6685     code = VIEW_CONVERT_EXPR;
6686 
6687   if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
6688     {
6689       arg_type = long_long_unsigned_type_node;
6690       fn = NVPTX_BUILTIN_CMP_SWAPLL;
6691     }
6692 
6693   tree swap_fn = nvptx_builtin_decl (fn, true);
6694 
6695   gimple_seq init_seq = NULL;
6696   tree init_var = make_ssa_name (arg_type);
6697   tree init_expr = omp_reduction_init_op (loc, op, var_type);
6698   init_expr = fold_build1 (code, arg_type, init_expr);
6699   gimplify_assign (init_var, init_expr, &init_seq);
6700   gimple *init_end = gimple_seq_last (init_seq);
6701 
6702   gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
6703 
6704   /* Split the block just after the init stmts.  */
6705   basic_block pre_bb = gsi_bb (*gsi);
6706   edge pre_edge = split_block (pre_bb, init_end);
6707   basic_block loop_bb = pre_edge->dest;
6708   pre_bb = pre_edge->src;
6709   /* Reset the iterator.  */
6710   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6711 
6712   tree expect_var = make_ssa_name (arg_type);
6713   tree actual_var = make_ssa_name (arg_type);
6714   tree write_var = make_ssa_name (arg_type);
6715 
6716   /* Build and insert the reduction calculation.  */
6717   gimple_seq red_seq = NULL;
6718   tree write_expr = fold_build1 (code, var_type, expect_var);
6719   write_expr = fold_build2 (op, var_type, write_expr, var);
6720   write_expr = fold_build1 (code, arg_type, write_expr);
6721   gimplify_assign (write_var, write_expr, &red_seq);
6722 
6723   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6724 
6725   /* Build & insert the cmp&swap sequence.  */
6726   gimple_seq latch_seq = NULL;
6727   tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
6728 					ptr, expect_var, write_var);
6729   gimplify_assign (actual_var, swap_expr, &latch_seq);
6730 
6731   gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
6732 				   NULL_TREE, NULL_TREE);
6733   gimple_seq_add_stmt (&latch_seq, cond);
6734 
6735   gimple *latch_end = gimple_seq_last (latch_seq);
6736   gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
6737 
6738   /* Split the block just after the latch stmts.  */
6739   edge post_edge = split_block (loop_bb, latch_end);
6740   basic_block post_bb = post_edge->dest;
6741   loop_bb = post_edge->src;
6742   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6743 
6744   post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6745   post_edge->probability = profile_probability::even ();
6746   edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
6747   loop_edge->probability = profile_probability::even ();
6748   set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
6749   set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
6750 
6751   gphi *phi = create_phi_node (expect_var, loop_bb);
6752   add_phi_arg (phi, init_var, pre_edge, loc);
6753   add_phi_arg (phi, actual_var, loop_edge, loc);
6754 
6755   loop *loop = alloc_loop ();
6756   loop->header = loop_bb;
6757   loop->latch = loop_bb;
6758   add_loop (loop, loop_bb->loop_father);
6759 
6760   return fold_build1 (code, var_type, write_var);
6761 }
6762 
6763 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
6764    GSI.  This is necessary for types larger than 64 bits, where there
6765    is no cmp&swap instruction to implement a lockless scheme.  We use
6766    a lock variable in global memory.
6767 
6768    while (cmp&swap (&lock_var, 0, 1))
6769      continue;
6770    T accum = *ptr;
6771    accum = accum OP var;
6772    *ptr = accum;
6773    cmp&swap (&lock_var, 1, 0);
6774    return accum;
6775 
6776    A lock in global memory is necessary to force execution engine
6777    descheduling and avoid resource starvation that can occur if the
6778    lock is in .shared memory.  */
6779 
6780 static tree
nvptx_lockfull_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op,int level)6781 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6782 		       tree ptr, tree var, tree_code op, int level)
6783 {
6784   tree var_type = TREE_TYPE (var);
6785   tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6786   tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6787   tree uns_locked = build_int_cst (unsigned_type_node, 1);
6788 
6789   /* Split the block just before the gsi.  Insert a gimple nop to make
6790      this easier.  */
6791   gimple *nop = gimple_build_nop ();
6792   gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6793   basic_block entry_bb = gsi_bb (*gsi);
6794   edge entry_edge = split_block (entry_bb, nop);
6795   basic_block lock_bb = entry_edge->dest;
6796   /* Reset the iterator.  */
6797   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6798 
6799   /* Build and insert the locking sequence.  */
6800   gimple_seq lock_seq = NULL;
6801   tree lock_var = make_ssa_name (unsigned_type_node);
6802   tree lock_expr = nvptx_global_lock_addr ();
6803   lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6804 				   uns_unlocked, uns_locked);
6805   gimplify_assign (lock_var, lock_expr, &lock_seq);
6806   gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6807 				   NULL_TREE, NULL_TREE);
6808   gimple_seq_add_stmt (&lock_seq, cond);
6809   gimple *lock_end = gimple_seq_last (lock_seq);
6810   gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6811 
6812   /* Split the block just after the lock sequence.  */
6813   edge locked_edge = split_block (lock_bb, lock_end);
6814   basic_block update_bb = locked_edge->dest;
6815   lock_bb = locked_edge->src;
6816   *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6817 
6818   /* Create the lock loop ... */
6819   locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6820   locked_edge->probability = profile_probability::even ();
6821   edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6822   loop_edge->probability = profile_probability::even ();
6823   set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6824   set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6825 
6826   /* ... and the loop structure.  */
6827   loop *lock_loop = alloc_loop ();
6828   lock_loop->header = lock_bb;
6829   lock_loop->latch = lock_bb;
6830   lock_loop->nb_iterations_estimate = 1;
6831   lock_loop->any_estimate = true;
6832   add_loop (lock_loop, entry_bb->loop_father);
6833 
6834   /* Build the pre-barrier.  */
6835   gimple_seq red_seq = NULL;
6836   enum nvptx_builtins barrier_builtin
6837     = (level == GOMP_DIM_GANG
6838        ? NVPTX_BUILTIN_MEMBAR_GL
6839        : NVPTX_BUILTIN_MEMBAR_CTA);
6840   tree barrier_fn = nvptx_builtin_decl (barrier_builtin, true);
6841   tree barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
6842   gimplify_stmt (&barrier_expr, &red_seq);
6843 
6844   /* Build the reduction calculation.  */
6845   tree acc_in = make_ssa_name (var_type);
6846   tree ref_in = build_simple_mem_ref (ptr);
6847   TREE_THIS_VOLATILE (ref_in) = 1;
6848   gimplify_assign (acc_in, ref_in, &red_seq);
6849 
6850   tree acc_out = make_ssa_name (var_type);
6851   tree update_expr = fold_build2 (op, var_type, ref_in, var);
6852   gimplify_assign (acc_out, update_expr, &red_seq);
6853 
6854   tree ref_out = build_simple_mem_ref (ptr);
6855   TREE_THIS_VOLATILE (ref_out) = 1;
6856   gimplify_assign (ref_out, acc_out, &red_seq);
6857 
6858   /* Build the post-barrier.  */
6859   barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
6860   gimplify_stmt (&barrier_expr, &red_seq);
6861 
6862   /* Insert the reduction calculation.  */
6863   gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6864 
6865   /* Build & insert the unlock sequence.  */
6866   gimple_seq unlock_seq = NULL;
6867   tree unlock_expr = nvptx_global_lock_addr ();
6868   unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6869 				     uns_locked, uns_unlocked);
6870   gimplify_and_add (unlock_expr, &unlock_seq);
6871   gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6872 
6873   return acc_out;
6874 }
6875 
6876 /* Emit a sequence to update a reduction accumlator at *PTR with the
6877    value held in VAR using operator OP.  Return the updated value.
6878 
6879    TODO: optimize for atomic ops and indepedent complex ops.  */
6880 
6881 static tree
nvptx_reduction_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op,int level)6882 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6883 			tree ptr, tree var, tree_code op, int level)
6884 {
6885   tree type = TREE_TYPE (var);
6886   tree size = TYPE_SIZE (type);
6887 
6888   if (size == TYPE_SIZE (unsigned_type_node)
6889       || size == TYPE_SIZE (long_long_unsigned_type_node))
6890     return nvptx_lockless_update (loc, gsi, ptr, var, op);
6891   else
6892     return nvptx_lockfull_update (loc, gsi, ptr, var, op, level);
6893 }
6894 
6895 /* NVPTX implementation of GOACC_REDUCTION_SETUP.  */
6896 
6897 static void
nvptx_goacc_reduction_setup(gcall * call,offload_attrs * oa)6898 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
6899 {
6900   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6901   tree lhs = gimple_call_lhs (call);
6902   tree var = gimple_call_arg (call, 2);
6903   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6904   gimple_seq seq = NULL;
6905 
6906   push_gimplify_context (true);
6907 
6908   if (level != GOMP_DIM_GANG)
6909     {
6910       /* Copy the receiver object.  */
6911       tree ref_to_res = gimple_call_arg (call, 1);
6912 
6913       if (!integer_zerop (ref_to_res))
6914 	var = build_simple_mem_ref (ref_to_res);
6915     }
6916 
6917   if (level == GOMP_DIM_WORKER
6918       || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6919     {
6920       /* Store incoming value to worker reduction buffer.  */
6921       tree offset = gimple_call_arg (call, 5);
6922       tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6923 					     level == GOMP_DIM_VECTOR);
6924       tree ptr = make_ssa_name (TREE_TYPE (call));
6925 
6926       gimplify_assign (ptr, call, &seq);
6927       tree ref = build_simple_mem_ref (ptr);
6928       TREE_THIS_VOLATILE (ref) = 1;
6929       gimplify_assign (ref, var, &seq);
6930     }
6931 
6932   if (lhs)
6933     gimplify_assign (lhs, var, &seq);
6934 
6935   pop_gimplify_context (NULL);
6936   gsi_replace_with_seq (&gsi, seq, true);
6937 }
6938 
6939 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6940 
6941 static void
nvptx_goacc_reduction_init(gcall * call,offload_attrs * oa)6942 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
6943 {
6944   gimple_stmt_iterator gsi = gsi_for_stmt (call);
6945   tree lhs = gimple_call_lhs (call);
6946   tree var = gimple_call_arg (call, 2);
6947   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6948   enum tree_code rcode
6949     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6950   tree init = omp_reduction_init_op (gimple_location (call), rcode,
6951 				     TREE_TYPE (var));
6952   gimple_seq seq = NULL;
6953 
6954   push_gimplify_context (true);
6955 
6956   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6957     {
6958       /* Initialize vector-non-zeroes to INIT_VAL (OP).  */
6959       tree tid = make_ssa_name (integer_type_node);
6960       tree dim_vector = gimple_call_arg (call, 3);
6961       gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6962 						     dim_vector);
6963       gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6964 					     NULL_TREE, NULL_TREE);
6965 
6966       gimple_call_set_lhs (tid_call, tid);
6967       gimple_seq_add_stmt (&seq, tid_call);
6968       gimple_seq_add_stmt (&seq, cond_stmt);
6969 
6970       /* Split the block just after the call.  */
6971       edge init_edge = split_block (gsi_bb (gsi), call);
6972       basic_block init_bb = init_edge->dest;
6973       basic_block call_bb = init_edge->src;
6974 
6975       /* Fixup flags from call_bb to init_bb.  */
6976       init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
6977       init_edge->probability = profile_probability::even ();
6978 
6979       /* Set the initialization stmts.  */
6980       gimple_seq init_seq = NULL;
6981       tree init_var = make_ssa_name (TREE_TYPE (var));
6982       gimplify_assign (init_var, init, &init_seq);
6983       gsi = gsi_start_bb (init_bb);
6984       gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6985 
6986       /* Split block just after the init stmt.  */
6987       gsi_prev (&gsi);
6988       edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6989       basic_block dst_bb = inited_edge->dest;
6990 
6991       /* Create false edge from call_bb to dst_bb.  */
6992       edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
6993       nop_edge->probability = profile_probability::even ();
6994 
6995       /* Create phi node in dst block.  */
6996       gphi *phi = create_phi_node (lhs, dst_bb);
6997       add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6998       add_phi_arg (phi, var, nop_edge, gimple_location (call));
6999 
7000       /* Reset dominator of dst bb.  */
7001       set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
7002 
7003       /* Reset the gsi.  */
7004       gsi = gsi_for_stmt (call);
7005     }
7006   else
7007     {
7008       if (level == GOMP_DIM_GANG)
7009 	{
7010 	  /* If there's no receiver object, propagate the incoming VAR.  */
7011 	  tree ref_to_res = gimple_call_arg (call, 1);
7012 	  if (integer_zerop (ref_to_res))
7013 	    init = var;
7014 	}
7015 
7016       if (lhs != NULL_TREE)
7017 	gimplify_assign (lhs, init, &seq);
7018     }
7019 
7020   pop_gimplify_context (NULL);
7021   gsi_replace_with_seq (&gsi, seq, true);
7022 }
7023 
7024 /* NVPTX implementation of GOACC_REDUCTION_FINI.  */
7025 
7026 static void
nvptx_goacc_reduction_fini(gcall * call,offload_attrs * oa)7027 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
7028 {
7029   gimple_stmt_iterator gsi = gsi_for_stmt (call);
7030   tree lhs = gimple_call_lhs (call);
7031   tree ref_to_res = gimple_call_arg (call, 1);
7032   tree var = gimple_call_arg (call, 2);
7033   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7034   enum tree_code op
7035     = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
7036   gimple_seq seq = NULL;
7037   tree r = NULL_TREE;;
7038 
7039   push_gimplify_context (true);
7040 
7041   if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
7042     {
7043       /* Emit binary shuffle tree.  TODO. Emit this as an actual loop,
7044 	 but that requires a method of emitting a unified jump at the
7045 	 gimple level.  */
7046       for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
7047 	{
7048 	  tree other_var = make_ssa_name (TREE_TYPE (var));
7049 	  nvptx_generate_vector_shuffle (gimple_location (call),
7050 					 other_var, var, shfl, &seq);
7051 
7052 	  r = make_ssa_name (TREE_TYPE (var));
7053 	  gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
7054 					   var, other_var), &seq);
7055 	  var = r;
7056 	}
7057     }
7058   else
7059     {
7060       tree accum = NULL_TREE;
7061 
7062       if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
7063 	{
7064 	  /* Get reduction buffer address.  */
7065 	  tree offset = gimple_call_arg (call, 5);
7066 	  tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7067 						 level == GOMP_DIM_VECTOR);
7068 	  tree ptr = make_ssa_name (TREE_TYPE (call));
7069 
7070 	  gimplify_assign (ptr, call, &seq);
7071 	  accum = ptr;
7072 	}
7073       else if (integer_zerop (ref_to_res))
7074 	r = var;
7075       else
7076 	accum = ref_to_res;
7077 
7078       if (accum)
7079 	{
7080 	  /* UPDATE the accumulator.  */
7081 	  gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
7082 	  seq = NULL;
7083 	  r = nvptx_reduction_update (gimple_location (call), &gsi,
7084 				      accum, var, op, level);
7085 	}
7086     }
7087 
7088   if (lhs)
7089     gimplify_assign (lhs, r, &seq);
7090   pop_gimplify_context (NULL);
7091 
7092   gsi_replace_with_seq (&gsi, seq, true);
7093 }
7094 
7095 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN.  */
7096 
7097 static void
nvptx_goacc_reduction_teardown(gcall * call,offload_attrs * oa)7098 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
7099 {
7100   gimple_stmt_iterator gsi = gsi_for_stmt (call);
7101   tree lhs = gimple_call_lhs (call);
7102   tree var = gimple_call_arg (call, 2);
7103   int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7104   gimple_seq seq = NULL;
7105 
7106   push_gimplify_context (true);
7107   if (level == GOMP_DIM_WORKER
7108       || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
7109     {
7110       /* Read the worker reduction buffer.  */
7111       tree offset = gimple_call_arg (call, 5);
7112       tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7113 					     level == GOMP_DIM_VECTOR);
7114       tree ptr = make_ssa_name (TREE_TYPE (call));
7115 
7116       gimplify_assign (ptr, call, &seq);
7117       var = build_simple_mem_ref (ptr);
7118       TREE_THIS_VOLATILE (var) = 1;
7119     }
7120 
7121   if (level != GOMP_DIM_GANG)
7122     {
7123       /* Write to the receiver object.  */
7124       tree ref_to_res = gimple_call_arg (call, 1);
7125 
7126       if (!integer_zerop (ref_to_res))
7127 	gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
7128     }
7129 
7130   if (lhs)
7131     gimplify_assign (lhs, var, &seq);
7132 
7133   pop_gimplify_context (NULL);
7134 
7135   gsi_replace_with_seq (&gsi, seq, true);
7136 }
7137 
7138 /* NVPTX reduction expander.  */
7139 
7140 static void
nvptx_goacc_reduction(gcall * call)7141 nvptx_goacc_reduction (gcall *call)
7142 {
7143   unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
7144   offload_attrs oa;
7145 
7146   populate_offload_attrs (&oa);
7147 
7148   switch (code)
7149     {
7150     case IFN_GOACC_REDUCTION_SETUP:
7151       nvptx_goacc_reduction_setup (call, &oa);
7152       break;
7153 
7154     case IFN_GOACC_REDUCTION_INIT:
7155       nvptx_goacc_reduction_init (call, &oa);
7156       break;
7157 
7158     case IFN_GOACC_REDUCTION_FINI:
7159       nvptx_goacc_reduction_fini (call, &oa);
7160       break;
7161 
7162     case IFN_GOACC_REDUCTION_TEARDOWN:
7163       nvptx_goacc_reduction_teardown (call, &oa);
7164       break;
7165 
7166     default:
7167       gcc_unreachable ();
7168     }
7169 }
7170 
7171 static bool
nvptx_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x ATTRIBUTE_UNUSED)7172 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
7173 			      rtx x ATTRIBUTE_UNUSED)
7174 {
7175   return true;
7176 }
7177 
7178 static bool
nvptx_scalar_mode_supported_p(scalar_mode mode)7179 nvptx_scalar_mode_supported_p (scalar_mode mode)
7180 {
7181   if (nvptx_experimental && mode == HFmode && TARGET_SM53)
7182     return true;
7183 
7184   return default_scalar_mode_supported_p (mode);
7185 }
7186 
7187 static bool
nvptx_libgcc_floating_mode_supported_p(scalar_float_mode mode)7188 nvptx_libgcc_floating_mode_supported_p (scalar_float_mode mode)
7189 {
7190   if (nvptx_experimental && mode == HFmode && TARGET_SM53)
7191     return true;
7192 
7193   return default_libgcc_floating_mode_supported_p (mode);
7194 }
7195 
7196 static bool
nvptx_vector_mode_supported(machine_mode mode)7197 nvptx_vector_mode_supported (machine_mode mode)
7198 {
7199   return (mode == V2SImode
7200 	  || mode == V2DImode);
7201 }
7202 
7203 /* Return the preferred mode for vectorizing scalar MODE.  */
7204 
7205 static machine_mode
nvptx_preferred_simd_mode(scalar_mode mode)7206 nvptx_preferred_simd_mode (scalar_mode mode)
7207 {
7208   switch (mode)
7209     {
7210     case E_DImode:
7211       return V2DImode;
7212     case E_SImode:
7213       return V2SImode;
7214 
7215     default:
7216       return default_preferred_simd_mode (mode);
7217     }
7218 }
7219 
7220 unsigned int
nvptx_data_alignment(const_tree type,unsigned int basic_align)7221 nvptx_data_alignment (const_tree type, unsigned int basic_align)
7222 {
7223   if (TREE_CODE (type) == INTEGER_TYPE)
7224     {
7225       unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
7226       if (size == GET_MODE_SIZE (TImode))
7227 	return GET_MODE_BITSIZE (maybe_split_mode (TImode));
7228     }
7229 
7230   return basic_align;
7231 }
7232 
7233 /* Implement TARGET_MODES_TIEABLE_P.  */
7234 
7235 static bool
nvptx_modes_tieable_p(machine_mode,machine_mode)7236 nvptx_modes_tieable_p (machine_mode, machine_mode)
7237 {
7238   return false;
7239 }
7240 
7241 /* Implement TARGET_HARD_REGNO_NREGS.  */
7242 
7243 static unsigned int
nvptx_hard_regno_nregs(unsigned int,machine_mode)7244 nvptx_hard_regno_nregs (unsigned int, machine_mode)
7245 {
7246   return 1;
7247 }
7248 
7249 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
7250 
7251 static bool
nvptx_can_change_mode_class(machine_mode,machine_mode,reg_class_t)7252 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
7253 {
7254   return false;
7255 }
7256 
7257 /* Implement TARGET_TRULY_NOOP_TRUNCATION.  */
7258 
7259 static bool
nvptx_truly_noop_truncation(poly_uint64,poly_uint64)7260 nvptx_truly_noop_truncation (poly_uint64, poly_uint64)
7261 {
7262   return false;
7263 }
7264 
7265 /* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL.  */
7266 
7267 static tree
nvptx_goacc_adjust_private_decl(location_t loc,tree decl,int level)7268 nvptx_goacc_adjust_private_decl (location_t loc, tree decl, int level)
7269 {
7270   gcc_checking_assert (!lookup_attribute ("oacc gang-private",
7271 					  DECL_ATTRIBUTES (decl)));
7272 
7273   /* Set "oacc gang-private" attribute for gang-private variable
7274      declarations.  */
7275   if (level == GOMP_DIM_GANG)
7276     {
7277       tree id = get_identifier ("oacc gang-private");
7278       /* For later diagnostic purposes, pass LOC as VALUE (wrapped as a
7279 	 TREE).  */
7280       tree loc_tree = build_empty_stmt (loc);
7281       DECL_ATTRIBUTES (decl)
7282 	= tree_cons (id, loc_tree, DECL_ATTRIBUTES (decl));
7283     }
7284 
7285   return decl;
7286 }
7287 
7288 /* Implement TARGET_GOACC_EXPAND_VAR_DECL.  */
7289 
7290 static rtx
nvptx_goacc_expand_var_decl(tree var)7291 nvptx_goacc_expand_var_decl (tree var)
7292 {
7293   /* Place "oacc gang-private" variables in shared memory.  */
7294   if (tree attr = lookup_attribute ("oacc gang-private",
7295 				    DECL_ATTRIBUTES (var)))
7296     {
7297       gcc_checking_assert (VAR_P (var));
7298 
7299       unsigned int offset, *poffset;
7300       poffset = gang_private_shared_hmap.get (var);
7301       if (poffset)
7302 	offset = *poffset;
7303       else
7304 	{
7305 	  unsigned HOST_WIDE_INT align = DECL_ALIGN (var);
7306 	  gang_private_shared_size
7307 	    = (gang_private_shared_size + align - 1) & ~(align - 1);
7308 	  if (gang_private_shared_align < align)
7309 	    gang_private_shared_align = align;
7310 
7311 	  offset = gang_private_shared_size;
7312 	  bool existed = gang_private_shared_hmap.put (var, offset);
7313 	  gcc_checking_assert (!existed);
7314 	  gang_private_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var));
7315 
7316 	  location_t loc = EXPR_LOCATION (TREE_VALUE (attr));
7317 #if 0 /* For some reason, this doesn't work.  */
7318 	  if (dump_enabled_p ())
7319 	    {
7320 	      dump_flags_t l_dump_flags
7321 		= get_openacc_privatization_dump_flags ();
7322 
7323 	      const dump_user_location_t d_u_loc
7324 		= dump_user_location_t::from_location_t (loc);
7325 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
7326 #if __GNUC__ >= 10
7327 # pragma GCC diagnostic push
7328 # pragma GCC diagnostic ignored "-Wformat"
7329 #endif
7330 	      dump_printf_loc (l_dump_flags, d_u_loc,
7331 			       "variable %<%T%> adjusted for OpenACC"
7332 			       " privatization level: %qs\n",
7333 			       var, "gang");
7334 #if __GNUC__ >= 10
7335 # pragma GCC diagnostic pop
7336 #endif
7337 	    }
7338 #else /* ..., thus emulate that, good enough for testsuite usage.  */
7339 	  if (param_openacc_privatization != OPENACC_PRIVATIZATION_QUIET)
7340 	    inform (loc,
7341 		    "variable %qD adjusted for OpenACC privatization level:"
7342 		    " %qs",
7343 		    var, "gang");
7344 	  if (dump_file && (dump_flags & TDF_DETAILS))
7345 	    {
7346 	      /* 'dumpfile.cc:dump_loc' */
7347 	      fprintf (dump_file, "%s:%d:%d: ", LOCATION_FILE (loc),
7348 		       LOCATION_LINE (loc), LOCATION_COLUMN (loc));
7349 	      fprintf (dump_file, "%s: ", "note");
7350 
7351 	      fprintf (dump_file,
7352 		       "variable '");
7353 	      print_generic_expr (dump_file, var, TDF_SLIM);
7354 	      fprintf (dump_file,
7355 		       "' adjusted for OpenACC privatization level: '%s'\n",
7356 		       "gang");
7357 	    }
7358 #endif
7359 	}
7360       rtx addr = plus_constant (Pmode, gang_private_shared_sym, offset);
7361       return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr);
7362     }
7363 
7364   return NULL_RTX;
7365 }
7366 
7367 static GTY(()) tree nvptx_previous_fndecl;
7368 
7369 static void
nvptx_set_current_function(tree fndecl)7370 nvptx_set_current_function (tree fndecl)
7371 {
7372   if (!fndecl || fndecl == nvptx_previous_fndecl)
7373     return;
7374 
7375   gang_private_shared_hmap.empty ();
7376   nvptx_previous_fndecl = fndecl;
7377   vector_red_partition = 0;
7378   oacc_bcast_partition = 0;
7379 }
7380 
7381 /* Implement TARGET_LIBC_HAS_FUNCTION.  */
7382 
7383 bool
nvptx_libc_has_function(enum function_class fn_class,tree type)7384 nvptx_libc_has_function (enum function_class fn_class, tree type)
7385 {
7386   if (fn_class == function_sincos)
7387     {
7388       if (type != NULL_TREE)
7389 	/* Currently, newlib does not support sincosl.  */
7390 	return type == float_type_node || type == double_type_node;
7391       else
7392 	return true;
7393     }
7394 
7395   return default_libc_has_function (fn_class, type);
7396 }
7397 
7398 bool
nvptx_mem_local_p(rtx mem)7399 nvptx_mem_local_p (rtx mem)
7400 {
7401   gcc_assert (GET_CODE (mem) == MEM);
7402 
7403   struct address_info info;
7404   decompose_mem_address (&info, mem);
7405 
7406   if (info.base != NULL && REG_P (*info.base)
7407       && REGNO_PTR_FRAME_P (REGNO (*info.base)))
7408     {
7409       if (TARGET_SOFT_STACK)
7410 	{
7411 	  /* Frame-related doesn't mean local.  */
7412 	}
7413       else
7414 	return true;
7415     }
7416 
7417   return false;
7418 }
7419 
7420 /* Define locally, for use in NVPTX_ASM_OUTPUT_DEF.  */
7421 #define SET_ASM_OP ".alias "
7422 
7423 /* Define locally, for use in nvptx_asm_output_def_from_decls.  Add NVPTX_
7424    prefix to avoid clash with ASM_OUTPUT_DEF from nvptx.h.
7425    Copy of ASM_OUTPUT_DEF from defaults.h, with added terminating
7426    semicolon.  */
7427 #define NVPTX_ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2)	\
7428   do							\
7429     {							\
7430       fprintf ((FILE), "%s", SET_ASM_OP);		\
7431       assemble_name (FILE, LABEL1);			\
7432       fprintf (FILE, ",");				\
7433       assemble_name (FILE, LABEL2);			\
7434       fprintf (FILE, ";\n");				\
7435     }							\
7436   while (0)
7437 
7438 void
nvptx_asm_output_def_from_decls(FILE * stream,tree name,tree value)7439 nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value)
7440 {
7441   if (nvptx_alias == 0 || !TARGET_PTX_6_3)
7442     {
7443       /* Copied from assemble_alias.  */
7444       error_at (DECL_SOURCE_LOCATION (name),
7445 		"alias definitions not supported in this configuration");
7446       TREE_ASM_WRITTEN (name) = 1;
7447       return;
7448     }
7449 
7450   if (lookup_attribute ("weak", DECL_ATTRIBUTES (name)))
7451     {
7452       /* Prevent execution FAILs for gcc.dg/globalalias.c and
7453 	 gcc.dg/pr77587.c.  */
7454       error_at (DECL_SOURCE_LOCATION (name),
7455 		"weak alias definitions not supported in this configuration");
7456       TREE_ASM_WRITTEN (name) = 1;
7457       return;
7458     }
7459 
7460   /* Ptx also doesn't support value having weak linkage, but we can't detect
7461      that here, so we'll end up with:
7462      "error: Function test with .weak scope cannot be aliased".
7463      See gcc.dg/localalias.c.  */
7464 
7465   if (TREE_CODE (name) != FUNCTION_DECL)
7466     {
7467       error_at (DECL_SOURCE_LOCATION (name),
7468 		"non-function alias definitions not supported"
7469 		" in this configuration");
7470       TREE_ASM_WRITTEN (name) = 1;
7471       return;
7472     }
7473 
7474   if (!cgraph_node::get (name)->referred_to_p ())
7475     /* Prevent "Internal error: reference to deleted section".  */
7476     return;
7477 
7478   std::stringstream s;
7479   write_fn_proto (s, false, get_fnname_from_decl (name), name);
7480   fputs (s.str ().c_str (), stream);
7481 
7482   tree id = DECL_ASSEMBLER_NAME (name);
7483   NVPTX_ASM_OUTPUT_DEF (stream, IDENTIFIER_POINTER (id),
7484 			IDENTIFIER_POINTER (value));
7485 }
7486 
7487 #undef NVPTX_ASM_OUTPUT_DEF
7488 #undef SET_ASM_OP
7489 
7490 #undef TARGET_OPTION_OVERRIDE
7491 #define TARGET_OPTION_OVERRIDE nvptx_option_override
7492 
7493 #undef TARGET_ATTRIBUTE_TABLE
7494 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
7495 
7496 #undef TARGET_LRA_P
7497 #define TARGET_LRA_P hook_bool_void_false
7498 
7499 #undef TARGET_LEGITIMATE_ADDRESS_P
7500 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
7501 
7502 #undef  TARGET_PROMOTE_FUNCTION_MODE
7503 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
7504 
7505 #undef TARGET_FUNCTION_ARG
7506 #define TARGET_FUNCTION_ARG nvptx_function_arg
7507 #undef TARGET_FUNCTION_INCOMING_ARG
7508 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
7509 #undef TARGET_FUNCTION_ARG_ADVANCE
7510 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
7511 #undef TARGET_FUNCTION_ARG_BOUNDARY
7512 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
7513 #undef TARGET_PASS_BY_REFERENCE
7514 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
7515 #undef TARGET_FUNCTION_VALUE_REGNO_P
7516 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
7517 #undef TARGET_FUNCTION_VALUE
7518 #define TARGET_FUNCTION_VALUE nvptx_function_value
7519 #undef TARGET_LIBCALL_VALUE
7520 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
7521 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
7522 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
7523 #undef TARGET_GET_DRAP_RTX
7524 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
7525 #undef TARGET_SPLIT_COMPLEX_ARG
7526 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
7527 #undef TARGET_RETURN_IN_MEMORY
7528 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
7529 #undef TARGET_OMIT_STRUCT_RETURN_REG
7530 #define TARGET_OMIT_STRUCT_RETURN_REG true
7531 #undef TARGET_STRICT_ARGUMENT_NAMING
7532 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
7533 #undef TARGET_CALL_ARGS
7534 #define TARGET_CALL_ARGS nvptx_call_args
7535 #undef TARGET_END_CALL_ARGS
7536 #define TARGET_END_CALL_ARGS nvptx_end_call_args
7537 
7538 #undef TARGET_ASM_FILE_START
7539 #define TARGET_ASM_FILE_START nvptx_file_start
7540 #undef TARGET_ASM_FILE_END
7541 #define TARGET_ASM_FILE_END nvptx_file_end
7542 #undef TARGET_ASM_GLOBALIZE_LABEL
7543 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
7544 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
7545 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
7546 #undef  TARGET_PRINT_OPERAND
7547 #define TARGET_PRINT_OPERAND nvptx_print_operand
7548 #undef  TARGET_PRINT_OPERAND_ADDRESS
7549 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
7550 #undef  TARGET_PRINT_OPERAND_PUNCT_VALID_P
7551 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
7552 #undef TARGET_ASM_INTEGER
7553 #define TARGET_ASM_INTEGER nvptx_assemble_integer
7554 #undef TARGET_ASM_DECL_END
7555 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
7556 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
7557 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
7558 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
7559 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
7560 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
7561 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
7562 
7563 #undef TARGET_MACHINE_DEPENDENT_REORG
7564 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
7565 #undef TARGET_NO_REGISTER_ALLOCATION
7566 #define TARGET_NO_REGISTER_ALLOCATION true
7567 
7568 #undef TARGET_ENCODE_SECTION_INFO
7569 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
7570 #undef TARGET_RECORD_OFFLOAD_SYMBOL
7571 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
7572 
7573 #undef TARGET_VECTOR_ALIGNMENT
7574 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
7575 
7576 #undef TARGET_CANNOT_COPY_INSN_P
7577 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
7578 
7579 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
7580 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
7581 
7582 #undef TARGET_INIT_BUILTINS
7583 #define TARGET_INIT_BUILTINS nvptx_init_builtins
7584 #undef TARGET_EXPAND_BUILTIN
7585 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
7586 #undef  TARGET_BUILTIN_DECL
7587 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
7588 
7589 #undef TARGET_SIMT_VF
7590 #define TARGET_SIMT_VF nvptx_simt_vf
7591 
7592 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
7593 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
7594 
7595 #undef TARGET_GOACC_VALIDATE_DIMS
7596 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
7597 
7598 #undef TARGET_GOACC_DIM_LIMIT
7599 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
7600 
7601 #undef TARGET_GOACC_FORK_JOIN
7602 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
7603 
7604 #undef TARGET_GOACC_REDUCTION
7605 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
7606 
7607 #undef TARGET_CANNOT_FORCE_CONST_MEM
7608 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
7609 
7610 #undef TARGET_SCALAR_MODE_SUPPORTED_P
7611 #define TARGET_SCALAR_MODE_SUPPORTED_P nvptx_scalar_mode_supported_p
7612 
7613 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
7614 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
7615   nvptx_libgcc_floating_mode_supported_p
7616 
7617 #undef TARGET_VECTOR_MODE_SUPPORTED_P
7618 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
7619 
7620 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
7621 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
7622     nvptx_preferred_simd_mode
7623 
7624 #undef TARGET_MODES_TIEABLE_P
7625 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
7626 
7627 #undef TARGET_HARD_REGNO_NREGS
7628 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
7629 
7630 #undef TARGET_CAN_CHANGE_MODE_CLASS
7631 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
7632 
7633 #undef TARGET_TRULY_NOOP_TRUNCATION
7634 #define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
7635 
7636 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
7637 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
7638 
7639 #undef TARGET_GOACC_ADJUST_PRIVATE_DECL
7640 #define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl
7641 
7642 #undef TARGET_GOACC_EXPAND_VAR_DECL
7643 #define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl
7644 
7645 #undef TARGET_SET_CURRENT_FUNCTION
7646 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
7647 
7648 #undef TARGET_LIBC_HAS_FUNCTION
7649 #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
7650 
7651 struct gcc_target targetm = TARGET_INITIALIZER;
7652 
7653 #include "gt-nvptx.h"
7654