1 /* Target code for NVPTX.
2 Copyright (C) 2014-2022 Free Software Foundation, Inc.
3 Contributed by Bernd Schmidt <bernds@codesourcery.com>
4
5 This file is part of GCC.
6
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published
9 by the Free Software Foundation; either version 3, or (at your
10 option) any later version.
11
12 GCC is distributed in the hope that it will be useful, but WITHOUT
13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
15 License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
20
21 #define IN_TARGET_CODE 1
22
23 #include "config.h"
24 #include <sstream>
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "cfghooks.h"
32 #include "df.h"
33 #include "memmodel.h"
34 #include "tm_p.h"
35 #include "expmed.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "alias.h"
42 #include "insn-flags.h"
43 #include "output.h"
44 #include "insn-attr.h"
45 #include "flags.h"
46 #include "dojump.h"
47 #include "explow.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "stmt.h"
51 #include "expr.h"
52 #include "tm-preds.h"
53 #include "tm-constrs.h"
54 #include "langhooks.h"
55 #include "dbxout.h"
56 #include "cfgrtl.h"
57 #include "gimple.h"
58 #include "stor-layout.h"
59 #include "builtins.h"
60 #include "omp-general.h"
61 #include "omp-low.h"
62 #include "omp-offload.h"
63 #include "gomp-constants.h"
64 #include "dumpfile.h"
65 #include "internal-fn.h"
66 #include "gimple-iterator.h"
67 #include "stringpool.h"
68 #include "attribs.h"
69 #include "tree-vrp.h"
70 #include "tree-ssa-operands.h"
71 #include "tree-ssanames.h"
72 #include "gimplify.h"
73 #include "tree-phinodes.h"
74 #include "cfgloop.h"
75 #include "fold-const.h"
76 #include "intl.h"
77 #include "opts.h"
78 #include "tree-pretty-print.h"
79 #include "rtl-iter.h"
80 #include "cgraph.h"
81
82 /* This file should be included last. */
83 #include "target-def.h"
84
85 #define WORKAROUND_PTXJIT_BUG 1
86 #define WORKAROUND_PTXJIT_BUG_2 1
87 #define WORKAROUND_PTXJIT_BUG_3 1
88
89 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread
90 block, which has had a maximum number of threads of 1024 since CUDA version
91 2.x. */
92 #define PTX_CTA_SIZE 1024
93
94 #define PTX_CTA_NUM_BARRIERS 16
95 #define PTX_WARP_SIZE 32
96
97 #define PTX_PER_CTA_BARRIER 0
98 #define PTX_NUM_PER_CTA_BARRIERS 1
99 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS)
100 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS)
101
102 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE
103 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE
104 #define PTX_WORKER_LENGTH 32
105 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */
106
107 /* The various PTX memory areas an object might reside in. */
108 enum nvptx_data_area
109 {
110 DATA_AREA_GENERIC,
111 DATA_AREA_GLOBAL,
112 DATA_AREA_SHARED,
113 DATA_AREA_LOCAL,
114 DATA_AREA_CONST,
115 DATA_AREA_PARAM,
116 DATA_AREA_MAX
117 };
118
119 /* We record the data area in the target symbol flags. */
120 #define SYMBOL_DATA_AREA(SYM) \
121 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \
122 & 7)
123 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \
124 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT)
125
126 /* Record the function decls we've written, and the libfuncs and function
127 decls corresponding to them. */
128 static std::stringstream func_decls;
129
130 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def>
131 {
hashdeclared_libfunc_hasher132 static hashval_t hash (rtx x) { return htab_hash_pointer (x); }
equaldeclared_libfunc_hasher133 static bool equal (rtx a, rtx b) { return a == b; }
134 };
135
136 static GTY((cache))
137 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab;
138
139 struct tree_hasher : ggc_cache_ptr_hash<tree_node>
140 {
hashtree_hasher141 static hashval_t hash (tree t) { return htab_hash_pointer (t); }
equaltree_hasher142 static bool equal (tree a, tree b) { return a == b; }
143 };
144
145 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab;
146 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab;
147
148 /* Buffer needed to broadcast across workers and vectors. This is
149 used for both worker-neutering and worker broadcasting, and
150 vector-neutering and boardcasting when vector_length > 32. It is
151 shared by all functions emitted. The buffer is placed in shared
152 memory. It'd be nice if PTX supported common blocks, because then
153 this could be shared across TUs (taking the largest size). */
154 static unsigned oacc_bcast_size;
155 static unsigned oacc_bcast_partition;
156 static unsigned oacc_bcast_align;
157 static GTY(()) rtx oacc_bcast_sym;
158
159 /* Buffer needed for worker reductions. This has to be distinct from
160 the worker broadcast array, as both may be live concurrently. */
161 static unsigned worker_red_size;
162 static unsigned worker_red_align;
163 static GTY(()) rtx worker_red_sym;
164
165 /* Buffer needed for vector reductions, when vector_length >
166 PTX_WARP_SIZE. This has to be distinct from the worker broadcast
167 array, as both may be live concurrently. */
168 static unsigned vector_red_size;
169 static unsigned vector_red_align;
170 static unsigned vector_red_partition;
171 static GTY(()) rtx vector_red_sym;
172
173 /* Shared memory block for gang-private variables. */
174 static unsigned gang_private_shared_size;
175 static unsigned gang_private_shared_align;
176 static GTY(()) rtx gang_private_shared_sym;
177 static hash_map<tree_decl_hash, unsigned int> gang_private_shared_hmap;
178
179 /* Global lock variable, needed for 128bit worker & gang reductions. */
180 static GTY(()) tree global_lock_var;
181
182 /* True if any function references __nvptx_stacks. */
183 static bool need_softstack_decl;
184
185 /* True if any function references __nvptx_uni. */
186 static bool need_unisimt_decl;
187
188 static int nvptx_mach_max_workers ();
189
190 /* Allocate a new, cleared machine_function structure. */
191
192 static struct machine_function *
nvptx_init_machine_status(void)193 nvptx_init_machine_status (void)
194 {
195 struct machine_function *p = ggc_cleared_alloc<machine_function> ();
196 p->return_mode = VOIDmode;
197 return p;
198 }
199
200 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL)
201 and -fopenacc is also enabled. */
202
203 static void
diagnose_openacc_conflict(bool optval,const char * optname)204 diagnose_openacc_conflict (bool optval, const char *optname)
205 {
206 if (flag_openacc && optval)
207 error ("option %s is not supported together with %<-fopenacc%>", optname);
208 }
209
210 static enum ptx_version
first_ptx_version_supporting_sm(enum ptx_isa sm)211 first_ptx_version_supporting_sm (enum ptx_isa sm)
212 {
213 switch (sm)
214 {
215 case PTX_ISA_SM30:
216 return PTX_VERSION_3_0;
217 case PTX_ISA_SM35:
218 return PTX_VERSION_3_1;
219 case PTX_ISA_SM53:
220 return PTX_VERSION_4_2;
221 case PTX_ISA_SM70:
222 return PTX_VERSION_6_0;
223 case PTX_ISA_SM75:
224 return PTX_VERSION_6_3;
225 case PTX_ISA_SM80:
226 return PTX_VERSION_7_0;
227 default:
228 gcc_unreachable ();
229 }
230 }
231
232 static enum ptx_version
default_ptx_version_option(void)233 default_ptx_version_option (void)
234 {
235 enum ptx_version first
236 = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
237
238 /* Pick a version that supports the sm. */
239 enum ptx_version res = first;
240
241 /* Pick at least 3.1. This has been the smallest version historically. */
242 res = MAX (res, PTX_VERSION_3_1);
243
244 /* Pick at least 6.0, to enable using bar.warp.sync to have a way to force
245 warp convergence. */
246 res = MAX (res, PTX_VERSION_6_0);
247
248 /* Verify that we pick a version that supports the sm. */
249 gcc_assert (first <= res);
250 return res;
251 }
252
253 static const char *
ptx_version_to_string(enum ptx_version v)254 ptx_version_to_string (enum ptx_version v)
255 {
256 switch (v)
257 {
258 case PTX_VERSION_3_0:
259 return "3.0";
260 case PTX_VERSION_3_1:
261 return "3.1";
262 case PTX_VERSION_4_2:
263 return "4.2";
264 case PTX_VERSION_6_0:
265 return "6.0";
266 case PTX_VERSION_6_3:
267 return "6.3";
268 case PTX_VERSION_7_0:
269 return "7.0";
270 default:
271 gcc_unreachable ();
272 }
273 }
274
275 unsigned int
ptx_version_to_number(enum ptx_version v,bool major_p)276 ptx_version_to_number (enum ptx_version v, bool major_p)
277 {
278 switch (v)
279 {
280 case PTX_VERSION_3_0:
281 return major_p ? 3 : 0;
282 case PTX_VERSION_3_1:
283 return major_p ? 3 : 1;
284 case PTX_VERSION_4_2:
285 return major_p ? 4 : 2;
286 case PTX_VERSION_6_0:
287 return major_p ? 6 : 0;
288 case PTX_VERSION_6_3:
289 return major_p ? 6 : 3;
290 case PTX_VERSION_7_0:
291 return major_p ? 7 : 0;
292 default:
293 gcc_unreachable ();
294 }
295 }
296
297 static const char *
sm_version_to_string(enum ptx_isa sm)298 sm_version_to_string (enum ptx_isa sm)
299 {
300 switch (sm)
301 {
302 #define NVPTX_SM(XX, SEP) \
303 case PTX_ISA_SM ## XX: \
304 return #XX;
305 #include "nvptx-sm.def"
306 #undef NVPTX_SM
307 default:
308 gcc_unreachable ();
309 }
310 }
311
312 static void
handle_ptx_version_option(void)313 handle_ptx_version_option (void)
314 {
315 if (!OPTION_SET_P (ptx_version_option)
316 || ptx_version_option == PTX_VERSION_default)
317 {
318 ptx_version_option = default_ptx_version_option ();
319 return;
320 }
321
322 enum ptx_version first
323 = first_ptx_version_supporting_sm ((enum ptx_isa) ptx_isa_option);
324
325 if (ptx_version_option < first)
326 error ("PTX version (%<-mptx%>) needs to be at least %s to support selected"
327 " %<-misa%> (sm_%s)", ptx_version_to_string (first),
328 sm_version_to_string ((enum ptx_isa)ptx_isa_option));
329 }
330
331 /* Implement TARGET_OPTION_OVERRIDE. */
332
333 static void
nvptx_option_override(void)334 nvptx_option_override (void)
335 {
336 init_machine_status = nvptx_init_machine_status;
337
338 handle_ptx_version_option ();
339
340 /* Set toplevel_reorder, unless explicitly disabled. We need
341 reordering so that we emit necessary assembler decls of
342 undeclared variables. */
343 if (!OPTION_SET_P (flag_toplevel_reorder))
344 flag_toplevel_reorder = 1;
345
346 debug_nonbind_markers_p = 0;
347
348 /* Set flag_no_common, unless explicitly disabled. We fake common
349 using .weak, and that's not entirely accurate, so avoid it
350 unless forced. */
351 if (!OPTION_SET_P (flag_no_common))
352 flag_no_common = 1;
353
354 /* The patch area requires nops, which we don't have. */
355 HOST_WIDE_INT patch_area_size, patch_area_entry;
356 parse_and_check_patch_area (flag_patchable_function_entry, false,
357 &patch_area_size, &patch_area_entry);
358 if (patch_area_size > 0)
359 sorry ("not generating patch area, nops not supported");
360
361 /* Assumes that it will see only hard registers. */
362 flag_var_tracking = 0;
363
364 if (nvptx_optimize < 0)
365 nvptx_optimize = optimize > 0;
366
367 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
368 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17);
369 declared_libfuncs_htab
370 = hash_table<declared_libfunc_hasher>::create_ggc (17);
371
372 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast");
373 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED);
374 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
375 oacc_bcast_partition = 0;
376
377 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red");
378 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED);
379 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
380
381 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red");
382 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED);
383 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
384 vector_red_partition = 0;
385
386 gang_private_shared_sym = gen_rtx_SYMBOL_REF (Pmode, "__gang_private_shared");
387 SET_SYMBOL_DATA_AREA (gang_private_shared_sym, DATA_AREA_SHARED);
388 gang_private_shared_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT;
389
390 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp");
391 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack");
392 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt");
393
394 if (TARGET_GOMP)
395 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT;
396 }
397
398 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to
399 deal with ptx ideosyncracies. */
400
401 const char *
nvptx_ptx_type_from_mode(machine_mode mode,bool promote)402 nvptx_ptx_type_from_mode (machine_mode mode, bool promote)
403 {
404 switch (mode)
405 {
406 case E_BLKmode:
407 return ".b8";
408 case E_BImode:
409 return ".pred";
410 case E_QImode:
411 if (promote)
412 return ".u32";
413 else
414 return ".u8";
415 case E_HImode:
416 return ".u16";
417 case E_SImode:
418 return ".u32";
419 case E_DImode:
420 return ".u64";
421
422 case E_HFmode:
423 return ".f16";
424 case E_SFmode:
425 return ".f32";
426 case E_DFmode:
427 return ".f64";
428
429 case E_V2SImode:
430 return ".v2.u32";
431 case E_V2DImode:
432 return ".v2.u64";
433
434 default:
435 gcc_unreachable ();
436 }
437 }
438
439 /* Encode the PTX data area that DECL (which might not actually be a
440 _DECL) should reside in. */
441
442 static void
nvptx_encode_section_info(tree decl,rtx rtl,int first)443 nvptx_encode_section_info (tree decl, rtx rtl, int first)
444 {
445 default_encode_section_info (decl, rtl, first);
446 if (first && MEM_P (rtl))
447 {
448 nvptx_data_area area = DATA_AREA_GENERIC;
449
450 if (TREE_CONSTANT (decl))
451 area = DATA_AREA_CONST;
452 else if (TREE_CODE (decl) == VAR_DECL)
453 {
454 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl)))
455 {
456 area = DATA_AREA_SHARED;
457 if (DECL_INITIAL (decl))
458 error ("static initialization of variable %q+D in %<.shared%>"
459 " memory is not supported", decl);
460 }
461 else
462 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL;
463 }
464
465 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area);
466 }
467 }
468
469 /* Return the PTX name of the data area in which SYM should be
470 placed. The symbol must have already been processed by
471 nvptx_encode_seciton_info, or equivalent. */
472
473 static const char *
section_for_sym(rtx sym)474 section_for_sym (rtx sym)
475 {
476 nvptx_data_area area = SYMBOL_DATA_AREA (sym);
477 /* Same order as nvptx_data_area enum. */
478 static char const *const areas[] =
479 {"", ".global", ".shared", ".local", ".const", ".param"};
480
481 return areas[area];
482 }
483
484 /* Similarly for a decl. */
485
486 static const char *
section_for_decl(const_tree decl)487 section_for_decl (const_tree decl)
488 {
489 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0));
490 }
491
492 /* Check NAME for special function names and redirect them by returning a
493 replacement. This applies to malloc, free and realloc, for which we
494 want to use libgcc wrappers, and call, which triggers a bug in
495 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's
496 not active in an offload compiler -- the names are all set by the
497 host-side compiler. */
498
499 static const char *
nvptx_name_replacement(const char * name)500 nvptx_name_replacement (const char *name)
501 {
502 if (strcmp (name, "call") == 0)
503 return "__nvptx_call";
504 if (strcmp (name, "malloc") == 0)
505 return "__nvptx_malloc";
506 if (strcmp (name, "free") == 0)
507 return "__nvptx_free";
508 if (strcmp (name, "realloc") == 0)
509 return "__nvptx_realloc";
510 return name;
511 }
512
513 /* Return NULL if NAME contains no dot. Otherwise return a copy of NAME
514 with the dots replaced with dollar signs. */
515
516 static char *
nvptx_replace_dot(const char * name)517 nvptx_replace_dot (const char *name)
518 {
519 if (strchr (name, '.') == NULL)
520 return NULL;
521
522 char *p = xstrdup (name);
523 for (size_t i = 0; i < strlen (p); ++i)
524 if (p[i] == '.')
525 p[i] = '$';
526 return p;
527 }
528
529 /* If MODE should be treated as two registers of an inner mode, return
530 that inner mode. Otherwise return VOIDmode. */
531
532 static machine_mode
maybe_split_mode(machine_mode mode)533 maybe_split_mode (machine_mode mode)
534 {
535 if (COMPLEX_MODE_P (mode))
536 return GET_MODE_INNER (mode);
537
538 if (mode == TImode)
539 return DImode;
540
541 return VOIDmode;
542 }
543
544 /* Return true if mode should be treated as two registers. */
545
546 static bool
split_mode_p(machine_mode mode)547 split_mode_p (machine_mode mode)
548 {
549 return maybe_split_mode (mode) != VOIDmode;
550 }
551
552 /* Output a register, subreg, or register pair (with optional
553 enclosing braces). */
554
555 static void
output_reg(FILE * file,unsigned regno,machine_mode inner_mode,int subreg_offset=-1)556 output_reg (FILE *file, unsigned regno, machine_mode inner_mode,
557 int subreg_offset = -1)
558 {
559 if (inner_mode == VOIDmode)
560 {
561 if (HARD_REGISTER_NUM_P (regno))
562 fprintf (file, "%s", reg_names[regno]);
563 else
564 fprintf (file, "%%r%d", regno);
565 }
566 else if (subreg_offset >= 0)
567 {
568 output_reg (file, regno, VOIDmode);
569 fprintf (file, "$%d", subreg_offset);
570 }
571 else
572 {
573 if (subreg_offset == -1)
574 fprintf (file, "{");
575 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode));
576 fprintf (file, ",");
577 output_reg (file, regno, inner_mode, 0);
578 if (subreg_offset == -1)
579 fprintf (file, "}");
580 }
581 }
582
583 /* Emit forking instructions for MASK. */
584
585 static void
nvptx_emit_forking(unsigned mask,bool is_call)586 nvptx_emit_forking (unsigned mask, bool is_call)
587 {
588 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
589 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
590 if (mask)
591 {
592 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
593
594 /* Emit fork at all levels. This helps form SESE regions, as
595 it creates a block with a single successor before entering a
596 partitooned region. That is a good candidate for the end of
597 an SESE region. */
598 emit_insn (gen_nvptx_fork (op));
599 emit_insn (gen_nvptx_forked (op));
600 }
601 }
602
603 /* Emit joining instructions for MASK. */
604
605 static void
nvptx_emit_joining(unsigned mask,bool is_call)606 nvptx_emit_joining (unsigned mask, bool is_call)
607 {
608 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
609 | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
610 if (mask)
611 {
612 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX));
613
614 /* Emit joining for all non-call pars to ensure there's a single
615 predecessor for the block the join insn ends up in. This is
616 needed for skipping entire loops. */
617 emit_insn (gen_nvptx_joining (op));
618 emit_insn (gen_nvptx_join (op));
619 }
620 }
621
622
623 /* Determine whether MODE and TYPE (possibly NULL) should be passed or
624 returned in memory. Integer and floating types supported by the
625 machine are passed in registers, everything else is passed in
626 memory. Complex types are split. */
627
628 static bool
pass_in_memory(machine_mode mode,const_tree type,bool for_return)629 pass_in_memory (machine_mode mode, const_tree type, bool for_return)
630 {
631 if (type)
632 {
633 if (AGGREGATE_TYPE_P (type))
634 return true;
635 if (TREE_CODE (type) == VECTOR_TYPE)
636 return true;
637 }
638
639 if (!for_return && COMPLEX_MODE_P (mode))
640 /* Complex types are passed as two underlying args. */
641 mode = GET_MODE_INNER (mode);
642
643 if (GET_MODE_CLASS (mode) != MODE_INT
644 && GET_MODE_CLASS (mode) != MODE_FLOAT)
645 return true;
646
647 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
648 return true;
649
650 return false;
651 }
652
653 /* A non-memory argument of mode MODE is being passed, determine the mode it
654 should be promoted to. This is also used for determining return
655 type promotion. */
656
657 static machine_mode
promote_arg(machine_mode mode,bool prototyped)658 promote_arg (machine_mode mode, bool prototyped)
659 {
660 if (!prototyped && mode == SFmode)
661 /* K&R float promotion for unprototyped functions. */
662 mode = DFmode;
663 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode))
664 mode = SImode;
665
666 return mode;
667 }
668
669 /* A non-memory return type of MODE is being returned. Determine the
670 mode it should be promoted to. */
671
672 static machine_mode
promote_return(machine_mode mode)673 promote_return (machine_mode mode)
674 {
675 return promote_arg (mode, true);
676 }
677
678 /* Implement TARGET_FUNCTION_ARG. */
679
680 static rtx
nvptx_function_arg(cumulative_args_t,const function_arg_info & arg)681 nvptx_function_arg (cumulative_args_t, const function_arg_info &arg)
682 {
683 if (arg.end_marker_p () || !arg.named)
684 return NULL_RTX;
685
686 return gen_reg_rtx (arg.mode);
687 }
688
689 /* Implement TARGET_FUNCTION_INCOMING_ARG. */
690
691 static rtx
nvptx_function_incoming_arg(cumulative_args_t cum_v,const function_arg_info & arg)692 nvptx_function_incoming_arg (cumulative_args_t cum_v,
693 const function_arg_info &arg)
694 {
695 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
696
697 if (arg.end_marker_p () || !arg.named)
698 return NULL_RTX;
699
700 /* No need to deal with split modes here, the only case that can
701 happen is complex modes and those are dealt with by
702 TARGET_SPLIT_COMPLEX_ARG. */
703 return gen_rtx_UNSPEC (arg.mode,
704 gen_rtvec (1, GEN_INT (cum->count)),
705 UNSPEC_ARG_REG);
706 }
707
708 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */
709
710 static void
nvptx_function_arg_advance(cumulative_args_t cum_v,const function_arg_info &)711 nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &)
712 {
713 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
714
715 cum->count++;
716 }
717
718 /* Implement TARGET_FUNCTION_ARG_BOUNDARY.
719
720 For nvptx This is only used for varadic args. The type has already
721 been promoted and/or converted to invisible reference. */
722
723 static unsigned
nvptx_function_arg_boundary(machine_mode mode,const_tree ARG_UNUSED (type))724 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type))
725 {
726 return GET_MODE_ALIGNMENT (mode);
727 }
728
729 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook.
730
731 For nvptx, we know how to handle functions declared as stdarg: by
732 passing an extra pointer to the unnamed arguments. However, the
733 Fortran frontend can produce a different situation, where a
734 function pointer is declared with no arguments, but the actual
735 function and calls to it take more arguments. In that case, we
736 want to ensure the call matches the definition of the function. */
737
738 static bool
nvptx_strict_argument_naming(cumulative_args_t cum_v)739 nvptx_strict_argument_naming (cumulative_args_t cum_v)
740 {
741 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
742
743 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype);
744 }
745
746 /* Implement TARGET_LIBCALL_VALUE. */
747
748 static rtx
nvptx_libcall_value(machine_mode mode,const_rtx)749 nvptx_libcall_value (machine_mode mode, const_rtx)
750 {
751 if (!cfun || !cfun->machine->doing_call)
752 /* Pretend to return in a hard reg for early uses before pseudos can be
753 generated. */
754 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
755
756 return gen_reg_rtx (mode);
757 }
758
759 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place
760 where function FUNC returns or receives a value of data type TYPE. */
761
762 static rtx
nvptx_function_value(const_tree type,const_tree ARG_UNUSED (func),bool outgoing)763 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func),
764 bool outgoing)
765 {
766 machine_mode mode = promote_return (TYPE_MODE (type));
767
768 if (outgoing)
769 {
770 gcc_assert (cfun);
771 cfun->machine->return_mode = mode;
772 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM);
773 }
774
775 return nvptx_libcall_value (mode, NULL_RTX);
776 }
777
778 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */
779
780 static bool
nvptx_function_value_regno_p(const unsigned int regno)781 nvptx_function_value_regno_p (const unsigned int regno)
782 {
783 return regno == NVPTX_RETURN_REGNUM;
784 }
785
786 /* Types with a mode other than those supported by the machine are passed by
787 reference in memory. */
788
789 static bool
nvptx_pass_by_reference(cumulative_args_t,const function_arg_info & arg)790 nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg)
791 {
792 return pass_in_memory (arg.mode, arg.type, false);
793 }
794
795 /* Implement TARGET_RETURN_IN_MEMORY. */
796
797 static bool
nvptx_return_in_memory(const_tree type,const_tree)798 nvptx_return_in_memory (const_tree type, const_tree)
799 {
800 return pass_in_memory (TYPE_MODE (type), type, true);
801 }
802
803 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */
804
805 static machine_mode
nvptx_promote_function_mode(const_tree type,machine_mode mode,int * ARG_UNUSED (punsignedp),const_tree funtype,int for_return)806 nvptx_promote_function_mode (const_tree type, machine_mode mode,
807 int *ARG_UNUSED (punsignedp),
808 const_tree funtype, int for_return)
809 {
810 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype));
811 }
812
813 /* Helper for write_arg. Emit a single PTX argument of MODE, either
814 in a prototype, or as copy in a function prologue. ARGNO is the
815 index of this argument in the PTX function. FOR_REG is negative,
816 if we're emitting the PTX prototype. It is zero if we're copying
817 to an argument register and it is greater than zero if we're
818 copying to a specific hard register. */
819
820 static int
write_arg_mode(std::stringstream & s,int for_reg,int argno,machine_mode mode)821 write_arg_mode (std::stringstream &s, int for_reg, int argno,
822 machine_mode mode)
823 {
824 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
825
826 if (for_reg < 0)
827 {
828 /* Writing PTX prototype. */
829 s << (argno ? ", " : " (");
830 s << ".param" << ptx_type << " %in_ar" << argno;
831 }
832 else
833 {
834 s << "\t.reg" << ptx_type << " ";
835 if (for_reg)
836 s << reg_names[for_reg];
837 else
838 s << "%ar" << argno;
839 s << ";\n";
840 if (argno >= 0)
841 {
842 s << "\tld.param" << ptx_type << " ";
843 if (for_reg)
844 s << reg_names[for_reg];
845 else
846 s << "%ar" << argno;
847 s << ", [%in_ar" << argno << "];\n";
848 }
849 }
850 return argno + 1;
851 }
852
853 /* Process function parameter TYPE to emit one or more PTX
854 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED
855 is true, if this is a prototyped function, rather than an old-style
856 C declaration. Returns the next argument number to use.
857
858 The promotion behavior here must match the regular GCC function
859 parameter marshalling machinery. */
860
861 static int
write_arg_type(std::stringstream & s,int for_reg,int argno,tree type,bool prototyped)862 write_arg_type (std::stringstream &s, int for_reg, int argno,
863 tree type, bool prototyped)
864 {
865 machine_mode mode = TYPE_MODE (type);
866
867 if (mode == VOIDmode)
868 return argno;
869
870 if (pass_in_memory (mode, type, false))
871 mode = Pmode;
872 else
873 {
874 bool split = TREE_CODE (type) == COMPLEX_TYPE;
875
876 if (split)
877 {
878 /* Complex types are sent as two separate args. */
879 type = TREE_TYPE (type);
880 mode = TYPE_MODE (type);
881 prototyped = true;
882 }
883
884 mode = promote_arg (mode, prototyped);
885 if (split)
886 argno = write_arg_mode (s, for_reg, argno, mode);
887 }
888
889 return write_arg_mode (s, for_reg, argno, mode);
890 }
891
892 /* Emit a PTX return as a prototype or function prologue declaration
893 for MODE. */
894
895 static void
write_return_mode(std::stringstream & s,bool for_proto,machine_mode mode)896 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode)
897 {
898 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
899 const char *pfx = "\t.reg";
900 const char *sfx = ";\n";
901
902 if (for_proto)
903 pfx = "(.param", sfx = "_out) ";
904
905 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx;
906 }
907
908 /* Process a function return TYPE to emit a PTX return as a prototype
909 or function prologue declaration. Returns true if return is via an
910 additional pointer parameter. The promotion behavior here must
911 match the regular GCC function return mashalling. */
912
913 static bool
write_return_type(std::stringstream & s,bool for_proto,tree type)914 write_return_type (std::stringstream &s, bool for_proto, tree type)
915 {
916 machine_mode mode = TYPE_MODE (type);
917
918 if (mode == VOIDmode)
919 return false;
920
921 bool return_in_mem = pass_in_memory (mode, type, true);
922
923 if (return_in_mem)
924 {
925 if (for_proto)
926 return return_in_mem;
927
928 /* Named return values can cause us to return a pointer as well
929 as expect an argument for the return location. This is
930 optimization-level specific, so no caller can make use of
931 this data, but more importantly for us, we must ensure it
932 doesn't change the PTX prototype. */
933 mode = (machine_mode) cfun->machine->return_mode;
934
935 if (mode == VOIDmode)
936 return return_in_mem;
937
938 /* Clear return_mode to inhibit copy of retval to non-existent
939 retval parameter. */
940 cfun->machine->return_mode = VOIDmode;
941 }
942 else
943 mode = promote_return (mode);
944
945 write_return_mode (s, for_proto, mode);
946
947 return return_in_mem;
948 }
949
950 /* Look for attributes in ATTRS that would indicate we must write a function
951 as a .entry kernel rather than a .func. Return true if one is found. */
952
953 static bool
write_as_kernel(tree attrs)954 write_as_kernel (tree attrs)
955 {
956 return (lookup_attribute ("kernel", attrs) != NULL_TREE
957 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE
958 && lookup_attribute ("oacc function", attrs) != NULL_TREE));
959 /* For OpenMP target regions, the corresponding kernel entry is emitted from
960 write_omp_entry as a separate function. */
961 }
962
963 /* Emit a linker marker for a function decl or defn. */
964
965 static void
write_fn_marker(std::stringstream & s,bool is_defn,bool globalize,const char * name)966 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize,
967 const char *name)
968 {
969 s << "\n// BEGIN";
970 if (globalize)
971 s << " GLOBAL";
972 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: ");
973 s << name << "\n";
974 }
975
976 /* Emit a linker marker for a variable decl or defn. */
977
978 static void
write_var_marker(FILE * file,bool is_defn,bool globalize,const char * name)979 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name)
980 {
981 fprintf (file, "\n// BEGIN%s VAR %s: ",
982 globalize ? " GLOBAL" : "",
983 is_defn ? "DEF" : "DECL");
984 assemble_name_raw (file, name);
985 fputs ("\n", file);
986 }
987
988 /* Helper function for write_fn_proto. */
989
990 static void
write_fn_proto_1(std::stringstream & s,bool is_defn,const char * name,const_tree decl)991 write_fn_proto_1 (std::stringstream &s, bool is_defn,
992 const char *name, const_tree decl)
993 {
994 if (lookup_attribute ("alias", DECL_ATTRIBUTES (decl)) == NULL)
995 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name);
996
997 /* PTX declaration. */
998 if (DECL_EXTERNAL (decl))
999 s << ".extern ";
1000 else if (TREE_PUBLIC (decl))
1001 s << (DECL_WEAK (decl) ? ".weak " : ".visible ");
1002 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func ");
1003
1004 tree fntype = TREE_TYPE (decl);
1005 tree result_type = TREE_TYPE (fntype);
1006
1007 /* atomic_compare_exchange_$n builtins have an exceptional calling
1008 convention. */
1009 int not_atomic_weak_arg = -1;
1010 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL)
1011 switch (DECL_FUNCTION_CODE (decl))
1012 {
1013 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1:
1014 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2:
1015 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4:
1016 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8:
1017 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16:
1018 /* These atomics skip the 'weak' parm in an actual library
1019 call. We must skip it in the prototype too. */
1020 not_atomic_weak_arg = 3;
1021 break;
1022
1023 default:
1024 break;
1025 }
1026
1027 /* Declare the result. */
1028 bool return_in_mem = write_return_type (s, true, result_type);
1029
1030 s << name;
1031
1032 int argno = 0;
1033
1034 /* Emit argument list. */
1035 if (return_in_mem)
1036 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1037
1038 /* We get:
1039 NULL in TYPE_ARG_TYPES, for old-style functions
1040 NULL in DECL_ARGUMENTS, for builtin functions without another
1041 declaration.
1042 So we have to pick the best one we have. */
1043 tree args = TYPE_ARG_TYPES (fntype);
1044 bool prototyped = true;
1045 if (!args)
1046 {
1047 args = DECL_ARGUMENTS (decl);
1048 prototyped = false;
1049 }
1050
1051 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--)
1052 {
1053 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1054
1055 if (not_atomic_weak_arg)
1056 argno = write_arg_type (s, -1, argno, type, prototyped);
1057 else
1058 gcc_assert (TREE_CODE (type) == BOOLEAN_TYPE);
1059 }
1060
1061 if (stdarg_p (fntype))
1062 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1063
1064 if (DECL_STATIC_CHAIN (decl))
1065 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1066
1067 if (argno < 2 && strcmp (name, "main") == 0)
1068 {
1069 if (argno == 0)
1070 argno = write_arg_type (s, -1, argno, integer_type_node, true);
1071
1072 if (argno == 1)
1073 argno = write_arg_type (s, -1, argno, ptr_type_node, true);
1074 }
1075
1076 if (argno)
1077 s << ")";
1078
1079 s << (is_defn ? "\n" : ";\n");
1080 }
1081
1082 /* Write a .func or .kernel declaration or definition along with
1083 a helper comment for use by ld. S is the stream to write to, DECL
1084 the decl for the function with name NAME. For definitions, emit
1085 a declaration too. */
1086
1087 static void
write_fn_proto(std::stringstream & s,bool is_defn,const char * name,const_tree decl)1088 write_fn_proto (std::stringstream &s, bool is_defn,
1089 const char *name, const_tree decl)
1090 {
1091 const char *replacement = nvptx_name_replacement (name);
1092 char *replaced_dots = NULL;
1093 if (replacement != name)
1094 name = replacement;
1095 else
1096 {
1097 replaced_dots = nvptx_replace_dot (name);
1098 if (replaced_dots)
1099 name = replaced_dots;
1100 }
1101 if (name[0] == '*')
1102 name++;
1103
1104 if (is_defn)
1105 /* Emit a declaration. The PTX assembler gets upset without it. */
1106 write_fn_proto_1 (s, false, name, decl);
1107
1108 write_fn_proto_1 (s, is_defn, name, decl);
1109
1110 if (replaced_dots)
1111 XDELETE (replaced_dots);
1112 }
1113
1114 /* Construct a function declaration from a call insn. This can be
1115 necessary for two reasons - either we have an indirect call which
1116 requires a .callprototype declaration, or we have a libcall
1117 generated by emit_library_call for which no decl exists. */
1118
1119 static void
write_fn_proto_from_insn(std::stringstream & s,const char * name,rtx result,rtx pat)1120 write_fn_proto_from_insn (std::stringstream &s, const char *name,
1121 rtx result, rtx pat)
1122 {
1123 char *replaced_dots = NULL;
1124
1125 if (!name)
1126 {
1127 s << "\t.callprototype ";
1128 name = "_";
1129 }
1130 else
1131 {
1132 const char *replacement = nvptx_name_replacement (name);
1133 if (replacement != name)
1134 name = replacement;
1135 else
1136 {
1137 replaced_dots = nvptx_replace_dot (name);
1138 if (replaced_dots)
1139 name = replaced_dots;
1140 }
1141 write_fn_marker (s, false, true, name);
1142 s << "\t.extern .func ";
1143 }
1144
1145 if (result != NULL_RTX)
1146 write_return_mode (s, true, GET_MODE (result));
1147
1148 s << name;
1149 if (replaced_dots)
1150 XDELETE (replaced_dots);
1151
1152 int arg_end = XVECLEN (pat, 0);
1153 for (int i = 1; i < arg_end; i++)
1154 {
1155 /* We don't have to deal with mode splitting & promotion here,
1156 as that was already done when generating the call
1157 sequence. */
1158 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0));
1159
1160 write_arg_mode (s, -1, i - 1, mode);
1161 }
1162 if (arg_end != 1)
1163 s << ")";
1164 s << ";\n";
1165 }
1166
1167 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash
1168 table and write a ptx prototype. These are emitted at end of
1169 compilation. */
1170
1171 static void
nvptx_record_fndecl(tree decl)1172 nvptx_record_fndecl (tree decl)
1173 {
1174 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT);
1175 if (*slot == NULL)
1176 {
1177 *slot = decl;
1178 const char *name = get_fnname_from_decl (decl);
1179 write_fn_proto (func_decls, false, name, decl);
1180 }
1181 }
1182
1183 /* Record a libcall or unprototyped external function. CALLEE is the
1184 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx
1185 declaration for it. */
1186
1187 static void
nvptx_record_libfunc(rtx callee,rtx retval,rtx pat)1188 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat)
1189 {
1190 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT);
1191 if (*slot == NULL)
1192 {
1193 *slot = callee;
1194
1195 const char *name = XSTR (callee, 0);
1196 write_fn_proto_from_insn (func_decls, name, retval, pat);
1197 }
1198 }
1199
1200 /* DECL is an external FUNCTION_DECL, that we're referencing. If it
1201 is prototyped, record it now. Otherwise record it as needed at end
1202 of compilation, when we might have more information about it. */
1203
1204 void
nvptx_record_needed_fndecl(tree decl)1205 nvptx_record_needed_fndecl (tree decl)
1206 {
1207 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE)
1208 {
1209 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT);
1210 if (*slot == NULL)
1211 *slot = decl;
1212 }
1213 else
1214 nvptx_record_fndecl (decl);
1215 }
1216
1217 /* SYM is a SYMBOL_REF. If it refers to an external function, record
1218 it as needed. */
1219
1220 static void
nvptx_maybe_record_fnsym(rtx sym)1221 nvptx_maybe_record_fnsym (rtx sym)
1222 {
1223 tree decl = SYMBOL_REF_DECL (sym);
1224
1225 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl))
1226 nvptx_record_needed_fndecl (decl);
1227 }
1228
1229 /* Emit a local array to hold some part of a conventional stack frame
1230 and initialize REGNO to point to it. If the size is zero, it'll
1231 never be valid to dereference, so we can simply initialize to
1232 zero. */
1233
1234 static void
init_frame(FILE * file,int regno,unsigned align,unsigned size)1235 init_frame (FILE *file, int regno, unsigned align, unsigned size)
1236 {
1237 if (size)
1238 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n",
1239 align, reg_names[regno], size);
1240 fprintf (file, "\t.reg.u%d %s;\n",
1241 POINTER_SIZE, reg_names[regno]);
1242 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n"
1243 : "\tmov.u%d %s, 0;\n"),
1244 POINTER_SIZE, reg_names[regno], reg_names[regno]);
1245 }
1246
1247 /* Emit soft stack frame setup sequence. */
1248
1249 static void
init_softstack_frame(FILE * file,unsigned alignment,HOST_WIDE_INT size)1250 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size)
1251 {
1252 /* Maintain 64-bit stack alignment. */
1253 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT;
1254 size = ROUND_UP (size, keep_align);
1255 int bits = POINTER_SIZE;
1256 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1257 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM];
1258 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM];
1259 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM];
1260 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack);
1261 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame);
1262 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot);
1263 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev);
1264 fprintf (file, "\t{\n");
1265 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n");
1266 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits);
1267 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits);
1268 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n");
1269 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n",
1270 bits == 64 ? ".wide" : ".lo", bits / 8);
1271 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits);
1272
1273 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */
1274 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot);
1275
1276 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */
1277 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n",
1278 bits, reg_sspprev, reg_sspslot);
1279
1280 /* Initialize %frame = %sspprev - size. */
1281 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1282 bits, reg_frame, reg_sspprev, size);
1283
1284 /* Apply alignment, if larger than 64. */
1285 if (alignment > keep_align)
1286 fprintf (file, "\t\tand.b%d %s, %s, %d;\n",
1287 bits, reg_frame, reg_frame, -alignment);
1288
1289 size = crtl->outgoing_args_size;
1290 gcc_assert (size % keep_align == 0);
1291
1292 /* Initialize %stack. */
1293 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n",
1294 bits, reg_stack, reg_frame, size);
1295
1296 if (!crtl->is_leaf)
1297 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n",
1298 bits, reg_sspslot, reg_stack);
1299 fprintf (file, "\t}\n");
1300 cfun->machine->has_softstack = true;
1301 need_softstack_decl = true;
1302 }
1303
1304 /* Emit code to initialize the REGNO predicate register to indicate
1305 whether we are not lane zero on the NAME axis. */
1306
1307 static void
nvptx_init_axis_predicate(FILE * file,int regno,const char * name)1308 nvptx_init_axis_predicate (FILE *file, int regno, const char *name)
1309 {
1310 fprintf (file, "\t{\n");
1311 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name);
1312 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1313 {
1314 fprintf (file, "\t\t.reg.u64\t%%t_red;\n");
1315 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1316 }
1317 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name);
1318 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name);
1319 if (strcmp (name, "x") == 0 && cfun->machine->red_partition)
1320 {
1321 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n");
1322 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n");
1323 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; "
1324 "// vector reduction buffer\n",
1325 REGNO (cfun->machine->red_partition),
1326 vector_red_partition);
1327 }
1328 /* Verify vector_red_size. */
1329 gcc_assert (vector_red_partition * nvptx_mach_max_workers ()
1330 <= vector_red_size);
1331 fprintf (file, "\t}\n");
1332 }
1333
1334 /* Emit code to initialize OpenACC worker broadcast and synchronization
1335 registers. */
1336
1337 static void
nvptx_init_oacc_workers(FILE * file)1338 nvptx_init_oacc_workers (FILE *file)
1339 {
1340 fprintf (file, "\t{\n");
1341 fprintf (file, "\t\t.reg.u32\t%%tidy;\n");
1342 if (cfun->machine->bcast_partition)
1343 {
1344 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n");
1345 fprintf (file, "\t\t.reg.u64\t%%y64;\n");
1346 }
1347 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n");
1348 if (cfun->machine->bcast_partition)
1349 {
1350 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n");
1351 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n");
1352 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n");
1353 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; "
1354 "// vector broadcast offset\n",
1355 REGNO (cfun->machine->bcast_partition),
1356 oacc_bcast_partition);
1357 }
1358 /* Verify oacc_bcast_size. */
1359 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1)
1360 <= oacc_bcast_size);
1361 if (cfun->machine->sync_bar)
1362 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; "
1363 "// vector synchronization barrier\n",
1364 REGNO (cfun->machine->sync_bar));
1365 fprintf (file, "\t}\n");
1366 }
1367
1368 /* Emit code to initialize predicate and master lane index registers for
1369 -muniform-simt code generation variant. */
1370
1371 static void
nvptx_init_unisimt_predicate(FILE * file)1372 nvptx_init_unisimt_predicate (FILE *file)
1373 {
1374 cfun->machine->unisimt_location = gen_reg_rtx (Pmode);
1375 int loc = REGNO (cfun->machine->unisimt_location);
1376 int bits = POINTER_SIZE;
1377 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc);
1378 fprintf (file, "\t{\n");
1379 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n");
1380 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits);
1381 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n");
1382 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n",
1383 bits == 64 ? ".wide" : ".lo");
1384 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc);
1385 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc);
1386 if (cfun->machine->unisimt_predicate)
1387 {
1388 int master = REGNO (cfun->machine->unisimt_master);
1389 int pred = REGNO (cfun->machine->unisimt_predicate);
1390 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc);
1391 if (cfun->machine->unisimt_outside_simt_predicate)
1392 {
1393 int pred_outside_simt
1394 = REGNO (cfun->machine->unisimt_outside_simt_predicate);
1395 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, 0;\n",
1396 pred_outside_simt, master);
1397 }
1398 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n");
1399 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */
1400 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master);
1401 /* Compute predicate as 'tid.x == master'. */
1402 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master);
1403 }
1404 fprintf (file, "\t}\n");
1405 need_unisimt_decl = true;
1406 }
1407
1408 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region:
1409
1410 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg);
1411 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize)
1412 {
1413 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1);
1414 __nvptx_uni[tid.y] = 0;
1415 gomp_nvptx_main (ORIG, arg);
1416 }
1417 ORIG itself should not be emitted as a PTX .entry function. */
1418
1419 static void
write_omp_entry(FILE * file,const char * name,const char * orig)1420 write_omp_entry (FILE *file, const char *name, const char *orig)
1421 {
1422 static bool gomp_nvptx_main_declared;
1423 if (!gomp_nvptx_main_declared)
1424 {
1425 gomp_nvptx_main_declared = true;
1426 write_fn_marker (func_decls, false, true, "gomp_nvptx_main");
1427 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE
1428 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n";
1429 }
1430 /* PR79332. Single out this string; it confuses gcc.pot generation. */
1431 #define NTID_Y "%ntid.y"
1432 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\
1433 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\
1434 {\n\
1435 .reg.u32 %r<3>;\n\
1436 .reg.u" PS " %R<4>;\n\
1437 mov.u32 %r0, %tid.y;\n\
1438 mov.u32 %r1, " NTID_Y ";\n\
1439 mov.u32 %r2, %ctaid.x;\n\
1440 cvt.u" PS ".u32 %R1, %r0;\n\
1441 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\
1442 mov.u" PS " %R0, __nvptx_stacks;\n\
1443 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\
1444 ld.param.u" PS " %R2, [%stack];\n\
1445 ld.param.u" PS " %R3, [%sz];\n\
1446 add.u" PS " %R2, %R2, %R3;\n\
1447 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\
1448 st.shared.u" PS " [%R0], %R2;\n\
1449 mov.u" PS " %R0, __nvptx_uni;\n\
1450 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\
1451 mov.u32 %r0, 0;\n\
1452 st.shared.u32 [%R0], %r0;\n\
1453 mov.u" PS " %R0, \0;\n\
1454 ld.param.u" PS " %R1, [%arg];\n\
1455 {\n\
1456 .param.u" PS " %P<2>;\n\
1457 st.param.u" PS " [%P0], %R0;\n\
1458 st.param.u" PS " [%P1], %R1;\n\
1459 call.uni gomp_nvptx_main, (%P0, %P1);\n\
1460 }\n\
1461 ret.uni;\n\
1462 }\n"
1463 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32");
1464 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 ");
1465 #undef ENTRY_TEMPLATE
1466 #undef NTID_Y
1467 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32;
1468 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */
1469 const char *entry_2 = entry_1 + strlen (entry64) + 1;
1470 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2);
1471 need_softstack_decl = need_unisimt_decl = true;
1472 }
1473
1474 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx
1475 function, including local var decls and copies from the arguments to
1476 local regs. */
1477
1478 void
nvptx_declare_function_name(FILE * file,const char * name,const_tree decl)1479 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl)
1480 {
1481 tree fntype = TREE_TYPE (decl);
1482 tree result_type = TREE_TYPE (fntype);
1483 int argno = 0;
1484
1485 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl))
1486 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl)))
1487 {
1488 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl"));
1489 sprintf (buf, "%s$impl", name);
1490 write_omp_entry (file, name, buf);
1491 name = buf;
1492 }
1493 /* We construct the initial part of the function into a string
1494 stream, in order to share the prototype writing code. */
1495 std::stringstream s;
1496 write_fn_proto (s, true, name, decl);
1497 s << "{\n";
1498
1499 bool return_in_mem = write_return_type (s, false, result_type);
1500 if (return_in_mem)
1501 argno = write_arg_type (s, 0, argno, ptr_type_node, true);
1502
1503 /* Declare and initialize incoming arguments. */
1504 tree args = TYPE_ARG_TYPES (fntype);
1505 bool prototyped = true;
1506 if (!args)
1507 {
1508 args = DECL_ARGUMENTS (decl);
1509 prototyped = false;
1510 }
1511
1512 for (; args != NULL_TREE; args = TREE_CHAIN (args))
1513 {
1514 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args);
1515
1516 argno = write_arg_type (s, 0, argno, type, prototyped);
1517 }
1518
1519 if (stdarg_p (fntype))
1520 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node,
1521 true);
1522
1523 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain)
1524 write_arg_type (s, STATIC_CHAIN_REGNUM,
1525 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node,
1526 true);
1527
1528 fprintf (file, "%s", s.str().c_str());
1529
1530 /* Usually 'crtl->is_leaf' is computed during register allocator
1531 initialization (which is not done on NVPTX) or for pressure-sensitive
1532 optimizations. Initialize it here, except if already set. */
1533 if (!crtl->is_leaf)
1534 crtl->is_leaf = leaf_function_p ();
1535
1536 HOST_WIDE_INT sz = get_frame_size ();
1537 bool need_frameptr = sz || cfun->machine->has_chain;
1538 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT;
1539 if (!TARGET_SOFT_STACK)
1540 {
1541 /* Declare a local var for outgoing varargs. */
1542 if (cfun->machine->has_varadic)
1543 init_frame (file, STACK_POINTER_REGNUM,
1544 UNITS_PER_WORD, crtl->outgoing_args_size);
1545
1546 /* Declare a local variable for the frame. Force its size to be
1547 DImode-compatible. */
1548 if (need_frameptr)
1549 init_frame (file, FRAME_POINTER_REGNUM, alignment,
1550 ROUND_UP (sz, GET_MODE_SIZE (DImode)));
1551 }
1552 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca
1553 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1554 init_softstack_frame (file, alignment, sz);
1555
1556 if (cfun->machine->has_simtreg)
1557 {
1558 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size;
1559 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align;
1560 align = MAX (align, GET_MODE_SIZE (DImode));
1561 if (!crtl->is_leaf || cfun->calls_alloca)
1562 simtsz = HOST_WIDE_INT_M1U;
1563 if (simtsz == HOST_WIDE_INT_M1U)
1564 simtsz = nvptx_softstack_size;
1565 if (cfun->machine->has_softstack)
1566 simtsz += POINTER_SIZE / 8;
1567 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode));
1568 if (align > GET_MODE_SIZE (DImode))
1569 simtsz += align - GET_MODE_SIZE (DImode);
1570 if (simtsz)
1571 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar["
1572 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz);
1573 }
1574
1575 /* Restore the vector reduction partition register, if necessary.
1576 FIXME: Find out when and why this is necessary, and fix it. */
1577 if (cfun->machine->red_partition)
1578 regno_reg_rtx[REGNO (cfun->machine->red_partition)]
1579 = cfun->machine->red_partition;
1580
1581 /* Declare the pseudos we have as ptx registers. */
1582 int maxregs = max_reg_num ();
1583 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++)
1584 {
1585 if (regno_reg_rtx[i] != const0_rtx)
1586 {
1587 machine_mode mode = PSEUDO_REGNO_MODE (i);
1588 machine_mode split = maybe_split_mode (mode);
1589
1590 if (split_mode_p (mode))
1591 mode = split;
1592 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true));
1593 output_reg (file, i, split, -2);
1594 fprintf (file, ";\n");
1595 }
1596 }
1597
1598 /* Emit axis predicates. */
1599 if (cfun->machine->axis_predicate[0])
1600 nvptx_init_axis_predicate (file,
1601 REGNO (cfun->machine->axis_predicate[0]), "y");
1602 if (cfun->machine->axis_predicate[1])
1603 nvptx_init_axis_predicate (file,
1604 REGNO (cfun->machine->axis_predicate[1]), "x");
1605 if (cfun->machine->unisimt_predicate
1606 || (cfun->machine->has_simtreg && !crtl->is_leaf))
1607 nvptx_init_unisimt_predicate (file);
1608 if (cfun->machine->bcast_partition || cfun->machine->sync_bar)
1609 nvptx_init_oacc_workers (file);
1610 }
1611
1612 /* Output code for switching uniform-simt state. ENTERING indicates whether
1613 we are entering or leaving non-uniform execution region. */
1614
1615 static void
nvptx_output_unisimt_switch(FILE * file,bool entering)1616 nvptx_output_unisimt_switch (FILE *file, bool entering)
1617 {
1618 if (crtl->is_leaf && !cfun->machine->unisimt_predicate)
1619 return;
1620 fprintf (file, "\t{\n");
1621 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n");
1622 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0);
1623 if (cfun->machine->unisimt_outside_simt_predicate)
1624 {
1625 int pred_outside_simt
1626 = REGNO (cfun->machine->unisimt_outside_simt_predicate);
1627 fprintf (file, "\t\tmov.pred %%r%d, %d;\n", pred_outside_simt,
1628 entering ? 0 : 1);
1629 }
1630 if (!crtl->is_leaf)
1631 {
1632 int loc = REGNO (cfun->machine->unisimt_location);
1633 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc);
1634 }
1635 if (cfun->machine->unisimt_predicate)
1636 {
1637 int master = REGNO (cfun->machine->unisimt_master);
1638 int pred = REGNO (cfun->machine->unisimt_predicate);
1639 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n");
1640 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n",
1641 master, entering ? "%ustmp2" : "0");
1642 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master);
1643 }
1644 fprintf (file, "\t}\n");
1645 }
1646
1647 /* Output code for allocating per-lane storage and switching soft-stack pointer.
1648 ENTERING indicates whether we are entering or leaving non-uniform execution.
1649 PTR is the register pointing to allocated storage, it is assigned to on
1650 entering and used to restore state on leaving. SIZE and ALIGN are used only
1651 on entering. */
1652
1653 static void
nvptx_output_softstack_switch(FILE * file,bool entering,rtx ptr,rtx size,rtx align)1654 nvptx_output_softstack_switch (FILE *file, bool entering,
1655 rtx ptr, rtx size, rtx align)
1656 {
1657 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr));
1658 if (crtl->is_leaf && !cfun->machine->simt_stack_size)
1659 return;
1660 int bits = POINTER_SIZE, regno = REGNO (ptr);
1661 fprintf (file, "\t{\n");
1662 if (entering)
1663 {
1664 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + "
1665 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno,
1666 cfun->machine->simt_stack_size);
1667 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno);
1668 if (CONST_INT_P (size))
1669 fprintf (file, HOST_WIDE_INT_PRINT_DEC,
1670 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode)));
1671 else
1672 output_reg (file, REGNO (size), VOIDmode);
1673 fputs (";\n", file);
1674 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode))
1675 fprintf (file,
1676 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n",
1677 bits, regno, regno, UINTVAL (align));
1678 }
1679 if (cfun->machine->has_softstack)
1680 {
1681 const char *reg_stack = reg_names[STACK_POINTER_REGNUM];
1682 if (entering)
1683 {
1684 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n",
1685 bits, regno, bits / 8, reg_stack);
1686 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n",
1687 bits, reg_stack, regno, bits / 8);
1688 }
1689 else
1690 {
1691 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n",
1692 bits, reg_stack, regno, bits / 8);
1693 }
1694 nvptx_output_set_softstack (REGNO (stack_pointer_rtx));
1695 }
1696 fprintf (file, "\t}\n");
1697 }
1698
1699 /* Output code to enter non-uniform execution region. DEST is a register
1700 to hold a per-lane allocation given by SIZE and ALIGN. */
1701
1702 const char *
nvptx_output_simt_enter(rtx dest,rtx size,rtx align)1703 nvptx_output_simt_enter (rtx dest, rtx size, rtx align)
1704 {
1705 nvptx_output_unisimt_switch (asm_out_file, true);
1706 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align);
1707 return "";
1708 }
1709
1710 /* Output code to leave non-uniform execution region. SRC is the register
1711 holding per-lane storage previously allocated by omp_simt_enter insn. */
1712
1713 const char *
nvptx_output_simt_exit(rtx src)1714 nvptx_output_simt_exit (rtx src)
1715 {
1716 nvptx_output_unisimt_switch (asm_out_file, false);
1717 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX);
1718 return "";
1719 }
1720
1721 /* Output instruction that sets soft stack pointer in shared memory to the
1722 value in register given by SRC_REGNO. */
1723
1724 const char *
nvptx_output_set_softstack(unsigned src_regno)1725 nvptx_output_set_softstack (unsigned src_regno)
1726 {
1727 if (cfun->machine->has_softstack && !crtl->is_leaf)
1728 {
1729 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ",
1730 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]);
1731 output_reg (asm_out_file, src_regno, VOIDmode);
1732 fprintf (asm_out_file, ";\n");
1733 }
1734 return "";
1735 }
1736 /* Output a return instruction. Also copy the return value to its outgoing
1737 location. */
1738
1739 const char *
nvptx_output_return(void)1740 nvptx_output_return (void)
1741 {
1742 machine_mode mode = (machine_mode)cfun->machine->return_mode;
1743
1744 if (mode != VOIDmode)
1745 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n",
1746 nvptx_ptx_type_from_mode (mode, false),
1747 reg_names[NVPTX_RETURN_REGNUM],
1748 reg_names[NVPTX_RETURN_REGNUM]);
1749
1750 return "ret;";
1751 }
1752
1753 /* Terminate a function by writing a closing brace to FILE. */
1754
1755 void
nvptx_function_end(FILE * file)1756 nvptx_function_end (FILE *file)
1757 {
1758 fprintf (file, "}\n");
1759 }
1760
1761 /* Decide whether we can make a sibling call to a function. For ptx, we
1762 can't. */
1763
1764 static bool
nvptx_function_ok_for_sibcall(tree,tree)1765 nvptx_function_ok_for_sibcall (tree, tree)
1766 {
1767 return false;
1768 }
1769
1770 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */
1771
1772 static rtx
nvptx_get_drap_rtx(void)1773 nvptx_get_drap_rtx (void)
1774 {
1775 if (TARGET_SOFT_STACK && stack_realign_drap)
1776 return arg_pointer_rtx;
1777 return NULL_RTX;
1778 }
1779
1780 /* Implement the TARGET_CALL_ARGS hook. Record information about one
1781 argument to the next call. */
1782
1783 static void
nvptx_call_args(rtx arg,tree fntype)1784 nvptx_call_args (rtx arg, tree fntype)
1785 {
1786 if (!cfun->machine->doing_call)
1787 {
1788 cfun->machine->doing_call = true;
1789 cfun->machine->is_varadic = false;
1790 cfun->machine->num_args = 0;
1791
1792 if (fntype && stdarg_p (fntype))
1793 {
1794 cfun->machine->is_varadic = true;
1795 cfun->machine->has_varadic = true;
1796 cfun->machine->num_args++;
1797 }
1798 }
1799
1800 if (REG_P (arg) && arg != pc_rtx)
1801 {
1802 cfun->machine->num_args++;
1803 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg,
1804 cfun->machine->call_args);
1805 }
1806 }
1807
1808 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the
1809 information we recorded. */
1810
1811 static void
nvptx_end_call_args(void)1812 nvptx_end_call_args (void)
1813 {
1814 cfun->machine->doing_call = false;
1815 free_EXPR_LIST_list (&cfun->machine->call_args);
1816 }
1817
1818 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep
1819 track of whether calls involving static chains or varargs were seen
1820 in the current function.
1821 For libcalls, maintain a hash table of decls we have seen, and
1822 record a function decl for later when encountering a new one. */
1823
1824 void
nvptx_expand_call(rtx retval,rtx address)1825 nvptx_expand_call (rtx retval, rtx address)
1826 {
1827 rtx callee = XEXP (address, 0);
1828 rtx varargs = NULL_RTX;
1829 unsigned parallel = 0;
1830
1831 if (!call_insn_operand (callee, Pmode))
1832 {
1833 callee = force_reg (Pmode, callee);
1834 address = change_address (address, QImode, callee);
1835 }
1836
1837 if (GET_CODE (callee) == SYMBOL_REF)
1838 {
1839 tree decl = SYMBOL_REF_DECL (callee);
1840 if (decl != NULL_TREE)
1841 {
1842 if (DECL_STATIC_CHAIN (decl))
1843 cfun->machine->has_chain = true;
1844
1845 tree attr = oacc_get_fn_attrib (decl);
1846 if (attr)
1847 {
1848 tree dims = TREE_VALUE (attr);
1849
1850 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
1851 for (int ix = 0; ix != GOMP_DIM_MAX; ix++)
1852 {
1853 if (TREE_PURPOSE (dims)
1854 && !integer_zerop (TREE_PURPOSE (dims)))
1855 break;
1856 /* Not on this axis. */
1857 parallel ^= GOMP_DIM_MASK (ix);
1858 dims = TREE_CHAIN (dims);
1859 }
1860 }
1861 }
1862 }
1863
1864 unsigned nargs = cfun->machine->num_args;
1865 if (cfun->machine->is_varadic)
1866 {
1867 varargs = gen_reg_rtx (Pmode);
1868 emit_move_insn (varargs, stack_pointer_rtx);
1869 }
1870
1871 rtvec vec = rtvec_alloc (nargs + 1);
1872 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec);
1873 int vec_pos = 0;
1874
1875 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx);
1876 rtx tmp_retval = retval;
1877 if (retval)
1878 {
1879 if (!nvptx_register_operand (retval, GET_MODE (retval)))
1880 tmp_retval = gen_reg_rtx (GET_MODE (retval));
1881 call = gen_rtx_SET (tmp_retval, call);
1882 }
1883 XVECEXP (pat, 0, vec_pos++) = call;
1884
1885 /* Construct the call insn, including a USE for each argument pseudo
1886 register. These will be used when printing the insn. */
1887 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
1888 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0));
1889
1890 if (varargs)
1891 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs);
1892
1893 gcc_assert (vec_pos = XVECLEN (pat, 0));
1894
1895 nvptx_emit_forking (parallel, true);
1896 emit_call_insn (pat);
1897 nvptx_emit_joining (parallel, true);
1898
1899 if (tmp_retval != retval)
1900 emit_move_insn (retval, tmp_retval);
1901 }
1902
1903 /* Emit a comparison COMPARE, and return the new test to be used in the
1904 jump. */
1905
1906 rtx
nvptx_expand_compare(rtx compare)1907 nvptx_expand_compare (rtx compare)
1908 {
1909 rtx pred = gen_reg_rtx (BImode);
1910 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode,
1911 XEXP (compare, 0), XEXP (compare, 1));
1912 emit_insn (gen_rtx_SET (pred, cmp));
1913 return gen_rtx_NE (BImode, pred, const0_rtx);
1914 }
1915
1916 /* Expand the oacc fork & join primitive into ptx-required unspecs. */
1917
1918 void
nvptx_expand_oacc_fork(unsigned mode)1919 nvptx_expand_oacc_fork (unsigned mode)
1920 {
1921 nvptx_emit_forking (GOMP_DIM_MASK (mode), false);
1922 }
1923
1924 void
nvptx_expand_oacc_join(unsigned mode)1925 nvptx_expand_oacc_join (unsigned mode)
1926 {
1927 nvptx_emit_joining (GOMP_DIM_MASK (mode), false);
1928 }
1929
1930 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit
1931 objects. */
1932
1933 static rtx
nvptx_gen_unpack(rtx dst0,rtx dst1,rtx src)1934 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src)
1935 {
1936 rtx res;
1937
1938 switch (GET_MODE (src))
1939 {
1940 case E_DImode:
1941 res = gen_unpackdisi2 (dst0, dst1, src);
1942 break;
1943 case E_DFmode:
1944 res = gen_unpackdfsi2 (dst0, dst1, src);
1945 break;
1946 default: gcc_unreachable ();
1947 }
1948 return res;
1949 }
1950
1951 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit
1952 object. */
1953
1954 static rtx
nvptx_gen_pack(rtx dst,rtx src0,rtx src1)1955 nvptx_gen_pack (rtx dst, rtx src0, rtx src1)
1956 {
1957 rtx res;
1958
1959 switch (GET_MODE (dst))
1960 {
1961 case E_DImode:
1962 res = gen_packsidi2 (dst, src0, src1);
1963 break;
1964 case E_DFmode:
1965 res = gen_packsidf2 (dst, src0, src1);
1966 break;
1967 default: gcc_unreachable ();
1968 }
1969 return res;
1970 }
1971
1972 /* Generate an instruction or sequence to broadcast register REG
1973 across the vectors of a single warp. */
1974
1975 rtx
nvptx_gen_shuffle(rtx dst,rtx src,rtx idx,nvptx_shuffle_kind kind)1976 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind)
1977 {
1978 rtx res;
1979
1980 switch (GET_MODE (dst))
1981 {
1982 case E_DCmode:
1983 case E_CDImode:
1984 {
1985 gcc_assert (GET_CODE (dst) == CONCAT);
1986 gcc_assert (GET_CODE (src) == CONCAT);
1987 rtx dst_real = XEXP (dst, 0);
1988 rtx dst_imag = XEXP (dst, 1);
1989 rtx src_real = XEXP (src, 0);
1990 rtx src_imag = XEXP (src, 1);
1991
1992 start_sequence ();
1993 emit_insn (nvptx_gen_shuffle (dst_real, src_real, idx, kind));
1994 emit_insn (nvptx_gen_shuffle (dst_imag, src_imag, idx, kind));
1995 res = get_insns ();
1996 end_sequence ();
1997 }
1998 break;
1999 case E_SImode:
2000 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind));
2001 break;
2002 case E_SFmode:
2003 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind));
2004 break;
2005 case E_DImode:
2006 case E_DFmode:
2007 {
2008 rtx tmp0 = gen_reg_rtx (SImode);
2009 rtx tmp1 = gen_reg_rtx (SImode);
2010
2011 start_sequence ();
2012 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src));
2013 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2014 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2015 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1));
2016 res = get_insns ();
2017 end_sequence ();
2018 }
2019 break;
2020 case E_V2SImode:
2021 {
2022 rtx src0 = gen_rtx_SUBREG (SImode, src, 0);
2023 rtx src1 = gen_rtx_SUBREG (SImode, src, 4);
2024 rtx dst0 = gen_rtx_SUBREG (SImode, dst, 0);
2025 rtx dst1 = gen_rtx_SUBREG (SImode, dst, 4);
2026 rtx tmp0 = gen_reg_rtx (SImode);
2027 rtx tmp1 = gen_reg_rtx (SImode);
2028 start_sequence ();
2029 emit_insn (gen_movsi (tmp0, src0));
2030 emit_insn (gen_movsi (tmp1, src1));
2031 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2032 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2033 emit_insn (gen_movsi (dst0, tmp0));
2034 emit_insn (gen_movsi (dst1, tmp1));
2035 res = get_insns ();
2036 end_sequence ();
2037 }
2038 break;
2039 case E_V2DImode:
2040 {
2041 rtx src0 = gen_rtx_SUBREG (DImode, src, 0);
2042 rtx src1 = gen_rtx_SUBREG (DImode, src, 8);
2043 rtx dst0 = gen_rtx_SUBREG (DImode, dst, 0);
2044 rtx dst1 = gen_rtx_SUBREG (DImode, dst, 8);
2045 rtx tmp0 = gen_reg_rtx (DImode);
2046 rtx tmp1 = gen_reg_rtx (DImode);
2047 start_sequence ();
2048 emit_insn (gen_movdi (tmp0, src0));
2049 emit_insn (gen_movdi (tmp1, src1));
2050 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind));
2051 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind));
2052 emit_insn (gen_movdi (dst0, tmp0));
2053 emit_insn (gen_movdi (dst1, tmp1));
2054 res = get_insns ();
2055 end_sequence ();
2056 }
2057 break;
2058 case E_BImode:
2059 {
2060 rtx tmp = gen_reg_rtx (SImode);
2061
2062 start_sequence ();
2063 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx));
2064 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
2065 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx)));
2066 res = get_insns ();
2067 end_sequence ();
2068 }
2069 break;
2070 case E_QImode:
2071 case E_HImode:
2072 {
2073 rtx tmp = gen_reg_rtx (SImode);
2074
2075 start_sequence ();
2076 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src)));
2077 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind));
2078 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst),
2079 tmp)));
2080 res = get_insns ();
2081 end_sequence ();
2082 }
2083 break;
2084
2085 default:
2086 gcc_unreachable ();
2087 }
2088 return res;
2089 }
2090
2091 /* Generate an instruction or sequence to broadcast register REG
2092 across the vectors of a single warp. */
2093
2094 static rtx
nvptx_gen_warp_bcast(rtx reg)2095 nvptx_gen_warp_bcast (rtx reg)
2096 {
2097 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX);
2098 }
2099
2100 /* Structure used when generating a worker-level spill or fill. */
2101
2102 struct broadcast_data_t
2103 {
2104 rtx base; /* Register holding base addr of buffer. */
2105 rtx ptr; /* Iteration var, if needed. */
2106 unsigned offset; /* Offset into worker buffer. */
2107 };
2108
2109 /* Direction of the spill/fill and looping setup/teardown indicator. */
2110
2111 enum propagate_mask
2112 {
2113 PM_read = 1 << 0,
2114 PM_write = 1 << 1,
2115 PM_loop_begin = 1 << 2,
2116 PM_loop_end = 1 << 3,
2117
2118 PM_read_write = PM_read | PM_write
2119 };
2120
2121 /* Generate instruction(s) to spill or fill register REG to/from the
2122 worker broadcast array. PM indicates what is to be done, REP
2123 how many loop iterations will be executed (0 for not a loop). */
2124
2125 static rtx
nvptx_gen_shared_bcast(rtx reg,propagate_mask pm,unsigned rep,broadcast_data_t * data,bool vector)2126 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep,
2127 broadcast_data_t *data, bool vector)
2128 {
2129 rtx res;
2130 machine_mode mode = GET_MODE (reg);
2131
2132 switch (mode)
2133 {
2134 case E_BImode:
2135 {
2136 rtx tmp = gen_reg_rtx (SImode);
2137
2138 start_sequence ();
2139 if (pm & PM_read)
2140 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx));
2141 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector));
2142 if (pm & PM_write)
2143 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx)));
2144 res = get_insns ();
2145 end_sequence ();
2146 }
2147 break;
2148
2149 default:
2150 {
2151 rtx addr = data->ptr;
2152
2153 if (!addr)
2154 {
2155 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT;
2156
2157 oacc_bcast_align = MAX (oacc_bcast_align, align);
2158 data->offset = ROUND_UP (data->offset, align);
2159 addr = data->base;
2160 gcc_assert (data->base != NULL);
2161 if (data->offset)
2162 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset));
2163 }
2164
2165 addr = gen_rtx_MEM (mode, addr);
2166 if (pm == PM_read)
2167 res = gen_rtx_SET (addr, reg);
2168 else if (pm == PM_write)
2169 res = gen_rtx_SET (reg, addr);
2170 else
2171 gcc_unreachable ();
2172
2173 if (data->ptr)
2174 {
2175 /* We're using a ptr, increment it. */
2176 start_sequence ();
2177
2178 emit_insn (res);
2179 emit_insn (gen_adddi3 (data->ptr, data->ptr,
2180 GEN_INT (GET_MODE_SIZE (GET_MODE (reg)))));
2181 res = get_insns ();
2182 end_sequence ();
2183 }
2184 else
2185 rep = 1;
2186 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg));
2187 }
2188 break;
2189 }
2190 return res;
2191 }
2192
2193 /* Returns true if X is a valid address for use in a memory reference. */
2194
2195 static bool
nvptx_legitimate_address_p(machine_mode,rtx x,bool)2196 nvptx_legitimate_address_p (machine_mode, rtx x, bool)
2197 {
2198 enum rtx_code code = GET_CODE (x);
2199
2200 switch (code)
2201 {
2202 case REG:
2203 return true;
2204
2205 case PLUS:
2206 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1)))
2207 return true;
2208 return false;
2209
2210 case CONST:
2211 case SYMBOL_REF:
2212 case LABEL_REF:
2213 return true;
2214
2215 default:
2216 return false;
2217 }
2218 }
2219
2220 /* Machinery to output constant initializers. When beginning an
2221 initializer, we decide on a fragment size (which is visible in ptx
2222 in the type used), and then all initializer data is buffered until
2223 a fragment is filled and ready to be written out. */
2224
2225 static struct
2226 {
2227 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */
2228 unsigned HOST_WIDE_INT val; /* Current fragment value. */
2229 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written
2230 out. */
2231 unsigned size; /* Fragment size to accumulate. */
2232 unsigned offset; /* Offset within current fragment. */
2233 bool started; /* Whether we've output any initializer. */
2234 } init_frag;
2235
2236 /* The current fragment is full, write it out. SYM may provide a
2237 symbolic reference we should output, in which case the fragment
2238 value is the addend. */
2239
2240 static void
output_init_frag(rtx sym)2241 output_init_frag (rtx sym)
2242 {
2243 fprintf (asm_out_file, init_frag.started ? ", " : " = { ");
2244 unsigned HOST_WIDE_INT val = init_frag.val;
2245
2246 init_frag.started = true;
2247 init_frag.val = 0;
2248 init_frag.offset = 0;
2249 init_frag.remaining--;
2250
2251 if (sym)
2252 {
2253 bool function = (SYMBOL_REF_DECL (sym)
2254 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL));
2255 if (!function)
2256 fprintf (asm_out_file, "generic(");
2257 output_address (VOIDmode, sym);
2258 if (!function)
2259 fprintf (asm_out_file, ")");
2260 if (val)
2261 fprintf (asm_out_file, " + ");
2262 }
2263
2264 if (!sym || val)
2265 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val);
2266 }
2267
2268 /* Add value VAL of size SIZE to the data we're emitting, and keep
2269 writing out chunks as they fill up. */
2270
2271 static void
nvptx_assemble_value(unsigned HOST_WIDE_INT val,unsigned size)2272 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size)
2273 {
2274 bool negative_p
2275 = val & (HOST_WIDE_INT_1U << (HOST_BITS_PER_WIDE_INT - 1));
2276
2277 /* Avoid undefined behaviour. */
2278 if (size * BITS_PER_UNIT < HOST_BITS_PER_WIDE_INT)
2279 val &= (HOST_WIDE_INT_1U << (size * BITS_PER_UNIT)) - 1;
2280
2281 for (unsigned part = 0; size; size -= part)
2282 {
2283 if (part * BITS_PER_UNIT == HOST_BITS_PER_WIDE_INT)
2284 /* Avoid undefined behaviour. */
2285 val = negative_p ? -1 : 0;
2286 else
2287 val >>= (part * BITS_PER_UNIT);
2288 part = init_frag.size - init_frag.offset;
2289 part = MIN (part, size);
2290
2291 unsigned HOST_WIDE_INT partial
2292 = val << (init_frag.offset * BITS_PER_UNIT);
2293 init_frag.val |= partial & init_frag.mask;
2294 init_frag.offset += part;
2295
2296 if (init_frag.offset == init_frag.size)
2297 output_init_frag (NULL);
2298 }
2299 }
2300
2301 /* Target hook for assembling integer object X of size SIZE. */
2302
2303 static bool
nvptx_assemble_integer(rtx x,unsigned int size,int ARG_UNUSED (aligned_p))2304 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p))
2305 {
2306 HOST_WIDE_INT val = 0;
2307
2308 switch (GET_CODE (x))
2309 {
2310 default:
2311 /* Let the generic machinery figure it out, usually for a
2312 CONST_WIDE_INT. */
2313 return false;
2314
2315 case CONST_INT:
2316 nvptx_assemble_value (INTVAL (x), size);
2317 break;
2318
2319 case CONST:
2320 x = XEXP (x, 0);
2321 gcc_assert (GET_CODE (x) == PLUS);
2322 val = INTVAL (XEXP (x, 1));
2323 x = XEXP (x, 0);
2324 gcc_assert (GET_CODE (x) == SYMBOL_REF);
2325 gcc_fallthrough (); /* FALLTHROUGH */
2326
2327 case SYMBOL_REF:
2328 gcc_assert (size == init_frag.size);
2329 if (init_frag.offset)
2330 sorry ("cannot emit unaligned pointers in ptx assembly");
2331
2332 nvptx_maybe_record_fnsym (x);
2333 init_frag.val = val;
2334 output_init_frag (x);
2335 break;
2336 }
2337
2338 return true;
2339 }
2340
2341 /* Output SIZE zero bytes. We ignore the FILE argument since the
2342 functions we're calling to perform the output just use
2343 asm_out_file. */
2344
2345 void
nvptx_output_skip(FILE *,unsigned HOST_WIDE_INT size)2346 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size)
2347 {
2348 /* Finish the current fragment, if it's started. */
2349 if (init_frag.offset)
2350 {
2351 unsigned part = init_frag.size - init_frag.offset;
2352 part = MIN (part, (unsigned)size);
2353 size -= part;
2354 nvptx_assemble_value (0, part);
2355 }
2356
2357 /* If this skip doesn't terminate the initializer, write as many
2358 remaining pieces as possible directly. */
2359 if (size < init_frag.remaining * init_frag.size)
2360 {
2361 while (size >= init_frag.size)
2362 {
2363 size -= init_frag.size;
2364 output_init_frag (NULL_RTX);
2365 }
2366 if (size)
2367 nvptx_assemble_value (0, size);
2368 }
2369 }
2370
2371 /* Output a string STR with length SIZE. As in nvptx_output_skip we
2372 ignore the FILE arg. */
2373
2374 void
nvptx_output_ascii(FILE *,const char * str,unsigned HOST_WIDE_INT size)2375 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size)
2376 {
2377 for (unsigned HOST_WIDE_INT i = 0; i < size; i++)
2378 nvptx_assemble_value (str[i], 1);
2379 }
2380
2381 /* Return true if TYPE is a record type where the last field is an array without
2382 given dimension. */
2383
2384 static bool
flexible_array_member_type_p(const_tree type)2385 flexible_array_member_type_p (const_tree type)
2386 {
2387 if (TREE_CODE (type) != RECORD_TYPE)
2388 return false;
2389
2390 const_tree last_field = NULL_TREE;
2391 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f))
2392 last_field = f;
2393
2394 if (!last_field)
2395 return false;
2396
2397 const_tree last_field_type = TREE_TYPE (last_field);
2398 if (TREE_CODE (last_field_type) != ARRAY_TYPE)
2399 return false;
2400
2401 return (! TYPE_DOMAIN (last_field_type)
2402 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type)));
2403 }
2404
2405 /* Emit a PTX variable decl and prepare for emission of its
2406 initializer. NAME is the symbol name and SETION the PTX data
2407 area. The type is TYPE, object size SIZE and alignment is ALIGN.
2408 The caller has already emitted any indentation and linkage
2409 specifier. It is responsible for any initializer, terminating ;
2410 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly
2411 this is the opposite way round that PTX wants them! */
2412
2413 static void
nvptx_assemble_decl_begin(FILE * file,const char * name,const char * section,const_tree type,HOST_WIDE_INT size,unsigned align,bool undefined=false)2414 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section,
2415 const_tree type, HOST_WIDE_INT size, unsigned align,
2416 bool undefined = false)
2417 {
2418 bool atype = (TREE_CODE (type) == ARRAY_TYPE)
2419 && (TYPE_DOMAIN (type) == NULL_TREE);
2420
2421 if (undefined && flexible_array_member_type_p (type))
2422 {
2423 size = 0;
2424 atype = true;
2425 }
2426
2427 while (TREE_CODE (type) == ARRAY_TYPE)
2428 type = TREE_TYPE (type);
2429
2430 if (TREE_CODE (type) == VECTOR_TYPE
2431 || TREE_CODE (type) == COMPLEX_TYPE)
2432 /* Neither vector nor complex types can contain the other. */
2433 type = TREE_TYPE (type);
2434
2435 unsigned HOST_WIDE_INT elt_size = int_size_in_bytes (type);
2436
2437 /* Largest mode we're prepared to accept. For BLKmode types we
2438 don't know if it'll contain pointer constants, so have to choose
2439 pointer size, otherwise we can choose DImode. */
2440 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode;
2441
2442 elt_size |= GET_MODE_SIZE (elt_mode);
2443 elt_size &= -elt_size; /* Extract LSB set. */
2444
2445 init_frag.size = elt_size;
2446 /* Avoid undefined shift behavior by using '2'. */
2447 init_frag.mask = ((unsigned HOST_WIDE_INT)2
2448 << (elt_size * BITS_PER_UNIT - 1)) - 1;
2449 init_frag.val = 0;
2450 init_frag.offset = 0;
2451 init_frag.started = false;
2452 /* Size might not be a multiple of elt size, if there's an
2453 initialized trailing struct array with smaller type than
2454 elt_size. */
2455 init_frag.remaining = (size + elt_size - 1) / elt_size;
2456
2457 fprintf (file, "%s .align %d .u" HOST_WIDE_INT_PRINT_UNSIGNED " ",
2458 section, align / BITS_PER_UNIT,
2459 elt_size * BITS_PER_UNIT);
2460 assemble_name (file, name);
2461
2462 if (size)
2463 /* We make everything an array, to simplify any initialization
2464 emission. */
2465 fprintf (file, "[" HOST_WIDE_INT_PRINT_UNSIGNED "]", init_frag.remaining);
2466 else if (atype)
2467 fprintf (file, "[]");
2468 }
2469
2470 /* Called when the initializer for a decl has been completely output through
2471 combinations of the three functions above. */
2472
2473 static void
nvptx_assemble_decl_end(void)2474 nvptx_assemble_decl_end (void)
2475 {
2476 if (init_frag.offset)
2477 /* This can happen with a packed struct with trailing array member. */
2478 nvptx_assemble_value (0, init_frag.size - init_frag.offset);
2479 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n");
2480 }
2481
2482 /* Output an uninitialized common or file-scope variable. */
2483
2484 void
nvptx_output_aligned_decl(FILE * file,const char * name,const_tree decl,HOST_WIDE_INT size,unsigned align)2485 nvptx_output_aligned_decl (FILE *file, const char *name,
2486 const_tree decl, HOST_WIDE_INT size, unsigned align)
2487 {
2488 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2489
2490 /* If this is public, it is common. The nearest thing we have to
2491 common is weak. */
2492 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : "");
2493
2494 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2495 TREE_TYPE (decl), size, align);
2496 nvptx_assemble_decl_end ();
2497 }
2498
2499 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of
2500 writing a constant variable EXP with NAME and SIZE and its
2501 initializer to FILE. */
2502
2503 static void
nvptx_asm_declare_constant_name(FILE * file,const char * name,const_tree exp,HOST_WIDE_INT obj_size)2504 nvptx_asm_declare_constant_name (FILE *file, const char *name,
2505 const_tree exp, HOST_WIDE_INT obj_size)
2506 {
2507 write_var_marker (file, true, false, name);
2508
2509 fprintf (file, "\t");
2510
2511 tree type = TREE_TYPE (exp);
2512 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size,
2513 TYPE_ALIGN (type));
2514 }
2515
2516 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing
2517 a variable DECL with NAME to FILE. */
2518
2519 void
nvptx_declare_object_name(FILE * file,const char * name,const_tree decl)2520 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl)
2521 {
2522 write_var_marker (file, true, TREE_PUBLIC (decl), name);
2523
2524 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? ""
2525 : DECL_WEAK (decl) ? ".weak " : ".visible "));
2526
2527 tree type = TREE_TYPE (decl);
2528 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl));
2529 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2530 type, obj_size, DECL_ALIGN (decl));
2531 }
2532
2533 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */
2534
2535 static void
nvptx_globalize_label(FILE *,const char *)2536 nvptx_globalize_label (FILE *, const char *)
2537 {
2538 }
2539
2540 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern
2541 declaration only for variable DECL with NAME to FILE. */
2542
2543 static void
nvptx_assemble_undefined_decl(FILE * file,const char * name,const_tree decl)2544 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl)
2545 {
2546 /* The middle end can place constant pool decls into the varpool as
2547 undefined. Until that is fixed, catch the problem here. */
2548 if (DECL_IN_CONSTANT_POOL (decl))
2549 return;
2550
2551 /* We support weak defintions, and hence have the right
2552 ASM_WEAKEN_DECL definition. Diagnose the problem here. */
2553 if (DECL_WEAK (decl))
2554 error_at (DECL_SOURCE_LOCATION (decl),
2555 "PTX does not support weak declarations"
2556 " (only weak definitions)");
2557 write_var_marker (file, false, TREE_PUBLIC (decl), name);
2558
2559 fprintf (file, "\t.extern ");
2560 tree size = DECL_SIZE_UNIT (decl);
2561 nvptx_assemble_decl_begin (file, name, section_for_decl (decl),
2562 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0,
2563 DECL_ALIGN (decl), true);
2564 nvptx_assemble_decl_end ();
2565 }
2566
2567 /* Output a pattern for a move instruction. */
2568
2569 const char *
nvptx_output_mov_insn(rtx dst,rtx src)2570 nvptx_output_mov_insn (rtx dst, rtx src)
2571 {
2572 machine_mode dst_mode = GET_MODE (dst);
2573 machine_mode src_mode = GET_MODE (src);
2574 machine_mode dst_inner = (GET_CODE (dst) == SUBREG
2575 ? GET_MODE (XEXP (dst, 0)) : dst_mode);
2576 machine_mode src_inner = (GET_CODE (src) == SUBREG
2577 ? GET_MODE (XEXP (src, 0)) : dst_mode);
2578
2579 rtx sym = src;
2580 if (GET_CODE (sym) == CONST)
2581 sym = XEXP (XEXP (sym, 0), 0);
2582 if (SYMBOL_REF_P (sym))
2583 {
2584 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC)
2585 return "%.\tcvta%D1%t0\t%0, %1;";
2586 nvptx_maybe_record_fnsym (sym);
2587 }
2588
2589 if (src_inner == dst_inner)
2590 return "%.\tmov%t0\t%0, %1;";
2591
2592 if (CONSTANT_P (src))
2593 return (GET_MODE_CLASS (dst_inner) == MODE_INT
2594 && GET_MODE_CLASS (src_inner) != MODE_FLOAT
2595 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;");
2596
2597 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner))
2598 {
2599 if (GET_MODE_BITSIZE (dst_mode) == 128
2600 && GET_MODE_BITSIZE (src_mode) == 128)
2601 {
2602 /* mov.b128 is not supported. */
2603 if (dst_inner == V2DImode && src_inner == TImode)
2604 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;";
2605 else if (dst_inner == TImode && src_inner == V2DImode)
2606 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;";
2607
2608 gcc_unreachable ();
2609 }
2610 return "%.\tmov.b%T0\t%0, %1;";
2611 }
2612
2613 if (GET_MODE_BITSIZE (src_inner) == 128
2614 && GET_MODE_BITSIZE (src_mode) == 64)
2615 return "%.\tmov.b%T0\t%0, %1;";
2616
2617 return "%.\tcvt%t0%t1\t%0, %1;";
2618 }
2619
2620 /* Output a pre/post barrier for MEM_OPERAND according to MEMMODEL. */
2621
2622 static void
nvptx_output_barrier(rtx * mem_operand,int memmodel,bool pre_p)2623 nvptx_output_barrier (rtx *mem_operand, int memmodel, bool pre_p)
2624 {
2625 bool post_p = !pre_p;
2626
2627 switch (memmodel)
2628 {
2629 case MEMMODEL_RELAXED:
2630 return;
2631 case MEMMODEL_CONSUME:
2632 case MEMMODEL_ACQUIRE:
2633 case MEMMODEL_SYNC_ACQUIRE:
2634 if (post_p)
2635 break;
2636 return;
2637 case MEMMODEL_RELEASE:
2638 case MEMMODEL_SYNC_RELEASE:
2639 if (pre_p)
2640 break;
2641 return;
2642 case MEMMODEL_ACQ_REL:
2643 case MEMMODEL_SEQ_CST:
2644 case MEMMODEL_SYNC_SEQ_CST:
2645 if (pre_p || post_p)
2646 break;
2647 return;
2648 default:
2649 gcc_unreachable ();
2650 }
2651
2652 output_asm_insn ("%.\tmembar%B0;", mem_operand);
2653 }
2654
2655 const char *
nvptx_output_atomic_insn(const char * asm_template,rtx * operands,int mem_pos,int memmodel_pos)2656 nvptx_output_atomic_insn (const char *asm_template, rtx *operands, int mem_pos,
2657 int memmodel_pos)
2658 {
2659 nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2660 true);
2661 output_asm_insn (asm_template, operands);
2662 nvptx_output_barrier (&operands[mem_pos], INTVAL (operands[memmodel_pos]),
2663 false);
2664 return "";
2665 }
2666
2667 static void nvptx_print_operand (FILE *, rtx, int);
2668
2669 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this
2670 involves writing .param declarations and in/out copies into them. For
2671 indirect calls, also write the .callprototype. */
2672
2673 const char *
nvptx_output_call_insn(rtx_insn * insn,rtx result,rtx callee)2674 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee)
2675 {
2676 char buf[16];
2677 static int labelno;
2678 bool needs_tgt = register_operand (callee, Pmode);
2679 rtx pat = PATTERN (insn);
2680 if (GET_CODE (pat) == COND_EXEC)
2681 pat = COND_EXEC_CODE (pat);
2682 int arg_end = XVECLEN (pat, 0);
2683 tree decl = NULL_TREE;
2684
2685 fprintf (asm_out_file, "\t{\n");
2686 if (result != NULL)
2687 fprintf (asm_out_file, "\t\t.param%s %s_in;\n",
2688 nvptx_ptx_type_from_mode (GET_MODE (result), false),
2689 reg_names[NVPTX_RETURN_REGNUM]);
2690
2691 /* Ensure we have a ptx declaration in the output if necessary. */
2692 if (GET_CODE (callee) == SYMBOL_REF)
2693 {
2694 decl = SYMBOL_REF_DECL (callee);
2695 if (!decl
2696 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl))))
2697 nvptx_record_libfunc (callee, result, pat);
2698 else if (DECL_EXTERNAL (decl))
2699 nvptx_record_fndecl (decl);
2700 }
2701
2702 if (needs_tgt)
2703 {
2704 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno);
2705 labelno++;
2706 ASM_OUTPUT_LABEL (asm_out_file, buf);
2707 std::stringstream s;
2708 write_fn_proto_from_insn (s, NULL, result, pat);
2709 fputs (s.str().c_str(), asm_out_file);
2710 }
2711
2712 for (int argno = 1; argno < arg_end; argno++)
2713 {
2714 rtx t = XEXP (XVECEXP (pat, 0, argno), 0);
2715 machine_mode mode = GET_MODE (t);
2716 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false);
2717
2718 /* Mode splitting has already been done. */
2719 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n"
2720 "\t\tst.param%s [%%out_arg%d], ",
2721 ptx_type, argno, ptx_type, argno);
2722 output_reg (asm_out_file, REGNO (t), VOIDmode);
2723 fprintf (asm_out_file, ";\n");
2724 }
2725
2726 /* The '.' stands for the call's predicate, if any. */
2727 nvptx_print_operand (asm_out_file, NULL_RTX, '.');
2728 fprintf (asm_out_file, "\t\tcall ");
2729 if (result != NULL_RTX)
2730 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]);
2731
2732 if (decl)
2733 {
2734 char *replaced_dots = NULL;
2735 const char *name = get_fnname_from_decl (decl);
2736 const char *replacement = nvptx_name_replacement (name);
2737 if (replacement != name)
2738 name = replacement;
2739 else
2740 {
2741 replaced_dots = nvptx_replace_dot (name);
2742 if (replaced_dots)
2743 name = replaced_dots;
2744 }
2745 assemble_name (asm_out_file, name);
2746 if (replaced_dots)
2747 XDELETE (replaced_dots);
2748 }
2749 else
2750 output_address (VOIDmode, callee);
2751
2752 const char *open = "(";
2753 for (int argno = 1; argno < arg_end; argno++)
2754 {
2755 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno);
2756 open = "";
2757 }
2758 if (decl && DECL_STATIC_CHAIN (decl))
2759 {
2760 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]);
2761 open = "";
2762 }
2763 if (!open[0])
2764 fprintf (asm_out_file, ")");
2765
2766 if (needs_tgt)
2767 {
2768 fprintf (asm_out_file, ", ");
2769 assemble_name (asm_out_file, buf);
2770 }
2771 fprintf (asm_out_file, ";\n");
2772
2773 if (find_reg_note (insn, REG_NORETURN, NULL))
2774 {
2775 /* No return functions confuse the PTX JIT, as it doesn't realize
2776 the flow control barrier they imply. It can seg fault if it
2777 encounters what looks like an unexitable loop. Emit a trailing
2778 trap and exit, which it does grok. */
2779 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n");
2780 fprintf (asm_out_file, "\t\texit; // (noreturn)\n");
2781 }
2782
2783 if (result)
2784 {
2785 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8];
2786
2787 if (!rval[0])
2788 /* We must escape the '%' that starts RETURN_REGNUM. */
2789 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}",
2790 reg_names[NVPTX_RETURN_REGNUM]);
2791 return rval;
2792 }
2793
2794 return "}";
2795 }
2796
2797 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */
2798
2799 static bool
nvptx_print_operand_punct_valid_p(unsigned char c)2800 nvptx_print_operand_punct_valid_p (unsigned char c)
2801 {
2802 return c == '.' || c== '#';
2803 }
2804
2805 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */
2806
2807 static void
nvptx_print_address_operand(FILE * file,rtx x,machine_mode)2808 nvptx_print_address_operand (FILE *file, rtx x, machine_mode)
2809 {
2810 rtx off;
2811 if (GET_CODE (x) == CONST)
2812 x = XEXP (x, 0);
2813 switch (GET_CODE (x))
2814 {
2815 case PLUS:
2816 off = XEXP (x, 1);
2817 output_address (VOIDmode, XEXP (x, 0));
2818 fprintf (file, "+");
2819 output_address (VOIDmode, off);
2820 break;
2821
2822 case SYMBOL_REF:
2823 case LABEL_REF:
2824 output_addr_const (file, x);
2825 break;
2826
2827 default:
2828 gcc_assert (GET_CODE (x) != MEM);
2829 nvptx_print_operand (file, x, 0);
2830 break;
2831 }
2832 }
2833
2834 /* Write assembly language output for the address ADDR to FILE. */
2835
2836 static void
nvptx_print_operand_address(FILE * file,machine_mode mode,rtx addr)2837 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr)
2838 {
2839 nvptx_print_address_operand (file, addr, mode);
2840 }
2841
2842 static nvptx_data_area
nvptx_mem_data_area(const_rtx x)2843 nvptx_mem_data_area (const_rtx x)
2844 {
2845 gcc_assert (GET_CODE (x) == MEM);
2846
2847 const_rtx addr = XEXP (x, 0);
2848 subrtx_iterator::array_type array;
2849 FOR_EACH_SUBRTX (iter, array, addr, ALL)
2850 if (SYMBOL_REF_P (*iter))
2851 return SYMBOL_DATA_AREA (*iter);
2852
2853 return DATA_AREA_GENERIC;
2854 }
2855
2856 bool
nvptx_mem_maybe_shared_p(const_rtx x)2857 nvptx_mem_maybe_shared_p (const_rtx x)
2858 {
2859 nvptx_data_area area = nvptx_mem_data_area (x);
2860 return area == DATA_AREA_SHARED || area == DATA_AREA_GENERIC;
2861 }
2862
2863 /* Print an operand, X, to FILE, with an optional modifier in CODE.
2864
2865 Meaning of CODE:
2866 . -- print the predicate for the instruction or an emptry string for an
2867 unconditional one.
2868 # -- print a rounding mode for the instruction
2869
2870 A -- print a data area for a MEM
2871 c -- print an opcode suffix for a comparison operator, including a type code
2872 D -- print a data area for a MEM operand
2873 S -- print a shuffle kind specified by CONST_INT
2874 t -- print a type opcode suffix, promoting QImode to 32 bits
2875 T -- print a type size in bits
2876 u -- print a type opcode suffix without promotions.
2877 x -- print a destination operand that may also be a bit bucket. */
2878
2879 static void
nvptx_print_operand(FILE * file,rtx x,int code)2880 nvptx_print_operand (FILE *file, rtx x, int code)
2881 {
2882 if (code == '.')
2883 {
2884 x = current_insn_predicate;
2885 if (x)
2886 {
2887 fputs ("@", file);
2888 if (GET_CODE (x) == EQ)
2889 fputs ("!", file);
2890 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode);
2891 }
2892 return;
2893 }
2894 else if (code == '#')
2895 {
2896 fputs (".rn", file);
2897 return;
2898 }
2899
2900 enum rtx_code x_code = GET_CODE (x);
2901 machine_mode mode = GET_MODE (x);
2902
2903 switch (code)
2904 {
2905 case 'x':
2906 if (current_output_insn != NULL
2907 && find_reg_note (current_output_insn, REG_UNUSED, x) != NULL_RTX)
2908 {
2909 fputs ("_", file);
2910 return;
2911 }
2912 goto common;
2913 case 'B':
2914 if (SYMBOL_REF_P (XEXP (x, 0)))
2915 switch (SYMBOL_DATA_AREA (XEXP (x, 0)))
2916 {
2917 case DATA_AREA_GENERIC:
2918 /* Assume worst-case: global. */
2919 gcc_fallthrough (); /* FALLTHROUGH. */
2920 case DATA_AREA_GLOBAL:
2921 break;
2922 case DATA_AREA_SHARED:
2923 fputs (".cta", file);
2924 return;
2925 case DATA_AREA_LOCAL:
2926 case DATA_AREA_CONST:
2927 case DATA_AREA_PARAM:
2928 default:
2929 gcc_unreachable ();
2930 }
2931
2932 /* There are 2 cases where membar.sys differs from membar.gl:
2933 - host accesses global memory (f.i. systemwide atomics)
2934 - 2 or more devices are setup in peer-to-peer mode, and one
2935 peer can access global memory of other peer.
2936 Neither are currently supported by openMP/OpenACC on nvptx, but
2937 that could change, so we default to membar.sys. We could support
2938 this more optimally by adding DATA_AREA_SYS and then emitting
2939 .gl for DATA_AREA_GLOBAL and .sys for DATA_AREA_SYS. */
2940 fputs (".sys", file);
2941 return;
2942
2943 case 'A':
2944 x = XEXP (x, 0);
2945 gcc_fallthrough (); /* FALLTHROUGH. */
2946
2947 case 'D':
2948 if (GET_CODE (x) == CONST)
2949 x = XEXP (x, 0);
2950 if (GET_CODE (x) == PLUS)
2951 x = XEXP (x, 0);
2952
2953 if (GET_CODE (x) == SYMBOL_REF)
2954 fputs (section_for_sym (x), file);
2955 break;
2956
2957 case 't':
2958 case 'u':
2959 if (x_code == SUBREG)
2960 {
2961 machine_mode inner_mode = GET_MODE (SUBREG_REG (x));
2962 if (VECTOR_MODE_P (inner_mode)
2963 && (GET_MODE_SIZE (mode)
2964 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
2965 mode = GET_MODE_INNER (inner_mode);
2966 else if (split_mode_p (inner_mode))
2967 mode = maybe_split_mode (inner_mode);
2968 else
2969 mode = inner_mode;
2970 }
2971 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't'));
2972 break;
2973
2974 case 'H':
2975 case 'L':
2976 {
2977 rtx inner_x = SUBREG_REG (x);
2978 machine_mode inner_mode = GET_MODE (inner_x);
2979 machine_mode split = maybe_split_mode (inner_mode);
2980
2981 output_reg (file, REGNO (inner_x), split,
2982 (code == 'H'
2983 ? GET_MODE_SIZE (inner_mode) / 2
2984 : 0));
2985 }
2986 break;
2987
2988 case 'S':
2989 {
2990 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x);
2991 /* Same order as nvptx_shuffle_kind. */
2992 static const char *const kinds[] =
2993 {".up", ".down", ".bfly", ".idx"};
2994 fputs (kinds[kind], file);
2995 }
2996 break;
2997
2998 case 'T':
2999 fprintf (file, "%d", GET_MODE_BITSIZE (mode));
3000 break;
3001
3002 case 'j':
3003 fprintf (file, "@");
3004 goto common;
3005
3006 case 'J':
3007 fprintf (file, "@!");
3008 goto common;
3009
3010 case 'c':
3011 mode = GET_MODE (XEXP (x, 0));
3012 switch (x_code)
3013 {
3014 case EQ:
3015 fputs (".eq", file);
3016 break;
3017 case NE:
3018 if (FLOAT_MODE_P (mode))
3019 fputs (".neu", file);
3020 else
3021 fputs (".ne", file);
3022 break;
3023 case LE:
3024 case LEU:
3025 fputs (".le", file);
3026 break;
3027 case GE:
3028 case GEU:
3029 fputs (".ge", file);
3030 break;
3031 case LT:
3032 case LTU:
3033 fputs (".lt", file);
3034 break;
3035 case GT:
3036 case GTU:
3037 fputs (".gt", file);
3038 break;
3039 case LTGT:
3040 fputs (".ne", file);
3041 break;
3042 case UNEQ:
3043 fputs (".equ", file);
3044 break;
3045 case UNLE:
3046 fputs (".leu", file);
3047 break;
3048 case UNGE:
3049 fputs (".geu", file);
3050 break;
3051 case UNLT:
3052 fputs (".ltu", file);
3053 break;
3054 case UNGT:
3055 fputs (".gtu", file);
3056 break;
3057 case UNORDERED:
3058 fputs (".nan", file);
3059 break;
3060 case ORDERED:
3061 fputs (".num", file);
3062 break;
3063 default:
3064 gcc_unreachable ();
3065 }
3066 if (FLOAT_MODE_P (mode)
3067 || x_code == EQ || x_code == NE
3068 || x_code == GEU || x_code == GTU
3069 || x_code == LEU || x_code == LTU)
3070 fputs (nvptx_ptx_type_from_mode (mode, true), file);
3071 else
3072 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode));
3073 break;
3074 default:
3075 common:
3076 switch (x_code)
3077 {
3078 case SUBREG:
3079 {
3080 rtx inner_x = SUBREG_REG (x);
3081 machine_mode inner_mode = GET_MODE (inner_x);
3082 machine_mode split = maybe_split_mode (inner_mode);
3083
3084 if (VECTOR_MODE_P (inner_mode)
3085 && (GET_MODE_SIZE (mode)
3086 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode))))
3087 {
3088 output_reg (file, REGNO (inner_x), VOIDmode);
3089 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y");
3090 }
3091 else if (split_mode_p (inner_mode)
3092 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode)))
3093 output_reg (file, REGNO (inner_x), split);
3094 else
3095 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x));
3096 }
3097 break;
3098
3099 case REG:
3100 output_reg (file, REGNO (x), maybe_split_mode (mode));
3101 break;
3102
3103 case MEM:
3104 fputc ('[', file);
3105 nvptx_print_address_operand (file, XEXP (x, 0), mode);
3106 fputc (']', file);
3107 break;
3108
3109 case CONST_INT:
3110 output_addr_const (file, x);
3111 break;
3112
3113 case CONST:
3114 case SYMBOL_REF:
3115 case LABEL_REF:
3116 /* We could use output_addr_const, but that can print things like
3117 "x-8", which breaks ptxas. Need to ensure it is output as
3118 "x+-8". */
3119 nvptx_print_address_operand (file, x, VOIDmode);
3120 break;
3121
3122 case CONST_DOUBLE:
3123 long vals[2];
3124 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode);
3125 vals[0] &= 0xffffffff;
3126 vals[1] &= 0xffffffff;
3127 if (mode == SFmode)
3128 fprintf (file, "0f%08lx", vals[0]);
3129 else
3130 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]);
3131 break;
3132
3133 case CONST_VECTOR:
3134 {
3135 unsigned n = CONST_VECTOR_NUNITS (x);
3136 fprintf (file, "{ ");
3137 for (unsigned i = 0; i < n; ++i)
3138 {
3139 if (i != 0)
3140 fprintf (file, ", ");
3141
3142 rtx elem = CONST_VECTOR_ELT (x, i);
3143 output_addr_const (file, elem);
3144 }
3145 fprintf (file, " }");
3146 }
3147 break;
3148
3149 default:
3150 output_addr_const (file, x);
3151 }
3152 }
3153 }
3154
3155 /* Record replacement regs used to deal with subreg operands. */
3156 struct reg_replace
3157 {
3158 rtx replacement[MAX_RECOG_OPERANDS];
3159 machine_mode mode;
3160 int n_allocated;
3161 int n_in_use;
3162 };
3163
3164 /* Allocate or reuse a replacement in R and return the rtx. */
3165
3166 static rtx
get_replacement(struct reg_replace * r)3167 get_replacement (struct reg_replace *r)
3168 {
3169 if (r->n_allocated == r->n_in_use)
3170 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode);
3171 return r->replacement[r->n_in_use++];
3172 }
3173
3174 /* Clean up subreg operands. In ptx assembly, everything is typed, and
3175 the presence of subregs would break the rules for most instructions.
3176 Replace them with a suitable new register of the right size, plus
3177 conversion copyin/copyout instructions. */
3178
3179 static void
nvptx_reorg_subreg(void)3180 nvptx_reorg_subreg (void)
3181 {
3182 struct reg_replace qiregs, hiregs, siregs, diregs;
3183 rtx_insn *insn, *next;
3184
3185 qiregs.n_allocated = 0;
3186 hiregs.n_allocated = 0;
3187 siregs.n_allocated = 0;
3188 diregs.n_allocated = 0;
3189 qiregs.mode = QImode;
3190 hiregs.mode = HImode;
3191 siregs.mode = SImode;
3192 diregs.mode = DImode;
3193
3194 for (insn = get_insns (); insn; insn = next)
3195 {
3196 next = NEXT_INSN (insn);
3197 if (!NONDEBUG_INSN_P (insn)
3198 || asm_noperands (PATTERN (insn)) >= 0
3199 || GET_CODE (PATTERN (insn)) == USE
3200 || GET_CODE (PATTERN (insn)) == CLOBBER)
3201 continue;
3202
3203 qiregs.n_in_use = 0;
3204 hiregs.n_in_use = 0;
3205 siregs.n_in_use = 0;
3206 diregs.n_in_use = 0;
3207 extract_insn (insn);
3208 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn);
3209
3210 for (int i = 0; i < recog_data.n_operands; i++)
3211 {
3212 rtx op = recog_data.operand[i];
3213 if (GET_CODE (op) != SUBREG)
3214 continue;
3215
3216 rtx inner = SUBREG_REG (op);
3217
3218 machine_mode outer_mode = GET_MODE (op);
3219 machine_mode inner_mode = GET_MODE (inner);
3220 gcc_assert (s_ok);
3221 if (s_ok
3222 && (GET_MODE_PRECISION (inner_mode)
3223 >= GET_MODE_PRECISION (outer_mode)))
3224 continue;
3225 gcc_assert (SCALAR_INT_MODE_P (outer_mode));
3226 struct reg_replace *r = (outer_mode == QImode ? &qiregs
3227 : outer_mode == HImode ? &hiregs
3228 : outer_mode == SImode ? &siregs
3229 : &diregs);
3230 rtx new_reg = get_replacement (r);
3231
3232 if (recog_data.operand_type[i] != OP_OUT)
3233 {
3234 enum rtx_code code;
3235 if (GET_MODE_PRECISION (inner_mode)
3236 < GET_MODE_PRECISION (outer_mode))
3237 code = ZERO_EXTEND;
3238 else
3239 code = TRUNCATE;
3240
3241 rtx pat = gen_rtx_SET (new_reg,
3242 gen_rtx_fmt_e (code, outer_mode, inner));
3243 emit_insn_before (pat, insn);
3244 }
3245
3246 if (recog_data.operand_type[i] != OP_IN)
3247 {
3248 enum rtx_code code;
3249 if (GET_MODE_PRECISION (inner_mode)
3250 < GET_MODE_PRECISION (outer_mode))
3251 code = TRUNCATE;
3252 else
3253 code = ZERO_EXTEND;
3254
3255 rtx pat = gen_rtx_SET (inner,
3256 gen_rtx_fmt_e (code, inner_mode, new_reg));
3257 emit_insn_after (pat, insn);
3258 }
3259 validate_change (insn, recog_data.operand_loc[i], new_reg, false);
3260 }
3261 }
3262 }
3263
3264 /* Return a SImode "master lane index" register for uniform-simt, allocating on
3265 first use. */
3266
3267 static rtx
nvptx_get_unisimt_master()3268 nvptx_get_unisimt_master ()
3269 {
3270 rtx &master = cfun->machine->unisimt_master;
3271 return master ? master : master = gen_reg_rtx (SImode);
3272 }
3273
3274 /* Return a BImode "predicate" register for uniform-simt, similar to above. */
3275
3276 static rtx
nvptx_get_unisimt_predicate()3277 nvptx_get_unisimt_predicate ()
3278 {
3279 rtx &pred = cfun->machine->unisimt_predicate;
3280 return pred ? pred : pred = gen_reg_rtx (BImode);
3281 }
3282
3283 static rtx
nvptx_get_unisimt_outside_simt_predicate()3284 nvptx_get_unisimt_outside_simt_predicate ()
3285 {
3286 rtx &pred = cfun->machine->unisimt_outside_simt_predicate;
3287 return pred ? pred : pred = gen_reg_rtx (BImode);
3288 }
3289
3290 /* Return true if given call insn references one of the functions provided by
3291 the CUDA runtime: malloc, free, vprintf. */
3292
3293 static bool
nvptx_call_insn_is_syscall_p(rtx_insn * insn)3294 nvptx_call_insn_is_syscall_p (rtx_insn *insn)
3295 {
3296 rtx pat = PATTERN (insn);
3297 gcc_checking_assert (GET_CODE (pat) == PARALLEL);
3298 pat = XVECEXP (pat, 0, 0);
3299 if (GET_CODE (pat) == SET)
3300 pat = SET_SRC (pat);
3301 gcc_checking_assert (GET_CODE (pat) == CALL
3302 && GET_CODE (XEXP (pat, 0)) == MEM);
3303 rtx addr = XEXP (XEXP (pat, 0), 0);
3304 if (GET_CODE (addr) != SYMBOL_REF)
3305 return false;
3306 const char *name = XSTR (addr, 0);
3307 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the
3308 references with forced assembler name refer to PTX syscalls. For vprintf,
3309 accept both normal and forced-assembler-name references. */
3310 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf")
3311 || !strcmp (name, "*malloc")
3312 || !strcmp (name, "*free"));
3313 }
3314
3315 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to
3316 propagate its value from lane MASTER to current lane. */
3317
3318 static bool
nvptx_unisimt_handle_set(rtx set,rtx_insn * insn,rtx master)3319 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master)
3320 {
3321 rtx reg;
3322 if (GET_CODE (set) == SET
3323 && REG_P (reg = SET_DEST (set))
3324 && find_reg_note (insn, REG_UNUSED, reg) == NULL_RTX)
3325 {
3326 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX),
3327 insn);
3328 return true;
3329 }
3330
3331 return false;
3332 }
3333
3334 static void
predicate_insn(rtx_insn * insn,rtx pred)3335 predicate_insn (rtx_insn *insn, rtx pred)
3336 {
3337 rtx pat = PATTERN (insn);
3338 pred = gen_rtx_NE (BImode, pred, const0_rtx);
3339 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat);
3340 bool changed_p = validate_change (insn, &PATTERN (insn), pat, false);
3341 gcc_assert (changed_p);
3342 }
3343
3344 /* Adjust code for uniform-simt code generation variant by making atomics and
3345 "syscalls" conditionally executed, and inserting shuffle-based propagation
3346 for registers being set. */
3347
3348 static void
nvptx_reorg_uniform_simt()3349 nvptx_reorg_uniform_simt ()
3350 {
3351 rtx_insn *insn, *next;
3352
3353 for (insn = get_insns (); insn; insn = next)
3354 {
3355 next = NEXT_INSN (insn);
3356
3357 /* Skip NOTE, USE, etc. */
3358 if (!INSN_P (insn) || recog_memoized (insn) == -1)
3359 continue;
3360
3361 if (CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn))
3362 {
3363 /* Handle syscall. */
3364 }
3365 else if (get_attr_atomic (insn))
3366 {
3367 /* Handle atomic insn. */
3368 }
3369 else
3370 continue;
3371
3372 rtx pat = PATTERN (insn);
3373 rtx master = nvptx_get_unisimt_master ();
3374 bool shuffle_p = false;
3375 switch (GET_CODE (pat))
3376 {
3377 case PARALLEL:
3378 for (int i = 0; i < XVECLEN (pat, 0); i++)
3379 shuffle_p
3380 |= nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master);
3381 break;
3382 case SET:
3383 shuffle_p |= nvptx_unisimt_handle_set (pat, insn, master);
3384 break;
3385 default:
3386 gcc_unreachable ();
3387 }
3388
3389 if (shuffle_p && TARGET_PTX_6_0)
3390 {
3391 /* The shuffle is a sync, so uniformity is guaranteed. */
3392 }
3393 else
3394 {
3395 if (TARGET_PTX_6_0)
3396 {
3397 gcc_assert (!shuffle_p);
3398 /* Emit after the insn, to guarantee uniformity. */
3399 emit_insn_after (gen_nvptx_warpsync (), insn);
3400 }
3401 else
3402 {
3403 /* Emit after the insn (and before the shuffle, if there are any)
3404 to check uniformity. */
3405 emit_insn_after (gen_nvptx_uniform_warp_check (), insn);
3406 }
3407 }
3408
3409 rtx pred = nvptx_get_unisimt_predicate ();
3410 predicate_insn (insn, pred);
3411
3412 pred = NULL_RTX;
3413 for (rtx_insn *post = NEXT_INSN (insn); post != next;
3414 post = NEXT_INSN (post))
3415 {
3416 if (pred == NULL_RTX)
3417 pred = nvptx_get_unisimt_outside_simt_predicate ();
3418 predicate_insn (post, pred);
3419 }
3420 }
3421 }
3422
3423 /* Offloading function attributes. */
3424
3425 struct offload_attrs
3426 {
3427 unsigned mask;
3428 int num_gangs;
3429 int num_workers;
3430 int vector_length;
3431 };
3432
3433 /* Define entries for cfun->machine->axis_dim. */
3434
3435 #define MACH_VECTOR_LENGTH 0
3436 #define MACH_MAX_WORKERS 1
3437
3438 static void populate_offload_attrs (offload_attrs *oa);
3439
3440 static void
init_axis_dim(void)3441 init_axis_dim (void)
3442 {
3443 offload_attrs oa;
3444 int max_workers;
3445
3446 populate_offload_attrs (&oa);
3447
3448 if (oa.num_workers == 0)
3449 max_workers = PTX_CTA_SIZE / oa.vector_length;
3450 else
3451 max_workers = oa.num_workers;
3452
3453 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length;
3454 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers;
3455 cfun->machine->axis_dim_init_p = true;
3456 }
3457
3458 static int ATTRIBUTE_UNUSED
nvptx_mach_max_workers()3459 nvptx_mach_max_workers ()
3460 {
3461 if (!cfun->machine->axis_dim_init_p)
3462 init_axis_dim ();
3463 return cfun->machine->axis_dim[MACH_MAX_WORKERS];
3464 }
3465
3466 static int ATTRIBUTE_UNUSED
nvptx_mach_vector_length()3467 nvptx_mach_vector_length ()
3468 {
3469 if (!cfun->machine->axis_dim_init_p)
3470 init_axis_dim ();
3471 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH];
3472 }
3473
3474 /* Loop structure of the function. The entire function is described as
3475 a NULL loop. */
3476 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:struct parallel_g'. */
3477
3478 struct parallel
3479 {
3480 /* Parent parallel. */
3481 parallel *parent;
3482
3483 /* Next sibling parallel. */
3484 parallel *next;
3485
3486 /* First child parallel. */
3487 parallel *inner;
3488
3489 /* Partitioning mask of the parallel. */
3490 unsigned mask;
3491
3492 /* Partitioning used within inner parallels. */
3493 unsigned inner_mask;
3494
3495 /* Location of parallel forked and join. The forked is the first
3496 block in the parallel and the join is the first block after of
3497 the partition. */
3498 basic_block forked_block;
3499 basic_block join_block;
3500
3501 rtx_insn *forked_insn;
3502 rtx_insn *join_insn;
3503
3504 rtx_insn *fork_insn;
3505 rtx_insn *joining_insn;
3506
3507 /* Basic blocks in this parallel, but not in child parallels. The
3508 FORKED and JOINING blocks are in the partition. The FORK and JOIN
3509 blocks are not. */
3510 auto_vec<basic_block> blocks;
3511
3512 public:
3513 parallel (parallel *parent, unsigned mode);
3514 ~parallel ();
3515 };
3516
3517 /* Constructor links the new parallel into it's parent's chain of
3518 children. */
3519
parallel(parallel * parent_,unsigned mask_)3520 parallel::parallel (parallel *parent_, unsigned mask_)
3521 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
3522 {
3523 forked_block = join_block = 0;
3524 forked_insn = join_insn = 0;
3525 fork_insn = joining_insn = 0;
3526
3527 if (parent)
3528 {
3529 next = parent->inner;
3530 parent->inner = this;
3531 }
3532 }
3533
~parallel()3534 parallel::~parallel ()
3535 {
3536 delete inner;
3537 delete next;
3538 }
3539
3540 /* Map of basic blocks to insns */
3541 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t;
3542
3543 /* A tuple of an insn of interest and the BB in which it resides. */
3544 typedef std::pair<rtx_insn *, basic_block> insn_bb_t;
3545 typedef auto_vec<insn_bb_t> insn_bb_vec_t;
3546
3547 /* Split basic blocks such that each forked and join unspecs are at
3548 the start of their basic blocks. Thus afterwards each block will
3549 have a single partitioning mode. We also do the same for return
3550 insns, as they are executed by every thread. Return the
3551 partitioning mode of the function as a whole. Populate MAP with
3552 head and tail blocks. We also clear the BB visited flag, which is
3553 used when finding partitions. */
3554 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_split_blocks'. */
3555
3556 static void
nvptx_split_blocks(bb_insn_map_t * map)3557 nvptx_split_blocks (bb_insn_map_t *map)
3558 {
3559 insn_bb_vec_t worklist;
3560 basic_block block;
3561 rtx_insn *insn;
3562
3563 /* Locate all the reorg instructions of interest. */
3564 FOR_ALL_BB_FN (block, cfun)
3565 {
3566 bool seen_insn = false;
3567
3568 /* Clear visited flag, for use by parallel locator */
3569 block->flags &= ~BB_VISITED;
3570
3571 FOR_BB_INSNS (block, insn)
3572 {
3573 if (!INSN_P (insn))
3574 continue;
3575 switch (recog_memoized (insn))
3576 {
3577 default:
3578 seen_insn = true;
3579 continue;
3580 case CODE_FOR_nvptx_forked:
3581 case CODE_FOR_nvptx_join:
3582 break;
3583
3584 case CODE_FOR_return:
3585 /* We also need to split just before return insns, as
3586 that insn needs executing by all threads, but the
3587 block it is in probably does not. */
3588 break;
3589 }
3590
3591 if (seen_insn)
3592 /* We've found an instruction that must be at the start of
3593 a block, but isn't. Add it to the worklist. */
3594 worklist.safe_push (insn_bb_t (insn, block));
3595 else
3596 /* It was already the first instruction. Just add it to
3597 the map. */
3598 map->get_or_insert (block) = insn;
3599 seen_insn = true;
3600 }
3601 }
3602
3603 /* Split blocks on the worklist. */
3604 unsigned ix;
3605 insn_bb_t *elt;
3606 basic_block remap = 0;
3607 for (ix = 0; worklist.iterate (ix, &elt); ix++)
3608 {
3609 if (remap != elt->second)
3610 {
3611 block = elt->second;
3612 remap = block;
3613 }
3614
3615 /* Split block before insn. The insn is in the new block */
3616 edge e = split_block (block, PREV_INSN (elt->first));
3617
3618 block = e->dest;
3619 map->get_or_insert (block) = elt->first;
3620 }
3621 }
3622
3623 /* Return true if MASK contains parallelism that requires shared
3624 memory to broadcast. */
3625
3626 static bool
nvptx_needs_shared_bcast(unsigned mask)3627 nvptx_needs_shared_bcast (unsigned mask)
3628 {
3629 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER);
3630 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
3631 && nvptx_mach_vector_length () != PTX_WARP_SIZE;
3632
3633 return worker || large_vector;
3634 }
3635
3636 /* BLOCK is a basic block containing a head or tail instruction.
3637 Locate the associated prehead or pretail instruction, which must be
3638 in the single predecessor block. */
3639
3640 static rtx_insn *
nvptx_discover_pre(basic_block block,int expected)3641 nvptx_discover_pre (basic_block block, int expected)
3642 {
3643 gcc_assert (block->preds->length () == 1);
3644 basic_block pre_block = (*block->preds)[0]->src;
3645 rtx_insn *pre_insn;
3646
3647 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn);
3648 pre_insn = PREV_INSN (pre_insn))
3649 gcc_assert (pre_insn != BB_HEAD (pre_block));
3650
3651 gcc_assert (recog_memoized (pre_insn) == expected);
3652 return pre_insn;
3653 }
3654
3655 /* Dump this parallel and all its inner parallels. */
3656 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_dump_pars'. */
3657
3658 static void
nvptx_dump_pars(parallel * par,unsigned depth)3659 nvptx_dump_pars (parallel *par, unsigned depth)
3660 {
3661 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
3662 depth, par->mask,
3663 par->forked_block ? par->forked_block->index : -1,
3664 par->join_block ? par->join_block->index : -1);
3665
3666 fprintf (dump_file, " blocks:");
3667
3668 basic_block block;
3669 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++)
3670 fprintf (dump_file, " %d", block->index);
3671 fprintf (dump_file, "\n");
3672 if (par->inner)
3673 nvptx_dump_pars (par->inner, depth + 1);
3674
3675 if (par->next)
3676 nvptx_dump_pars (par->next, depth);
3677 }
3678
3679 /* If BLOCK contains a fork/join marker, process it to create or
3680 terminate a loop structure. Add this block to the current loop,
3681 and then walk successor blocks. */
3682 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_find_par'. */
3683
3684 static parallel *
nvptx_find_par(bb_insn_map_t * map,parallel * par,basic_block block)3685 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block)
3686 {
3687 if (block->flags & BB_VISITED)
3688 return par;
3689 block->flags |= BB_VISITED;
3690
3691 if (rtx_insn **endp = map->get (block))
3692 {
3693 rtx_insn *end = *endp;
3694
3695 /* This is a block head or tail, or return instruction. */
3696 switch (recog_memoized (end))
3697 {
3698 case CODE_FOR_return:
3699 /* Return instructions are in their own block, and we
3700 don't need to do anything more. */
3701 return par;
3702
3703 case CODE_FOR_nvptx_forked:
3704 /* Loop head, create a new inner loop and add it into
3705 our parent's child list. */
3706 {
3707 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3708
3709 gcc_assert (mask);
3710 par = new parallel (par, mask);
3711 par->forked_block = block;
3712 par->forked_insn = end;
3713 if (nvptx_needs_shared_bcast (mask))
3714 par->fork_insn
3715 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
3716 }
3717 break;
3718
3719 case CODE_FOR_nvptx_join:
3720 /* A loop tail. Finish the current loop and return to
3721 parent. */
3722 {
3723 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
3724
3725 gcc_assert (par->mask == mask);
3726 gcc_assert (par->join_block == NULL);
3727 par->join_block = block;
3728 par->join_insn = end;
3729 if (nvptx_needs_shared_bcast (mask))
3730 par->joining_insn
3731 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
3732 par = par->parent;
3733 }
3734 break;
3735
3736 default:
3737 gcc_unreachable ();
3738 }
3739 }
3740
3741 if (par)
3742 /* Add this block onto the current loop's list of blocks. */
3743 par->blocks.safe_push (block);
3744 else
3745 /* This must be the entry block. Create a NULL parallel. */
3746 par = new parallel (0, 0);
3747
3748 /* Walk successor blocks. */
3749 edge e;
3750 edge_iterator ei;
3751
3752 FOR_EACH_EDGE (e, ei, block->succs)
3753 nvptx_find_par (map, par, e->dest);
3754
3755 return par;
3756 }
3757
3758 /* DFS walk the CFG looking for fork & join markers. Construct
3759 loop structures as we go. MAP is a mapping of basic blocks
3760 to head & tail markers, discovered when splitting blocks. This
3761 speeds up the discovery. We rely on the BB visited flag having
3762 been cleared when splitting blocks. */
3763 /* See also 'gcc/omp-oacc-neuter-broadcast.cc:omp_sese_discover_pars'. */
3764
3765 static parallel *
nvptx_discover_pars(bb_insn_map_t * map)3766 nvptx_discover_pars (bb_insn_map_t *map)
3767 {
3768 basic_block block;
3769
3770 /* Mark exit blocks as visited. */
3771 block = EXIT_BLOCK_PTR_FOR_FN (cfun);
3772 block->flags |= BB_VISITED;
3773
3774 /* And entry block as not. */
3775 block = ENTRY_BLOCK_PTR_FOR_FN (cfun);
3776 block->flags &= ~BB_VISITED;
3777
3778 parallel *par = nvptx_find_par (map, 0, block);
3779
3780 if (dump_file)
3781 {
3782 fprintf (dump_file, "\nLoops\n");
3783 nvptx_dump_pars (par, 0);
3784 fprintf (dump_file, "\n");
3785 }
3786
3787 return par;
3788 }
3789
3790 /* Analyse a group of BBs within a partitioned region and create N
3791 Single-Entry-Single-Exit regions. Some of those regions will be
3792 trivial ones consisting of a single BB. The blocks of a
3793 partitioned region might form a set of disjoint graphs -- because
3794 the region encloses a differently partitoned sub region.
3795
3796 We use the linear time algorithm described in 'Finding Regions Fast:
3797 Single Entry Single Exit and control Regions in Linear Time'
3798 Johnson, Pearson & Pingali. That algorithm deals with complete
3799 CFGs, where a back edge is inserted from END to START, and thus the
3800 problem becomes one of finding equivalent loops.
3801
3802 In this case we have a partial CFG. We complete it by redirecting
3803 any incoming edge to the graph to be from an arbitrary external BB,
3804 and similarly redirecting any outgoing edge to be to that BB.
3805 Thus we end up with a closed graph.
3806
3807 The algorithm works by building a spanning tree of an undirected
3808 graph and keeping track of back edges from nodes further from the
3809 root in the tree to nodes nearer to the root in the tree. In the
3810 description below, the root is up and the tree grows downwards.
3811
3812 We avoid having to deal with degenerate back-edges to the same
3813 block, by splitting each BB into 3 -- one for input edges, one for
3814 the node itself and one for the output edges. Such back edges are
3815 referred to as 'Brackets'. Cycle equivalent nodes will have the
3816 same set of brackets.
3817
3818 Determining bracket equivalency is done by maintaining a list of
3819 brackets in such a manner that the list length and final bracket
3820 uniquely identify the set.
3821
3822 We use coloring to mark all BBs with cycle equivalency with the
3823 same color. This is the output of the 'Finding Regions Fast'
3824 algorithm. Notice it doesn't actually find the set of nodes within
3825 a particular region, just unorderd sets of nodes that are the
3826 entries and exits of SESE regions.
3827
3828 After determining cycle equivalency, we need to find the minimal
3829 set of SESE regions. Do this with a DFS coloring walk of the
3830 complete graph. We're either 'looking' or 'coloring'. When
3831 looking, and we're in the subgraph, we start coloring the color of
3832 the current node, and remember that node as the start of the
3833 current color's SESE region. Every time we go to a new node, we
3834 decrement the count of nodes with thet color. If it reaches zero,
3835 we remember that node as the end of the current color's SESE region
3836 and return to 'looking'. Otherwise we color the node the current
3837 color.
3838
3839 This way we end up with coloring the inside of non-trivial SESE
3840 regions with the color of that region. */
3841
3842 /* A pair of BBs. We use this to represent SESE regions. */
3843 typedef std::pair<basic_block, basic_block> bb_pair_t;
3844 typedef auto_vec<bb_pair_t> bb_pair_vec_t;
3845
3846 /* A node in the undirected CFG. The discriminator SECOND indicates just
3847 above or just below the BB idicated by FIRST. */
3848 typedef std::pair<basic_block, int> pseudo_node_t;
3849
3850 /* A bracket indicates an edge towards the root of the spanning tree of the
3851 undirected graph. Each bracket has a color, determined
3852 from the currrent set of brackets. */
3853 struct bracket
3854 {
3855 pseudo_node_t back; /* Back target */
3856
3857 /* Current color and size of set. */
3858 unsigned color;
3859 unsigned size;
3860
bracketbracket3861 bracket (pseudo_node_t back_)
3862 : back (back_), color (~0u), size (~0u)
3863 {
3864 }
3865
get_colorbracket3866 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length)
3867 {
3868 if (length != size)
3869 {
3870 size = length;
3871 color = color_counts.length ();
3872 color_counts.quick_push (0);
3873 }
3874 color_counts[color]++;
3875 return color;
3876 }
3877 };
3878
3879 typedef auto_vec<bracket> bracket_vec_t;
3880
3881 /* Basic block info for finding SESE regions. */
3882
3883 struct bb_sese
3884 {
3885 int node; /* Node number in spanning tree. */
3886 int parent; /* Parent node number. */
3887
3888 /* The algorithm splits each node A into Ai, A', Ao. The incoming
3889 edges arrive at pseudo-node Ai and the outgoing edges leave at
3890 pseudo-node Ao. We have to remember which way we arrived at a
3891 particular node when generating the spanning tree. dir > 0 means
3892 we arrived at Ai, dir < 0 means we arrived at Ao. */
3893 int dir;
3894
3895 /* Lowest numbered pseudo-node reached via a backedge from thsis
3896 node, or any descendant. */
3897 pseudo_node_t high;
3898
3899 int color; /* Cycle-equivalence color */
3900
3901 /* Stack of brackets for this node. */
3902 bracket_vec_t brackets;
3903
bb_sesebb_sese3904 bb_sese (unsigned node_, unsigned p, int dir_)
3905 :node (node_), parent (p), dir (dir_)
3906 {
3907 }
3908 ~bb_sese ();
3909
3910 /* Push a bracket ending at BACK. */
pushbb_sese3911 void push (const pseudo_node_t &back)
3912 {
3913 if (dump_file)
3914 fprintf (dump_file, "Pushing backedge %d:%+d\n",
3915 back.first ? back.first->index : 0, back.second);
3916 brackets.safe_push (bracket (back));
3917 }
3918
3919 void append (bb_sese *child);
3920 void remove (const pseudo_node_t &);
3921
3922 /* Set node's color. */
set_colorbb_sese3923 void set_color (auto_vec<unsigned> &color_counts)
3924 {
3925 color = brackets.last ().get_color (color_counts, brackets.length ());
3926 }
3927 };
3928
~bb_sese()3929 bb_sese::~bb_sese ()
3930 {
3931 }
3932
3933 /* Destructively append CHILD's brackets. */
3934
3935 void
append(bb_sese * child)3936 bb_sese::append (bb_sese *child)
3937 {
3938 if (int len = child->brackets.length ())
3939 {
3940 int ix;
3941
3942 if (dump_file)
3943 {
3944 for (ix = 0; ix < len; ix++)
3945 {
3946 const pseudo_node_t &pseudo = child->brackets[ix].back;
3947 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n",
3948 child->node, pseudo.first ? pseudo.first->index : 0,
3949 pseudo.second);
3950 }
3951 }
3952 if (!brackets.length ())
3953 std::swap (brackets, child->brackets);
3954 else
3955 {
3956 brackets.reserve (len);
3957 for (ix = 0; ix < len; ix++)
3958 brackets.quick_push (child->brackets[ix]);
3959 }
3960 }
3961 }
3962
3963 /* Remove brackets that terminate at PSEUDO. */
3964
3965 void
remove(const pseudo_node_t & pseudo)3966 bb_sese::remove (const pseudo_node_t &pseudo)
3967 {
3968 unsigned removed = 0;
3969 int len = brackets.length ();
3970
3971 for (int ix = 0; ix < len; ix++)
3972 {
3973 if (brackets[ix].back == pseudo)
3974 {
3975 if (dump_file)
3976 fprintf (dump_file, "Removing backedge %d:%+d\n",
3977 pseudo.first ? pseudo.first->index : 0, pseudo.second);
3978 removed++;
3979 }
3980 else if (removed)
3981 brackets[ix-removed] = brackets[ix];
3982 }
3983 while (removed--)
3984 brackets.pop ();
3985 }
3986
3987 /* Accessors for BB's aux pointer. */
3988 #define BB_SET_SESE(B, S) ((B)->aux = (S))
3989 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux)
3990
3991 /* DFS walk creating SESE data structures. Only cover nodes with
3992 BB_VISITED set. Append discovered blocks to LIST. We number in
3993 increments of 3 so that the above and below pseudo nodes can be
3994 implicitly numbered too. */
3995
3996 static int
nvptx_sese_number(int n,int p,int dir,basic_block b,auto_vec<basic_block> * list)3997 nvptx_sese_number (int n, int p, int dir, basic_block b,
3998 auto_vec<basic_block> *list)
3999 {
4000 if (BB_GET_SESE (b))
4001 return n;
4002
4003 if (dump_file)
4004 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n",
4005 b->index, n, p, dir);
4006
4007 BB_SET_SESE (b, new bb_sese (n, p, dir));
4008 p = n;
4009
4010 n += 3;
4011 list->quick_push (b);
4012
4013 /* First walk the nodes on the 'other side' of this node, then walk
4014 the nodes on the same side. */
4015 for (unsigned ix = 2; ix; ix--)
4016 {
4017 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds;
4018 size_t offset = (dir > 0 ? offsetof (edge_def, dest)
4019 : offsetof (edge_def, src));
4020 edge e;
4021 edge_iterator ei;
4022
4023 FOR_EACH_EDGE (e, ei, edges)
4024 {
4025 basic_block target = *(basic_block *)((char *)e + offset);
4026
4027 if (target->flags & BB_VISITED)
4028 n = nvptx_sese_number (n, p, dir, target, list);
4029 }
4030 dir = -dir;
4031 }
4032 return n;
4033 }
4034
4035 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME.
4036 EDGES are the outgoing edges and OFFSET is the offset to the src
4037 or dst block on the edges. */
4038
4039 static void
nvptx_sese_pseudo(basic_block me,bb_sese * sese,int depth,int dir,vec<edge,va_gc> * edges,size_t offset)4040 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir,
4041 vec<edge, va_gc> *edges, size_t offset)
4042 {
4043 edge e;
4044 edge_iterator ei;
4045 int hi_back = depth;
4046 pseudo_node_t node_back (nullptr, depth);
4047 int hi_child = depth;
4048 pseudo_node_t node_child (nullptr, depth);
4049 basic_block child = NULL;
4050 unsigned num_children = 0;
4051 int usd = -dir * sese->dir;
4052
4053 if (dump_file)
4054 fprintf (dump_file, "\nProcessing %d(%d) %+d\n",
4055 me->index, sese->node, dir);
4056
4057 if (dir < 0)
4058 {
4059 /* This is the above pseudo-child. It has the BB itself as an
4060 additional child node. */
4061 node_child = sese->high;
4062 hi_child = node_child.second;
4063 if (node_child.first)
4064 hi_child += BB_GET_SESE (node_child.first)->node;
4065 num_children++;
4066 }
4067
4068 /* Examine each edge.
4069 - if it is a child (a) append its bracket list and (b) record
4070 whether it is the child with the highest reaching bracket.
4071 - if it is an edge to ancestor, record whether it's the highest
4072 reaching backlink. */
4073 FOR_EACH_EDGE (e, ei, edges)
4074 {
4075 basic_block target = *(basic_block *)((char *)e + offset);
4076
4077 if (bb_sese *t_sese = BB_GET_SESE (target))
4078 {
4079 if (t_sese->parent == sese->node && !(t_sese->dir + usd))
4080 {
4081 /* Child node. Append its bracket list. */
4082 num_children++;
4083 sese->append (t_sese);
4084
4085 /* Compare it's hi value. */
4086 int t_hi = t_sese->high.second;
4087
4088 if (basic_block child_hi_block = t_sese->high.first)
4089 t_hi += BB_GET_SESE (child_hi_block)->node;
4090
4091 if (hi_child > t_hi)
4092 {
4093 hi_child = t_hi;
4094 node_child = t_sese->high;
4095 child = target;
4096 }
4097 }
4098 else if (t_sese->node < sese->node + dir
4099 && !(dir < 0 && sese->parent == t_sese->node))
4100 {
4101 /* Non-parental ancestor node -- a backlink. */
4102 int d = usd * t_sese->dir;
4103 int back = t_sese->node + d;
4104
4105 if (hi_back > back)
4106 {
4107 hi_back = back;
4108 node_back = pseudo_node_t (target, d);
4109 }
4110 }
4111 }
4112 else
4113 { /* Fallen off graph, backlink to entry node. */
4114 hi_back = 0;
4115 node_back = pseudo_node_t (nullptr, 0);
4116 }
4117 }
4118
4119 /* Remove any brackets that terminate at this pseudo node. */
4120 sese->remove (pseudo_node_t (me, dir));
4121
4122 /* Now push any backlinks from this pseudo node. */
4123 FOR_EACH_EDGE (e, ei, edges)
4124 {
4125 basic_block target = *(basic_block *)((char *)e + offset);
4126 if (bb_sese *t_sese = BB_GET_SESE (target))
4127 {
4128 if (t_sese->node < sese->node + dir
4129 && !(dir < 0 && sese->parent == t_sese->node))
4130 /* Non-parental ancestor node - backedge from me. */
4131 sese->push (pseudo_node_t (target, usd * t_sese->dir));
4132 }
4133 else
4134 {
4135 /* back edge to entry node */
4136 sese->push (pseudo_node_t (nullptr, 0));
4137 }
4138 }
4139
4140 /* If this node leads directly or indirectly to a no-return region of
4141 the graph, then fake a backedge to entry node. */
4142 if (!sese->brackets.length () || !edges || !edges->length ())
4143 {
4144 hi_back = 0;
4145 node_back = pseudo_node_t (nullptr, 0);
4146 sese->push (node_back);
4147 }
4148
4149 /* Record the highest reaching backedge from us or a descendant. */
4150 sese->high = hi_back < hi_child ? node_back : node_child;
4151
4152 if (num_children > 1)
4153 {
4154 /* There is more than one child -- this is a Y shaped piece of
4155 spanning tree. We have to insert a fake backedge from this
4156 node to the highest ancestor reached by not-the-highest
4157 reaching child. Note that there may be multiple children
4158 with backedges to the same highest node. That's ok and we
4159 insert the edge to that highest node. */
4160 hi_child = depth;
4161 if (dir < 0 && child)
4162 {
4163 node_child = sese->high;
4164 hi_child = node_child.second;
4165 if (node_child.first)
4166 hi_child += BB_GET_SESE (node_child.first)->node;
4167 }
4168
4169 FOR_EACH_EDGE (e, ei, edges)
4170 {
4171 basic_block target = *(basic_block *)((char *)e + offset);
4172
4173 if (target == child)
4174 /* Ignore the highest child. */
4175 continue;
4176
4177 bb_sese *t_sese = BB_GET_SESE (target);
4178 if (!t_sese)
4179 continue;
4180 if (t_sese->parent != sese->node)
4181 /* Not a child. */
4182 continue;
4183
4184 /* Compare its hi value. */
4185 int t_hi = t_sese->high.second;
4186
4187 if (basic_block child_hi_block = t_sese->high.first)
4188 t_hi += BB_GET_SESE (child_hi_block)->node;
4189
4190 if (hi_child > t_hi)
4191 {
4192 hi_child = t_hi;
4193 node_child = t_sese->high;
4194 }
4195 }
4196
4197 sese->push (node_child);
4198 }
4199 }
4200
4201
4202 /* DFS walk of BB graph. Color node BLOCK according to COLORING then
4203 proceed to successors. Set SESE entry and exit nodes of
4204 REGIONS. */
4205
4206 static void
nvptx_sese_color(auto_vec<unsigned> & color_counts,bb_pair_vec_t & regions,basic_block block,int coloring)4207 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t ®ions,
4208 basic_block block, int coloring)
4209 {
4210 bb_sese *sese = BB_GET_SESE (block);
4211
4212 if (block->flags & BB_VISITED)
4213 {
4214 /* If we've already encountered this block, either we must not
4215 be coloring, or it must have been colored the current color. */
4216 gcc_assert (coloring < 0 || (sese && coloring == sese->color));
4217 return;
4218 }
4219
4220 block->flags |= BB_VISITED;
4221
4222 if (sese)
4223 {
4224 if (coloring < 0)
4225 {
4226 /* Start coloring a region. */
4227 regions[sese->color].first = block;
4228 coloring = sese->color;
4229 }
4230
4231 if (!--color_counts[sese->color] && sese->color == coloring)
4232 {
4233 /* Found final block of SESE region. */
4234 regions[sese->color].second = block;
4235 coloring = -1;
4236 }
4237 else
4238 /* Color the node, so we can assert on revisiting the node
4239 that the graph is indeed SESE. */
4240 sese->color = coloring;
4241 }
4242 else
4243 /* Fallen off the subgraph, we cannot be coloring. */
4244 gcc_assert (coloring < 0);
4245
4246 /* Walk each successor block. */
4247 if (block->succs && block->succs->length ())
4248 {
4249 edge e;
4250 edge_iterator ei;
4251
4252 FOR_EACH_EDGE (e, ei, block->succs)
4253 nvptx_sese_color (color_counts, regions, e->dest, coloring);
4254 }
4255 else
4256 gcc_assert (coloring < 0);
4257 }
4258
4259 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might
4260 end up with NULL entries in it. */
4261
4262 static void
nvptx_find_sese(auto_vec<basic_block> & blocks,bb_pair_vec_t & regions)4263 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions)
4264 {
4265 basic_block block;
4266 int ix;
4267
4268 /* First clear each BB of the whole function. */
4269 FOR_ALL_BB_FN (block, cfun)
4270 {
4271 block->flags &= ~BB_VISITED;
4272 BB_SET_SESE (block, 0);
4273 }
4274
4275 /* Mark blocks in the function that are in this graph. */
4276 for (ix = 0; blocks.iterate (ix, &block); ix++)
4277 block->flags |= BB_VISITED;
4278
4279 /* Counts of nodes assigned to each color. There cannot be more
4280 colors than blocks (and hopefully there will be fewer). */
4281 auto_vec<unsigned> color_counts;
4282 color_counts.reserve (blocks.length ());
4283
4284 /* Worklist of nodes in the spanning tree. Again, there cannot be
4285 more nodes in the tree than blocks (there will be fewer if the
4286 CFG of blocks is disjoint). */
4287 auto_vec<basic_block> spanlist;
4288 spanlist.reserve (blocks.length ());
4289
4290 /* Make sure every block has its cycle class determined. */
4291 for (ix = 0; blocks.iterate (ix, &block); ix++)
4292 {
4293 if (BB_GET_SESE (block))
4294 /* We already met this block in an earlier graph solve. */
4295 continue;
4296
4297 if (dump_file)
4298 fprintf (dump_file, "Searching graph starting at %d\n", block->index);
4299
4300 /* Number the nodes reachable from block initial DFS order. */
4301 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist);
4302
4303 /* Now walk in reverse DFS order to find cycle equivalents. */
4304 while (spanlist.length ())
4305 {
4306 block = spanlist.pop ();
4307 bb_sese *sese = BB_GET_SESE (block);
4308
4309 /* Do the pseudo node below. */
4310 nvptx_sese_pseudo (block, sese, depth, +1,
4311 sese->dir > 0 ? block->succs : block->preds,
4312 (sese->dir > 0 ? offsetof (edge_def, dest)
4313 : offsetof (edge_def, src)));
4314 sese->set_color (color_counts);
4315 /* Do the pseudo node above. */
4316 nvptx_sese_pseudo (block, sese, depth, -1,
4317 sese->dir < 0 ? block->succs : block->preds,
4318 (sese->dir < 0 ? offsetof (edge_def, dest)
4319 : offsetof (edge_def, src)));
4320 }
4321 if (dump_file)
4322 fprintf (dump_file, "\n");
4323 }
4324
4325 if (dump_file)
4326 {
4327 unsigned count;
4328 const char *comma = "";
4329
4330 fprintf (dump_file, "Found %d cycle equivalents\n",
4331 color_counts.length ());
4332 for (ix = 0; color_counts.iterate (ix, &count); ix++)
4333 {
4334 fprintf (dump_file, "%s%d[%d]={", comma, ix, count);
4335
4336 comma = "";
4337 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++)
4338 if (BB_GET_SESE (block)->color == ix)
4339 {
4340 block->flags |= BB_VISITED;
4341 fprintf (dump_file, "%s%d", comma, block->index);
4342 comma=",";
4343 }
4344 fprintf (dump_file, "}");
4345 comma = ", ";
4346 }
4347 fprintf (dump_file, "\n");
4348 }
4349
4350 /* Now we've colored every block in the subgraph. We now need to
4351 determine the minimal set of SESE regions that cover that
4352 subgraph. Do this with a DFS walk of the complete function.
4353 During the walk we're either 'looking' or 'coloring'. When we
4354 reach the last node of a particular color, we stop coloring and
4355 return to looking. */
4356
4357 /* There cannot be more SESE regions than colors. */
4358 regions.reserve (color_counts.length ());
4359 for (ix = color_counts.length (); ix--;)
4360 regions.quick_push (bb_pair_t (0, 0));
4361
4362 for (ix = 0; blocks.iterate (ix, &block); ix++)
4363 block->flags &= ~BB_VISITED;
4364
4365 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1);
4366
4367 if (dump_file)
4368 {
4369 const char *comma = "";
4370 int len = regions.length ();
4371
4372 fprintf (dump_file, "SESE regions:");
4373 for (ix = 0; ix != len; ix++)
4374 {
4375 basic_block from = regions[ix].first;
4376 basic_block to = regions[ix].second;
4377
4378 if (from)
4379 {
4380 fprintf (dump_file, "%s %d{%d", comma, ix, from->index);
4381 if (to != from)
4382 fprintf (dump_file, "->%d", to->index);
4383
4384 int color = BB_GET_SESE (from)->color;
4385
4386 /* Print the blocks within the region (excluding ends). */
4387 FOR_EACH_BB_FN (block, cfun)
4388 {
4389 bb_sese *sese = BB_GET_SESE (block);
4390
4391 if (sese && sese->color == color
4392 && block != from && block != to)
4393 fprintf (dump_file, ".%d", block->index);
4394 }
4395 fprintf (dump_file, "}");
4396 }
4397 comma = ",";
4398 }
4399 fprintf (dump_file, "\n\n");
4400 }
4401
4402 for (ix = 0; blocks.iterate (ix, &block); ix++)
4403 delete BB_GET_SESE (block);
4404 }
4405
4406 #undef BB_SET_SESE
4407 #undef BB_GET_SESE
4408
4409 /* Propagate live state at the start of a partitioned region. IS_CALL
4410 indicates whether the propagation is for a (partitioned) call
4411 instruction. BLOCK provides the live register information, and
4412 might not contain INSN. Propagation is inserted just after INSN. RW
4413 indicates whether we are reading and/or writing state. This
4414 separation is needed for worker-level proppagation where we
4415 essentially do a spill & fill. FN is the underlying worker
4416 function to generate the propagation instructions for single
4417 register. DATA is user data.
4418
4419 Returns true if we didn't emit any instructions.
4420
4421 We propagate the live register set for non-calls and the entire
4422 frame for calls and non-calls. We could do better by (a)
4423 propagating just the live set that is used within the partitioned
4424 regions and (b) only propagating stack entries that are used. The
4425 latter might be quite hard to determine. */
4426
4427 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool);
4428
4429 static bool
nvptx_propagate(bool is_call,basic_block block,rtx_insn * insn,propagate_mask rw,propagator_fn fn,void * data,bool vector)4430 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn,
4431 propagate_mask rw, propagator_fn fn, void *data, bool vector)
4432 {
4433 bitmap live = DF_LIVE_IN (block);
4434 bitmap_iterator iterator;
4435 unsigned ix;
4436 bool empty = true;
4437
4438 /* Copy the frame array. */
4439 HOST_WIDE_INT fs = get_frame_size ();
4440 if (fs)
4441 {
4442 rtx tmp = gen_reg_rtx (DImode);
4443 rtx idx = NULL_RTX;
4444 rtx ptr = gen_reg_rtx (Pmode);
4445 rtx pred = NULL_RTX;
4446 rtx_code_label *label = NULL;
4447
4448 empty = false;
4449 /* The frame size might not be DImode compatible, but the frame
4450 array's declaration will be. So it's ok to round up here. */
4451 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode);
4452 /* Detect single iteration loop. */
4453 if (fs == 1)
4454 fs = 0;
4455
4456 start_sequence ();
4457 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx));
4458 if (fs)
4459 {
4460 idx = gen_reg_rtx (SImode);
4461 pred = gen_reg_rtx (BImode);
4462 label = gen_label_rtx ();
4463
4464 emit_insn (gen_rtx_SET (idx, GEN_INT (fs)));
4465 /* Allow worker function to initialize anything needed. */
4466 rtx init = fn (tmp, PM_loop_begin, fs, data, vector);
4467 if (init)
4468 emit_insn (init);
4469 emit_label (label);
4470 LABEL_NUSES (label)++;
4471 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1)));
4472 }
4473 if (rw & PM_read)
4474 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr)));
4475 emit_insn (fn (tmp, rw, fs, data, vector));
4476 if (rw & PM_write)
4477 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp));
4478 if (fs)
4479 {
4480 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx)));
4481 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode))));
4482 emit_insn (gen_br_true_uni (pred, label));
4483 rtx fini = fn (tmp, PM_loop_end, fs, data, vector);
4484 if (fini)
4485 emit_insn (fini);
4486 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx));
4487 }
4488 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp));
4489 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr));
4490 rtx cpy = get_insns ();
4491 end_sequence ();
4492 insn = emit_insn_after (cpy, insn);
4493 }
4494
4495 if (!is_call)
4496 /* Copy live registers. */
4497 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator)
4498 {
4499 rtx reg = regno_reg_rtx[ix];
4500
4501 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER)
4502 {
4503 rtx bcast = fn (reg, rw, 0, data, vector);
4504
4505 insn = emit_insn_after (bcast, insn);
4506 empty = false;
4507 }
4508 }
4509 return empty;
4510 }
4511
4512 /* Worker for nvptx_warp_propagate. */
4513
4514 static rtx
warp_prop_gen(rtx reg,propagate_mask pm,unsigned ARG_UNUSED (count),void * ARG_UNUSED (data),bool ARG_UNUSED (vector))4515 warp_prop_gen (rtx reg, propagate_mask pm,
4516 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data),
4517 bool ARG_UNUSED (vector))
4518 {
4519 if (!(pm & PM_read_write))
4520 return 0;
4521
4522 return nvptx_gen_warp_bcast (reg);
4523 }
4524
4525 /* Propagate state that is live at start of BLOCK across the vectors
4526 of a single warp. Propagation is inserted just after INSN.
4527 IS_CALL and return as for nvptx_propagate. */
4528
4529 static bool
nvptx_warp_propagate(bool is_call,basic_block block,rtx_insn * insn)4530 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn)
4531 {
4532 return nvptx_propagate (is_call, block, insn, PM_read_write,
4533 warp_prop_gen, 0, false);
4534 }
4535
4536 /* Worker for nvptx_shared_propagate. */
4537
4538 static rtx
shared_prop_gen(rtx reg,propagate_mask pm,unsigned rep,void * data_,bool vector)4539 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_,
4540 bool vector)
4541 {
4542 broadcast_data_t *data = (broadcast_data_t *)data_;
4543
4544 if (pm & PM_loop_begin)
4545 {
4546 /* Starting a loop, initialize pointer. */
4547 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT;
4548
4549 oacc_bcast_align = MAX (oacc_bcast_align, align);
4550 data->offset = ROUND_UP (data->offset, align);
4551
4552 data->ptr = gen_reg_rtx (Pmode);
4553
4554 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset));
4555 }
4556 else if (pm & PM_loop_end)
4557 {
4558 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr);
4559 data->ptr = NULL_RTX;
4560 return clobber;
4561 }
4562 else
4563 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector);
4564 }
4565
4566 /* Spill or fill live state that is live at start of BLOCK. PRE_P
4567 indicates if this is just before partitioned mode (do spill), or
4568 just after it starts (do fill). Sequence is inserted just after
4569 INSN. IS_CALL and return as for nvptx_propagate. */
4570
4571 static bool
nvptx_shared_propagate(bool pre_p,bool is_call,basic_block block,rtx_insn * insn,bool vector)4572 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block,
4573 rtx_insn *insn, bool vector)
4574 {
4575 broadcast_data_t data;
4576
4577 data.base = gen_reg_rtx (Pmode);
4578 data.offset = 0;
4579 data.ptr = NULL_RTX;
4580
4581 bool empty = nvptx_propagate (is_call, block, insn,
4582 pre_p ? PM_read : PM_write, shared_prop_gen,
4583 &data, vector);
4584 gcc_assert (empty == !data.offset);
4585 if (data.offset)
4586 {
4587 rtx bcast_sym = oacc_bcast_sym;
4588
4589 /* Stuff was emitted, initialize the base pointer now. */
4590 if (vector && nvptx_mach_max_workers () > 1)
4591 {
4592 if (!cfun->machine->bcast_partition)
4593 {
4594 /* It would be nice to place this register in
4595 DATA_AREA_SHARED. */
4596 cfun->machine->bcast_partition = gen_reg_rtx (DImode);
4597 }
4598 if (!cfun->machine->sync_bar)
4599 cfun->machine->sync_bar = gen_reg_rtx (SImode);
4600
4601 bcast_sym = cfun->machine->bcast_partition;
4602 }
4603
4604 rtx init = gen_rtx_SET (data.base, bcast_sym);
4605 emit_insn_after (init, insn);
4606
4607 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align);
4608 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
4609 ? nvptx_mach_max_workers () + 1
4610 : 1);
4611
4612 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
4613 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
4614 }
4615 return empty;
4616 }
4617
4618 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number,
4619 which is an integer or a register. THREADS is the number of threads
4620 controlled by the barrier. */
4621
4622 static rtx
nvptx_cta_sync(rtx lock,int threads)4623 nvptx_cta_sync (rtx lock, int threads)
4624 {
4625 return gen_nvptx_barsync (lock, GEN_INT (threads));
4626 }
4627
4628 #if WORKAROUND_PTXJIT_BUG
4629 /* Return first real insn in BB, or return NULL_RTX if BB does not contain
4630 real insns. */
4631
4632 static rtx_insn *
bb_first_real_insn(basic_block bb)4633 bb_first_real_insn (basic_block bb)
4634 {
4635 rtx_insn *insn;
4636
4637 /* Find first insn of from block. */
4638 FOR_BB_INSNS (bb, insn)
4639 if (INSN_P (insn))
4640 return insn;
4641
4642 return 0;
4643 }
4644 #endif
4645
4646 /* Return true if INSN needs neutering. */
4647
4648 static bool
needs_neutering_p(rtx_insn * insn)4649 needs_neutering_p (rtx_insn *insn)
4650 {
4651 if (!INSN_P (insn))
4652 return false;
4653
4654 switch (recog_memoized (insn))
4655 {
4656 case CODE_FOR_nvptx_fork:
4657 case CODE_FOR_nvptx_forked:
4658 case CODE_FOR_nvptx_joining:
4659 case CODE_FOR_nvptx_join:
4660 case CODE_FOR_nvptx_barsync:
4661 return false;
4662 default:
4663 return true;
4664 }
4665 }
4666
4667 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */
4668
4669 static bool
verify_neutering_jumps(basic_block from,rtx_insn * vector_jump,rtx_insn * worker_jump,rtx_insn * vector_label,rtx_insn * worker_label)4670 verify_neutering_jumps (basic_block from,
4671 rtx_insn *vector_jump, rtx_insn *worker_jump,
4672 rtx_insn *vector_label, rtx_insn *worker_label)
4673 {
4674 basic_block bb = from;
4675 rtx_insn *insn = BB_HEAD (bb);
4676 bool seen_worker_jump = false;
4677 bool seen_vector_jump = false;
4678 bool seen_worker_label = false;
4679 bool seen_vector_label = false;
4680 bool worker_neutered = false;
4681 bool vector_neutered = false;
4682 while (true)
4683 {
4684 if (insn == worker_jump)
4685 {
4686 seen_worker_jump = true;
4687 worker_neutered = true;
4688 gcc_assert (!vector_neutered);
4689 }
4690 else if (insn == vector_jump)
4691 {
4692 seen_vector_jump = true;
4693 vector_neutered = true;
4694 }
4695 else if (insn == worker_label)
4696 {
4697 seen_worker_label = true;
4698 gcc_assert (worker_neutered);
4699 worker_neutered = false;
4700 }
4701 else if (insn == vector_label)
4702 {
4703 seen_vector_label = true;
4704 gcc_assert (vector_neutered);
4705 vector_neutered = false;
4706 }
4707 else if (INSN_P (insn))
4708 switch (recog_memoized (insn))
4709 {
4710 case CODE_FOR_nvptx_barsync:
4711 gcc_assert (!vector_neutered && !worker_neutered);
4712 break;
4713 default:
4714 break;
4715 }
4716
4717 if (insn != BB_END (bb))
4718 insn = NEXT_INSN (insn);
4719 else if (JUMP_P (insn) && single_succ_p (bb)
4720 && !seen_vector_jump && !seen_worker_jump)
4721 {
4722 bb = single_succ (bb);
4723 insn = BB_HEAD (bb);
4724 }
4725 else
4726 break;
4727 }
4728
4729 gcc_assert (!(vector_jump && !seen_vector_jump));
4730 gcc_assert (!(worker_jump && !seen_worker_jump));
4731
4732 if (seen_vector_label || seen_worker_label)
4733 {
4734 gcc_assert (!(vector_label && !seen_vector_label));
4735 gcc_assert (!(worker_label && !seen_worker_label));
4736
4737 return true;
4738 }
4739
4740 return false;
4741 }
4742
4743 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */
4744
4745 static void
verify_neutering_labels(basic_block to,rtx_insn * vector_label,rtx_insn * worker_label)4746 verify_neutering_labels (basic_block to, rtx_insn *vector_label,
4747 rtx_insn *worker_label)
4748 {
4749 basic_block bb = to;
4750 rtx_insn *insn = BB_END (bb);
4751 bool seen_worker_label = false;
4752 bool seen_vector_label = false;
4753 while (true)
4754 {
4755 if (insn == worker_label)
4756 {
4757 seen_worker_label = true;
4758 gcc_assert (!seen_vector_label);
4759 }
4760 else if (insn == vector_label)
4761 seen_vector_label = true;
4762 else if (INSN_P (insn))
4763 switch (recog_memoized (insn))
4764 {
4765 case CODE_FOR_nvptx_barsync:
4766 gcc_assert (!seen_vector_label && !seen_worker_label);
4767 break;
4768 }
4769
4770 if (insn != BB_HEAD (bb))
4771 insn = PREV_INSN (insn);
4772 else
4773 break;
4774 }
4775
4776 gcc_assert (!(vector_label && !seen_vector_label));
4777 gcc_assert (!(worker_label && !seen_worker_label));
4778 }
4779
4780 /* Single neutering according to MASK. FROM is the incoming block and
4781 TO is the outgoing block. These may be the same block. Insert at
4782 start of FROM:
4783
4784 if (tid.<axis>) goto end.
4785
4786 and insert before ending branch of TO (if there is such an insn):
4787
4788 end:
4789 <possibly-broadcast-cond>
4790 <branch>
4791
4792 We currently only use differnt FROM and TO when skipping an entire
4793 loop. We could do more if we detected superblocks. */
4794
4795 static void
nvptx_single(unsigned mask,basic_block from,basic_block to)4796 nvptx_single (unsigned mask, basic_block from, basic_block to)
4797 {
4798 rtx_insn *head = BB_HEAD (from);
4799 rtx_insn *tail = BB_END (to);
4800 unsigned skip_mask = mask;
4801
4802 while (true)
4803 {
4804 /* Find first insn of from block. */
4805 while (head != BB_END (from) && !needs_neutering_p (head))
4806 head = NEXT_INSN (head);
4807
4808 if (from == to)
4809 break;
4810
4811 if (!(JUMP_P (head) && single_succ_p (from)))
4812 break;
4813
4814 basic_block jump_target = single_succ (from);
4815 if (!single_pred_p (jump_target))
4816 break;
4817
4818 from = jump_target;
4819 head = BB_HEAD (from);
4820 }
4821
4822 /* Find last insn of to block */
4823 rtx_insn *limit = from == to ? head : BB_HEAD (to);
4824 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail))
4825 tail = PREV_INSN (tail);
4826
4827 /* Detect if tail is a branch. */
4828 rtx tail_branch = NULL_RTX;
4829 rtx cond_branch = NULL_RTX;
4830 if (tail && INSN_P (tail))
4831 {
4832 tail_branch = PATTERN (tail);
4833 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx)
4834 tail_branch = NULL_RTX;
4835 else
4836 {
4837 cond_branch = SET_SRC (tail_branch);
4838 if (GET_CODE (cond_branch) != IF_THEN_ELSE)
4839 cond_branch = NULL_RTX;
4840 }
4841 }
4842
4843 if (tail == head)
4844 {
4845 /* If this is empty, do nothing. */
4846 if (!head || !needs_neutering_p (head))
4847 return;
4848
4849 if (cond_branch)
4850 {
4851 /* If we're only doing vector single, there's no need to
4852 emit skip code because we'll not insert anything. */
4853 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)))
4854 skip_mask = 0;
4855 }
4856 else if (tail_branch)
4857 /* Block with only unconditional branch. Nothing to do. */
4858 return;
4859 }
4860
4861 /* Insert the vector test inside the worker test. */
4862 unsigned mode;
4863 rtx_insn *before = tail;
4864 rtx_insn *neuter_start = NULL;
4865 rtx_insn *worker_label = NULL, *vector_label = NULL;
4866 rtx_insn *worker_jump = NULL, *vector_jump = NULL;
4867 rtx_insn *warp_sync = NULL;
4868 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
4869 if (GOMP_DIM_MASK (mode) & skip_mask)
4870 {
4871 rtx_code_label *label = gen_label_rtx ();
4872 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER];
4873 rtx_insn **mode_jump
4874 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump;
4875 rtx_insn **mode_label
4876 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label;
4877
4878 if (!pred)
4879 {
4880 pred = gen_reg_rtx (BImode);
4881 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred;
4882 }
4883
4884 rtx br;
4885 if (mode == GOMP_DIM_VECTOR)
4886 br = gen_br_true (pred, label);
4887 else
4888 br = gen_br_true_uni (pred, label);
4889 if (neuter_start)
4890 neuter_start = emit_insn_after (br, neuter_start);
4891 else
4892 neuter_start = emit_insn_before (br, head);
4893 *mode_jump = neuter_start;
4894
4895 LABEL_NUSES (label)++;
4896 rtx_insn *label_insn;
4897 if (tail_branch)
4898 {
4899 label_insn = emit_label_before (label, before);
4900 if (mode == GOMP_DIM_VECTOR)
4901 {
4902 if (TARGET_PTX_6_0)
4903 warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4904 label_insn);
4905 else
4906 warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4907 label_insn);
4908 }
4909 before = label_insn;
4910 }
4911 else
4912 {
4913 label_insn = emit_label_after (label, tail);
4914 if (mode == GOMP_DIM_VECTOR)
4915 {
4916 if (TARGET_PTX_6_0)
4917 warp_sync = emit_insn_after (gen_nvptx_warpsync (),
4918 label_insn);
4919 else
4920 warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (),
4921 label_insn);
4922 }
4923 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER)
4924 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL))
4925 emit_insn_after (gen_exit (), label_insn);
4926 }
4927
4928 *mode_label = label_insn;
4929 }
4930
4931 /* Now deal with propagating the branch condition. */
4932 if (cond_branch)
4933 {
4934 rtx pvar = XEXP (XEXP (cond_branch, 0), 0);
4935
4936 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask
4937 && nvptx_mach_vector_length () == PTX_WARP_SIZE)
4938 {
4939 /* Vector mode only, do a shuffle. */
4940 #if WORKAROUND_PTXJIT_BUG
4941 /* The branch condition %rcond is propagated like this:
4942
4943 {
4944 .reg .u32 %x;
4945 mov.u32 %x,%tid.x;
4946 setp.ne.u32 %rnotvzero,%x,0;
4947 }
4948
4949 @%rnotvzero bra Lskip;
4950 setp.<op>.<type> %rcond,op1,op2;
4951 Lskip:
4952 selp.u32 %rcondu32,1,0,%rcond;
4953 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4954 setp.ne.u32 %rcond,%rcondu32,0;
4955
4956 There seems to be a bug in the ptx JIT compiler (observed at driver
4957 version 381.22, at -O1 and higher for sm_61), that drops the shfl
4958 unless %rcond is initialized to something before 'bra Lskip'. The
4959 bug is not observed with ptxas from cuda 8.0.61.
4960
4961 It is true that the code is non-trivial: at Lskip, %rcond is
4962 uninitialized in threads 1-31, and after the selp the same holds
4963 for %rcondu32. But shfl propagates the defined value in thread 0
4964 to threads 1-31, so after the shfl %rcondu32 is defined in threads
4965 0-31, and after the setp.ne %rcond is defined in threads 0-31.
4966
4967 There is nothing in the PTX spec to suggest that this is wrong, or
4968 to explain why the extra initialization is needed. So, we classify
4969 it as a JIT bug, and the extra initialization as workaround:
4970
4971 {
4972 .reg .u32 %x;
4973 mov.u32 %x,%tid.x;
4974 setp.ne.u32 %rnotvzero,%x,0;
4975 }
4976
4977 +.reg .pred %rcond2;
4978 +setp.eq.u32 %rcond2, 1, 0;
4979
4980 @%rnotvzero bra Lskip;
4981 setp.<op>.<type> %rcond,op1,op2;
4982 +mov.pred %rcond2, %rcond;
4983 Lskip:
4984 +mov.pred %rcond, %rcond2;
4985 selp.u32 %rcondu32,1,0,%rcond;
4986 shfl.idx.b32 %rcondu32,%rcondu32,0,31;
4987 setp.ne.u32 %rcond,%rcondu32,0;
4988 */
4989 rtx_insn *label = PREV_INSN (tail);
4990 if (label == warp_sync)
4991 label = PREV_INSN (label);
4992 gcc_assert (label && LABEL_P (label));
4993 rtx tmp = gen_reg_rtx (BImode);
4994 emit_insn_before (gen_movbi (tmp, const0_rtx),
4995 bb_first_real_insn (from));
4996 emit_insn_before (gen_rtx_SET (tmp, pvar), label);
4997 emit_insn_before (gen_rtx_SET (pvar, tmp), tail);
4998 #endif
4999 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail);
5000 }
5001 else
5002 {
5003 /* Includes worker mode, do spill & fill. By construction
5004 we should never have worker mode only. */
5005 broadcast_data_t data;
5006 unsigned size = GET_MODE_SIZE (SImode);
5007 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0;
5008 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0;
5009 rtx barrier = GEN_INT (0);
5010 int threads = 0;
5011
5012 data.base = oacc_bcast_sym;
5013 data.ptr = 0;
5014
5015 bool use_partitioning_p = (vector && !worker
5016 && nvptx_mach_max_workers () > 1
5017 && cfun->machine->bcast_partition);
5018 if (use_partitioning_p)
5019 {
5020 data.base = cfun->machine->bcast_partition;
5021 barrier = cfun->machine->sync_bar;
5022 threads = nvptx_mach_vector_length ();
5023 }
5024 gcc_assert (data.base != NULL);
5025 gcc_assert (barrier);
5026
5027 unsigned int psize = ROUND_UP (size, oacc_bcast_align);
5028 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE
5029 ? nvptx_mach_max_workers () + 1
5030 : 1);
5031
5032 oacc_bcast_partition = MAX (oacc_bcast_partition, psize);
5033 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum);
5034
5035 data.offset = 0;
5036 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data,
5037 vector),
5038 before);
5039
5040 /* Barrier so other workers can see the write. */
5041 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
5042 data.offset = 0;
5043 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data,
5044 vector),
5045 tail);
5046 /* This barrier is needed to avoid worker zero clobbering
5047 the broadcast buffer before all the other workers have
5048 had a chance to read this instance of it. */
5049 emit_insn_before (nvptx_cta_sync (barrier, threads), tail);
5050 }
5051
5052 extract_insn (tail);
5053 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar),
5054 UNSPEC_BR_UNIFIED);
5055 validate_change (tail, recog_data.operand_loc[0], unsp, false);
5056 }
5057
5058 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump,
5059 vector_label, worker_label);
5060 if (!seen_label)
5061 verify_neutering_labels (to, vector_label, worker_label);
5062 }
5063
5064 /* PAR is a parallel that is being skipped in its entirety according to
5065 MASK. Treat this as skipping a superblock starting at forked
5066 and ending at joining. */
5067
5068 static void
nvptx_skip_par(unsigned mask,parallel * par)5069 nvptx_skip_par (unsigned mask, parallel *par)
5070 {
5071 basic_block tail = par->join_block;
5072 gcc_assert (tail->preds->length () == 1);
5073
5074 basic_block pre_tail = (*tail->preds)[0]->src;
5075 gcc_assert (pre_tail->succs->length () == 1);
5076
5077 nvptx_single (mask, par->forked_block, pre_tail);
5078 }
5079
5080 /* If PAR has a single inner parallel and PAR itself only contains
5081 empty entry and exit blocks, swallow the inner PAR. */
5082
5083 static void
nvptx_optimize_inner(parallel * par)5084 nvptx_optimize_inner (parallel *par)
5085 {
5086 parallel *inner = par->inner;
5087
5088 /* We mustn't be the outer dummy par. */
5089 if (!par->mask)
5090 return;
5091
5092 /* We must have a single inner par. */
5093 if (!inner || inner->next)
5094 return;
5095
5096 /* We must only contain 2 blocks ourselves -- the head and tail of
5097 the inner par. */
5098 if (par->blocks.length () != 2)
5099 return;
5100
5101 /* We must be disjoint partitioning. As we only have vector and
5102 worker partitioning, this is sufficient to guarantee the pars
5103 have adjacent partitioning. */
5104 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1))
5105 /* This indicates malformed code generation. */
5106 return;
5107
5108 /* The outer forked insn should be immediately followed by the inner
5109 fork insn. */
5110 rtx_insn *forked = par->forked_insn;
5111 rtx_insn *fork = BB_END (par->forked_block);
5112
5113 if (NEXT_INSN (forked) != fork)
5114 return;
5115 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork);
5116
5117 /* The outer joining insn must immediately follow the inner join
5118 insn. */
5119 rtx_insn *joining = par->joining_insn;
5120 rtx_insn *join = inner->join_insn;
5121 if (NEXT_INSN (join) != joining)
5122 return;
5123
5124 /* Preconditions met. Swallow the inner par. */
5125 if (dump_file)
5126 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n",
5127 inner->mask, inner->forked_block->index,
5128 inner->join_block->index,
5129 par->mask, par->forked_block->index, par->join_block->index);
5130
5131 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1);
5132
5133 par->blocks.reserve (inner->blocks.length ());
5134 while (inner->blocks.length ())
5135 par->blocks.quick_push (inner->blocks.pop ());
5136
5137 par->inner = inner->inner;
5138 inner->inner = NULL;
5139
5140 delete inner;
5141 }
5142
5143 /* Process the parallel PAR and all its contained
5144 parallels. We do everything but the neutering. Return mask of
5145 partitioned modes used within this parallel. */
5146
5147 static unsigned
nvptx_process_pars(parallel * par)5148 nvptx_process_pars (parallel *par)
5149 {
5150 if (nvptx_optimize)
5151 nvptx_optimize_inner (par);
5152
5153 unsigned inner_mask = par->mask;
5154
5155 /* Do the inner parallels first. */
5156 if (par->inner)
5157 {
5158 par->inner_mask = nvptx_process_pars (par->inner);
5159 inner_mask |= par->inner_mask;
5160 }
5161
5162 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0;
5163 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER));
5164 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
5165 && nvptx_mach_vector_length () > PTX_WARP_SIZE);
5166
5167 if (worker || large_vector)
5168 {
5169 nvptx_shared_propagate (false, is_call, par->forked_block,
5170 par->forked_insn, !worker);
5171 bool no_prop_p
5172 = nvptx_shared_propagate (true, is_call, par->forked_block,
5173 par->fork_insn, !worker);
5174 bool empty_loop_p
5175 = !is_call && (NEXT_INSN (par->forked_insn)
5176 && NEXT_INSN (par->forked_insn) == par->joining_insn);
5177 rtx barrier = GEN_INT (0);
5178 int threads = 0;
5179
5180 if (!worker && cfun->machine->sync_bar)
5181 {
5182 barrier = cfun->machine->sync_bar;
5183 threads = nvptx_mach_vector_length ();
5184 }
5185
5186 if (no_prop_p && empty_loop_p)
5187 ;
5188 else if (no_prop_p && is_call)
5189 ;
5190 else
5191 {
5192 /* Insert begin and end synchronizations. */
5193 emit_insn_before (nvptx_cta_sync (barrier, threads),
5194 par->forked_insn);
5195 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn);
5196 }
5197 }
5198 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))
5199 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn);
5200
5201 /* Now do siblings. */
5202 if (par->next)
5203 inner_mask |= nvptx_process_pars (par->next);
5204 return inner_mask;
5205 }
5206
5207 /* Neuter the parallel described by PAR. We recurse in depth-first
5208 order. MODES are the partitioning of the execution and OUTER is
5209 the partitioning of the parallels we are contained in. */
5210
5211 static void
nvptx_neuter_pars(parallel * par,unsigned modes,unsigned outer)5212 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
5213 {
5214 unsigned me = (par->mask
5215 & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
5216 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5217 unsigned skip_mask = 0, neuter_mask = 0;
5218
5219 if (par->inner)
5220 nvptx_neuter_pars (par->inner, modes, outer | me);
5221
5222 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++)
5223 {
5224 if ((outer | me) & GOMP_DIM_MASK (mode))
5225 {} /* Mode is partitioned: no neutering. */
5226 else if (!(modes & GOMP_DIM_MASK (mode)))
5227 {} /* Mode is not used: nothing to do. */
5228 else if (par->inner_mask & GOMP_DIM_MASK (mode)
5229 || !par->forked_insn)
5230 /* Partitioned in inner parallels, or we're not a partitioned
5231 at all: neuter individual blocks. */
5232 neuter_mask |= GOMP_DIM_MASK (mode);
5233 else if (!par->parent || !par->parent->forked_insn
5234 || par->parent->inner_mask & GOMP_DIM_MASK (mode))
5235 /* Parent isn't a parallel or contains this paralleling: skip
5236 parallel at this level. */
5237 skip_mask |= GOMP_DIM_MASK (mode);
5238 else
5239 {} /* Parent will skip this parallel itself. */
5240 }
5241
5242 if (neuter_mask)
5243 {
5244 int ix, len;
5245
5246 if (nvptx_optimize)
5247 {
5248 /* Neuter whole SESE regions. */
5249 bb_pair_vec_t regions;
5250
5251 nvptx_find_sese (par->blocks, regions);
5252 len = regions.length ();
5253 for (ix = 0; ix != len; ix++)
5254 {
5255 basic_block from = regions[ix].first;
5256 basic_block to = regions[ix].second;
5257
5258 if (from)
5259 nvptx_single (neuter_mask, from, to);
5260 else
5261 gcc_assert (!to);
5262 }
5263 }
5264 else
5265 {
5266 /* Neuter each BB individually. */
5267 len = par->blocks.length ();
5268 for (ix = 0; ix != len; ix++)
5269 {
5270 basic_block block = par->blocks[ix];
5271
5272 nvptx_single (neuter_mask, block, block);
5273 }
5274 }
5275 }
5276
5277 if (skip_mask)
5278 nvptx_skip_par (skip_mask, par);
5279
5280 if (par->next)
5281 nvptx_neuter_pars (par->next, modes, outer);
5282 }
5283
5284 static void
populate_offload_attrs(offload_attrs * oa)5285 populate_offload_attrs (offload_attrs *oa)
5286 {
5287 tree attr = oacc_get_fn_attrib (current_function_decl);
5288 tree dims = TREE_VALUE (attr);
5289 unsigned ix;
5290
5291 oa->mask = 0;
5292
5293 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims))
5294 {
5295 tree t = TREE_VALUE (dims);
5296 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t);
5297 tree allowed = TREE_PURPOSE (dims);
5298
5299 if (size != 1 && !(allowed && integer_zerop (allowed)))
5300 oa->mask |= GOMP_DIM_MASK (ix);
5301
5302 switch (ix)
5303 {
5304 case GOMP_DIM_GANG:
5305 oa->num_gangs = size;
5306 break;
5307
5308 case GOMP_DIM_WORKER:
5309 oa->num_workers = size;
5310 break;
5311
5312 case GOMP_DIM_VECTOR:
5313 oa->vector_length = size;
5314 break;
5315 }
5316 }
5317 }
5318
5319 #if WORKAROUND_PTXJIT_BUG_2
5320 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant
5321 is needed in the nvptx target because the branches generated for
5322 parititioning are NONJUMP_INSN_P, not JUMP_P. */
5323
5324 static rtx
nvptx_pc_set(const rtx_insn * insn,bool strict=true)5325 nvptx_pc_set (const rtx_insn *insn, bool strict = true)
5326 {
5327 rtx pat;
5328 if ((strict && !JUMP_P (insn))
5329 || (!strict && !INSN_P (insn)))
5330 return NULL_RTX;
5331 pat = PATTERN (insn);
5332
5333 /* The set is allowed to appear either as the insn pattern or
5334 the first set in a PARALLEL. */
5335 if (GET_CODE (pat) == PARALLEL)
5336 pat = XVECEXP (pat, 0, 0);
5337 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC)
5338 return pat;
5339
5340 return NULL_RTX;
5341 }
5342
5343 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */
5344
5345 static rtx
nvptx_condjump_label(const rtx_insn * insn,bool strict=true)5346 nvptx_condjump_label (const rtx_insn *insn, bool strict = true)
5347 {
5348 rtx x = nvptx_pc_set (insn, strict);
5349
5350 if (!x)
5351 return NULL_RTX;
5352 x = SET_SRC (x);
5353 if (GET_CODE (x) == LABEL_REF)
5354 return x;
5355 if (GET_CODE (x) != IF_THEN_ELSE)
5356 return NULL_RTX;
5357 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF)
5358 return XEXP (x, 1);
5359 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF)
5360 return XEXP (x, 2);
5361 return NULL_RTX;
5362 }
5363
5364 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx
5365 insn inbetween the branch and the label. This works around a JIT bug
5366 observed at driver version 384.111, at -O0 for sm_50. */
5367
5368 static void
prevent_branch_around_nothing(void)5369 prevent_branch_around_nothing (void)
5370 {
5371 rtx_insn *seen_label = NULL;
5372 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5373 {
5374 if (INSN_P (insn) && condjump_p (insn))
5375 {
5376 seen_label = label_ref_label (nvptx_condjump_label (insn, false));
5377 continue;
5378 }
5379
5380 if (seen_label == NULL)
5381 continue;
5382
5383 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5384 continue;
5385
5386 if (INSN_P (insn))
5387 switch (recog_memoized (insn))
5388 {
5389 case CODE_FOR_nvptx_fork:
5390 case CODE_FOR_nvptx_forked:
5391 case CODE_FOR_nvptx_joining:
5392 case CODE_FOR_nvptx_join:
5393 case CODE_FOR_nop:
5394 continue;
5395 case -1:
5396 /* Handle asm ("") and similar. */
5397 if (GET_CODE (PATTERN (insn)) == ASM_INPUT
5398 || GET_CODE (PATTERN (insn)) == ASM_OPERANDS
5399 || (GET_CODE (PATTERN (insn)) == PARALLEL
5400 && asm_noperands (PATTERN (insn)) >= 0))
5401 continue;
5402 /* FALLTHROUGH. */
5403 default:
5404 seen_label = NULL;
5405 continue;
5406 }
5407
5408 if (LABEL_P (insn) && insn == seen_label)
5409 emit_insn_before (gen_fake_nop (), insn);
5410
5411 seen_label = NULL;
5412 }
5413 }
5414 #endif
5415
5416 #ifdef WORKAROUND_PTXJIT_BUG_3
5417 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This
5418 works around a hang observed at driver version 390.48 for sm_50. */
5419
5420 static void
workaround_barsyncs(void)5421 workaround_barsyncs (void)
5422 {
5423 bool seen_barsync = false;
5424 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn))
5425 {
5426 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync)
5427 {
5428 if (seen_barsync)
5429 {
5430 emit_insn_before (gen_nvptx_membar_cta (), insn);
5431 emit_insn_before (gen_nvptx_membar_cta (), insn);
5432 }
5433
5434 seen_barsync = true;
5435 continue;
5436 }
5437
5438 if (!seen_barsync)
5439 continue;
5440
5441 if (NOTE_P (insn) || DEBUG_INSN_P (insn))
5442 continue;
5443 else if (INSN_P (insn))
5444 switch (recog_memoized (insn))
5445 {
5446 case CODE_FOR_nvptx_fork:
5447 case CODE_FOR_nvptx_forked:
5448 case CODE_FOR_nvptx_joining:
5449 case CODE_FOR_nvptx_join:
5450 continue;
5451 default:
5452 break;
5453 }
5454
5455 seen_barsync = false;
5456 }
5457 }
5458 #endif
5459
5460 static rtx
gen_comment(const char * s)5461 gen_comment (const char *s)
5462 {
5463 const char *sep = " ";
5464 size_t len = strlen (ASM_COMMENT_START) + strlen (sep) + strlen (s) + 1;
5465 char *comment = (char *) alloca (len);
5466 snprintf (comment, len, "%s%s%s", ASM_COMMENT_START, sep, s);
5467 return gen_rtx_ASM_INPUT_loc (VOIDmode, ggc_strdup (comment),
5468 DECL_SOURCE_LOCATION (cfun->decl));
5469 }
5470
5471 /* Initialize all declared regs at function entry.
5472 Advantage : Fool-proof.
5473 Disadvantage: Potentially creates a lot of long live ranges and adds a lot
5474 of insns. */
5475
5476 static void
workaround_uninit_method_1(void)5477 workaround_uninit_method_1 (void)
5478 {
5479 rtx_insn *first = get_insns ();
5480 rtx_insn *insert_here = NULL;
5481
5482 for (int ix = LAST_VIRTUAL_REGISTER + 1; ix < max_reg_num (); ix++)
5483 {
5484 rtx reg = regno_reg_rtx[ix];
5485
5486 /* Skip undeclared registers. */
5487 if (reg == const0_rtx)
5488 continue;
5489
5490 gcc_assert (CONST0_RTX (GET_MODE (reg)));
5491
5492 start_sequence ();
5493 if (nvptx_comment && first != NULL)
5494 emit_insn (gen_comment ("Start: Added by -minit-regs=1"));
5495 emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5496 rtx_insn *inits = get_insns ();
5497 end_sequence ();
5498
5499 if (dump_file && (dump_flags & TDF_DETAILS))
5500 for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
5501 fprintf (dump_file, "Default init of reg %u inserted: insn %u\n",
5502 ix, INSN_UID (init));
5503
5504 if (first != NULL)
5505 {
5506 insert_here = emit_insn_before (inits, first);
5507 first = NULL;
5508 }
5509 else
5510 insert_here = emit_insn_after (inits, insert_here);
5511 }
5512
5513 if (nvptx_comment && insert_here != NULL)
5514 emit_insn_after (gen_comment ("End: Added by -minit-regs=1"), insert_here);
5515 }
5516
5517 /* Find uses of regs that are not defined on all incoming paths, and insert a
5518 corresponding def at function entry.
5519 Advantage : Simple.
5520 Disadvantage: Potentially creates long live ranges.
5521 May not catch all cases. F.i. a clobber cuts a live range in
5522 the compiler and may prevent entry_lr_in from being set for a
5523 reg, but the clobber does not translate to a ptx insn, so in
5524 ptx there still may be an uninitialized ptx reg. See f.i.
5525 gcc.c-torture/compile/20020926-1.c. */
5526
5527 static void
workaround_uninit_method_2(void)5528 workaround_uninit_method_2 (void)
5529 {
5530 auto_bitmap entry_pseudo_uninit;
5531 {
5532 auto_bitmap not_pseudo;
5533 bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
5534
5535 bitmap entry_lr_in = DF_LR_IN (ENTRY_BLOCK_PTR_FOR_FN (cfun));
5536 bitmap_and_compl (entry_pseudo_uninit, entry_lr_in, not_pseudo);
5537 }
5538
5539 rtx_insn *first = get_insns ();
5540 rtx_insn *insert_here = NULL;
5541
5542 bitmap_iterator iterator;
5543 unsigned ix;
5544 EXECUTE_IF_SET_IN_BITMAP (entry_pseudo_uninit, 0, ix, iterator)
5545 {
5546 rtx reg = regno_reg_rtx[ix];
5547 gcc_assert (CONST0_RTX (GET_MODE (reg)));
5548
5549 start_sequence ();
5550 if (nvptx_comment && first != NULL)
5551 emit_insn (gen_comment ("Start: Added by -minit-regs=2:"));
5552 emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5553 rtx_insn *inits = get_insns ();
5554 end_sequence ();
5555
5556 if (dump_file && (dump_flags & TDF_DETAILS))
5557 for (rtx_insn *init = inits; init != NULL; init = NEXT_INSN (init))
5558 fprintf (dump_file, "Missing init of reg %u inserted: insn %u\n",
5559 ix, INSN_UID (init));
5560
5561 if (first != NULL)
5562 {
5563 insert_here = emit_insn_before (inits, first);
5564 first = NULL;
5565 }
5566 else
5567 insert_here = emit_insn_after (inits, insert_here);
5568 }
5569
5570 if (nvptx_comment && insert_here != NULL)
5571 emit_insn_after (gen_comment ("End: Added by -minit-regs=2"), insert_here);
5572 }
5573
5574 /* Find uses of regs that are not defined on all incoming paths, and insert a
5575 corresponding def on those.
5576 Advantage : Doesn't create long live ranges.
5577 Disadvantage: More complex, and potentially also more defs. */
5578
5579 static void
workaround_uninit_method_3(void)5580 workaround_uninit_method_3 (void)
5581 {
5582 auto_bitmap not_pseudo;
5583 bitmap_set_range (not_pseudo, 0, LAST_VIRTUAL_REGISTER);
5584
5585 basic_block bb;
5586 FOR_EACH_BB_FN (bb, cfun)
5587 {
5588 if (single_pred_p (bb))
5589 continue;
5590
5591 auto_bitmap bb_pseudo_uninit;
5592 bitmap_and_compl (bb_pseudo_uninit, DF_LIVE_IN (bb), DF_MIR_IN (bb));
5593 bitmap_and_compl_into (bb_pseudo_uninit, not_pseudo);
5594
5595 bitmap_iterator iterator;
5596 unsigned ix;
5597 EXECUTE_IF_SET_IN_BITMAP (bb_pseudo_uninit, 0, ix, iterator)
5598 {
5599 bool have_false = false;
5600 bool have_true = false;
5601
5602 edge e;
5603 edge_iterator ei;
5604 FOR_EACH_EDGE (e, ei, bb->preds)
5605 {
5606 if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
5607 have_true = true;
5608 else
5609 have_false = true;
5610 }
5611 if (have_false ^ have_true)
5612 continue;
5613
5614 FOR_EACH_EDGE (e, ei, bb->preds)
5615 {
5616 if (bitmap_bit_p (DF_LIVE_OUT (e->src), ix))
5617 continue;
5618
5619 rtx reg = regno_reg_rtx[ix];
5620 gcc_assert (CONST0_RTX (GET_MODE (reg)));
5621
5622 start_sequence ();
5623 emit_move_insn (reg, CONST0_RTX (GET_MODE (reg)));
5624 rtx_insn *inits = get_insns ();
5625 end_sequence ();
5626
5627 if (dump_file && (dump_flags & TDF_DETAILS))
5628 for (rtx_insn *init = inits; init != NULL;
5629 init = NEXT_INSN (init))
5630 fprintf (dump_file,
5631 "Missing init of reg %u inserted on edge: %d -> %d:"
5632 " insn %u\n", ix, e->src->index, e->dest->index,
5633 INSN_UID (init));
5634
5635 insert_insn_on_edge (inits, e);
5636 }
5637 }
5638 }
5639
5640 if (nvptx_comment)
5641 FOR_EACH_BB_FN (bb, cfun)
5642 {
5643 if (single_pred_p (bb))
5644 continue;
5645
5646 edge e;
5647 edge_iterator ei;
5648 FOR_EACH_EDGE (e, ei, bb->preds)
5649 {
5650 if (e->insns.r == NULL_RTX)
5651 continue;
5652 start_sequence ();
5653 emit_insn (gen_comment ("Start: Added by -minit-regs=3:"));
5654 emit_insn (e->insns.r);
5655 emit_insn (gen_comment ("End: Added by -minit-regs=3:"));
5656 e->insns.r = get_insns ();
5657 end_sequence ();
5658 }
5659 }
5660
5661 commit_edge_insertions ();
5662 }
5663
5664 static void
workaround_uninit(void)5665 workaround_uninit (void)
5666 {
5667 switch (nvptx_init_regs)
5668 {
5669 case 0:
5670 /* Skip. */
5671 break;
5672 case 1:
5673 workaround_uninit_method_1 ();
5674 break;
5675 case 2:
5676 workaround_uninit_method_2 ();
5677 break;
5678 case 3:
5679 workaround_uninit_method_3 ();
5680 break;
5681 default:
5682 gcc_unreachable ();
5683 }
5684 }
5685
5686 /* PTX-specific reorganization
5687 - Split blocks at fork and join instructions
5688 - Compute live registers
5689 - Mark now-unused registers, so function begin doesn't declare
5690 unused registers.
5691 - Insert state propagation when entering partitioned mode
5692 - Insert neutering instructions when in single mode
5693 - Replace subregs with suitable sequences.
5694 */
5695
5696 static void
nvptx_reorg(void)5697 nvptx_reorg (void)
5698 {
5699 /* We are freeing block_for_insn in the toplev to keep compatibility
5700 with old MDEP_REORGS that are not CFG based. Recompute it now. */
5701 compute_bb_for_insn ();
5702
5703 thread_prologue_and_epilogue_insns ();
5704
5705 /* Split blocks and record interesting unspecs. */
5706 bb_insn_map_t bb_insn_map;
5707
5708 nvptx_split_blocks (&bb_insn_map);
5709
5710 /* Compute live regs */
5711 df_clear_flags (DF_LR_RUN_DCE);
5712 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS);
5713 df_live_add_problem ();
5714 df_live_set_all_dirty ();
5715 if (nvptx_init_regs == 3)
5716 df_mir_add_problem ();
5717 df_analyze ();
5718 regstat_init_n_sets_and_refs ();
5719
5720 if (dump_file)
5721 df_dump (dump_file);
5722
5723 /* Mark unused regs as unused. */
5724 int max_regs = max_reg_num ();
5725 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++)
5726 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0)
5727 regno_reg_rtx[i] = const0_rtx;
5728
5729 workaround_uninit ();
5730
5731 /* Determine launch dimensions of the function. If it is not an
5732 offloaded function (i.e. this is a regular compiler), the
5733 function has no neutering. */
5734 tree attr = oacc_get_fn_attrib (current_function_decl);
5735 if (attr)
5736 {
5737 /* If we determined this mask before RTL expansion, we could
5738 elide emission of some levels of forks and joins. */
5739 offload_attrs oa;
5740
5741 populate_offload_attrs (&oa);
5742
5743 /* If there is worker neutering, there must be vector
5744 neutering. Otherwise the hardware will fail. */
5745 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
5746 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
5747
5748 /* Discover & process partitioned regions. */
5749 parallel *pars = nvptx_discover_pars (&bb_insn_map);
5750 nvptx_process_pars (pars);
5751 nvptx_neuter_pars (pars, oa.mask, 0);
5752 delete pars;
5753 }
5754
5755 /* Replace subregs. */
5756 nvptx_reorg_subreg ();
5757
5758 if (TARGET_UNIFORM_SIMT)
5759 nvptx_reorg_uniform_simt ();
5760
5761 #if WORKAROUND_PTXJIT_BUG_2
5762 prevent_branch_around_nothing ();
5763 #endif
5764
5765 #ifdef WORKAROUND_PTXJIT_BUG_3
5766 workaround_barsyncs ();
5767 #endif
5768
5769 regstat_free_n_sets_and_refs ();
5770
5771 df_finish_pass (true);
5772 }
5773
5774 /* Handle a "kernel" attribute; arguments as in
5775 struct attribute_spec.handler. */
5776
5777 static tree
nvptx_handle_kernel_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)5778 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5779 int ARG_UNUSED (flags), bool *no_add_attrs)
5780 {
5781 tree decl = *node;
5782
5783 if (TREE_CODE (decl) != FUNCTION_DECL)
5784 {
5785 error ("%qE attribute only applies to functions", name);
5786 *no_add_attrs = true;
5787 }
5788 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl))))
5789 {
5790 error ("%qE attribute requires a void return type", name);
5791 *no_add_attrs = true;
5792 }
5793
5794 return NULL_TREE;
5795 }
5796
5797 /* Handle a "shared" attribute; arguments as in
5798 struct attribute_spec.handler. */
5799
5800 static tree
nvptx_handle_shared_attribute(tree * node,tree name,tree ARG_UNUSED (args),int ARG_UNUSED (flags),bool * no_add_attrs)5801 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args),
5802 int ARG_UNUSED (flags), bool *no_add_attrs)
5803 {
5804 tree decl = *node;
5805
5806 if (TREE_CODE (decl) != VAR_DECL)
5807 {
5808 error ("%qE attribute only applies to variables", name);
5809 *no_add_attrs = true;
5810 }
5811 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl)))
5812 {
5813 error ("%qE attribute not allowed with auto storage class", name);
5814 *no_add_attrs = true;
5815 }
5816
5817 return NULL_TREE;
5818 }
5819
5820 /* Table of valid machine attributes. */
5821 static const struct attribute_spec nvptx_attribute_table[] =
5822 {
5823 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
5824 affects_type_identity, handler, exclude } */
5825 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute,
5826 NULL },
5827 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute,
5828 NULL },
5829 { NULL, 0, 0, false, false, false, false, NULL, NULL }
5830 };
5831
5832 /* Limit vector alignments to BIGGEST_ALIGNMENT. */
5833
5834 static HOST_WIDE_INT
nvptx_vector_alignment(const_tree type)5835 nvptx_vector_alignment (const_tree type)
5836 {
5837 unsigned HOST_WIDE_INT align;
5838 tree size = TYPE_SIZE (type);
5839
5840 /* Ensure align is not bigger than BIGGEST_ALIGNMENT. */
5841 if (tree_fits_uhwi_p (size))
5842 {
5843 align = tree_to_uhwi (size);
5844 align = MIN (align, BIGGEST_ALIGNMENT);
5845 }
5846 else
5847 align = BIGGEST_ALIGNMENT;
5848
5849 /* Ensure align is not smaller than mode alignment. */
5850 align = MAX (align, GET_MODE_ALIGNMENT (TYPE_MODE (type)));
5851
5852 return align;
5853 }
5854
5855 /* Indicate that INSN cannot be duplicated. */
5856
5857 static bool
nvptx_cannot_copy_insn_p(rtx_insn * insn)5858 nvptx_cannot_copy_insn_p (rtx_insn *insn)
5859 {
5860 switch (recog_memoized (insn))
5861 {
5862 case CODE_FOR_nvptx_shufflesi:
5863 case CODE_FOR_nvptx_shufflesf:
5864 case CODE_FOR_nvptx_barsync:
5865 case CODE_FOR_nvptx_fork:
5866 case CODE_FOR_nvptx_forked:
5867 case CODE_FOR_nvptx_joining:
5868 case CODE_FOR_nvptx_join:
5869 return true;
5870 default:
5871 return false;
5872 }
5873 }
5874
5875 /* Section anchors do not work. Initialization for flag_section_anchor
5876 probes the existence of the anchoring target hooks and prevents
5877 anchoring if they don't exist. However, we may be being used with
5878 a host-side compiler that does support anchoring, and hence see
5879 the anchor flag set (as it's not recalculated). So provide an
5880 implementation denying anchoring. */
5881
5882 static bool
nvptx_use_anchors_for_symbol_p(const_rtx ARG_UNUSED (a))5883 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a))
5884 {
5885 return false;
5886 }
5887
5888 /* Record a symbol for mkoffload to enter into the mapping table. */
5889
5890 static void
nvptx_record_offload_symbol(tree decl)5891 nvptx_record_offload_symbol (tree decl)
5892 {
5893 switch (TREE_CODE (decl))
5894 {
5895 case VAR_DECL:
5896 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n",
5897 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5898 break;
5899
5900 case FUNCTION_DECL:
5901 {
5902 tree attr = oacc_get_fn_attrib (decl);
5903 /* OpenMP offloading does not set this attribute. */
5904 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE;
5905
5906 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"",
5907 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)));
5908
5909 for (; dims; dims = TREE_CHAIN (dims))
5910 {
5911 int size = TREE_INT_CST_LOW (TREE_VALUE (dims));
5912
5913 gcc_assert (!TREE_PURPOSE (dims));
5914 fprintf (asm_out_file, ", %#x", size);
5915 }
5916
5917 fprintf (asm_out_file, "\n");
5918 }
5919 break;
5920
5921 default:
5922 gcc_unreachable ();
5923 }
5924 }
5925
5926 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects
5927 at the start of a file. */
5928
5929 static void
nvptx_file_start(void)5930 nvptx_file_start (void)
5931 {
5932 fputs ("// BEGIN PREAMBLE\n", asm_out_file);
5933
5934 fputs ("\t.version\t", asm_out_file);
5935 fputs (ptx_version_to_string ((enum ptx_version)ptx_version_option),
5936 asm_out_file);
5937 fputs ("\n", asm_out_file);
5938
5939 fputs ("\t.target\tsm_", asm_out_file);
5940 fputs (sm_version_to_string ((enum ptx_isa)ptx_isa_option),
5941 asm_out_file);
5942 fputs ("\n", asm_out_file);
5943
5944 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode));
5945
5946 fputs ("// END PREAMBLE\n", asm_out_file);
5947 }
5948
5949 /* Emit a declaration for a worker and vector-level buffer in .shared
5950 memory. */
5951
5952 static void
write_shared_buffer(FILE * file,rtx sym,unsigned align,unsigned size)5953 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size)
5954 {
5955 const char *name = XSTR (sym, 0);
5956
5957 write_var_marker (file, true, false, name);
5958 fprintf (file, ".shared .align %d .u8 %s[%d];\n",
5959 align, name, size);
5960 }
5961
5962 /* Write out the function declarations we've collected and declare storage
5963 for the broadcast buffer. */
5964
5965 static void
nvptx_file_end(void)5966 nvptx_file_end (void)
5967 {
5968 hash_table<tree_hasher>::iterator iter;
5969 tree decl;
5970 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter)
5971 nvptx_record_fndecl (decl);
5972 fputs (func_decls.str().c_str(), asm_out_file);
5973
5974 if (oacc_bcast_size)
5975 write_shared_buffer (asm_out_file, oacc_bcast_sym,
5976 oacc_bcast_align, oacc_bcast_size);
5977
5978 if (worker_red_size)
5979 write_shared_buffer (asm_out_file, worker_red_sym,
5980 worker_red_align, worker_red_size);
5981
5982 if (vector_red_size)
5983 write_shared_buffer (asm_out_file, vector_red_sym,
5984 vector_red_align, vector_red_size);
5985
5986 if (gang_private_shared_size)
5987 write_shared_buffer (asm_out_file, gang_private_shared_sym,
5988 gang_private_shared_align, gang_private_shared_size);
5989
5990 if (need_softstack_decl)
5991 {
5992 write_var_marker (asm_out_file, false, true, "__nvptx_stacks");
5993 /* 32 is the maximum number of warps in a block. Even though it's an
5994 external declaration, emit the array size explicitly; otherwise, it
5995 may fail at PTX JIT time if the definition is later in link order. */
5996 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n",
5997 POINTER_SIZE);
5998 }
5999 if (need_unisimt_decl)
6000 {
6001 write_var_marker (asm_out_file, false, true, "__nvptx_uni");
6002 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n");
6003 }
6004 }
6005
6006 /* Expander for the shuffle builtins. */
6007
6008 static rtx
nvptx_expand_shuffle(tree exp,rtx target,machine_mode mode,int ignore)6009 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore)
6010 {
6011 if (ignore)
6012 return target;
6013
6014 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0),
6015 NULL_RTX, mode, EXPAND_NORMAL);
6016 if (!REG_P (src))
6017 src = copy_to_mode_reg (mode, src);
6018
6019 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1),
6020 NULL_RTX, SImode, EXPAND_NORMAL);
6021 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2),
6022 NULL_RTX, SImode, EXPAND_NORMAL);
6023
6024 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT)
6025 idx = copy_to_mode_reg (SImode, idx);
6026
6027 rtx pat = nvptx_gen_shuffle (target, src, idx,
6028 (nvptx_shuffle_kind) INTVAL (op));
6029 if (pat)
6030 emit_insn (pat);
6031
6032 return target;
6033 }
6034
6035 const char *
nvptx_output_red_partition(rtx dst,rtx offset)6036 nvptx_output_red_partition (rtx dst, rtx offset)
6037 {
6038 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n";
6039 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n";
6040
6041 if (offset == const0_rtx)
6042 fprintf (asm_out_file, zero_offset, REGNO (dst),
6043 REGNO (cfun->machine->red_partition));
6044 else
6045 fprintf (asm_out_file, with_offset, REGNO (dst),
6046 REGNO (cfun->machine->red_partition), UINTVAL (offset));
6047
6048 return "";
6049 }
6050
6051 /* Shared-memory reduction address expander. */
6052
6053 static rtx
nvptx_expand_shared_addr(tree exp,rtx target,machine_mode ARG_UNUSED (mode),int ignore,int vector)6054 nvptx_expand_shared_addr (tree exp, rtx target,
6055 machine_mode ARG_UNUSED (mode), int ignore,
6056 int vector)
6057 {
6058 if (ignore)
6059 return target;
6060
6061 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2));
6062 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0));
6063 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1));
6064 rtx addr = worker_red_sym;
6065
6066 if (vector)
6067 {
6068 offload_attrs oa;
6069
6070 populate_offload_attrs (&oa);
6071
6072 unsigned int psize = ROUND_UP (size + offset, align);
6073 unsigned int pnum = nvptx_mach_max_workers ();
6074 vector_red_partition = MAX (vector_red_partition, psize);
6075 vector_red_size = MAX (vector_red_size, psize * pnum);
6076 vector_red_align = MAX (vector_red_align, align);
6077
6078 if (cfun->machine->red_partition == NULL)
6079 cfun->machine->red_partition = gen_reg_rtx (Pmode);
6080
6081 addr = gen_reg_rtx (Pmode);
6082 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset)));
6083 }
6084 else
6085 {
6086 worker_red_align = MAX (worker_red_align, align);
6087 worker_red_size = MAX (worker_red_size, size + offset);
6088
6089 if (offset)
6090 {
6091 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset));
6092 addr = gen_rtx_CONST (Pmode, addr);
6093 }
6094 }
6095
6096 emit_move_insn (target, addr);
6097 return target;
6098 }
6099
6100 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do
6101 not require taking the address of any object, other than the memory
6102 cell being operated on. */
6103
6104 static rtx
nvptx_expand_cmp_swap(tree exp,rtx target,machine_mode ARG_UNUSED (m),int ARG_UNUSED (ignore))6105 nvptx_expand_cmp_swap (tree exp, rtx target,
6106 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore))
6107 {
6108 machine_mode mode = TYPE_MODE (TREE_TYPE (exp));
6109
6110 if (!target)
6111 target = gen_reg_rtx (mode);
6112
6113 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0),
6114 NULL_RTX, Pmode, EXPAND_NORMAL);
6115 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1),
6116 NULL_RTX, mode, EXPAND_NORMAL);
6117 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2),
6118 NULL_RTX, mode, EXPAND_NORMAL);
6119 rtx pat;
6120
6121 mem = gen_rtx_MEM (mode, mem);
6122 if (!REG_P (cmp))
6123 cmp = copy_to_mode_reg (mode, cmp);
6124 if (!REG_P (src))
6125 src = copy_to_mode_reg (mode, src);
6126
6127 if (mode == SImode)
6128 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx);
6129 else
6130 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx);
6131
6132 emit_insn (pat);
6133
6134 return target;
6135 }
6136
6137
6138 /* Codes for all the NVPTX builtins. */
6139 enum nvptx_builtins
6140 {
6141 NVPTX_BUILTIN_SHUFFLE,
6142 NVPTX_BUILTIN_SHUFFLELL,
6143 NVPTX_BUILTIN_WORKER_ADDR,
6144 NVPTX_BUILTIN_VECTOR_ADDR,
6145 NVPTX_BUILTIN_CMP_SWAP,
6146 NVPTX_BUILTIN_CMP_SWAPLL,
6147 NVPTX_BUILTIN_MEMBAR_GL,
6148 NVPTX_BUILTIN_MEMBAR_CTA,
6149 NVPTX_BUILTIN_MAX
6150 };
6151
6152 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX];
6153
6154 /* Return the NVPTX builtin for CODE. */
6155
6156 static tree
nvptx_builtin_decl(unsigned code,bool ARG_UNUSED (initialize_p))6157 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p))
6158 {
6159 if (code >= NVPTX_BUILTIN_MAX)
6160 return error_mark_node;
6161
6162 return nvptx_builtin_decls[code];
6163 }
6164
6165 /* Set up all builtin functions for this target. */
6166
6167 static void
nvptx_init_builtins(void)6168 nvptx_init_builtins (void)
6169 {
6170 #define DEF(ID, NAME, T) \
6171 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \
6172 = add_builtin_function ("__builtin_nvptx_" NAME, \
6173 build_function_type_list T, \
6174 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL))
6175 #define ST sizetype
6176 #define UINT unsigned_type_node
6177 #define LLUINT long_long_unsigned_type_node
6178 #define PTRVOID ptr_type_node
6179 #define VOID void_type_node
6180
6181 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE));
6182 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE));
6183 DEF (WORKER_ADDR, "worker_addr",
6184 (PTRVOID, ST, UINT, UINT, NULL_TREE));
6185 DEF (VECTOR_ADDR, "vector_addr",
6186 (PTRVOID, ST, UINT, UINT, NULL_TREE));
6187 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE));
6188 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE));
6189 DEF (MEMBAR_GL, "membar_gl", (VOID, VOID, NULL_TREE));
6190 DEF (MEMBAR_CTA, "membar_cta", (VOID, VOID, NULL_TREE));
6191
6192 #undef DEF
6193 #undef ST
6194 #undef UINT
6195 #undef LLUINT
6196 #undef PTRVOID
6197 }
6198
6199 /* Expand an expression EXP that calls a built-in function,
6200 with result going to TARGET if that's convenient
6201 (and in mode MODE if that's convenient).
6202 SUBTARGET may be used as the target for computing one of EXP's operands.
6203 IGNORE is nonzero if the value is to be ignored. */
6204
6205 static rtx
nvptx_expand_builtin(tree exp,rtx target,rtx ARG_UNUSED (subtarget),machine_mode mode,int ignore)6206 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget),
6207 machine_mode mode, int ignore)
6208 {
6209 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
6210 switch (DECL_MD_FUNCTION_CODE (fndecl))
6211 {
6212 case NVPTX_BUILTIN_SHUFFLE:
6213 case NVPTX_BUILTIN_SHUFFLELL:
6214 return nvptx_expand_shuffle (exp, target, mode, ignore);
6215
6216 case NVPTX_BUILTIN_WORKER_ADDR:
6217 return nvptx_expand_shared_addr (exp, target, mode, ignore, false);
6218
6219 case NVPTX_BUILTIN_VECTOR_ADDR:
6220 return nvptx_expand_shared_addr (exp, target, mode, ignore, true);
6221
6222 case NVPTX_BUILTIN_CMP_SWAP:
6223 case NVPTX_BUILTIN_CMP_SWAPLL:
6224 return nvptx_expand_cmp_swap (exp, target, mode, ignore);
6225
6226 case NVPTX_BUILTIN_MEMBAR_GL:
6227 emit_insn (gen_nvptx_membar_gl ());
6228 return NULL_RTX;
6229
6230 case NVPTX_BUILTIN_MEMBAR_CTA:
6231 emit_insn (gen_nvptx_membar_cta ());
6232 return NULL_RTX;
6233
6234 default: gcc_unreachable ();
6235 }
6236 }
6237
6238 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */
6239
6240 static int
nvptx_simt_vf()6241 nvptx_simt_vf ()
6242 {
6243 return PTX_WARP_SIZE;
6244 }
6245
6246 /* Return 1 if TRAIT NAME is present in the OpenMP context's
6247 device trait set, return 0 if not present in any OpenMP context in the
6248 whole translation unit, or -1 if not present in the current OpenMP context
6249 but might be present in another OpenMP context in the same TU. */
6250
6251 int
nvptx_omp_device_kind_arch_isa(enum omp_device_kind_arch_isa trait,const char * name)6252 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait,
6253 const char *name)
6254 {
6255 switch (trait)
6256 {
6257 case omp_device_kind:
6258 return strcmp (name, "gpu") == 0;
6259 case omp_device_arch:
6260 return strcmp (name, "nvptx") == 0;
6261 case omp_device_isa:
6262 #define NVPTX_SM(XX, SEP) \
6263 { \
6264 if (strcmp (name, "sm_" #XX) == 0) \
6265 return ptx_isa_option == PTX_ISA_SM ## XX; \
6266 }
6267 #include "nvptx-sm.def"
6268 #undef NVPTX_SM
6269 return 0;
6270 default:
6271 gcc_unreachable ();
6272 }
6273 }
6274
6275 static bool
nvptx_welformed_vector_length_p(int l)6276 nvptx_welformed_vector_length_p (int l)
6277 {
6278 gcc_assert (l > 0);
6279 return l % PTX_WARP_SIZE == 0;
6280 }
6281
6282 static void
nvptx_apply_dim_limits(int dims[])6283 nvptx_apply_dim_limits (int dims[])
6284 {
6285 /* Check that the vector_length is not too large. */
6286 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH)
6287 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH;
6288
6289 /* Check that the number of workers is not too large. */
6290 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH)
6291 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH;
6292
6293 /* Ensure that num_worker * vector_length <= cta size. */
6294 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
6295 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE)
6296 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6297
6298 /* If we need a per-worker barrier ... . */
6299 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0
6300 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
6301 /* Don't use more barriers than available. */
6302 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER],
6303 PTX_NUM_PER_WORKER_BARRIERS);
6304 }
6305
6306 /* Return true if FNDECL contains calls to vector-partitionable routines. */
6307
6308 static bool
has_vector_partitionable_routine_calls_p(tree fndecl)6309 has_vector_partitionable_routine_calls_p (tree fndecl)
6310 {
6311 if (!fndecl)
6312 return false;
6313
6314 basic_block bb;
6315 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl))
6316 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i);
6317 gsi_next_nondebug (&i))
6318 {
6319 gimple *stmt = gsi_stmt (i);
6320 if (gimple_code (stmt) != GIMPLE_CALL)
6321 continue;
6322
6323 tree callee = gimple_call_fndecl (stmt);
6324 if (!callee)
6325 continue;
6326
6327 tree attrs = oacc_get_fn_attrib (callee);
6328 if (attrs == NULL_TREE)
6329 return false;
6330
6331 int partition_level = oacc_fn_attrib_level (attrs);
6332 bool seq_routine_p = partition_level == GOMP_DIM_MAX;
6333 if (!seq_routine_p)
6334 return true;
6335 }
6336
6337 return false;
6338 }
6339
6340 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether
6341 DIMS has changed. */
6342
6343 static void
nvptx_goacc_validate_dims_1(tree decl,int dims[],int fn_level,unsigned used)6344 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used)
6345 {
6346 bool oacc_default_dims_p = false;
6347 bool oacc_min_dims_p = false;
6348 bool offload_region_p = false;
6349 bool routine_p = false;
6350 bool routine_seq_p = false;
6351 int default_vector_length = -1;
6352
6353 if (decl == NULL_TREE)
6354 {
6355 if (fn_level == -1)
6356 oacc_default_dims_p = true;
6357 else if (fn_level == -2)
6358 oacc_min_dims_p = true;
6359 else
6360 gcc_unreachable ();
6361 }
6362 else if (fn_level == -1)
6363 offload_region_p = true;
6364 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX)
6365 {
6366 routine_p = true;
6367 routine_seq_p = fn_level == GOMP_DIM_MAX;
6368 }
6369 else
6370 gcc_unreachable ();
6371
6372 if (oacc_min_dims_p)
6373 {
6374 gcc_assert (dims[GOMP_DIM_VECTOR] == 1);
6375 gcc_assert (dims[GOMP_DIM_WORKER] == 1);
6376 gcc_assert (dims[GOMP_DIM_GANG] == 1);
6377
6378 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6379 return;
6380 }
6381
6382 if (routine_p)
6383 {
6384 if (!routine_seq_p)
6385 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6386
6387 return;
6388 }
6389
6390 if (oacc_default_dims_p)
6391 {
6392 /* -1 : not set
6393 0 : set at runtime, f.i. -fopenacc-dims=-
6394 >= 1: set at compile time, f.i. -fopenacc-dims=1. */
6395 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
6396 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
6397 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
6398
6399 /* But -fopenacc-dims=- is not yet supported on trunk. */
6400 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
6401 gcc_assert (dims[GOMP_DIM_WORKER] != 0);
6402 gcc_assert (dims[GOMP_DIM_GANG] != 0);
6403 }
6404
6405 if (offload_region_p)
6406 {
6407 /* -1 : not set
6408 0 : set using variable, f.i. num_gangs (n)
6409 >= 1: set using constant, f.i. num_gangs (1). */
6410 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1);
6411 gcc_assert (dims[GOMP_DIM_WORKER] >= -1);
6412 gcc_assert (dims[GOMP_DIM_GANG] >= -1);
6413 }
6414
6415 if (offload_region_p)
6416 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR);
6417 else
6418 /* oacc_default_dims_p. */
6419 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH;
6420
6421 int old_dims[GOMP_DIM_MAX];
6422 unsigned int i;
6423 for (i = 0; i < GOMP_DIM_MAX; ++i)
6424 old_dims[i] = dims[i];
6425
6426 const char *vector_reason = NULL;
6427 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl))
6428 {
6429 default_vector_length = PTX_WARP_SIZE;
6430
6431 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE)
6432 {
6433 vector_reason = G_("using %<vector_length (%d)%> due to call to"
6434 " vector-partitionable routine, ignoring %d");
6435 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE;
6436 }
6437 }
6438
6439 if (dims[GOMP_DIM_VECTOR] == 0)
6440 {
6441 vector_reason = G_("using %<vector_length (%d)%>, ignoring runtime setting");
6442 dims[GOMP_DIM_VECTOR] = default_vector_length;
6443 }
6444
6445 if (dims[GOMP_DIM_VECTOR] > 0
6446 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR]))
6447 dims[GOMP_DIM_VECTOR] = default_vector_length;
6448
6449 nvptx_apply_dim_limits (dims);
6450
6451 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR])
6452 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6453 vector_reason != NULL
6454 ? vector_reason
6455 : G_("using %<vector_length (%d)%>, ignoring %d"),
6456 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]);
6457
6458 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER])
6459 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0,
6460 G_("using %<num_workers (%d)%>, ignoring %d"),
6461 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]);
6462
6463 if (oacc_default_dims_p)
6464 {
6465 if (dims[GOMP_DIM_VECTOR] < 0)
6466 dims[GOMP_DIM_VECTOR] = default_vector_length;
6467 if (dims[GOMP_DIM_WORKER] < 0)
6468 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM;
6469 if (dims[GOMP_DIM_GANG] < 0)
6470 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM;
6471 nvptx_apply_dim_limits (dims);
6472 }
6473
6474 if (offload_region_p)
6475 {
6476 for (i = 0; i < GOMP_DIM_MAX; i++)
6477 {
6478 if (!(dims[i] < 0))
6479 continue;
6480
6481 if ((used & GOMP_DIM_MASK (i)) == 0)
6482 /* Function oacc_validate_dims will apply the minimal dimension. */
6483 continue;
6484
6485 dims[i] = (i == GOMP_DIM_VECTOR
6486 ? default_vector_length
6487 : oacc_get_default_dim (i));
6488 }
6489
6490 nvptx_apply_dim_limits (dims);
6491 }
6492 }
6493
6494 /* Validate compute dimensions of an OpenACC offload or routine, fill
6495 in non-unity defaults. FN_LEVEL indicates the level at which a
6496 routine might spawn a loop. It is negative for non-routines. If
6497 DECL is null, we are validating the default dimensions. */
6498
6499 static bool
nvptx_goacc_validate_dims(tree decl,int dims[],int fn_level,unsigned used)6500 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used)
6501 {
6502 int old_dims[GOMP_DIM_MAX];
6503 unsigned int i;
6504
6505 for (i = 0; i < GOMP_DIM_MAX; ++i)
6506 old_dims[i] = dims[i];
6507
6508 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used);
6509
6510 gcc_assert (dims[GOMP_DIM_VECTOR] != 0);
6511 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0)
6512 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE);
6513
6514 for (i = 0; i < GOMP_DIM_MAX; ++i)
6515 if (old_dims[i] != dims[i])
6516 return true;
6517
6518 return false;
6519 }
6520
6521 /* Return maximum dimension size, or zero for unbounded. */
6522
6523 static int
nvptx_dim_limit(int axis)6524 nvptx_dim_limit (int axis)
6525 {
6526 switch (axis)
6527 {
6528 case GOMP_DIM_VECTOR:
6529 return PTX_MAX_VECTOR_LENGTH;
6530
6531 default:
6532 break;
6533 }
6534 return 0;
6535 }
6536
6537 /* Determine whether fork & joins are needed. */
6538
6539 static bool
nvptx_goacc_fork_join(gcall * call,const int dims[],bool ARG_UNUSED (is_fork))6540 nvptx_goacc_fork_join (gcall *call, const int dims[],
6541 bool ARG_UNUSED (is_fork))
6542 {
6543 tree arg = gimple_call_arg (call, 2);
6544 unsigned axis = TREE_INT_CST_LOW (arg);
6545
6546 /* We only care about worker and vector partitioning. */
6547 if (axis < GOMP_DIM_WORKER)
6548 return false;
6549
6550 /* If the size is 1, there's no partitioning. */
6551 if (dims[axis] == 1)
6552 return false;
6553
6554 return true;
6555 }
6556
6557 /* Generate a PTX builtin function call that returns the address in
6558 the worker reduction buffer at OFFSET. TYPE is the type of the
6559 data at that location. */
6560
6561 static tree
nvptx_get_shared_red_addr(tree type,tree offset,bool vector)6562 nvptx_get_shared_red_addr (tree type, tree offset, bool vector)
6563 {
6564 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR;
6565 if (vector)
6566 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR;
6567 machine_mode mode = TYPE_MODE (type);
6568 tree fndecl = nvptx_builtin_decl (addr_dim, true);
6569 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode));
6570 tree align = build_int_cst (unsigned_type_node,
6571 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT);
6572 tree call = build_call_expr (fndecl, 3, offset, size, align);
6573
6574 return fold_convert (build_pointer_type (type), call);
6575 }
6576
6577 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function
6578 will cast the variable if necessary. */
6579
6580 static void
nvptx_generate_vector_shuffle(location_t loc,tree dest_var,tree var,unsigned shift,gimple_seq * seq)6581 nvptx_generate_vector_shuffle (location_t loc,
6582 tree dest_var, tree var, unsigned shift,
6583 gimple_seq *seq)
6584 {
6585 unsigned fn = NVPTX_BUILTIN_SHUFFLE;
6586 tree_code code = NOP_EXPR;
6587 tree arg_type = unsigned_type_node;
6588 tree var_type = TREE_TYPE (var);
6589 tree dest_type = var_type;
6590
6591 if (TREE_CODE (var_type) == COMPLEX_TYPE)
6592 var_type = TREE_TYPE (var_type);
6593
6594 if (TREE_CODE (var_type) == REAL_TYPE)
6595 code = VIEW_CONVERT_EXPR;
6596
6597 if (TYPE_SIZE (var_type)
6598 == TYPE_SIZE (long_long_unsigned_type_node))
6599 {
6600 fn = NVPTX_BUILTIN_SHUFFLELL;
6601 arg_type = long_long_unsigned_type_node;
6602 }
6603
6604 tree call = nvptx_builtin_decl (fn, true);
6605 tree bits = build_int_cst (unsigned_type_node, shift);
6606 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN);
6607 tree expr;
6608
6609 if (var_type != dest_type)
6610 {
6611 /* Do real and imaginary parts separately. */
6612 tree real = fold_build1 (REALPART_EXPR, var_type, var);
6613 real = fold_build1 (code, arg_type, real);
6614 real = build_call_expr_loc (loc, call, 3, real, bits, kind);
6615 real = fold_build1 (code, var_type, real);
6616
6617 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var);
6618 imag = fold_build1 (code, arg_type, imag);
6619 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind);
6620 imag = fold_build1 (code, var_type, imag);
6621
6622 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag);
6623 }
6624 else
6625 {
6626 expr = fold_build1 (code, arg_type, var);
6627 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind);
6628 expr = fold_build1 (code, dest_type, expr);
6629 }
6630
6631 gimplify_assign (dest_var, expr, seq);
6632 }
6633
6634 /* Lazily generate the global lock var decl and return its address. */
6635
6636 static tree
nvptx_global_lock_addr()6637 nvptx_global_lock_addr ()
6638 {
6639 tree v = global_lock_var;
6640
6641 if (!v)
6642 {
6643 tree name = get_identifier ("__reduction_lock");
6644 tree type = build_qualified_type (unsigned_type_node,
6645 TYPE_QUAL_VOLATILE);
6646 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type);
6647 global_lock_var = v;
6648 DECL_ARTIFICIAL (v) = 1;
6649 DECL_EXTERNAL (v) = 1;
6650 TREE_STATIC (v) = 1;
6651 TREE_PUBLIC (v) = 1;
6652 TREE_USED (v) = 1;
6653 mark_addressable (v);
6654 mark_decl_referenced (v);
6655 }
6656
6657 return build_fold_addr_expr (v);
6658 }
6659
6660 /* Insert code to locklessly update *PTR with *PTR OP VAR just before
6661 GSI. We use a lockless scheme for nearly all case, which looks
6662 like:
6663 actual = initval(OP);
6664 do {
6665 guess = actual;
6666 write = guess OP myval;
6667 actual = cmp&swap (ptr, guess, write)
6668 } while (actual bit-different-to guess);
6669 return write;
6670
6671 This relies on a cmp&swap instruction, which is available for 32-
6672 and 64-bit types. Larger types must use a locking scheme. */
6673
6674 static tree
nvptx_lockless_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op)6675 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi,
6676 tree ptr, tree var, tree_code op)
6677 {
6678 unsigned fn = NVPTX_BUILTIN_CMP_SWAP;
6679 tree_code code = NOP_EXPR;
6680 tree arg_type = unsigned_type_node;
6681 tree var_type = TREE_TYPE (var);
6682
6683 if (TREE_CODE (var_type) == COMPLEX_TYPE
6684 || TREE_CODE (var_type) == REAL_TYPE)
6685 code = VIEW_CONVERT_EXPR;
6686
6687 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node))
6688 {
6689 arg_type = long_long_unsigned_type_node;
6690 fn = NVPTX_BUILTIN_CMP_SWAPLL;
6691 }
6692
6693 tree swap_fn = nvptx_builtin_decl (fn, true);
6694
6695 gimple_seq init_seq = NULL;
6696 tree init_var = make_ssa_name (arg_type);
6697 tree init_expr = omp_reduction_init_op (loc, op, var_type);
6698 init_expr = fold_build1 (code, arg_type, init_expr);
6699 gimplify_assign (init_var, init_expr, &init_seq);
6700 gimple *init_end = gimple_seq_last (init_seq);
6701
6702 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT);
6703
6704 /* Split the block just after the init stmts. */
6705 basic_block pre_bb = gsi_bb (*gsi);
6706 edge pre_edge = split_block (pre_bb, init_end);
6707 basic_block loop_bb = pre_edge->dest;
6708 pre_bb = pre_edge->src;
6709 /* Reset the iterator. */
6710 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6711
6712 tree expect_var = make_ssa_name (arg_type);
6713 tree actual_var = make_ssa_name (arg_type);
6714 tree write_var = make_ssa_name (arg_type);
6715
6716 /* Build and insert the reduction calculation. */
6717 gimple_seq red_seq = NULL;
6718 tree write_expr = fold_build1 (code, var_type, expect_var);
6719 write_expr = fold_build2 (op, var_type, write_expr, var);
6720 write_expr = fold_build1 (code, arg_type, write_expr);
6721 gimplify_assign (write_var, write_expr, &red_seq);
6722
6723 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6724
6725 /* Build & insert the cmp&swap sequence. */
6726 gimple_seq latch_seq = NULL;
6727 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3,
6728 ptr, expect_var, write_var);
6729 gimplify_assign (actual_var, swap_expr, &latch_seq);
6730
6731 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var,
6732 NULL_TREE, NULL_TREE);
6733 gimple_seq_add_stmt (&latch_seq, cond);
6734
6735 gimple *latch_end = gimple_seq_last (latch_seq);
6736 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT);
6737
6738 /* Split the block just after the latch stmts. */
6739 edge post_edge = split_block (loop_bb, latch_end);
6740 basic_block post_bb = post_edge->dest;
6741 loop_bb = post_edge->src;
6742 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6743
6744 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6745 post_edge->probability = profile_probability::even ();
6746 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE);
6747 loop_edge->probability = profile_probability::even ();
6748 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb);
6749 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb);
6750
6751 gphi *phi = create_phi_node (expect_var, loop_bb);
6752 add_phi_arg (phi, init_var, pre_edge, loc);
6753 add_phi_arg (phi, actual_var, loop_edge, loc);
6754
6755 loop *loop = alloc_loop ();
6756 loop->header = loop_bb;
6757 loop->latch = loop_bb;
6758 add_loop (loop, loop_bb->loop_father);
6759
6760 return fold_build1 (code, var_type, write_var);
6761 }
6762
6763 /* Insert code to lockfully update *PTR with *PTR OP VAR just before
6764 GSI. This is necessary for types larger than 64 bits, where there
6765 is no cmp&swap instruction to implement a lockless scheme. We use
6766 a lock variable in global memory.
6767
6768 while (cmp&swap (&lock_var, 0, 1))
6769 continue;
6770 T accum = *ptr;
6771 accum = accum OP var;
6772 *ptr = accum;
6773 cmp&swap (&lock_var, 1, 0);
6774 return accum;
6775
6776 A lock in global memory is necessary to force execution engine
6777 descheduling and avoid resource starvation that can occur if the
6778 lock is in .shared memory. */
6779
6780 static tree
nvptx_lockfull_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op,int level)6781 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi,
6782 tree ptr, tree var, tree_code op, int level)
6783 {
6784 tree var_type = TREE_TYPE (var);
6785 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true);
6786 tree uns_unlocked = build_int_cst (unsigned_type_node, 0);
6787 tree uns_locked = build_int_cst (unsigned_type_node, 1);
6788
6789 /* Split the block just before the gsi. Insert a gimple nop to make
6790 this easier. */
6791 gimple *nop = gimple_build_nop ();
6792 gsi_insert_before (gsi, nop, GSI_SAME_STMT);
6793 basic_block entry_bb = gsi_bb (*gsi);
6794 edge entry_edge = split_block (entry_bb, nop);
6795 basic_block lock_bb = entry_edge->dest;
6796 /* Reset the iterator. */
6797 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6798
6799 /* Build and insert the locking sequence. */
6800 gimple_seq lock_seq = NULL;
6801 tree lock_var = make_ssa_name (unsigned_type_node);
6802 tree lock_expr = nvptx_global_lock_addr ();
6803 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr,
6804 uns_unlocked, uns_locked);
6805 gimplify_assign (lock_var, lock_expr, &lock_seq);
6806 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked,
6807 NULL_TREE, NULL_TREE);
6808 gimple_seq_add_stmt (&lock_seq, cond);
6809 gimple *lock_end = gimple_seq_last (lock_seq);
6810 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT);
6811
6812 /* Split the block just after the lock sequence. */
6813 edge locked_edge = split_block (lock_bb, lock_end);
6814 basic_block update_bb = locked_edge->dest;
6815 lock_bb = locked_edge->src;
6816 *gsi = gsi_for_stmt (gsi_stmt (*gsi));
6817
6818 /* Create the lock loop ... */
6819 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU;
6820 locked_edge->probability = profile_probability::even ();
6821 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE);
6822 loop_edge->probability = profile_probability::even ();
6823 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb);
6824 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb);
6825
6826 /* ... and the loop structure. */
6827 loop *lock_loop = alloc_loop ();
6828 lock_loop->header = lock_bb;
6829 lock_loop->latch = lock_bb;
6830 lock_loop->nb_iterations_estimate = 1;
6831 lock_loop->any_estimate = true;
6832 add_loop (lock_loop, entry_bb->loop_father);
6833
6834 /* Build the pre-barrier. */
6835 gimple_seq red_seq = NULL;
6836 enum nvptx_builtins barrier_builtin
6837 = (level == GOMP_DIM_GANG
6838 ? NVPTX_BUILTIN_MEMBAR_GL
6839 : NVPTX_BUILTIN_MEMBAR_CTA);
6840 tree barrier_fn = nvptx_builtin_decl (barrier_builtin, true);
6841 tree barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
6842 gimplify_stmt (&barrier_expr, &red_seq);
6843
6844 /* Build the reduction calculation. */
6845 tree acc_in = make_ssa_name (var_type);
6846 tree ref_in = build_simple_mem_ref (ptr);
6847 TREE_THIS_VOLATILE (ref_in) = 1;
6848 gimplify_assign (acc_in, ref_in, &red_seq);
6849
6850 tree acc_out = make_ssa_name (var_type);
6851 tree update_expr = fold_build2 (op, var_type, ref_in, var);
6852 gimplify_assign (acc_out, update_expr, &red_seq);
6853
6854 tree ref_out = build_simple_mem_ref (ptr);
6855 TREE_THIS_VOLATILE (ref_out) = 1;
6856 gimplify_assign (ref_out, acc_out, &red_seq);
6857
6858 /* Build the post-barrier. */
6859 barrier_expr = build_call_expr_loc (loc, barrier_fn, 0);
6860 gimplify_stmt (&barrier_expr, &red_seq);
6861
6862 /* Insert the reduction calculation. */
6863 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT);
6864
6865 /* Build & insert the unlock sequence. */
6866 gimple_seq unlock_seq = NULL;
6867 tree unlock_expr = nvptx_global_lock_addr ();
6868 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr,
6869 uns_locked, uns_unlocked);
6870 gimplify_and_add (unlock_expr, &unlock_seq);
6871 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT);
6872
6873 return acc_out;
6874 }
6875
6876 /* Emit a sequence to update a reduction accumlator at *PTR with the
6877 value held in VAR using operator OP. Return the updated value.
6878
6879 TODO: optimize for atomic ops and indepedent complex ops. */
6880
6881 static tree
nvptx_reduction_update(location_t loc,gimple_stmt_iterator * gsi,tree ptr,tree var,tree_code op,int level)6882 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi,
6883 tree ptr, tree var, tree_code op, int level)
6884 {
6885 tree type = TREE_TYPE (var);
6886 tree size = TYPE_SIZE (type);
6887
6888 if (size == TYPE_SIZE (unsigned_type_node)
6889 || size == TYPE_SIZE (long_long_unsigned_type_node))
6890 return nvptx_lockless_update (loc, gsi, ptr, var, op);
6891 else
6892 return nvptx_lockfull_update (loc, gsi, ptr, var, op, level);
6893 }
6894
6895 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */
6896
6897 static void
nvptx_goacc_reduction_setup(gcall * call,offload_attrs * oa)6898 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa)
6899 {
6900 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6901 tree lhs = gimple_call_lhs (call);
6902 tree var = gimple_call_arg (call, 2);
6903 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6904 gimple_seq seq = NULL;
6905
6906 push_gimplify_context (true);
6907
6908 if (level != GOMP_DIM_GANG)
6909 {
6910 /* Copy the receiver object. */
6911 tree ref_to_res = gimple_call_arg (call, 1);
6912
6913 if (!integer_zerop (ref_to_res))
6914 var = build_simple_mem_ref (ref_to_res);
6915 }
6916
6917 if (level == GOMP_DIM_WORKER
6918 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
6919 {
6920 /* Store incoming value to worker reduction buffer. */
6921 tree offset = gimple_call_arg (call, 5);
6922 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
6923 level == GOMP_DIM_VECTOR);
6924 tree ptr = make_ssa_name (TREE_TYPE (call));
6925
6926 gimplify_assign (ptr, call, &seq);
6927 tree ref = build_simple_mem_ref (ptr);
6928 TREE_THIS_VOLATILE (ref) = 1;
6929 gimplify_assign (ref, var, &seq);
6930 }
6931
6932 if (lhs)
6933 gimplify_assign (lhs, var, &seq);
6934
6935 pop_gimplify_context (NULL);
6936 gsi_replace_with_seq (&gsi, seq, true);
6937 }
6938
6939 /* NVPTX implementation of GOACC_REDUCTION_INIT. */
6940
6941 static void
nvptx_goacc_reduction_init(gcall * call,offload_attrs * oa)6942 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa)
6943 {
6944 gimple_stmt_iterator gsi = gsi_for_stmt (call);
6945 tree lhs = gimple_call_lhs (call);
6946 tree var = gimple_call_arg (call, 2);
6947 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
6948 enum tree_code rcode
6949 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
6950 tree init = omp_reduction_init_op (gimple_location (call), rcode,
6951 TREE_TYPE (var));
6952 gimple_seq seq = NULL;
6953
6954 push_gimplify_context (true);
6955
6956 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
6957 {
6958 /* Initialize vector-non-zeroes to INIT_VAL (OP). */
6959 tree tid = make_ssa_name (integer_type_node);
6960 tree dim_vector = gimple_call_arg (call, 3);
6961 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1,
6962 dim_vector);
6963 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node,
6964 NULL_TREE, NULL_TREE);
6965
6966 gimple_call_set_lhs (tid_call, tid);
6967 gimple_seq_add_stmt (&seq, tid_call);
6968 gimple_seq_add_stmt (&seq, cond_stmt);
6969
6970 /* Split the block just after the call. */
6971 edge init_edge = split_block (gsi_bb (gsi), call);
6972 basic_block init_bb = init_edge->dest;
6973 basic_block call_bb = init_edge->src;
6974
6975 /* Fixup flags from call_bb to init_bb. */
6976 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE;
6977 init_edge->probability = profile_probability::even ();
6978
6979 /* Set the initialization stmts. */
6980 gimple_seq init_seq = NULL;
6981 tree init_var = make_ssa_name (TREE_TYPE (var));
6982 gimplify_assign (init_var, init, &init_seq);
6983 gsi = gsi_start_bb (init_bb);
6984 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT);
6985
6986 /* Split block just after the init stmt. */
6987 gsi_prev (&gsi);
6988 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi));
6989 basic_block dst_bb = inited_edge->dest;
6990
6991 /* Create false edge from call_bb to dst_bb. */
6992 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE);
6993 nop_edge->probability = profile_probability::even ();
6994
6995 /* Create phi node in dst block. */
6996 gphi *phi = create_phi_node (lhs, dst_bb);
6997 add_phi_arg (phi, init_var, inited_edge, gimple_location (call));
6998 add_phi_arg (phi, var, nop_edge, gimple_location (call));
6999
7000 /* Reset dominator of dst bb. */
7001 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb);
7002
7003 /* Reset the gsi. */
7004 gsi = gsi_for_stmt (call);
7005 }
7006 else
7007 {
7008 if (level == GOMP_DIM_GANG)
7009 {
7010 /* If there's no receiver object, propagate the incoming VAR. */
7011 tree ref_to_res = gimple_call_arg (call, 1);
7012 if (integer_zerop (ref_to_res))
7013 init = var;
7014 }
7015
7016 if (lhs != NULL_TREE)
7017 gimplify_assign (lhs, init, &seq);
7018 }
7019
7020 pop_gimplify_context (NULL);
7021 gsi_replace_with_seq (&gsi, seq, true);
7022 }
7023
7024 /* NVPTX implementation of GOACC_REDUCTION_FINI. */
7025
7026 static void
nvptx_goacc_reduction_fini(gcall * call,offload_attrs * oa)7027 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa)
7028 {
7029 gimple_stmt_iterator gsi = gsi_for_stmt (call);
7030 tree lhs = gimple_call_lhs (call);
7031 tree ref_to_res = gimple_call_arg (call, 1);
7032 tree var = gimple_call_arg (call, 2);
7033 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7034 enum tree_code op
7035 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4));
7036 gimple_seq seq = NULL;
7037 tree r = NULL_TREE;;
7038
7039 push_gimplify_context (true);
7040
7041 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE)
7042 {
7043 /* Emit binary shuffle tree. TODO. Emit this as an actual loop,
7044 but that requires a method of emitting a unified jump at the
7045 gimple level. */
7046 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1)
7047 {
7048 tree other_var = make_ssa_name (TREE_TYPE (var));
7049 nvptx_generate_vector_shuffle (gimple_location (call),
7050 other_var, var, shfl, &seq);
7051
7052 r = make_ssa_name (TREE_TYPE (var));
7053 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var),
7054 var, other_var), &seq);
7055 var = r;
7056 }
7057 }
7058 else
7059 {
7060 tree accum = NULL_TREE;
7061
7062 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR)
7063 {
7064 /* Get reduction buffer address. */
7065 tree offset = gimple_call_arg (call, 5);
7066 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7067 level == GOMP_DIM_VECTOR);
7068 tree ptr = make_ssa_name (TREE_TYPE (call));
7069
7070 gimplify_assign (ptr, call, &seq);
7071 accum = ptr;
7072 }
7073 else if (integer_zerop (ref_to_res))
7074 r = var;
7075 else
7076 accum = ref_to_res;
7077
7078 if (accum)
7079 {
7080 /* UPDATE the accumulator. */
7081 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
7082 seq = NULL;
7083 r = nvptx_reduction_update (gimple_location (call), &gsi,
7084 accum, var, op, level);
7085 }
7086 }
7087
7088 if (lhs)
7089 gimplify_assign (lhs, r, &seq);
7090 pop_gimplify_context (NULL);
7091
7092 gsi_replace_with_seq (&gsi, seq, true);
7093 }
7094
7095 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */
7096
7097 static void
nvptx_goacc_reduction_teardown(gcall * call,offload_attrs * oa)7098 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa)
7099 {
7100 gimple_stmt_iterator gsi = gsi_for_stmt (call);
7101 tree lhs = gimple_call_lhs (call);
7102 tree var = gimple_call_arg (call, 2);
7103 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3));
7104 gimple_seq seq = NULL;
7105
7106 push_gimplify_context (true);
7107 if (level == GOMP_DIM_WORKER
7108 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE))
7109 {
7110 /* Read the worker reduction buffer. */
7111 tree offset = gimple_call_arg (call, 5);
7112 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset,
7113 level == GOMP_DIM_VECTOR);
7114 tree ptr = make_ssa_name (TREE_TYPE (call));
7115
7116 gimplify_assign (ptr, call, &seq);
7117 var = build_simple_mem_ref (ptr);
7118 TREE_THIS_VOLATILE (var) = 1;
7119 }
7120
7121 if (level != GOMP_DIM_GANG)
7122 {
7123 /* Write to the receiver object. */
7124 tree ref_to_res = gimple_call_arg (call, 1);
7125
7126 if (!integer_zerop (ref_to_res))
7127 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq);
7128 }
7129
7130 if (lhs)
7131 gimplify_assign (lhs, var, &seq);
7132
7133 pop_gimplify_context (NULL);
7134
7135 gsi_replace_with_seq (&gsi, seq, true);
7136 }
7137
7138 /* NVPTX reduction expander. */
7139
7140 static void
nvptx_goacc_reduction(gcall * call)7141 nvptx_goacc_reduction (gcall *call)
7142 {
7143 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0));
7144 offload_attrs oa;
7145
7146 populate_offload_attrs (&oa);
7147
7148 switch (code)
7149 {
7150 case IFN_GOACC_REDUCTION_SETUP:
7151 nvptx_goacc_reduction_setup (call, &oa);
7152 break;
7153
7154 case IFN_GOACC_REDUCTION_INIT:
7155 nvptx_goacc_reduction_init (call, &oa);
7156 break;
7157
7158 case IFN_GOACC_REDUCTION_FINI:
7159 nvptx_goacc_reduction_fini (call, &oa);
7160 break;
7161
7162 case IFN_GOACC_REDUCTION_TEARDOWN:
7163 nvptx_goacc_reduction_teardown (call, &oa);
7164 break;
7165
7166 default:
7167 gcc_unreachable ();
7168 }
7169 }
7170
7171 static bool
nvptx_cannot_force_const_mem(machine_mode mode ATTRIBUTE_UNUSED,rtx x ATTRIBUTE_UNUSED)7172 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED,
7173 rtx x ATTRIBUTE_UNUSED)
7174 {
7175 return true;
7176 }
7177
7178 static bool
nvptx_scalar_mode_supported_p(scalar_mode mode)7179 nvptx_scalar_mode_supported_p (scalar_mode mode)
7180 {
7181 if (nvptx_experimental && mode == HFmode && TARGET_SM53)
7182 return true;
7183
7184 return default_scalar_mode_supported_p (mode);
7185 }
7186
7187 static bool
nvptx_libgcc_floating_mode_supported_p(scalar_float_mode mode)7188 nvptx_libgcc_floating_mode_supported_p (scalar_float_mode mode)
7189 {
7190 if (nvptx_experimental && mode == HFmode && TARGET_SM53)
7191 return true;
7192
7193 return default_libgcc_floating_mode_supported_p (mode);
7194 }
7195
7196 static bool
nvptx_vector_mode_supported(machine_mode mode)7197 nvptx_vector_mode_supported (machine_mode mode)
7198 {
7199 return (mode == V2SImode
7200 || mode == V2DImode);
7201 }
7202
7203 /* Return the preferred mode for vectorizing scalar MODE. */
7204
7205 static machine_mode
nvptx_preferred_simd_mode(scalar_mode mode)7206 nvptx_preferred_simd_mode (scalar_mode mode)
7207 {
7208 switch (mode)
7209 {
7210 case E_DImode:
7211 return V2DImode;
7212 case E_SImode:
7213 return V2SImode;
7214
7215 default:
7216 return default_preferred_simd_mode (mode);
7217 }
7218 }
7219
7220 unsigned int
nvptx_data_alignment(const_tree type,unsigned int basic_align)7221 nvptx_data_alignment (const_tree type, unsigned int basic_align)
7222 {
7223 if (TREE_CODE (type) == INTEGER_TYPE)
7224 {
7225 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type));
7226 if (size == GET_MODE_SIZE (TImode))
7227 return GET_MODE_BITSIZE (maybe_split_mode (TImode));
7228 }
7229
7230 return basic_align;
7231 }
7232
7233 /* Implement TARGET_MODES_TIEABLE_P. */
7234
7235 static bool
nvptx_modes_tieable_p(machine_mode,machine_mode)7236 nvptx_modes_tieable_p (machine_mode, machine_mode)
7237 {
7238 return false;
7239 }
7240
7241 /* Implement TARGET_HARD_REGNO_NREGS. */
7242
7243 static unsigned int
nvptx_hard_regno_nregs(unsigned int,machine_mode)7244 nvptx_hard_regno_nregs (unsigned int, machine_mode)
7245 {
7246 return 1;
7247 }
7248
7249 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
7250
7251 static bool
nvptx_can_change_mode_class(machine_mode,machine_mode,reg_class_t)7252 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t)
7253 {
7254 return false;
7255 }
7256
7257 /* Implement TARGET_TRULY_NOOP_TRUNCATION. */
7258
7259 static bool
nvptx_truly_noop_truncation(poly_uint64,poly_uint64)7260 nvptx_truly_noop_truncation (poly_uint64, poly_uint64)
7261 {
7262 return false;
7263 }
7264
7265 /* Implement TARGET_GOACC_ADJUST_PRIVATE_DECL. */
7266
7267 static tree
nvptx_goacc_adjust_private_decl(location_t loc,tree decl,int level)7268 nvptx_goacc_adjust_private_decl (location_t loc, tree decl, int level)
7269 {
7270 gcc_checking_assert (!lookup_attribute ("oacc gang-private",
7271 DECL_ATTRIBUTES (decl)));
7272
7273 /* Set "oacc gang-private" attribute for gang-private variable
7274 declarations. */
7275 if (level == GOMP_DIM_GANG)
7276 {
7277 tree id = get_identifier ("oacc gang-private");
7278 /* For later diagnostic purposes, pass LOC as VALUE (wrapped as a
7279 TREE). */
7280 tree loc_tree = build_empty_stmt (loc);
7281 DECL_ATTRIBUTES (decl)
7282 = tree_cons (id, loc_tree, DECL_ATTRIBUTES (decl));
7283 }
7284
7285 return decl;
7286 }
7287
7288 /* Implement TARGET_GOACC_EXPAND_VAR_DECL. */
7289
7290 static rtx
nvptx_goacc_expand_var_decl(tree var)7291 nvptx_goacc_expand_var_decl (tree var)
7292 {
7293 /* Place "oacc gang-private" variables in shared memory. */
7294 if (tree attr = lookup_attribute ("oacc gang-private",
7295 DECL_ATTRIBUTES (var)))
7296 {
7297 gcc_checking_assert (VAR_P (var));
7298
7299 unsigned int offset, *poffset;
7300 poffset = gang_private_shared_hmap.get (var);
7301 if (poffset)
7302 offset = *poffset;
7303 else
7304 {
7305 unsigned HOST_WIDE_INT align = DECL_ALIGN (var);
7306 gang_private_shared_size
7307 = (gang_private_shared_size + align - 1) & ~(align - 1);
7308 if (gang_private_shared_align < align)
7309 gang_private_shared_align = align;
7310
7311 offset = gang_private_shared_size;
7312 bool existed = gang_private_shared_hmap.put (var, offset);
7313 gcc_checking_assert (!existed);
7314 gang_private_shared_size += tree_to_uhwi (DECL_SIZE_UNIT (var));
7315
7316 location_t loc = EXPR_LOCATION (TREE_VALUE (attr));
7317 #if 0 /* For some reason, this doesn't work. */
7318 if (dump_enabled_p ())
7319 {
7320 dump_flags_t l_dump_flags
7321 = get_openacc_privatization_dump_flags ();
7322
7323 const dump_user_location_t d_u_loc
7324 = dump_user_location_t::from_location_t (loc);
7325 /* PR100695 "Format decoder, quoting in 'dump_printf' etc." */
7326 #if __GNUC__ >= 10
7327 # pragma GCC diagnostic push
7328 # pragma GCC diagnostic ignored "-Wformat"
7329 #endif
7330 dump_printf_loc (l_dump_flags, d_u_loc,
7331 "variable %<%T%> adjusted for OpenACC"
7332 " privatization level: %qs\n",
7333 var, "gang");
7334 #if __GNUC__ >= 10
7335 # pragma GCC diagnostic pop
7336 #endif
7337 }
7338 #else /* ..., thus emulate that, good enough for testsuite usage. */
7339 if (param_openacc_privatization != OPENACC_PRIVATIZATION_QUIET)
7340 inform (loc,
7341 "variable %qD adjusted for OpenACC privatization level:"
7342 " %qs",
7343 var, "gang");
7344 if (dump_file && (dump_flags & TDF_DETAILS))
7345 {
7346 /* 'dumpfile.cc:dump_loc' */
7347 fprintf (dump_file, "%s:%d:%d: ", LOCATION_FILE (loc),
7348 LOCATION_LINE (loc), LOCATION_COLUMN (loc));
7349 fprintf (dump_file, "%s: ", "note");
7350
7351 fprintf (dump_file,
7352 "variable '");
7353 print_generic_expr (dump_file, var, TDF_SLIM);
7354 fprintf (dump_file,
7355 "' adjusted for OpenACC privatization level: '%s'\n",
7356 "gang");
7357 }
7358 #endif
7359 }
7360 rtx addr = plus_constant (Pmode, gang_private_shared_sym, offset);
7361 return gen_rtx_MEM (TYPE_MODE (TREE_TYPE (var)), addr);
7362 }
7363
7364 return NULL_RTX;
7365 }
7366
7367 static GTY(()) tree nvptx_previous_fndecl;
7368
7369 static void
nvptx_set_current_function(tree fndecl)7370 nvptx_set_current_function (tree fndecl)
7371 {
7372 if (!fndecl || fndecl == nvptx_previous_fndecl)
7373 return;
7374
7375 gang_private_shared_hmap.empty ();
7376 nvptx_previous_fndecl = fndecl;
7377 vector_red_partition = 0;
7378 oacc_bcast_partition = 0;
7379 }
7380
7381 /* Implement TARGET_LIBC_HAS_FUNCTION. */
7382
7383 bool
nvptx_libc_has_function(enum function_class fn_class,tree type)7384 nvptx_libc_has_function (enum function_class fn_class, tree type)
7385 {
7386 if (fn_class == function_sincos)
7387 {
7388 if (type != NULL_TREE)
7389 /* Currently, newlib does not support sincosl. */
7390 return type == float_type_node || type == double_type_node;
7391 else
7392 return true;
7393 }
7394
7395 return default_libc_has_function (fn_class, type);
7396 }
7397
7398 bool
nvptx_mem_local_p(rtx mem)7399 nvptx_mem_local_p (rtx mem)
7400 {
7401 gcc_assert (GET_CODE (mem) == MEM);
7402
7403 struct address_info info;
7404 decompose_mem_address (&info, mem);
7405
7406 if (info.base != NULL && REG_P (*info.base)
7407 && REGNO_PTR_FRAME_P (REGNO (*info.base)))
7408 {
7409 if (TARGET_SOFT_STACK)
7410 {
7411 /* Frame-related doesn't mean local. */
7412 }
7413 else
7414 return true;
7415 }
7416
7417 return false;
7418 }
7419
7420 /* Define locally, for use in NVPTX_ASM_OUTPUT_DEF. */
7421 #define SET_ASM_OP ".alias "
7422
7423 /* Define locally, for use in nvptx_asm_output_def_from_decls. Add NVPTX_
7424 prefix to avoid clash with ASM_OUTPUT_DEF from nvptx.h.
7425 Copy of ASM_OUTPUT_DEF from defaults.h, with added terminating
7426 semicolon. */
7427 #define NVPTX_ASM_OUTPUT_DEF(FILE,LABEL1,LABEL2) \
7428 do \
7429 { \
7430 fprintf ((FILE), "%s", SET_ASM_OP); \
7431 assemble_name (FILE, LABEL1); \
7432 fprintf (FILE, ","); \
7433 assemble_name (FILE, LABEL2); \
7434 fprintf (FILE, ";\n"); \
7435 } \
7436 while (0)
7437
7438 void
nvptx_asm_output_def_from_decls(FILE * stream,tree name,tree value)7439 nvptx_asm_output_def_from_decls (FILE *stream, tree name, tree value)
7440 {
7441 if (nvptx_alias == 0 || !TARGET_PTX_6_3)
7442 {
7443 /* Copied from assemble_alias. */
7444 error_at (DECL_SOURCE_LOCATION (name),
7445 "alias definitions not supported in this configuration");
7446 TREE_ASM_WRITTEN (name) = 1;
7447 return;
7448 }
7449
7450 if (lookup_attribute ("weak", DECL_ATTRIBUTES (name)))
7451 {
7452 /* Prevent execution FAILs for gcc.dg/globalalias.c and
7453 gcc.dg/pr77587.c. */
7454 error_at (DECL_SOURCE_LOCATION (name),
7455 "weak alias definitions not supported in this configuration");
7456 TREE_ASM_WRITTEN (name) = 1;
7457 return;
7458 }
7459
7460 /* Ptx also doesn't support value having weak linkage, but we can't detect
7461 that here, so we'll end up with:
7462 "error: Function test with .weak scope cannot be aliased".
7463 See gcc.dg/localalias.c. */
7464
7465 if (TREE_CODE (name) != FUNCTION_DECL)
7466 {
7467 error_at (DECL_SOURCE_LOCATION (name),
7468 "non-function alias definitions not supported"
7469 " in this configuration");
7470 TREE_ASM_WRITTEN (name) = 1;
7471 return;
7472 }
7473
7474 if (!cgraph_node::get (name)->referred_to_p ())
7475 /* Prevent "Internal error: reference to deleted section". */
7476 return;
7477
7478 std::stringstream s;
7479 write_fn_proto (s, false, get_fnname_from_decl (name), name);
7480 fputs (s.str ().c_str (), stream);
7481
7482 tree id = DECL_ASSEMBLER_NAME (name);
7483 NVPTX_ASM_OUTPUT_DEF (stream, IDENTIFIER_POINTER (id),
7484 IDENTIFIER_POINTER (value));
7485 }
7486
7487 #undef NVPTX_ASM_OUTPUT_DEF
7488 #undef SET_ASM_OP
7489
7490 #undef TARGET_OPTION_OVERRIDE
7491 #define TARGET_OPTION_OVERRIDE nvptx_option_override
7492
7493 #undef TARGET_ATTRIBUTE_TABLE
7494 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table
7495
7496 #undef TARGET_LRA_P
7497 #define TARGET_LRA_P hook_bool_void_false
7498
7499 #undef TARGET_LEGITIMATE_ADDRESS_P
7500 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p
7501
7502 #undef TARGET_PROMOTE_FUNCTION_MODE
7503 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode
7504
7505 #undef TARGET_FUNCTION_ARG
7506 #define TARGET_FUNCTION_ARG nvptx_function_arg
7507 #undef TARGET_FUNCTION_INCOMING_ARG
7508 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg
7509 #undef TARGET_FUNCTION_ARG_ADVANCE
7510 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance
7511 #undef TARGET_FUNCTION_ARG_BOUNDARY
7512 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary
7513 #undef TARGET_PASS_BY_REFERENCE
7514 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference
7515 #undef TARGET_FUNCTION_VALUE_REGNO_P
7516 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p
7517 #undef TARGET_FUNCTION_VALUE
7518 #define TARGET_FUNCTION_VALUE nvptx_function_value
7519 #undef TARGET_LIBCALL_VALUE
7520 #define TARGET_LIBCALL_VALUE nvptx_libcall_value
7521 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
7522 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall
7523 #undef TARGET_GET_DRAP_RTX
7524 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx
7525 #undef TARGET_SPLIT_COMPLEX_ARG
7526 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true
7527 #undef TARGET_RETURN_IN_MEMORY
7528 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory
7529 #undef TARGET_OMIT_STRUCT_RETURN_REG
7530 #define TARGET_OMIT_STRUCT_RETURN_REG true
7531 #undef TARGET_STRICT_ARGUMENT_NAMING
7532 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming
7533 #undef TARGET_CALL_ARGS
7534 #define TARGET_CALL_ARGS nvptx_call_args
7535 #undef TARGET_END_CALL_ARGS
7536 #define TARGET_END_CALL_ARGS nvptx_end_call_args
7537
7538 #undef TARGET_ASM_FILE_START
7539 #define TARGET_ASM_FILE_START nvptx_file_start
7540 #undef TARGET_ASM_FILE_END
7541 #define TARGET_ASM_FILE_END nvptx_file_end
7542 #undef TARGET_ASM_GLOBALIZE_LABEL
7543 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label
7544 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL
7545 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl
7546 #undef TARGET_PRINT_OPERAND
7547 #define TARGET_PRINT_OPERAND nvptx_print_operand
7548 #undef TARGET_PRINT_OPERAND_ADDRESS
7549 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address
7550 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P
7551 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p
7552 #undef TARGET_ASM_INTEGER
7553 #define TARGET_ASM_INTEGER nvptx_assemble_integer
7554 #undef TARGET_ASM_DECL_END
7555 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end
7556 #undef TARGET_ASM_DECLARE_CONSTANT_NAME
7557 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name
7558 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
7559 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true
7560 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE
7561 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true
7562
7563 #undef TARGET_MACHINE_DEPENDENT_REORG
7564 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg
7565 #undef TARGET_NO_REGISTER_ALLOCATION
7566 #define TARGET_NO_REGISTER_ALLOCATION true
7567
7568 #undef TARGET_ENCODE_SECTION_INFO
7569 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info
7570 #undef TARGET_RECORD_OFFLOAD_SYMBOL
7571 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol
7572
7573 #undef TARGET_VECTOR_ALIGNMENT
7574 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment
7575
7576 #undef TARGET_CANNOT_COPY_INSN_P
7577 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p
7578
7579 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P
7580 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p
7581
7582 #undef TARGET_INIT_BUILTINS
7583 #define TARGET_INIT_BUILTINS nvptx_init_builtins
7584 #undef TARGET_EXPAND_BUILTIN
7585 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin
7586 #undef TARGET_BUILTIN_DECL
7587 #define TARGET_BUILTIN_DECL nvptx_builtin_decl
7588
7589 #undef TARGET_SIMT_VF
7590 #define TARGET_SIMT_VF nvptx_simt_vf
7591
7592 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA
7593 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa
7594
7595 #undef TARGET_GOACC_VALIDATE_DIMS
7596 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims
7597
7598 #undef TARGET_GOACC_DIM_LIMIT
7599 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit
7600
7601 #undef TARGET_GOACC_FORK_JOIN
7602 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join
7603
7604 #undef TARGET_GOACC_REDUCTION
7605 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction
7606
7607 #undef TARGET_CANNOT_FORCE_CONST_MEM
7608 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem
7609
7610 #undef TARGET_SCALAR_MODE_SUPPORTED_P
7611 #define TARGET_SCALAR_MODE_SUPPORTED_P nvptx_scalar_mode_supported_p
7612
7613 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
7614 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
7615 nvptx_libgcc_floating_mode_supported_p
7616
7617 #undef TARGET_VECTOR_MODE_SUPPORTED_P
7618 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported
7619
7620 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
7621 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \
7622 nvptx_preferred_simd_mode
7623
7624 #undef TARGET_MODES_TIEABLE_P
7625 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p
7626
7627 #undef TARGET_HARD_REGNO_NREGS
7628 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs
7629
7630 #undef TARGET_CAN_CHANGE_MODE_CLASS
7631 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class
7632
7633 #undef TARGET_TRULY_NOOP_TRUNCATION
7634 #define TARGET_TRULY_NOOP_TRUNCATION nvptx_truly_noop_truncation
7635
7636 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE
7637 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed
7638
7639 #undef TARGET_GOACC_ADJUST_PRIVATE_DECL
7640 #define TARGET_GOACC_ADJUST_PRIVATE_DECL nvptx_goacc_adjust_private_decl
7641
7642 #undef TARGET_GOACC_EXPAND_VAR_DECL
7643 #define TARGET_GOACC_EXPAND_VAR_DECL nvptx_goacc_expand_var_decl
7644
7645 #undef TARGET_SET_CURRENT_FUNCTION
7646 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function
7647
7648 #undef TARGET_LIBC_HAS_FUNCTION
7649 #define TARGET_LIBC_HAS_FUNCTION nvptx_libc_has_function
7650
7651 struct gcc_target targetm = TARGET_INITIALIZER;
7652
7653 #include "gt-nvptx.h"
7654