1 /* Target code for NVPTX. 2 Copyright (C) 2014-2020 Free Software Foundation, Inc. 3 Contributed by Bernd Schmidt <bernds@codesourcery.com> 4 5 This file is part of GCC. 6 7 GCC is free software; you can redistribute it and/or modify it 8 under the terms of the GNU General Public License as published 9 by the Free Software Foundation; either version 3, or (at your 10 option) any later version. 11 12 GCC is distributed in the hope that it will be useful, but WITHOUT 13 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 15 License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with GCC; see the file COPYING3. If not see 19 <http://www.gnu.org/licenses/>. */ 20 21 #define IN_TARGET_CODE 1 22 23 #include "config.h" 24 #include <sstream> 25 #include "system.h" 26 #include "coretypes.h" 27 #include "backend.h" 28 #include "target.h" 29 #include "rtl.h" 30 #include "tree.h" 31 #include "cfghooks.h" 32 #include "df.h" 33 #include "memmodel.h" 34 #include "tm_p.h" 35 #include "expmed.h" 36 #include "optabs.h" 37 #include "regs.h" 38 #include "emit-rtl.h" 39 #include "recog.h" 40 #include "diagnostic.h" 41 #include "alias.h" 42 #include "insn-flags.h" 43 #include "output.h" 44 #include "insn-attr.h" 45 #include "flags.h" 46 #include "dojump.h" 47 #include "explow.h" 48 #include "calls.h" 49 #include "varasm.h" 50 #include "stmt.h" 51 #include "expr.h" 52 #include "tm-preds.h" 53 #include "tm-constrs.h" 54 #include "langhooks.h" 55 #include "dbxout.h" 56 #include "cfgrtl.h" 57 #include "gimple.h" 58 #include "stor-layout.h" 59 #include "builtins.h" 60 #include "omp-general.h" 61 #include "omp-low.h" 62 #include "omp-offload.h" 63 #include "gomp-constants.h" 64 #include "dumpfile.h" 65 #include "internal-fn.h" 66 #include "gimple-iterator.h" 67 #include "stringpool.h" 68 #include "attribs.h" 69 #include "tree-vrp.h" 70 #include "tree-ssa-operands.h" 71 #include "tree-ssanames.h" 72 #include "gimplify.h" 73 #include "tree-phinodes.h" 74 #include "cfgloop.h" 75 #include "fold-const.h" 76 #include "intl.h" 77 78 /* This file should be included last. */ 79 #include "target-def.h" 80 81 #define WORKAROUND_PTXJIT_BUG 1 82 #define WORKAROUND_PTXJIT_BUG_2 1 83 #define WORKAROUND_PTXJIT_BUG_3 1 84 85 /* The PTX concept CTA (Concurrent Thread Array) maps on the CUDA concept thread 86 block, which has had a maximum number of threads of 1024 since CUDA version 87 2.x. */ 88 #define PTX_CTA_SIZE 1024 89 90 #define PTX_CTA_NUM_BARRIERS 16 91 #define PTX_WARP_SIZE 32 92 93 #define PTX_PER_CTA_BARRIER 0 94 #define PTX_NUM_PER_CTA_BARRIERS 1 95 #define PTX_FIRST_PER_WORKER_BARRIER (PTX_NUM_PER_CTA_BARRIERS) 96 #define PTX_NUM_PER_WORKER_BARRIERS (PTX_CTA_NUM_BARRIERS - PTX_NUM_PER_CTA_BARRIERS) 97 98 #define PTX_DEFAULT_VECTOR_LENGTH PTX_WARP_SIZE 99 #define PTX_MAX_VECTOR_LENGTH PTX_CTA_SIZE 100 #define PTX_WORKER_LENGTH 32 101 #define PTX_DEFAULT_RUNTIME_DIM 0 /* Defer to runtime. */ 102 103 /* The various PTX memory areas an object might reside in. */ 104 enum nvptx_data_area 105 { 106 DATA_AREA_GENERIC, 107 DATA_AREA_GLOBAL, 108 DATA_AREA_SHARED, 109 DATA_AREA_LOCAL, 110 DATA_AREA_CONST, 111 DATA_AREA_PARAM, 112 DATA_AREA_MAX 113 }; 114 115 /* We record the data area in the target symbol flags. */ 116 #define SYMBOL_DATA_AREA(SYM) \ 117 (nvptx_data_area)((SYMBOL_REF_FLAGS (SYM) >> SYMBOL_FLAG_MACH_DEP_SHIFT) \ 118 & 7) 119 #define SET_SYMBOL_DATA_AREA(SYM,AREA) \ 120 (SYMBOL_REF_FLAGS (SYM) |= (AREA) << SYMBOL_FLAG_MACH_DEP_SHIFT) 121 122 /* Record the function decls we've written, and the libfuncs and function 123 decls corresponding to them. */ 124 static std::stringstream func_decls; 125 126 struct declared_libfunc_hasher : ggc_cache_ptr_hash<rtx_def> 127 { 128 static hashval_t hash (rtx x) { return htab_hash_pointer (x); } 129 static bool equal (rtx a, rtx b) { return a == b; } 130 }; 131 132 static GTY((cache)) 133 hash_table<declared_libfunc_hasher> *declared_libfuncs_htab; 134 135 struct tree_hasher : ggc_cache_ptr_hash<tree_node> 136 { 137 static hashval_t hash (tree t) { return htab_hash_pointer (t); } 138 static bool equal (tree a, tree b) { return a == b; } 139 }; 140 141 static GTY((cache)) hash_table<tree_hasher> *declared_fndecls_htab; 142 static GTY((cache)) hash_table<tree_hasher> *needed_fndecls_htab; 143 144 /* Buffer needed to broadcast across workers and vectors. This is 145 used for both worker-neutering and worker broadcasting, and 146 vector-neutering and boardcasting when vector_length > 32. It is 147 shared by all functions emitted. The buffer is placed in shared 148 memory. It'd be nice if PTX supported common blocks, because then 149 this could be shared across TUs (taking the largest size). */ 150 static unsigned oacc_bcast_size; 151 static unsigned oacc_bcast_partition; 152 static unsigned oacc_bcast_align; 153 static GTY(()) rtx oacc_bcast_sym; 154 155 /* Buffer needed for worker reductions. This has to be distinct from 156 the worker broadcast array, as both may be live concurrently. */ 157 static unsigned worker_red_size; 158 static unsigned worker_red_align; 159 static GTY(()) rtx worker_red_sym; 160 161 /* Buffer needed for vector reductions, when vector_length > 162 PTX_WARP_SIZE. This has to be distinct from the worker broadcast 163 array, as both may be live concurrently. */ 164 static unsigned vector_red_size; 165 static unsigned vector_red_align; 166 static unsigned vector_red_partition; 167 static GTY(()) rtx vector_red_sym; 168 169 /* Global lock variable, needed for 128bit worker & gang reductions. */ 170 static GTY(()) tree global_lock_var; 171 172 /* True if any function references __nvptx_stacks. */ 173 static bool need_softstack_decl; 174 175 /* True if any function references __nvptx_uni. */ 176 static bool need_unisimt_decl; 177 178 static int nvptx_mach_max_workers (); 179 180 /* Allocate a new, cleared machine_function structure. */ 181 182 static struct machine_function * 183 nvptx_init_machine_status (void) 184 { 185 struct machine_function *p = ggc_cleared_alloc<machine_function> (); 186 p->return_mode = VOIDmode; 187 return p; 188 } 189 190 /* Issue a diagnostic when option OPTNAME is enabled (as indicated by OPTVAL) 191 and -fopenacc is also enabled. */ 192 193 static void 194 diagnose_openacc_conflict (bool optval, const char *optname) 195 { 196 if (flag_openacc && optval) 197 error ("option %s is not supported together with %<-fopenacc%>", optname); 198 } 199 200 /* Implement TARGET_OPTION_OVERRIDE. */ 201 202 static void 203 nvptx_option_override (void) 204 { 205 init_machine_status = nvptx_init_machine_status; 206 207 /* Set toplevel_reorder, unless explicitly disabled. We need 208 reordering so that we emit necessary assembler decls of 209 undeclared variables. */ 210 if (!global_options_set.x_flag_toplevel_reorder) 211 flag_toplevel_reorder = 1; 212 213 debug_nonbind_markers_p = 0; 214 215 /* Set flag_no_common, unless explicitly disabled. We fake common 216 using .weak, and that's not entirely accurate, so avoid it 217 unless forced. */ 218 if (!global_options_set.x_flag_no_common) 219 flag_no_common = 1; 220 221 /* The patch area requires nops, which we don't have. */ 222 if (function_entry_patch_area_size > 0) 223 sorry ("not generating patch area, nops not supported"); 224 225 /* Assumes that it will see only hard registers. */ 226 flag_var_tracking = 0; 227 228 if (nvptx_optimize < 0) 229 nvptx_optimize = optimize > 0; 230 231 declared_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); 232 needed_fndecls_htab = hash_table<tree_hasher>::create_ggc (17); 233 declared_libfuncs_htab 234 = hash_table<declared_libfunc_hasher>::create_ggc (17); 235 236 oacc_bcast_sym = gen_rtx_SYMBOL_REF (Pmode, "__oacc_bcast"); 237 SET_SYMBOL_DATA_AREA (oacc_bcast_sym, DATA_AREA_SHARED); 238 oacc_bcast_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; 239 oacc_bcast_partition = 0; 240 241 worker_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__worker_red"); 242 SET_SYMBOL_DATA_AREA (worker_red_sym, DATA_AREA_SHARED); 243 worker_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; 244 245 vector_red_sym = gen_rtx_SYMBOL_REF (Pmode, "__vector_red"); 246 SET_SYMBOL_DATA_AREA (vector_red_sym, DATA_AREA_SHARED); 247 vector_red_align = GET_MODE_ALIGNMENT (SImode) / BITS_PER_UNIT; 248 vector_red_partition = 0; 249 250 diagnose_openacc_conflict (TARGET_GOMP, "-mgomp"); 251 diagnose_openacc_conflict (TARGET_SOFT_STACK, "-msoft-stack"); 252 diagnose_openacc_conflict (TARGET_UNIFORM_SIMT, "-muniform-simt"); 253 254 if (TARGET_GOMP) 255 target_flags |= MASK_SOFT_STACK | MASK_UNIFORM_SIMT; 256 } 257 258 /* Return a ptx type for MODE. If PROMOTE, then use .u32 for QImode to 259 deal with ptx ideosyncracies. */ 260 261 const char * 262 nvptx_ptx_type_from_mode (machine_mode mode, bool promote) 263 { 264 switch (mode) 265 { 266 case E_BLKmode: 267 return ".b8"; 268 case E_BImode: 269 return ".pred"; 270 case E_QImode: 271 if (promote) 272 return ".u32"; 273 else 274 return ".u8"; 275 case E_HImode: 276 return ".u16"; 277 case E_SImode: 278 return ".u32"; 279 case E_DImode: 280 return ".u64"; 281 282 case E_SFmode: 283 return ".f32"; 284 case E_DFmode: 285 return ".f64"; 286 287 case E_V2SImode: 288 return ".v2.u32"; 289 case E_V2DImode: 290 return ".v2.u64"; 291 292 default: 293 gcc_unreachable (); 294 } 295 } 296 297 /* Encode the PTX data area that DECL (which might not actually be a 298 _DECL) should reside in. */ 299 300 static void 301 nvptx_encode_section_info (tree decl, rtx rtl, int first) 302 { 303 default_encode_section_info (decl, rtl, first); 304 if (first && MEM_P (rtl)) 305 { 306 nvptx_data_area area = DATA_AREA_GENERIC; 307 308 if (TREE_CONSTANT (decl)) 309 area = DATA_AREA_CONST; 310 else if (TREE_CODE (decl) == VAR_DECL) 311 { 312 if (lookup_attribute ("shared", DECL_ATTRIBUTES (decl))) 313 { 314 area = DATA_AREA_SHARED; 315 if (DECL_INITIAL (decl)) 316 error ("static initialization of variable %q+D in %<.shared%>" 317 " memory is not supported", decl); 318 } 319 else 320 area = TREE_READONLY (decl) ? DATA_AREA_CONST : DATA_AREA_GLOBAL; 321 } 322 323 SET_SYMBOL_DATA_AREA (XEXP (rtl, 0), area); 324 } 325 } 326 327 /* Return the PTX name of the data area in which SYM should be 328 placed. The symbol must have already been processed by 329 nvptx_encode_seciton_info, or equivalent. */ 330 331 static const char * 332 section_for_sym (rtx sym) 333 { 334 nvptx_data_area area = SYMBOL_DATA_AREA (sym); 335 /* Same order as nvptx_data_area enum. */ 336 static char const *const areas[] = 337 {"", ".global", ".shared", ".local", ".const", ".param"}; 338 339 return areas[area]; 340 } 341 342 /* Similarly for a decl. */ 343 344 static const char * 345 section_for_decl (const_tree decl) 346 { 347 return section_for_sym (XEXP (DECL_RTL (CONST_CAST (tree, decl)), 0)); 348 } 349 350 /* Check NAME for special function names and redirect them by returning a 351 replacement. This applies to malloc, free and realloc, for which we 352 want to use libgcc wrappers, and call, which triggers a bug in 353 ptxas. We can't use TARGET_MANGLE_DECL_ASSEMBLER_NAME, as that's 354 not active in an offload compiler -- the names are all set by the 355 host-side compiler. */ 356 357 static const char * 358 nvptx_name_replacement (const char *name) 359 { 360 if (strcmp (name, "call") == 0) 361 return "__nvptx_call"; 362 if (strcmp (name, "malloc") == 0) 363 return "__nvptx_malloc"; 364 if (strcmp (name, "free") == 0) 365 return "__nvptx_free"; 366 if (strcmp (name, "realloc") == 0) 367 return "__nvptx_realloc"; 368 return name; 369 } 370 371 /* If MODE should be treated as two registers of an inner mode, return 372 that inner mode. Otherwise return VOIDmode. */ 373 374 static machine_mode 375 maybe_split_mode (machine_mode mode) 376 { 377 if (COMPLEX_MODE_P (mode)) 378 return GET_MODE_INNER (mode); 379 380 if (mode == TImode) 381 return DImode; 382 383 return VOIDmode; 384 } 385 386 /* Return true if mode should be treated as two registers. */ 387 388 static bool 389 split_mode_p (machine_mode mode) 390 { 391 return maybe_split_mode (mode) != VOIDmode; 392 } 393 394 /* Output a register, subreg, or register pair (with optional 395 enclosing braces). */ 396 397 static void 398 output_reg (FILE *file, unsigned regno, machine_mode inner_mode, 399 int subreg_offset = -1) 400 { 401 if (inner_mode == VOIDmode) 402 { 403 if (HARD_REGISTER_NUM_P (regno)) 404 fprintf (file, "%s", reg_names[regno]); 405 else 406 fprintf (file, "%%r%d", regno); 407 } 408 else if (subreg_offset >= 0) 409 { 410 output_reg (file, regno, VOIDmode); 411 fprintf (file, "$%d", subreg_offset); 412 } 413 else 414 { 415 if (subreg_offset == -1) 416 fprintf (file, "{"); 417 output_reg (file, regno, inner_mode, GET_MODE_SIZE (inner_mode)); 418 fprintf (file, ","); 419 output_reg (file, regno, inner_mode, 0); 420 if (subreg_offset == -1) 421 fprintf (file, "}"); 422 } 423 } 424 425 /* Emit forking instructions for MASK. */ 426 427 static void 428 nvptx_emit_forking (unsigned mask, bool is_call) 429 { 430 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER) 431 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); 432 if (mask) 433 { 434 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); 435 436 /* Emit fork at all levels. This helps form SESE regions, as 437 it creates a block with a single successor before entering a 438 partitooned region. That is a good candidate for the end of 439 an SESE region. */ 440 emit_insn (gen_nvptx_fork (op)); 441 emit_insn (gen_nvptx_forked (op)); 442 } 443 } 444 445 /* Emit joining instructions for MASK. */ 446 447 static void 448 nvptx_emit_joining (unsigned mask, bool is_call) 449 { 450 mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER) 451 | GOMP_DIM_MASK (GOMP_DIM_VECTOR)); 452 if (mask) 453 { 454 rtx op = GEN_INT (mask | (is_call << GOMP_DIM_MAX)); 455 456 /* Emit joining for all non-call pars to ensure there's a single 457 predecessor for the block the join insn ends up in. This is 458 needed for skipping entire loops. */ 459 emit_insn (gen_nvptx_joining (op)); 460 emit_insn (gen_nvptx_join (op)); 461 } 462 } 463 464 465 /* Determine whether MODE and TYPE (possibly NULL) should be passed or 466 returned in memory. Integer and floating types supported by the 467 machine are passed in registers, everything else is passed in 468 memory. Complex types are split. */ 469 470 static bool 471 pass_in_memory (machine_mode mode, const_tree type, bool for_return) 472 { 473 if (type) 474 { 475 if (AGGREGATE_TYPE_P (type)) 476 return true; 477 if (TREE_CODE (type) == VECTOR_TYPE) 478 return true; 479 } 480 481 if (!for_return && COMPLEX_MODE_P (mode)) 482 /* Complex types are passed as two underlying args. */ 483 mode = GET_MODE_INNER (mode); 484 485 if (GET_MODE_CLASS (mode) != MODE_INT 486 && GET_MODE_CLASS (mode) != MODE_FLOAT) 487 return true; 488 489 if (GET_MODE_SIZE (mode) > UNITS_PER_WORD) 490 return true; 491 492 return false; 493 } 494 495 /* A non-memory argument of mode MODE is being passed, determine the mode it 496 should be promoted to. This is also used for determining return 497 type promotion. */ 498 499 static machine_mode 500 promote_arg (machine_mode mode, bool prototyped) 501 { 502 if (!prototyped && mode == SFmode) 503 /* K&R float promotion for unprototyped functions. */ 504 mode = DFmode; 505 else if (GET_MODE_SIZE (mode) < GET_MODE_SIZE (SImode)) 506 mode = SImode; 507 508 return mode; 509 } 510 511 /* A non-memory return type of MODE is being returned. Determine the 512 mode it should be promoted to. */ 513 514 static machine_mode 515 promote_return (machine_mode mode) 516 { 517 return promote_arg (mode, true); 518 } 519 520 /* Implement TARGET_FUNCTION_ARG. */ 521 522 static rtx 523 nvptx_function_arg (cumulative_args_t, const function_arg_info &arg) 524 { 525 if (arg.end_marker_p () || !arg.named) 526 return NULL_RTX; 527 528 return gen_reg_rtx (arg.mode); 529 } 530 531 /* Implement TARGET_FUNCTION_INCOMING_ARG. */ 532 533 static rtx 534 nvptx_function_incoming_arg (cumulative_args_t cum_v, 535 const function_arg_info &arg) 536 { 537 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 538 539 if (arg.end_marker_p () || !arg.named) 540 return NULL_RTX; 541 542 /* No need to deal with split modes here, the only case that can 543 happen is complex modes and those are dealt with by 544 TARGET_SPLIT_COMPLEX_ARG. */ 545 return gen_rtx_UNSPEC (arg.mode, 546 gen_rtvec (1, GEN_INT (cum->count)), 547 UNSPEC_ARG_REG); 548 } 549 550 /* Implement TARGET_FUNCTION_ARG_ADVANCE. */ 551 552 static void 553 nvptx_function_arg_advance (cumulative_args_t cum_v, const function_arg_info &) 554 { 555 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 556 557 cum->count++; 558 } 559 560 /* Implement TARGET_FUNCTION_ARG_BOUNDARY. 561 562 For nvptx This is only used for varadic args. The type has already 563 been promoted and/or converted to invisible reference. */ 564 565 static unsigned 566 nvptx_function_arg_boundary (machine_mode mode, const_tree ARG_UNUSED (type)) 567 { 568 return GET_MODE_ALIGNMENT (mode); 569 } 570 571 /* Handle the TARGET_STRICT_ARGUMENT_NAMING target hook. 572 573 For nvptx, we know how to handle functions declared as stdarg: by 574 passing an extra pointer to the unnamed arguments. However, the 575 Fortran frontend can produce a different situation, where a 576 function pointer is declared with no arguments, but the actual 577 function and calls to it take more arguments. In that case, we 578 want to ensure the call matches the definition of the function. */ 579 580 static bool 581 nvptx_strict_argument_naming (cumulative_args_t cum_v) 582 { 583 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 584 585 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype); 586 } 587 588 /* Implement TARGET_LIBCALL_VALUE. */ 589 590 static rtx 591 nvptx_libcall_value (machine_mode mode, const_rtx) 592 { 593 if (!cfun || !cfun->machine->doing_call) 594 /* Pretend to return in a hard reg for early uses before pseudos can be 595 generated. */ 596 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM); 597 598 return gen_reg_rtx (mode); 599 } 600 601 /* TARGET_FUNCTION_VALUE implementation. Returns an RTX representing the place 602 where function FUNC returns or receives a value of data type TYPE. */ 603 604 static rtx 605 nvptx_function_value (const_tree type, const_tree ARG_UNUSED (func), 606 bool outgoing) 607 { 608 machine_mode mode = promote_return (TYPE_MODE (type)); 609 610 if (outgoing) 611 { 612 gcc_assert (cfun); 613 cfun->machine->return_mode = mode; 614 return gen_rtx_REG (mode, NVPTX_RETURN_REGNUM); 615 } 616 617 return nvptx_libcall_value (mode, NULL_RTX); 618 } 619 620 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. */ 621 622 static bool 623 nvptx_function_value_regno_p (const unsigned int regno) 624 { 625 return regno == NVPTX_RETURN_REGNUM; 626 } 627 628 /* Types with a mode other than those supported by the machine are passed by 629 reference in memory. */ 630 631 static bool 632 nvptx_pass_by_reference (cumulative_args_t, const function_arg_info &arg) 633 { 634 return pass_in_memory (arg.mode, arg.type, false); 635 } 636 637 /* Implement TARGET_RETURN_IN_MEMORY. */ 638 639 static bool 640 nvptx_return_in_memory (const_tree type, const_tree) 641 { 642 return pass_in_memory (TYPE_MODE (type), type, true); 643 } 644 645 /* Implement TARGET_PROMOTE_FUNCTION_MODE. */ 646 647 static machine_mode 648 nvptx_promote_function_mode (const_tree type, machine_mode mode, 649 int *ARG_UNUSED (punsignedp), 650 const_tree funtype, int for_return) 651 { 652 return promote_arg (mode, for_return || !type || TYPE_ARG_TYPES (funtype)); 653 } 654 655 /* Helper for write_arg. Emit a single PTX argument of MODE, either 656 in a prototype, or as copy in a function prologue. ARGNO is the 657 index of this argument in the PTX function. FOR_REG is negative, 658 if we're emitting the PTX prototype. It is zero if we're copying 659 to an argument register and it is greater than zero if we're 660 copying to a specific hard register. */ 661 662 static int 663 write_arg_mode (std::stringstream &s, int for_reg, int argno, 664 machine_mode mode) 665 { 666 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); 667 668 if (for_reg < 0) 669 { 670 /* Writing PTX prototype. */ 671 s << (argno ? ", " : " ("); 672 s << ".param" << ptx_type << " %in_ar" << argno; 673 } 674 else 675 { 676 s << "\t.reg" << ptx_type << " "; 677 if (for_reg) 678 s << reg_names[for_reg]; 679 else 680 s << "%ar" << argno; 681 s << ";\n"; 682 if (argno >= 0) 683 { 684 s << "\tld.param" << ptx_type << " "; 685 if (for_reg) 686 s << reg_names[for_reg]; 687 else 688 s << "%ar" << argno; 689 s << ", [%in_ar" << argno << "];\n"; 690 } 691 } 692 return argno + 1; 693 } 694 695 /* Process function parameter TYPE to emit one or more PTX 696 arguments. S, FOR_REG and ARGNO as for write_arg_mode. PROTOTYPED 697 is true, if this is a prototyped function, rather than an old-style 698 C declaration. Returns the next argument number to use. 699 700 The promotion behavior here must match the regular GCC function 701 parameter marshalling machinery. */ 702 703 static int 704 write_arg_type (std::stringstream &s, int for_reg, int argno, 705 tree type, bool prototyped) 706 { 707 machine_mode mode = TYPE_MODE (type); 708 709 if (mode == VOIDmode) 710 return argno; 711 712 if (pass_in_memory (mode, type, false)) 713 mode = Pmode; 714 else 715 { 716 bool split = TREE_CODE (type) == COMPLEX_TYPE; 717 718 if (split) 719 { 720 /* Complex types are sent as two separate args. */ 721 type = TREE_TYPE (type); 722 mode = TYPE_MODE (type); 723 prototyped = true; 724 } 725 726 mode = promote_arg (mode, prototyped); 727 if (split) 728 argno = write_arg_mode (s, for_reg, argno, mode); 729 } 730 731 return write_arg_mode (s, for_reg, argno, mode); 732 } 733 734 /* Emit a PTX return as a prototype or function prologue declaration 735 for MODE. */ 736 737 static void 738 write_return_mode (std::stringstream &s, bool for_proto, machine_mode mode) 739 { 740 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); 741 const char *pfx = "\t.reg"; 742 const char *sfx = ";\n"; 743 744 if (for_proto) 745 pfx = "(.param", sfx = "_out) "; 746 747 s << pfx << ptx_type << " " << reg_names[NVPTX_RETURN_REGNUM] << sfx; 748 } 749 750 /* Process a function return TYPE to emit a PTX return as a prototype 751 or function prologue declaration. Returns true if return is via an 752 additional pointer parameter. The promotion behavior here must 753 match the regular GCC function return mashalling. */ 754 755 static bool 756 write_return_type (std::stringstream &s, bool for_proto, tree type) 757 { 758 machine_mode mode = TYPE_MODE (type); 759 760 if (mode == VOIDmode) 761 return false; 762 763 bool return_in_mem = pass_in_memory (mode, type, true); 764 765 if (return_in_mem) 766 { 767 if (for_proto) 768 return return_in_mem; 769 770 /* Named return values can cause us to return a pointer as well 771 as expect an argument for the return location. This is 772 optimization-level specific, so no caller can make use of 773 this data, but more importantly for us, we must ensure it 774 doesn't change the PTX prototype. */ 775 mode = (machine_mode) cfun->machine->return_mode; 776 777 if (mode == VOIDmode) 778 return return_in_mem; 779 780 /* Clear return_mode to inhibit copy of retval to non-existent 781 retval parameter. */ 782 cfun->machine->return_mode = VOIDmode; 783 } 784 else 785 mode = promote_return (mode); 786 787 write_return_mode (s, for_proto, mode); 788 789 return return_in_mem; 790 } 791 792 /* Look for attributes in ATTRS that would indicate we must write a function 793 as a .entry kernel rather than a .func. Return true if one is found. */ 794 795 static bool 796 write_as_kernel (tree attrs) 797 { 798 return (lookup_attribute ("kernel", attrs) != NULL_TREE 799 || (lookup_attribute ("omp target entrypoint", attrs) != NULL_TREE 800 && lookup_attribute ("oacc function", attrs) != NULL_TREE)); 801 /* For OpenMP target regions, the corresponding kernel entry is emitted from 802 write_omp_entry as a separate function. */ 803 } 804 805 /* Emit a linker marker for a function decl or defn. */ 806 807 static void 808 write_fn_marker (std::stringstream &s, bool is_defn, bool globalize, 809 const char *name) 810 { 811 s << "\n// BEGIN"; 812 if (globalize) 813 s << " GLOBAL"; 814 s << " FUNCTION " << (is_defn ? "DEF: " : "DECL: "); 815 s << name << "\n"; 816 } 817 818 /* Emit a linker marker for a variable decl or defn. */ 819 820 static void 821 write_var_marker (FILE *file, bool is_defn, bool globalize, const char *name) 822 { 823 fprintf (file, "\n// BEGIN%s VAR %s: ", 824 globalize ? " GLOBAL" : "", 825 is_defn ? "DEF" : "DECL"); 826 assemble_name_raw (file, name); 827 fputs ("\n", file); 828 } 829 830 /* Write a .func or .kernel declaration or definition along with 831 a helper comment for use by ld. S is the stream to write to, DECL 832 the decl for the function with name NAME. For definitions, emit 833 a declaration too. */ 834 835 static const char * 836 write_fn_proto (std::stringstream &s, bool is_defn, 837 const char *name, const_tree decl) 838 { 839 if (is_defn) 840 /* Emit a declaration. The PTX assembler gets upset without it. */ 841 name = write_fn_proto (s, false, name, decl); 842 else 843 { 844 /* Avoid repeating the name replacement. */ 845 name = nvptx_name_replacement (name); 846 if (name[0] == '*') 847 name++; 848 } 849 850 write_fn_marker (s, is_defn, TREE_PUBLIC (decl), name); 851 852 /* PTX declaration. */ 853 if (DECL_EXTERNAL (decl)) 854 s << ".extern "; 855 else if (TREE_PUBLIC (decl)) 856 s << (DECL_WEAK (decl) ? ".weak " : ".visible "); 857 s << (write_as_kernel (DECL_ATTRIBUTES (decl)) ? ".entry " : ".func "); 858 859 tree fntype = TREE_TYPE (decl); 860 tree result_type = TREE_TYPE (fntype); 861 862 /* atomic_compare_exchange_$n builtins have an exceptional calling 863 convention. */ 864 int not_atomic_weak_arg = -1; 865 if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL) 866 switch (DECL_FUNCTION_CODE (decl)) 867 { 868 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_1: 869 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_2: 870 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_4: 871 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_8: 872 case BUILT_IN_ATOMIC_COMPARE_EXCHANGE_16: 873 /* These atomics skip the 'weak' parm in an actual library 874 call. We must skip it in the prototype too. */ 875 not_atomic_weak_arg = 3; 876 break; 877 878 default: 879 break; 880 } 881 882 /* Declare the result. */ 883 bool return_in_mem = write_return_type (s, true, result_type); 884 885 s << name; 886 887 int argno = 0; 888 889 /* Emit argument list. */ 890 if (return_in_mem) 891 argno = write_arg_type (s, -1, argno, ptr_type_node, true); 892 893 /* We get: 894 NULL in TYPE_ARG_TYPES, for old-style functions 895 NULL in DECL_ARGUMENTS, for builtin functions without another 896 declaration. 897 So we have to pick the best one we have. */ 898 tree args = TYPE_ARG_TYPES (fntype); 899 bool prototyped = true; 900 if (!args) 901 { 902 args = DECL_ARGUMENTS (decl); 903 prototyped = false; 904 } 905 906 for (; args; args = TREE_CHAIN (args), not_atomic_weak_arg--) 907 { 908 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args); 909 910 if (not_atomic_weak_arg) 911 argno = write_arg_type (s, -1, argno, type, prototyped); 912 else 913 gcc_assert (type == boolean_type_node); 914 } 915 916 if (stdarg_p (fntype)) 917 argno = write_arg_type (s, -1, argno, ptr_type_node, true); 918 919 if (DECL_STATIC_CHAIN (decl)) 920 argno = write_arg_type (s, -1, argno, ptr_type_node, true); 921 922 if (!argno && strcmp (name, "main") == 0) 923 { 924 argno = write_arg_type (s, -1, argno, integer_type_node, true); 925 argno = write_arg_type (s, -1, argno, ptr_type_node, true); 926 } 927 928 if (argno) 929 s << ")"; 930 931 s << (is_defn ? "\n" : ";\n"); 932 933 return name; 934 } 935 936 /* Construct a function declaration from a call insn. This can be 937 necessary for two reasons - either we have an indirect call which 938 requires a .callprototype declaration, or we have a libcall 939 generated by emit_library_call for which no decl exists. */ 940 941 static void 942 write_fn_proto_from_insn (std::stringstream &s, const char *name, 943 rtx result, rtx pat) 944 { 945 if (!name) 946 { 947 s << "\t.callprototype "; 948 name = "_"; 949 } 950 else 951 { 952 name = nvptx_name_replacement (name); 953 write_fn_marker (s, false, true, name); 954 s << "\t.extern .func "; 955 } 956 957 if (result != NULL_RTX) 958 write_return_mode (s, true, GET_MODE (result)); 959 960 s << name; 961 962 int arg_end = XVECLEN (pat, 0); 963 for (int i = 1; i < arg_end; i++) 964 { 965 /* We don't have to deal with mode splitting & promotion here, 966 as that was already done when generating the call 967 sequence. */ 968 machine_mode mode = GET_MODE (XEXP (XVECEXP (pat, 0, i), 0)); 969 970 write_arg_mode (s, -1, i - 1, mode); 971 } 972 if (arg_end != 1) 973 s << ")"; 974 s << ";\n"; 975 } 976 977 /* DECL is an external FUNCTION_DECL, make sure its in the fndecl hash 978 table and write a ptx prototype. These are emitted at end of 979 compilation. */ 980 981 static void 982 nvptx_record_fndecl (tree decl) 983 { 984 tree *slot = declared_fndecls_htab->find_slot (decl, INSERT); 985 if (*slot == NULL) 986 { 987 *slot = decl; 988 const char *name = get_fnname_from_decl (decl); 989 write_fn_proto (func_decls, false, name, decl); 990 } 991 } 992 993 /* Record a libcall or unprototyped external function. CALLEE is the 994 SYMBOL_REF. Insert into the libfunc hash table and emit a ptx 995 declaration for it. */ 996 997 static void 998 nvptx_record_libfunc (rtx callee, rtx retval, rtx pat) 999 { 1000 rtx *slot = declared_libfuncs_htab->find_slot (callee, INSERT); 1001 if (*slot == NULL) 1002 { 1003 *slot = callee; 1004 1005 const char *name = XSTR (callee, 0); 1006 write_fn_proto_from_insn (func_decls, name, retval, pat); 1007 } 1008 } 1009 1010 /* DECL is an external FUNCTION_DECL, that we're referencing. If it 1011 is prototyped, record it now. Otherwise record it as needed at end 1012 of compilation, when we might have more information about it. */ 1013 1014 void 1015 nvptx_record_needed_fndecl (tree decl) 1016 { 1017 if (TYPE_ARG_TYPES (TREE_TYPE (decl)) == NULL_TREE) 1018 { 1019 tree *slot = needed_fndecls_htab->find_slot (decl, INSERT); 1020 if (*slot == NULL) 1021 *slot = decl; 1022 } 1023 else 1024 nvptx_record_fndecl (decl); 1025 } 1026 1027 /* SYM is a SYMBOL_REF. If it refers to an external function, record 1028 it as needed. */ 1029 1030 static void 1031 nvptx_maybe_record_fnsym (rtx sym) 1032 { 1033 tree decl = SYMBOL_REF_DECL (sym); 1034 1035 if (decl && TREE_CODE (decl) == FUNCTION_DECL && DECL_EXTERNAL (decl)) 1036 nvptx_record_needed_fndecl (decl); 1037 } 1038 1039 /* Emit a local array to hold some part of a conventional stack frame 1040 and initialize REGNO to point to it. If the size is zero, it'll 1041 never be valid to dereference, so we can simply initialize to 1042 zero. */ 1043 1044 static void 1045 init_frame (FILE *file, int regno, unsigned align, unsigned size) 1046 { 1047 if (size) 1048 fprintf (file, "\t.local .align %d .b8 %s_ar[%u];\n", 1049 align, reg_names[regno], size); 1050 fprintf (file, "\t.reg.u%d %s;\n", 1051 POINTER_SIZE, reg_names[regno]); 1052 fprintf (file, (size ? "\tcvta.local.u%d %s, %s_ar;\n" 1053 : "\tmov.u%d %s, 0;\n"), 1054 POINTER_SIZE, reg_names[regno], reg_names[regno]); 1055 } 1056 1057 /* Emit soft stack frame setup sequence. */ 1058 1059 static void 1060 init_softstack_frame (FILE *file, unsigned alignment, HOST_WIDE_INT size) 1061 { 1062 /* Maintain 64-bit stack alignment. */ 1063 unsigned keep_align = BIGGEST_ALIGNMENT / BITS_PER_UNIT; 1064 size = ROUND_UP (size, keep_align); 1065 int bits = POINTER_SIZE; 1066 const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; 1067 const char *reg_frame = reg_names[FRAME_POINTER_REGNUM]; 1068 const char *reg_sspslot = reg_names[SOFTSTACK_SLOT_REGNUM]; 1069 const char *reg_sspprev = reg_names[SOFTSTACK_PREV_REGNUM]; 1070 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_stack); 1071 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_frame); 1072 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspslot); 1073 fprintf (file, "\t.reg.u%d %s;\n", bits, reg_sspprev); 1074 fprintf (file, "\t{\n"); 1075 fprintf (file, "\t\t.reg.u32 %%fstmp0;\n"); 1076 fprintf (file, "\t\t.reg.u%d %%fstmp1;\n", bits); 1077 fprintf (file, "\t\t.reg.u%d %%fstmp2;\n", bits); 1078 fprintf (file, "\t\tmov.u32 %%fstmp0, %%tid.y;\n"); 1079 fprintf (file, "\t\tmul%s.u32 %%fstmp1, %%fstmp0, %d;\n", 1080 bits == 64 ? ".wide" : ".lo", bits / 8); 1081 fprintf (file, "\t\tmov.u%d %%fstmp2, __nvptx_stacks;\n", bits); 1082 1083 /* Initialize %sspslot = &__nvptx_stacks[tid.y]. */ 1084 fprintf (file, "\t\tadd.u%d %s, %%fstmp2, %%fstmp1;\n", bits, reg_sspslot); 1085 1086 /* Initialize %sspprev = __nvptx_stacks[tid.y]. */ 1087 fprintf (file, "\t\tld.shared.u%d %s, [%s];\n", 1088 bits, reg_sspprev, reg_sspslot); 1089 1090 /* Initialize %frame = %sspprev - size. */ 1091 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n", 1092 bits, reg_frame, reg_sspprev, size); 1093 1094 /* Apply alignment, if larger than 64. */ 1095 if (alignment > keep_align) 1096 fprintf (file, "\t\tand.b%d %s, %s, %d;\n", 1097 bits, reg_frame, reg_frame, -alignment); 1098 1099 size = crtl->outgoing_args_size; 1100 gcc_assert (size % keep_align == 0); 1101 1102 /* Initialize %stack. */ 1103 fprintf (file, "\t\tsub.u%d %s, %s, " HOST_WIDE_INT_PRINT_DEC ";\n", 1104 bits, reg_stack, reg_frame, size); 1105 1106 if (!crtl->is_leaf) 1107 fprintf (file, "\t\tst.shared.u%d [%s], %s;\n", 1108 bits, reg_sspslot, reg_stack); 1109 fprintf (file, "\t}\n"); 1110 cfun->machine->has_softstack = true; 1111 need_softstack_decl = true; 1112 } 1113 1114 /* Emit code to initialize the REGNO predicate register to indicate 1115 whether we are not lane zero on the NAME axis. */ 1116 1117 static void 1118 nvptx_init_axis_predicate (FILE *file, int regno, const char *name) 1119 { 1120 fprintf (file, "\t{\n"); 1121 fprintf (file, "\t\t.reg.u32\t%%%s;\n", name); 1122 if (strcmp (name, "x") == 0 && cfun->machine->red_partition) 1123 { 1124 fprintf (file, "\t\t.reg.u64\t%%t_red;\n"); 1125 fprintf (file, "\t\t.reg.u64\t%%y64;\n"); 1126 } 1127 fprintf (file, "\t\tmov.u32\t%%%s, %%tid.%s;\n", name, name); 1128 fprintf (file, "\t\tsetp.ne.u32\t%%r%d, %%%s, 0;\n", regno, name); 1129 if (strcmp (name, "x") == 0 && cfun->machine->red_partition) 1130 { 1131 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tid.y;\n"); 1132 fprintf (file, "\t\tcvta.shared.u64\t%%t_red, __vector_red;\n"); 1133 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_red; " 1134 "// vector reduction buffer\n", 1135 REGNO (cfun->machine->red_partition), 1136 vector_red_partition); 1137 } 1138 /* Verify vector_red_size. */ 1139 gcc_assert (vector_red_partition * nvptx_mach_max_workers () 1140 <= vector_red_size); 1141 fprintf (file, "\t}\n"); 1142 } 1143 1144 /* Emit code to initialize OpenACC worker broadcast and synchronization 1145 registers. */ 1146 1147 static void 1148 nvptx_init_oacc_workers (FILE *file) 1149 { 1150 fprintf (file, "\t{\n"); 1151 fprintf (file, "\t\t.reg.u32\t%%tidy;\n"); 1152 if (cfun->machine->bcast_partition) 1153 { 1154 fprintf (file, "\t\t.reg.u64\t%%t_bcast;\n"); 1155 fprintf (file, "\t\t.reg.u64\t%%y64;\n"); 1156 } 1157 fprintf (file, "\t\tmov.u32\t\t%%tidy, %%tid.y;\n"); 1158 if (cfun->machine->bcast_partition) 1159 { 1160 fprintf (file, "\t\tcvt.u64.u32\t%%y64, %%tidy;\n"); 1161 fprintf (file, "\t\tadd.u64\t\t%%y64, %%y64, 1; // vector ID\n"); 1162 fprintf (file, "\t\tcvta.shared.u64\t%%t_bcast, __oacc_bcast;\n"); 1163 fprintf (file, "\t\tmad.lo.u64\t%%r%d, %%y64, %d, %%t_bcast; " 1164 "// vector broadcast offset\n", 1165 REGNO (cfun->machine->bcast_partition), 1166 oacc_bcast_partition); 1167 } 1168 /* Verify oacc_bcast_size. */ 1169 gcc_assert (oacc_bcast_partition * (nvptx_mach_max_workers () + 1) 1170 <= oacc_bcast_size); 1171 if (cfun->machine->sync_bar) 1172 fprintf (file, "\t\tadd.u32\t\t%%r%d, %%tidy, 1; " 1173 "// vector synchronization barrier\n", 1174 REGNO (cfun->machine->sync_bar)); 1175 fprintf (file, "\t}\n"); 1176 } 1177 1178 /* Emit code to initialize predicate and master lane index registers for 1179 -muniform-simt code generation variant. */ 1180 1181 static void 1182 nvptx_init_unisimt_predicate (FILE *file) 1183 { 1184 cfun->machine->unisimt_location = gen_reg_rtx (Pmode); 1185 int loc = REGNO (cfun->machine->unisimt_location); 1186 int bits = POINTER_SIZE; 1187 fprintf (file, "\t.reg.u%d %%r%d;\n", bits, loc); 1188 fprintf (file, "\t{\n"); 1189 fprintf (file, "\t\t.reg.u32 %%ustmp0;\n"); 1190 fprintf (file, "\t\t.reg.u%d %%ustmp1;\n", bits); 1191 fprintf (file, "\t\tmov.u32 %%ustmp0, %%tid.y;\n"); 1192 fprintf (file, "\t\tmul%s.u32 %%ustmp1, %%ustmp0, 4;\n", 1193 bits == 64 ? ".wide" : ".lo"); 1194 fprintf (file, "\t\tmov.u%d %%r%d, __nvptx_uni;\n", bits, loc); 1195 fprintf (file, "\t\tadd.u%d %%r%d, %%r%d, %%ustmp1;\n", bits, loc, loc); 1196 if (cfun->machine->unisimt_predicate) 1197 { 1198 int master = REGNO (cfun->machine->unisimt_master); 1199 int pred = REGNO (cfun->machine->unisimt_predicate); 1200 fprintf (file, "\t\tld.shared.u32 %%r%d, [%%r%d];\n", master, loc); 1201 fprintf (file, "\t\tmov.u32 %%ustmp0, %%laneid;\n"); 1202 /* Compute 'master lane index' as 'laneid & __nvptx_uni[tid.y]'. */ 1203 fprintf (file, "\t\tand.b32 %%r%d, %%r%d, %%ustmp0;\n", master, master); 1204 /* Compute predicate as 'tid.x == master'. */ 1205 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp0;\n", pred, master); 1206 } 1207 fprintf (file, "\t}\n"); 1208 need_unisimt_decl = true; 1209 } 1210 1211 /* Emit kernel NAME for function ORIG outlined for an OpenMP 'target' region: 1212 1213 extern void gomp_nvptx_main (void (*fn)(void*), void *fnarg); 1214 void __attribute__((kernel)) NAME (void *arg, char *stack, size_t stacksize) 1215 { 1216 __nvptx_stacks[tid.y] = stack + stacksize * (ctaid.x * ntid.y + tid.y + 1); 1217 __nvptx_uni[tid.y] = 0; 1218 gomp_nvptx_main (ORIG, arg); 1219 } 1220 ORIG itself should not be emitted as a PTX .entry function. */ 1221 1222 static void 1223 write_omp_entry (FILE *file, const char *name, const char *orig) 1224 { 1225 static bool gomp_nvptx_main_declared; 1226 if (!gomp_nvptx_main_declared) 1227 { 1228 gomp_nvptx_main_declared = true; 1229 write_fn_marker (func_decls, false, true, "gomp_nvptx_main"); 1230 func_decls << ".extern .func gomp_nvptx_main (.param.u" << POINTER_SIZE 1231 << " %in_ar1, .param.u" << POINTER_SIZE << " %in_ar2);\n"; 1232 } 1233 /* PR79332. Single out this string; it confuses gcc.pot generation. */ 1234 #define NTID_Y "%ntid.y" 1235 #define ENTRY_TEMPLATE(PS, PS_BYTES, MAD_PS_32) "\ 1236 (.param.u" PS " %arg, .param.u" PS " %stack, .param.u" PS " %sz)\n\ 1237 {\n\ 1238 .reg.u32 %r<3>;\n\ 1239 .reg.u" PS " %R<4>;\n\ 1240 mov.u32 %r0, %tid.y;\n\ 1241 mov.u32 %r1, " NTID_Y ";\n\ 1242 mov.u32 %r2, %ctaid.x;\n\ 1243 cvt.u" PS ".u32 %R1, %r0;\n\ 1244 " MAD_PS_32 " %R1, %r1, %r2, %R1;\n\ 1245 mov.u" PS " %R0, __nvptx_stacks;\n\ 1246 " MAD_PS_32 " %R0, %r0, " PS_BYTES ", %R0;\n\ 1247 ld.param.u" PS " %R2, [%stack];\n\ 1248 ld.param.u" PS " %R3, [%sz];\n\ 1249 add.u" PS " %R2, %R2, %R3;\n\ 1250 mad.lo.u" PS " %R2, %R1, %R3, %R2;\n\ 1251 st.shared.u" PS " [%R0], %R2;\n\ 1252 mov.u" PS " %R0, __nvptx_uni;\n\ 1253 " MAD_PS_32 " %R0, %r0, 4, %R0;\n\ 1254 mov.u32 %r0, 0;\n\ 1255 st.shared.u32 [%R0], %r0;\n\ 1256 mov.u" PS " %R0, \0;\n\ 1257 ld.param.u" PS " %R1, [%arg];\n\ 1258 {\n\ 1259 .param.u" PS " %P<2>;\n\ 1260 st.param.u" PS " [%P0], %R0;\n\ 1261 st.param.u" PS " [%P1], %R1;\n\ 1262 call.uni gomp_nvptx_main, (%P0, %P1);\n\ 1263 }\n\ 1264 ret.uni;\n\ 1265 }\n" 1266 static const char entry64[] = ENTRY_TEMPLATE ("64", "8", "mad.wide.u32"); 1267 static const char entry32[] = ENTRY_TEMPLATE ("32", "4", "mad.lo.u32 "); 1268 #undef ENTRY_TEMPLATE 1269 #undef NTID_Y 1270 const char *entry_1 = TARGET_ABI64 ? entry64 : entry32; 1271 /* Position ENTRY_2 after the embedded nul using strlen of the prefix. */ 1272 const char *entry_2 = entry_1 + strlen (entry64) + 1; 1273 fprintf (file, ".visible .entry %s%s%s%s", name, entry_1, orig, entry_2); 1274 need_softstack_decl = need_unisimt_decl = true; 1275 } 1276 1277 /* Implement ASM_DECLARE_FUNCTION_NAME. Writes the start of a ptx 1278 function, including local var decls and copies from the arguments to 1279 local regs. */ 1280 1281 void 1282 nvptx_declare_function_name (FILE *file, const char *name, const_tree decl) 1283 { 1284 tree fntype = TREE_TYPE (decl); 1285 tree result_type = TREE_TYPE (fntype); 1286 int argno = 0; 1287 1288 if (lookup_attribute ("omp target entrypoint", DECL_ATTRIBUTES (decl)) 1289 && !lookup_attribute ("oacc function", DECL_ATTRIBUTES (decl))) 1290 { 1291 char *buf = (char *) alloca (strlen (name) + sizeof ("$impl")); 1292 sprintf (buf, "%s$impl", name); 1293 write_omp_entry (file, name, buf); 1294 name = buf; 1295 } 1296 /* We construct the initial part of the function into a string 1297 stream, in order to share the prototype writing code. */ 1298 std::stringstream s; 1299 write_fn_proto (s, true, name, decl); 1300 s << "{\n"; 1301 1302 bool return_in_mem = write_return_type (s, false, result_type); 1303 if (return_in_mem) 1304 argno = write_arg_type (s, 0, argno, ptr_type_node, true); 1305 1306 /* Declare and initialize incoming arguments. */ 1307 tree args = TYPE_ARG_TYPES (fntype); 1308 bool prototyped = true; 1309 if (!args) 1310 { 1311 args = DECL_ARGUMENTS (decl); 1312 prototyped = false; 1313 } 1314 1315 for (; args != NULL_TREE; args = TREE_CHAIN (args)) 1316 { 1317 tree type = prototyped ? TREE_VALUE (args) : TREE_TYPE (args); 1318 1319 argno = write_arg_type (s, 0, argno, type, prototyped); 1320 } 1321 1322 if (stdarg_p (fntype)) 1323 argno = write_arg_type (s, ARG_POINTER_REGNUM, argno, ptr_type_node, 1324 true); 1325 1326 if (DECL_STATIC_CHAIN (decl) || cfun->machine->has_chain) 1327 write_arg_type (s, STATIC_CHAIN_REGNUM, 1328 DECL_STATIC_CHAIN (decl) ? argno : -1, ptr_type_node, 1329 true); 1330 1331 fprintf (file, "%s", s.str().c_str()); 1332 1333 /* Usually 'crtl->is_leaf' is computed during register allocator 1334 initialization (which is not done on NVPTX) or for pressure-sensitive 1335 optimizations. Initialize it here, except if already set. */ 1336 if (!crtl->is_leaf) 1337 crtl->is_leaf = leaf_function_p (); 1338 1339 HOST_WIDE_INT sz = get_frame_size (); 1340 bool need_frameptr = sz || cfun->machine->has_chain; 1341 int alignment = crtl->stack_alignment_needed / BITS_PER_UNIT; 1342 if (!TARGET_SOFT_STACK) 1343 { 1344 /* Declare a local var for outgoing varargs. */ 1345 if (cfun->machine->has_varadic) 1346 init_frame (file, STACK_POINTER_REGNUM, 1347 UNITS_PER_WORD, crtl->outgoing_args_size); 1348 1349 /* Declare a local variable for the frame. Force its size to be 1350 DImode-compatible. */ 1351 if (need_frameptr) 1352 init_frame (file, FRAME_POINTER_REGNUM, alignment, 1353 ROUND_UP (sz, GET_MODE_SIZE (DImode))); 1354 } 1355 else if (need_frameptr || cfun->machine->has_varadic || cfun->calls_alloca 1356 || (cfun->machine->has_simtreg && !crtl->is_leaf)) 1357 init_softstack_frame (file, alignment, sz); 1358 1359 if (cfun->machine->has_simtreg) 1360 { 1361 unsigned HOST_WIDE_INT &simtsz = cfun->machine->simt_stack_size; 1362 unsigned HOST_WIDE_INT &align = cfun->machine->simt_stack_align; 1363 align = MAX (align, GET_MODE_SIZE (DImode)); 1364 if (!crtl->is_leaf || cfun->calls_alloca) 1365 simtsz = HOST_WIDE_INT_M1U; 1366 if (simtsz == HOST_WIDE_INT_M1U) 1367 simtsz = nvptx_softstack_size; 1368 if (cfun->machine->has_softstack) 1369 simtsz += POINTER_SIZE / 8; 1370 simtsz = ROUND_UP (simtsz, GET_MODE_SIZE (DImode)); 1371 if (align > GET_MODE_SIZE (DImode)) 1372 simtsz += align - GET_MODE_SIZE (DImode); 1373 if (simtsz) 1374 fprintf (file, "\t.local.align 8 .b8 %%simtstack_ar[" 1375 HOST_WIDE_INT_PRINT_DEC "];\n", simtsz); 1376 } 1377 1378 /* Restore the vector reduction partition register, if necessary. 1379 FIXME: Find out when and why this is necessary, and fix it. */ 1380 if (cfun->machine->red_partition) 1381 regno_reg_rtx[REGNO (cfun->machine->red_partition)] 1382 = cfun->machine->red_partition; 1383 1384 /* Declare the pseudos we have as ptx registers. */ 1385 int maxregs = max_reg_num (); 1386 for (int i = LAST_VIRTUAL_REGISTER + 1; i < maxregs; i++) 1387 { 1388 if (regno_reg_rtx[i] != const0_rtx) 1389 { 1390 machine_mode mode = PSEUDO_REGNO_MODE (i); 1391 machine_mode split = maybe_split_mode (mode); 1392 1393 if (split_mode_p (mode)) 1394 mode = split; 1395 fprintf (file, "\t.reg%s ", nvptx_ptx_type_from_mode (mode, true)); 1396 output_reg (file, i, split, -2); 1397 fprintf (file, ";\n"); 1398 } 1399 } 1400 1401 /* Emit axis predicates. */ 1402 if (cfun->machine->axis_predicate[0]) 1403 nvptx_init_axis_predicate (file, 1404 REGNO (cfun->machine->axis_predicate[0]), "y"); 1405 if (cfun->machine->axis_predicate[1]) 1406 nvptx_init_axis_predicate (file, 1407 REGNO (cfun->machine->axis_predicate[1]), "x"); 1408 if (cfun->machine->unisimt_predicate 1409 || (cfun->machine->has_simtreg && !crtl->is_leaf)) 1410 nvptx_init_unisimt_predicate (file); 1411 if (cfun->machine->bcast_partition || cfun->machine->sync_bar) 1412 nvptx_init_oacc_workers (file); 1413 } 1414 1415 /* Output code for switching uniform-simt state. ENTERING indicates whether 1416 we are entering or leaving non-uniform execution region. */ 1417 1418 static void 1419 nvptx_output_unisimt_switch (FILE *file, bool entering) 1420 { 1421 if (crtl->is_leaf && !cfun->machine->unisimt_predicate) 1422 return; 1423 fprintf (file, "\t{\n"); 1424 fprintf (file, "\t\t.reg.u32 %%ustmp2;\n"); 1425 fprintf (file, "\t\tmov.u32 %%ustmp2, %d;\n", entering ? -1 : 0); 1426 if (!crtl->is_leaf) 1427 { 1428 int loc = REGNO (cfun->machine->unisimt_location); 1429 fprintf (file, "\t\tst.shared.u32 [%%r%d], %%ustmp2;\n", loc); 1430 } 1431 if (cfun->machine->unisimt_predicate) 1432 { 1433 int master = REGNO (cfun->machine->unisimt_master); 1434 int pred = REGNO (cfun->machine->unisimt_predicate); 1435 fprintf (file, "\t\tmov.u32 %%ustmp2, %%laneid;\n"); 1436 fprintf (file, "\t\tmov.u32 %%r%d, %s;\n", 1437 master, entering ? "%ustmp2" : "0"); 1438 fprintf (file, "\t\tsetp.eq.u32 %%r%d, %%r%d, %%ustmp2;\n", pred, master); 1439 } 1440 fprintf (file, "\t}\n"); 1441 } 1442 1443 /* Output code for allocating per-lane storage and switching soft-stack pointer. 1444 ENTERING indicates whether we are entering or leaving non-uniform execution. 1445 PTR is the register pointing to allocated storage, it is assigned to on 1446 entering and used to restore state on leaving. SIZE and ALIGN are used only 1447 on entering. */ 1448 1449 static void 1450 nvptx_output_softstack_switch (FILE *file, bool entering, 1451 rtx ptr, rtx size, rtx align) 1452 { 1453 gcc_assert (REG_P (ptr) && !HARD_REGISTER_P (ptr)); 1454 if (crtl->is_leaf && !cfun->machine->simt_stack_size) 1455 return; 1456 int bits = POINTER_SIZE, regno = REGNO (ptr); 1457 fprintf (file, "\t{\n"); 1458 if (entering) 1459 { 1460 fprintf (file, "\t\tcvta.local.u%d %%r%d, %%simtstack_ar + " 1461 HOST_WIDE_INT_PRINT_DEC ";\n", bits, regno, 1462 cfun->machine->simt_stack_size); 1463 fprintf (file, "\t\tsub.u%d %%r%d, %%r%d, ", bits, regno, regno); 1464 if (CONST_INT_P (size)) 1465 fprintf (file, HOST_WIDE_INT_PRINT_DEC, 1466 ROUND_UP (UINTVAL (size), GET_MODE_SIZE (DImode))); 1467 else 1468 output_reg (file, REGNO (size), VOIDmode); 1469 fputs (";\n", file); 1470 if (!CONST_INT_P (size) || UINTVAL (align) > GET_MODE_SIZE (DImode)) 1471 fprintf (file, 1472 "\t\tand.b%d %%r%d, %%r%d, -" HOST_WIDE_INT_PRINT_DEC ";\n", 1473 bits, regno, regno, UINTVAL (align)); 1474 } 1475 if (cfun->machine->has_softstack) 1476 { 1477 const char *reg_stack = reg_names[STACK_POINTER_REGNUM]; 1478 if (entering) 1479 { 1480 fprintf (file, "\t\tst.u%d [%%r%d + -%d], %s;\n", 1481 bits, regno, bits / 8, reg_stack); 1482 fprintf (file, "\t\tsub.u%d %s, %%r%d, %d;\n", 1483 bits, reg_stack, regno, bits / 8); 1484 } 1485 else 1486 { 1487 fprintf (file, "\t\tld.u%d %s, [%%r%d + -%d];\n", 1488 bits, reg_stack, regno, bits / 8); 1489 } 1490 nvptx_output_set_softstack (REGNO (stack_pointer_rtx)); 1491 } 1492 fprintf (file, "\t}\n"); 1493 } 1494 1495 /* Output code to enter non-uniform execution region. DEST is a register 1496 to hold a per-lane allocation given by SIZE and ALIGN. */ 1497 1498 const char * 1499 nvptx_output_simt_enter (rtx dest, rtx size, rtx align) 1500 { 1501 nvptx_output_unisimt_switch (asm_out_file, true); 1502 nvptx_output_softstack_switch (asm_out_file, true, dest, size, align); 1503 return ""; 1504 } 1505 1506 /* Output code to leave non-uniform execution region. SRC is the register 1507 holding per-lane storage previously allocated by omp_simt_enter insn. */ 1508 1509 const char * 1510 nvptx_output_simt_exit (rtx src) 1511 { 1512 nvptx_output_unisimt_switch (asm_out_file, false); 1513 nvptx_output_softstack_switch (asm_out_file, false, src, NULL_RTX, NULL_RTX); 1514 return ""; 1515 } 1516 1517 /* Output instruction that sets soft stack pointer in shared memory to the 1518 value in register given by SRC_REGNO. */ 1519 1520 const char * 1521 nvptx_output_set_softstack (unsigned src_regno) 1522 { 1523 if (cfun->machine->has_softstack && !crtl->is_leaf) 1524 { 1525 fprintf (asm_out_file, "\tst.shared.u%d\t[%s], ", 1526 POINTER_SIZE, reg_names[SOFTSTACK_SLOT_REGNUM]); 1527 output_reg (asm_out_file, src_regno, VOIDmode); 1528 fprintf (asm_out_file, ";\n"); 1529 } 1530 return ""; 1531 } 1532 /* Output a return instruction. Also copy the return value to its outgoing 1533 location. */ 1534 1535 const char * 1536 nvptx_output_return (void) 1537 { 1538 machine_mode mode = (machine_mode)cfun->machine->return_mode; 1539 1540 if (mode != VOIDmode) 1541 fprintf (asm_out_file, "\tst.param%s\t[%s_out], %s;\n", 1542 nvptx_ptx_type_from_mode (mode, false), 1543 reg_names[NVPTX_RETURN_REGNUM], 1544 reg_names[NVPTX_RETURN_REGNUM]); 1545 1546 return "ret;"; 1547 } 1548 1549 /* Terminate a function by writing a closing brace to FILE. */ 1550 1551 void 1552 nvptx_function_end (FILE *file) 1553 { 1554 fprintf (file, "}\n"); 1555 } 1556 1557 /* Decide whether we can make a sibling call to a function. For ptx, we 1558 can't. */ 1559 1560 static bool 1561 nvptx_function_ok_for_sibcall (tree, tree) 1562 { 1563 return false; 1564 } 1565 1566 /* Return Dynamic ReAlignment Pointer RTX. For PTX there isn't any. */ 1567 1568 static rtx 1569 nvptx_get_drap_rtx (void) 1570 { 1571 if (TARGET_SOFT_STACK && stack_realign_drap) 1572 return arg_pointer_rtx; 1573 return NULL_RTX; 1574 } 1575 1576 /* Implement the TARGET_CALL_ARGS hook. Record information about one 1577 argument to the next call. */ 1578 1579 static void 1580 nvptx_call_args (rtx arg, tree fntype) 1581 { 1582 if (!cfun->machine->doing_call) 1583 { 1584 cfun->machine->doing_call = true; 1585 cfun->machine->is_varadic = false; 1586 cfun->machine->num_args = 0; 1587 1588 if (fntype && stdarg_p (fntype)) 1589 { 1590 cfun->machine->is_varadic = true; 1591 cfun->machine->has_varadic = true; 1592 cfun->machine->num_args++; 1593 } 1594 } 1595 1596 if (REG_P (arg) && arg != pc_rtx) 1597 { 1598 cfun->machine->num_args++; 1599 cfun->machine->call_args = alloc_EXPR_LIST (VOIDmode, arg, 1600 cfun->machine->call_args); 1601 } 1602 } 1603 1604 /* Implement the corresponding END_CALL_ARGS hook. Clear and free the 1605 information we recorded. */ 1606 1607 static void 1608 nvptx_end_call_args (void) 1609 { 1610 cfun->machine->doing_call = false; 1611 free_EXPR_LIST_list (&cfun->machine->call_args); 1612 } 1613 1614 /* Emit the sequence for a call to ADDRESS, setting RETVAL. Keep 1615 track of whether calls involving static chains or varargs were seen 1616 in the current function. 1617 For libcalls, maintain a hash table of decls we have seen, and 1618 record a function decl for later when encountering a new one. */ 1619 1620 void 1621 nvptx_expand_call (rtx retval, rtx address) 1622 { 1623 rtx callee = XEXP (address, 0); 1624 rtx varargs = NULL_RTX; 1625 unsigned parallel = 0; 1626 1627 if (!call_insn_operand (callee, Pmode)) 1628 { 1629 callee = force_reg (Pmode, callee); 1630 address = change_address (address, QImode, callee); 1631 } 1632 1633 if (GET_CODE (callee) == SYMBOL_REF) 1634 { 1635 tree decl = SYMBOL_REF_DECL (callee); 1636 if (decl != NULL_TREE) 1637 { 1638 if (DECL_STATIC_CHAIN (decl)) 1639 cfun->machine->has_chain = true; 1640 1641 tree attr = oacc_get_fn_attrib (decl); 1642 if (attr) 1643 { 1644 tree dims = TREE_VALUE (attr); 1645 1646 parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1; 1647 for (int ix = 0; ix != GOMP_DIM_MAX; ix++) 1648 { 1649 if (TREE_PURPOSE (dims) 1650 && !integer_zerop (TREE_PURPOSE (dims))) 1651 break; 1652 /* Not on this axis. */ 1653 parallel ^= GOMP_DIM_MASK (ix); 1654 dims = TREE_CHAIN (dims); 1655 } 1656 } 1657 } 1658 } 1659 1660 unsigned nargs = cfun->machine->num_args; 1661 if (cfun->machine->is_varadic) 1662 { 1663 varargs = gen_reg_rtx (Pmode); 1664 emit_move_insn (varargs, stack_pointer_rtx); 1665 } 1666 1667 rtvec vec = rtvec_alloc (nargs + 1); 1668 rtx pat = gen_rtx_PARALLEL (VOIDmode, vec); 1669 int vec_pos = 0; 1670 1671 rtx call = gen_rtx_CALL (VOIDmode, address, const0_rtx); 1672 rtx tmp_retval = retval; 1673 if (retval) 1674 { 1675 if (!nvptx_register_operand (retval, GET_MODE (retval))) 1676 tmp_retval = gen_reg_rtx (GET_MODE (retval)); 1677 call = gen_rtx_SET (tmp_retval, call); 1678 } 1679 XVECEXP (pat, 0, vec_pos++) = call; 1680 1681 /* Construct the call insn, including a USE for each argument pseudo 1682 register. These will be used when printing the insn. */ 1683 for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1)) 1684 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, XEXP (arg, 0)); 1685 1686 if (varargs) 1687 XVECEXP (pat, 0, vec_pos++) = gen_rtx_USE (VOIDmode, varargs); 1688 1689 gcc_assert (vec_pos = XVECLEN (pat, 0)); 1690 1691 nvptx_emit_forking (parallel, true); 1692 emit_call_insn (pat); 1693 nvptx_emit_joining (parallel, true); 1694 1695 if (tmp_retval != retval) 1696 emit_move_insn (retval, tmp_retval); 1697 } 1698 1699 /* Emit a comparison COMPARE, and return the new test to be used in the 1700 jump. */ 1701 1702 rtx 1703 nvptx_expand_compare (rtx compare) 1704 { 1705 rtx pred = gen_reg_rtx (BImode); 1706 rtx cmp = gen_rtx_fmt_ee (GET_CODE (compare), BImode, 1707 XEXP (compare, 0), XEXP (compare, 1)); 1708 emit_insn (gen_rtx_SET (pred, cmp)); 1709 return gen_rtx_NE (BImode, pred, const0_rtx); 1710 } 1711 1712 /* Expand the oacc fork & join primitive into ptx-required unspecs. */ 1713 1714 void 1715 nvptx_expand_oacc_fork (unsigned mode) 1716 { 1717 nvptx_emit_forking (GOMP_DIM_MASK (mode), false); 1718 } 1719 1720 void 1721 nvptx_expand_oacc_join (unsigned mode) 1722 { 1723 nvptx_emit_joining (GOMP_DIM_MASK (mode), false); 1724 } 1725 1726 /* Generate instruction(s) to unpack a 64 bit object into 2 32 bit 1727 objects. */ 1728 1729 static rtx 1730 nvptx_gen_unpack (rtx dst0, rtx dst1, rtx src) 1731 { 1732 rtx res; 1733 1734 switch (GET_MODE (src)) 1735 { 1736 case E_DImode: 1737 res = gen_unpackdisi2 (dst0, dst1, src); 1738 break; 1739 case E_DFmode: 1740 res = gen_unpackdfsi2 (dst0, dst1, src); 1741 break; 1742 default: gcc_unreachable (); 1743 } 1744 return res; 1745 } 1746 1747 /* Generate instruction(s) to pack 2 32 bit objects into a 64 bit 1748 object. */ 1749 1750 static rtx 1751 nvptx_gen_pack (rtx dst, rtx src0, rtx src1) 1752 { 1753 rtx res; 1754 1755 switch (GET_MODE (dst)) 1756 { 1757 case E_DImode: 1758 res = gen_packsidi2 (dst, src0, src1); 1759 break; 1760 case E_DFmode: 1761 res = gen_packsidf2 (dst, src0, src1); 1762 break; 1763 default: gcc_unreachable (); 1764 } 1765 return res; 1766 } 1767 1768 /* Generate an instruction or sequence to broadcast register REG 1769 across the vectors of a single warp. */ 1770 1771 rtx 1772 nvptx_gen_shuffle (rtx dst, rtx src, rtx idx, nvptx_shuffle_kind kind) 1773 { 1774 rtx res; 1775 1776 switch (GET_MODE (dst)) 1777 { 1778 case E_SImode: 1779 res = gen_nvptx_shufflesi (dst, src, idx, GEN_INT (kind)); 1780 break; 1781 case E_SFmode: 1782 res = gen_nvptx_shufflesf (dst, src, idx, GEN_INT (kind)); 1783 break; 1784 case E_DImode: 1785 case E_DFmode: 1786 { 1787 rtx tmp0 = gen_reg_rtx (SImode); 1788 rtx tmp1 = gen_reg_rtx (SImode); 1789 1790 start_sequence (); 1791 emit_insn (nvptx_gen_unpack (tmp0, tmp1, src)); 1792 emit_insn (nvptx_gen_shuffle (tmp0, tmp0, idx, kind)); 1793 emit_insn (nvptx_gen_shuffle (tmp1, tmp1, idx, kind)); 1794 emit_insn (nvptx_gen_pack (dst, tmp0, tmp1)); 1795 res = get_insns (); 1796 end_sequence (); 1797 } 1798 break; 1799 case E_BImode: 1800 { 1801 rtx tmp = gen_reg_rtx (SImode); 1802 1803 start_sequence (); 1804 emit_insn (gen_sel_truesi (tmp, src, GEN_INT (1), const0_rtx)); 1805 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind)); 1806 emit_insn (gen_rtx_SET (dst, gen_rtx_NE (BImode, tmp, const0_rtx))); 1807 res = get_insns (); 1808 end_sequence (); 1809 } 1810 break; 1811 case E_QImode: 1812 case E_HImode: 1813 { 1814 rtx tmp = gen_reg_rtx (SImode); 1815 1816 start_sequence (); 1817 emit_insn (gen_rtx_SET (tmp, gen_rtx_fmt_e (ZERO_EXTEND, SImode, src))); 1818 emit_insn (nvptx_gen_shuffle (tmp, tmp, idx, kind)); 1819 emit_insn (gen_rtx_SET (dst, gen_rtx_fmt_e (TRUNCATE, GET_MODE (dst), 1820 tmp))); 1821 res = get_insns (); 1822 end_sequence (); 1823 } 1824 break; 1825 1826 default: 1827 gcc_unreachable (); 1828 } 1829 return res; 1830 } 1831 1832 /* Generate an instruction or sequence to broadcast register REG 1833 across the vectors of a single warp. */ 1834 1835 static rtx 1836 nvptx_gen_warp_bcast (rtx reg) 1837 { 1838 return nvptx_gen_shuffle (reg, reg, const0_rtx, SHUFFLE_IDX); 1839 } 1840 1841 /* Structure used when generating a worker-level spill or fill. */ 1842 1843 struct broadcast_data_t 1844 { 1845 rtx base; /* Register holding base addr of buffer. */ 1846 rtx ptr; /* Iteration var, if needed. */ 1847 unsigned offset; /* Offset into worker buffer. */ 1848 }; 1849 1850 /* Direction of the spill/fill and looping setup/teardown indicator. */ 1851 1852 enum propagate_mask 1853 { 1854 PM_read = 1 << 0, 1855 PM_write = 1 << 1, 1856 PM_loop_begin = 1 << 2, 1857 PM_loop_end = 1 << 3, 1858 1859 PM_read_write = PM_read | PM_write 1860 }; 1861 1862 /* Generate instruction(s) to spill or fill register REG to/from the 1863 worker broadcast array. PM indicates what is to be done, REP 1864 how many loop iterations will be executed (0 for not a loop). */ 1865 1866 static rtx 1867 nvptx_gen_shared_bcast (rtx reg, propagate_mask pm, unsigned rep, 1868 broadcast_data_t *data, bool vector) 1869 { 1870 rtx res; 1871 machine_mode mode = GET_MODE (reg); 1872 1873 switch (mode) 1874 { 1875 case E_BImode: 1876 { 1877 rtx tmp = gen_reg_rtx (SImode); 1878 1879 start_sequence (); 1880 if (pm & PM_read) 1881 emit_insn (gen_sel_truesi (tmp, reg, GEN_INT (1), const0_rtx)); 1882 emit_insn (nvptx_gen_shared_bcast (tmp, pm, rep, data, vector)); 1883 if (pm & PM_write) 1884 emit_insn (gen_rtx_SET (reg, gen_rtx_NE (BImode, tmp, const0_rtx))); 1885 res = get_insns (); 1886 end_sequence (); 1887 } 1888 break; 1889 1890 default: 1891 { 1892 rtx addr = data->ptr; 1893 1894 if (!addr) 1895 { 1896 unsigned align = GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT; 1897 1898 oacc_bcast_align = MAX (oacc_bcast_align, align); 1899 data->offset = ROUND_UP (data->offset, align); 1900 addr = data->base; 1901 gcc_assert (data->base != NULL); 1902 if (data->offset) 1903 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (data->offset)); 1904 } 1905 1906 addr = gen_rtx_MEM (mode, addr); 1907 if (pm == PM_read) 1908 res = gen_rtx_SET (addr, reg); 1909 else if (pm == PM_write) 1910 res = gen_rtx_SET (reg, addr); 1911 else 1912 gcc_unreachable (); 1913 1914 if (data->ptr) 1915 { 1916 /* We're using a ptr, increment it. */ 1917 start_sequence (); 1918 1919 emit_insn (res); 1920 emit_insn (gen_adddi3 (data->ptr, data->ptr, 1921 GEN_INT (GET_MODE_SIZE (GET_MODE (reg))))); 1922 res = get_insns (); 1923 end_sequence (); 1924 } 1925 else 1926 rep = 1; 1927 data->offset += rep * GET_MODE_SIZE (GET_MODE (reg)); 1928 } 1929 break; 1930 } 1931 return res; 1932 } 1933 1934 /* Returns true if X is a valid address for use in a memory reference. */ 1935 1936 static bool 1937 nvptx_legitimate_address_p (machine_mode, rtx x, bool) 1938 { 1939 enum rtx_code code = GET_CODE (x); 1940 1941 switch (code) 1942 { 1943 case REG: 1944 return true; 1945 1946 case PLUS: 1947 if (REG_P (XEXP (x, 0)) && CONST_INT_P (XEXP (x, 1))) 1948 return true; 1949 return false; 1950 1951 case CONST: 1952 case SYMBOL_REF: 1953 case LABEL_REF: 1954 return true; 1955 1956 default: 1957 return false; 1958 } 1959 } 1960 1961 /* Machinery to output constant initializers. When beginning an 1962 initializer, we decide on a fragment size (which is visible in ptx 1963 in the type used), and then all initializer data is buffered until 1964 a fragment is filled and ready to be written out. */ 1965 1966 static struct 1967 { 1968 unsigned HOST_WIDE_INT mask; /* Mask for storing fragment. */ 1969 unsigned HOST_WIDE_INT val; /* Current fragment value. */ 1970 unsigned HOST_WIDE_INT remaining; /* Remaining bytes to be written 1971 out. */ 1972 unsigned size; /* Fragment size to accumulate. */ 1973 unsigned offset; /* Offset within current fragment. */ 1974 bool started; /* Whether we've output any initializer. */ 1975 } init_frag; 1976 1977 /* The current fragment is full, write it out. SYM may provide a 1978 symbolic reference we should output, in which case the fragment 1979 value is the addend. */ 1980 1981 static void 1982 output_init_frag (rtx sym) 1983 { 1984 fprintf (asm_out_file, init_frag.started ? ", " : " = { "); 1985 unsigned HOST_WIDE_INT val = init_frag.val; 1986 1987 init_frag.started = true; 1988 init_frag.val = 0; 1989 init_frag.offset = 0; 1990 init_frag.remaining--; 1991 1992 if (sym) 1993 { 1994 bool function = (SYMBOL_REF_DECL (sym) 1995 && (TREE_CODE (SYMBOL_REF_DECL (sym)) == FUNCTION_DECL)); 1996 if (!function) 1997 fprintf (asm_out_file, "generic("); 1998 output_address (VOIDmode, sym); 1999 if (!function) 2000 fprintf (asm_out_file, ")"); 2001 if (val) 2002 fprintf (asm_out_file, " + "); 2003 } 2004 2005 if (!sym || val) 2006 fprintf (asm_out_file, HOST_WIDE_INT_PRINT_DEC, val); 2007 } 2008 2009 /* Add value VAL of size SIZE to the data we're emitting, and keep 2010 writing out chunks as they fill up. */ 2011 2012 static void 2013 nvptx_assemble_value (unsigned HOST_WIDE_INT val, unsigned size) 2014 { 2015 val &= ((unsigned HOST_WIDE_INT)2 << (size * BITS_PER_UNIT - 1)) - 1; 2016 2017 for (unsigned part = 0; size; size -= part) 2018 { 2019 val >>= part * BITS_PER_UNIT; 2020 part = init_frag.size - init_frag.offset; 2021 part = MIN (part, size); 2022 2023 unsigned HOST_WIDE_INT partial 2024 = val << (init_frag.offset * BITS_PER_UNIT); 2025 init_frag.val |= partial & init_frag.mask; 2026 init_frag.offset += part; 2027 2028 if (init_frag.offset == init_frag.size) 2029 output_init_frag (NULL); 2030 } 2031 } 2032 2033 /* Target hook for assembling integer object X of size SIZE. */ 2034 2035 static bool 2036 nvptx_assemble_integer (rtx x, unsigned int size, int ARG_UNUSED (aligned_p)) 2037 { 2038 HOST_WIDE_INT val = 0; 2039 2040 switch (GET_CODE (x)) 2041 { 2042 default: 2043 /* Let the generic machinery figure it out, usually for a 2044 CONST_WIDE_INT. */ 2045 return false; 2046 2047 case CONST_INT: 2048 nvptx_assemble_value (INTVAL (x), size); 2049 break; 2050 2051 case CONST: 2052 x = XEXP (x, 0); 2053 gcc_assert (GET_CODE (x) == PLUS); 2054 val = INTVAL (XEXP (x, 1)); 2055 x = XEXP (x, 0); 2056 gcc_assert (GET_CODE (x) == SYMBOL_REF); 2057 /* FALLTHROUGH */ 2058 2059 case SYMBOL_REF: 2060 gcc_assert (size == init_frag.size); 2061 if (init_frag.offset) 2062 sorry ("cannot emit unaligned pointers in ptx assembly"); 2063 2064 nvptx_maybe_record_fnsym (x); 2065 init_frag.val = val; 2066 output_init_frag (x); 2067 break; 2068 } 2069 2070 return true; 2071 } 2072 2073 /* Output SIZE zero bytes. We ignore the FILE argument since the 2074 functions we're calling to perform the output just use 2075 asm_out_file. */ 2076 2077 void 2078 nvptx_output_skip (FILE *, unsigned HOST_WIDE_INT size) 2079 { 2080 /* Finish the current fragment, if it's started. */ 2081 if (init_frag.offset) 2082 { 2083 unsigned part = init_frag.size - init_frag.offset; 2084 part = MIN (part, (unsigned)size); 2085 size -= part; 2086 nvptx_assemble_value (0, part); 2087 } 2088 2089 /* If this skip doesn't terminate the initializer, write as many 2090 remaining pieces as possible directly. */ 2091 if (size < init_frag.remaining * init_frag.size) 2092 { 2093 while (size >= init_frag.size) 2094 { 2095 size -= init_frag.size; 2096 output_init_frag (NULL_RTX); 2097 } 2098 if (size) 2099 nvptx_assemble_value (0, size); 2100 } 2101 } 2102 2103 /* Output a string STR with length SIZE. As in nvptx_output_skip we 2104 ignore the FILE arg. */ 2105 2106 void 2107 nvptx_output_ascii (FILE *, const char *str, unsigned HOST_WIDE_INT size) 2108 { 2109 for (unsigned HOST_WIDE_INT i = 0; i < size; i++) 2110 nvptx_assemble_value (str[i], 1); 2111 } 2112 2113 /* Return true if TYPE is a record type where the last field is an array without 2114 given dimension. */ 2115 2116 static bool 2117 flexible_array_member_type_p (const_tree type) 2118 { 2119 if (TREE_CODE (type) != RECORD_TYPE) 2120 return false; 2121 2122 const_tree last_field = NULL_TREE; 2123 for (const_tree f = TYPE_FIELDS (type); f; f = TREE_CHAIN (f)) 2124 last_field = f; 2125 2126 if (!last_field) 2127 return false; 2128 2129 const_tree last_field_type = TREE_TYPE (last_field); 2130 if (TREE_CODE (last_field_type) != ARRAY_TYPE) 2131 return false; 2132 2133 return (! TYPE_DOMAIN (last_field_type) 2134 || ! TYPE_MAX_VALUE (TYPE_DOMAIN (last_field_type))); 2135 } 2136 2137 /* Emit a PTX variable decl and prepare for emission of its 2138 initializer. NAME is the symbol name and SETION the PTX data 2139 area. The type is TYPE, object size SIZE and alignment is ALIGN. 2140 The caller has already emitted any indentation and linkage 2141 specifier. It is responsible for any initializer, terminating ; 2142 and newline. SIZE is in bytes, ALIGN is in bits -- confusingly 2143 this is the opposite way round that PTX wants them! */ 2144 2145 static void 2146 nvptx_assemble_decl_begin (FILE *file, const char *name, const char *section, 2147 const_tree type, HOST_WIDE_INT size, unsigned align, 2148 bool undefined = false) 2149 { 2150 bool atype = (TREE_CODE (type) == ARRAY_TYPE) 2151 && (TYPE_DOMAIN (type) == NULL_TREE); 2152 2153 if (undefined && flexible_array_member_type_p (type)) 2154 { 2155 size = 0; 2156 atype = true; 2157 } 2158 2159 while (TREE_CODE (type) == ARRAY_TYPE) 2160 type = TREE_TYPE (type); 2161 2162 if (TREE_CODE (type) == VECTOR_TYPE 2163 || TREE_CODE (type) == COMPLEX_TYPE) 2164 /* Neither vector nor complex types can contain the other. */ 2165 type = TREE_TYPE (type); 2166 2167 unsigned elt_size = int_size_in_bytes (type); 2168 2169 /* Largest mode we're prepared to accept. For BLKmode types we 2170 don't know if it'll contain pointer constants, so have to choose 2171 pointer size, otherwise we can choose DImode. */ 2172 machine_mode elt_mode = TYPE_MODE (type) == BLKmode ? Pmode : DImode; 2173 2174 elt_size |= GET_MODE_SIZE (elt_mode); 2175 elt_size &= -elt_size; /* Extract LSB set. */ 2176 2177 init_frag.size = elt_size; 2178 /* Avoid undefined shift behavior by using '2'. */ 2179 init_frag.mask = ((unsigned HOST_WIDE_INT)2 2180 << (elt_size * BITS_PER_UNIT - 1)) - 1; 2181 init_frag.val = 0; 2182 init_frag.offset = 0; 2183 init_frag.started = false; 2184 /* Size might not be a multiple of elt size, if there's an 2185 initialized trailing struct array with smaller type than 2186 elt_size. */ 2187 init_frag.remaining = (size + elt_size - 1) / elt_size; 2188 2189 fprintf (file, "%s .align %d .u%d ", 2190 section, align / BITS_PER_UNIT, 2191 elt_size * BITS_PER_UNIT); 2192 assemble_name (file, name); 2193 2194 if (size) 2195 /* We make everything an array, to simplify any initialization 2196 emission. */ 2197 fprintf (file, "[" HOST_WIDE_INT_PRINT_DEC "]", init_frag.remaining); 2198 else if (atype) 2199 fprintf (file, "[]"); 2200 } 2201 2202 /* Called when the initializer for a decl has been completely output through 2203 combinations of the three functions above. */ 2204 2205 static void 2206 nvptx_assemble_decl_end (void) 2207 { 2208 if (init_frag.offset) 2209 /* This can happen with a packed struct with trailing array member. */ 2210 nvptx_assemble_value (0, init_frag.size - init_frag.offset); 2211 fprintf (asm_out_file, init_frag.started ? " };\n" : ";\n"); 2212 } 2213 2214 /* Output an uninitialized common or file-scope variable. */ 2215 2216 void 2217 nvptx_output_aligned_decl (FILE *file, const char *name, 2218 const_tree decl, HOST_WIDE_INT size, unsigned align) 2219 { 2220 write_var_marker (file, true, TREE_PUBLIC (decl), name); 2221 2222 /* If this is public, it is common. The nearest thing we have to 2223 common is weak. */ 2224 fprintf (file, "\t%s", TREE_PUBLIC (decl) ? ".weak " : ""); 2225 2226 nvptx_assemble_decl_begin (file, name, section_for_decl (decl), 2227 TREE_TYPE (decl), size, align); 2228 nvptx_assemble_decl_end (); 2229 } 2230 2231 /* Implement TARGET_ASM_DECLARE_CONSTANT_NAME. Begin the process of 2232 writing a constant variable EXP with NAME and SIZE and its 2233 initializer to FILE. */ 2234 2235 static void 2236 nvptx_asm_declare_constant_name (FILE *file, const char *name, 2237 const_tree exp, HOST_WIDE_INT obj_size) 2238 { 2239 write_var_marker (file, true, false, name); 2240 2241 fprintf (file, "\t"); 2242 2243 tree type = TREE_TYPE (exp); 2244 nvptx_assemble_decl_begin (file, name, ".const", type, obj_size, 2245 TYPE_ALIGN (type)); 2246 } 2247 2248 /* Implement the ASM_DECLARE_OBJECT_NAME macro. Used to start writing 2249 a variable DECL with NAME to FILE. */ 2250 2251 void 2252 nvptx_declare_object_name (FILE *file, const char *name, const_tree decl) 2253 { 2254 write_var_marker (file, true, TREE_PUBLIC (decl), name); 2255 2256 fprintf (file, "\t%s", (!TREE_PUBLIC (decl) ? "" 2257 : DECL_WEAK (decl) ? ".weak " : ".visible ")); 2258 2259 tree type = TREE_TYPE (decl); 2260 HOST_WIDE_INT obj_size = tree_to_shwi (DECL_SIZE_UNIT (decl)); 2261 nvptx_assemble_decl_begin (file, name, section_for_decl (decl), 2262 type, obj_size, DECL_ALIGN (decl)); 2263 } 2264 2265 /* Implement TARGET_ASM_GLOBALIZE_LABEL by doing nothing. */ 2266 2267 static void 2268 nvptx_globalize_label (FILE *, const char *) 2269 { 2270 } 2271 2272 /* Implement TARGET_ASM_ASSEMBLE_UNDEFINED_DECL. Write an extern 2273 declaration only for variable DECL with NAME to FILE. */ 2274 2275 static void 2276 nvptx_assemble_undefined_decl (FILE *file, const char *name, const_tree decl) 2277 { 2278 /* The middle end can place constant pool decls into the varpool as 2279 undefined. Until that is fixed, catch the problem here. */ 2280 if (DECL_IN_CONSTANT_POOL (decl)) 2281 return; 2282 2283 /* We support weak defintions, and hence have the right 2284 ASM_WEAKEN_DECL definition. Diagnose the problem here. */ 2285 if (DECL_WEAK (decl)) 2286 error_at (DECL_SOURCE_LOCATION (decl), 2287 "PTX does not support weak declarations" 2288 " (only weak definitions)"); 2289 write_var_marker (file, false, TREE_PUBLIC (decl), name); 2290 2291 fprintf (file, "\t.extern "); 2292 tree size = DECL_SIZE_UNIT (decl); 2293 nvptx_assemble_decl_begin (file, name, section_for_decl (decl), 2294 TREE_TYPE (decl), size ? tree_to_shwi (size) : 0, 2295 DECL_ALIGN (decl), true); 2296 nvptx_assemble_decl_end (); 2297 } 2298 2299 /* Output a pattern for a move instruction. */ 2300 2301 const char * 2302 nvptx_output_mov_insn (rtx dst, rtx src) 2303 { 2304 machine_mode dst_mode = GET_MODE (dst); 2305 machine_mode dst_inner = (GET_CODE (dst) == SUBREG 2306 ? GET_MODE (XEXP (dst, 0)) : dst_mode); 2307 machine_mode src_inner = (GET_CODE (src) == SUBREG 2308 ? GET_MODE (XEXP (src, 0)) : dst_mode); 2309 2310 rtx sym = src; 2311 if (GET_CODE (sym) == CONST) 2312 sym = XEXP (XEXP (sym, 0), 0); 2313 if (SYMBOL_REF_P (sym)) 2314 { 2315 if (SYMBOL_DATA_AREA (sym) != DATA_AREA_GENERIC) 2316 return "%.\tcvta%D1%t0\t%0, %1;"; 2317 nvptx_maybe_record_fnsym (sym); 2318 } 2319 2320 if (src_inner == dst_inner) 2321 return "%.\tmov%t0\t%0, %1;"; 2322 2323 if (CONSTANT_P (src)) 2324 return (GET_MODE_CLASS (dst_inner) == MODE_INT 2325 && GET_MODE_CLASS (src_inner) != MODE_FLOAT 2326 ? "%.\tmov%t0\t%0, %1;" : "%.\tmov.b%T0\t%0, %1;"); 2327 2328 if (GET_MODE_SIZE (dst_inner) == GET_MODE_SIZE (src_inner)) 2329 { 2330 if (GET_MODE_BITSIZE (dst_mode) == 128 2331 && GET_MODE_BITSIZE (GET_MODE (src)) == 128) 2332 { 2333 /* mov.b128 is not supported. */ 2334 if (dst_inner == V2DImode && src_inner == TImode) 2335 return "%.\tmov.u64\t%0.x, %L1;\n\t%.\tmov.u64\t%0.y, %H1;"; 2336 else if (dst_inner == TImode && src_inner == V2DImode) 2337 return "%.\tmov.u64\t%L0, %1.x;\n\t%.\tmov.u64\t%H0, %1.y;"; 2338 2339 gcc_unreachable (); 2340 } 2341 return "%.\tmov.b%T0\t%0, %1;"; 2342 } 2343 2344 return "%.\tcvt%t0%t1\t%0, %1;"; 2345 } 2346 2347 static void nvptx_print_operand (FILE *, rtx, int); 2348 2349 /* Output INSN, which is a call to CALLEE with result RESULT. For ptx, this 2350 involves writing .param declarations and in/out copies into them. For 2351 indirect calls, also write the .callprototype. */ 2352 2353 const char * 2354 nvptx_output_call_insn (rtx_insn *insn, rtx result, rtx callee) 2355 { 2356 char buf[16]; 2357 static int labelno; 2358 bool needs_tgt = register_operand (callee, Pmode); 2359 rtx pat = PATTERN (insn); 2360 if (GET_CODE (pat) == COND_EXEC) 2361 pat = COND_EXEC_CODE (pat); 2362 int arg_end = XVECLEN (pat, 0); 2363 tree decl = NULL_TREE; 2364 2365 fprintf (asm_out_file, "\t{\n"); 2366 if (result != NULL) 2367 fprintf (asm_out_file, "\t\t.param%s %s_in;\n", 2368 nvptx_ptx_type_from_mode (GET_MODE (result), false), 2369 reg_names[NVPTX_RETURN_REGNUM]); 2370 2371 /* Ensure we have a ptx declaration in the output if necessary. */ 2372 if (GET_CODE (callee) == SYMBOL_REF) 2373 { 2374 decl = SYMBOL_REF_DECL (callee); 2375 if (!decl 2376 || (DECL_EXTERNAL (decl) && !TYPE_ARG_TYPES (TREE_TYPE (decl)))) 2377 nvptx_record_libfunc (callee, result, pat); 2378 else if (DECL_EXTERNAL (decl)) 2379 nvptx_record_fndecl (decl); 2380 } 2381 2382 if (needs_tgt) 2383 { 2384 ASM_GENERATE_INTERNAL_LABEL (buf, "LCT", labelno); 2385 labelno++; 2386 ASM_OUTPUT_LABEL (asm_out_file, buf); 2387 std::stringstream s; 2388 write_fn_proto_from_insn (s, NULL, result, pat); 2389 fputs (s.str().c_str(), asm_out_file); 2390 } 2391 2392 for (int argno = 1; argno < arg_end; argno++) 2393 { 2394 rtx t = XEXP (XVECEXP (pat, 0, argno), 0); 2395 machine_mode mode = GET_MODE (t); 2396 const char *ptx_type = nvptx_ptx_type_from_mode (mode, false); 2397 2398 /* Mode splitting has already been done. */ 2399 fprintf (asm_out_file, "\t\t.param%s %%out_arg%d;\n" 2400 "\t\tst.param%s [%%out_arg%d], ", 2401 ptx_type, argno, ptx_type, argno); 2402 output_reg (asm_out_file, REGNO (t), VOIDmode); 2403 fprintf (asm_out_file, ";\n"); 2404 } 2405 2406 /* The '.' stands for the call's predicate, if any. */ 2407 nvptx_print_operand (asm_out_file, NULL_RTX, '.'); 2408 fprintf (asm_out_file, "\t\tcall "); 2409 if (result != NULL_RTX) 2410 fprintf (asm_out_file, "(%s_in), ", reg_names[NVPTX_RETURN_REGNUM]); 2411 2412 if (decl) 2413 { 2414 const char *name = get_fnname_from_decl (decl); 2415 name = nvptx_name_replacement (name); 2416 assemble_name (asm_out_file, name); 2417 } 2418 else 2419 output_address (VOIDmode, callee); 2420 2421 const char *open = "("; 2422 for (int argno = 1; argno < arg_end; argno++) 2423 { 2424 fprintf (asm_out_file, ", %s%%out_arg%d", open, argno); 2425 open = ""; 2426 } 2427 if (decl && DECL_STATIC_CHAIN (decl)) 2428 { 2429 fprintf (asm_out_file, ", %s%s", open, reg_names [STATIC_CHAIN_REGNUM]); 2430 open = ""; 2431 } 2432 if (!open[0]) 2433 fprintf (asm_out_file, ")"); 2434 2435 if (needs_tgt) 2436 { 2437 fprintf (asm_out_file, ", "); 2438 assemble_name (asm_out_file, buf); 2439 } 2440 fprintf (asm_out_file, ";\n"); 2441 2442 if (find_reg_note (insn, REG_NORETURN, NULL)) 2443 { 2444 /* No return functions confuse the PTX JIT, as it doesn't realize 2445 the flow control barrier they imply. It can seg fault if it 2446 encounters what looks like an unexitable loop. Emit a trailing 2447 trap and exit, which it does grok. */ 2448 fprintf (asm_out_file, "\t\ttrap; // (noreturn)\n"); 2449 fprintf (asm_out_file, "\t\texit; // (noreturn)\n"); 2450 } 2451 2452 if (result) 2453 { 2454 static char rval[sizeof ("\tld.param%%t0\t%%0, [%%%s_in];\n\t}") + 8]; 2455 2456 if (!rval[0]) 2457 /* We must escape the '%' that starts RETURN_REGNUM. */ 2458 sprintf (rval, "\tld.param%%t0\t%%0, [%%%s_in];\n\t}", 2459 reg_names[NVPTX_RETURN_REGNUM]); 2460 return rval; 2461 } 2462 2463 return "}"; 2464 } 2465 2466 /* Implement TARGET_PRINT_OPERAND_PUNCT_VALID_P. */ 2467 2468 static bool 2469 nvptx_print_operand_punct_valid_p (unsigned char c) 2470 { 2471 return c == '.' || c== '#'; 2472 } 2473 2474 /* Subroutine of nvptx_print_operand; used to print a memory reference X to FILE. */ 2475 2476 static void 2477 nvptx_print_address_operand (FILE *file, rtx x, machine_mode) 2478 { 2479 rtx off; 2480 if (GET_CODE (x) == CONST) 2481 x = XEXP (x, 0); 2482 switch (GET_CODE (x)) 2483 { 2484 case PLUS: 2485 off = XEXP (x, 1); 2486 output_address (VOIDmode, XEXP (x, 0)); 2487 fprintf (file, "+"); 2488 output_address (VOIDmode, off); 2489 break; 2490 2491 case SYMBOL_REF: 2492 case LABEL_REF: 2493 output_addr_const (file, x); 2494 break; 2495 2496 default: 2497 gcc_assert (GET_CODE (x) != MEM); 2498 nvptx_print_operand (file, x, 0); 2499 break; 2500 } 2501 } 2502 2503 /* Write assembly language output for the address ADDR to FILE. */ 2504 2505 static void 2506 nvptx_print_operand_address (FILE *file, machine_mode mode, rtx addr) 2507 { 2508 nvptx_print_address_operand (file, addr, mode); 2509 } 2510 2511 /* Print an operand, X, to FILE, with an optional modifier in CODE. 2512 2513 Meaning of CODE: 2514 . -- print the predicate for the instruction or an emptry string for an 2515 unconditional one. 2516 # -- print a rounding mode for the instruction 2517 2518 A -- print a data area for a MEM 2519 c -- print an opcode suffix for a comparison operator, including a type code 2520 D -- print a data area for a MEM operand 2521 S -- print a shuffle kind specified by CONST_INT 2522 t -- print a type opcode suffix, promoting QImode to 32 bits 2523 T -- print a type size in bits 2524 u -- print a type opcode suffix without promotions. */ 2525 2526 static void 2527 nvptx_print_operand (FILE *file, rtx x, int code) 2528 { 2529 if (code == '.') 2530 { 2531 x = current_insn_predicate; 2532 if (x) 2533 { 2534 fputs ("@", file); 2535 if (GET_CODE (x) == EQ) 2536 fputs ("!", file); 2537 output_reg (file, REGNO (XEXP (x, 0)), VOIDmode); 2538 } 2539 return; 2540 } 2541 else if (code == '#') 2542 { 2543 fputs (".rn", file); 2544 return; 2545 } 2546 2547 enum rtx_code x_code = GET_CODE (x); 2548 machine_mode mode = GET_MODE (x); 2549 2550 switch (code) 2551 { 2552 case 'A': 2553 x = XEXP (x, 0); 2554 /* FALLTHROUGH. */ 2555 2556 case 'D': 2557 if (GET_CODE (x) == CONST) 2558 x = XEXP (x, 0); 2559 if (GET_CODE (x) == PLUS) 2560 x = XEXP (x, 0); 2561 2562 if (GET_CODE (x) == SYMBOL_REF) 2563 fputs (section_for_sym (x), file); 2564 break; 2565 2566 case 't': 2567 case 'u': 2568 if (x_code == SUBREG) 2569 { 2570 machine_mode inner_mode = GET_MODE (SUBREG_REG (x)); 2571 if (VECTOR_MODE_P (inner_mode) 2572 && (GET_MODE_SIZE (mode) 2573 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode)))) 2574 mode = GET_MODE_INNER (inner_mode); 2575 else if (split_mode_p (inner_mode)) 2576 mode = maybe_split_mode (inner_mode); 2577 else 2578 mode = inner_mode; 2579 } 2580 fprintf (file, "%s", nvptx_ptx_type_from_mode (mode, code == 't')); 2581 break; 2582 2583 case 'H': 2584 case 'L': 2585 { 2586 rtx inner_x = SUBREG_REG (x); 2587 machine_mode inner_mode = GET_MODE (inner_x); 2588 machine_mode split = maybe_split_mode (inner_mode); 2589 2590 output_reg (file, REGNO (inner_x), split, 2591 (code == 'H' 2592 ? GET_MODE_SIZE (inner_mode) / 2 2593 : 0)); 2594 } 2595 break; 2596 2597 case 'S': 2598 { 2599 nvptx_shuffle_kind kind = (nvptx_shuffle_kind) UINTVAL (x); 2600 /* Same order as nvptx_shuffle_kind. */ 2601 static const char *const kinds[] = 2602 {".up", ".down", ".bfly", ".idx"}; 2603 fputs (kinds[kind], file); 2604 } 2605 break; 2606 2607 case 'T': 2608 fprintf (file, "%d", GET_MODE_BITSIZE (mode)); 2609 break; 2610 2611 case 'j': 2612 fprintf (file, "@"); 2613 goto common; 2614 2615 case 'J': 2616 fprintf (file, "@!"); 2617 goto common; 2618 2619 case 'c': 2620 mode = GET_MODE (XEXP (x, 0)); 2621 switch (x_code) 2622 { 2623 case EQ: 2624 fputs (".eq", file); 2625 break; 2626 case NE: 2627 if (FLOAT_MODE_P (mode)) 2628 fputs (".neu", file); 2629 else 2630 fputs (".ne", file); 2631 break; 2632 case LE: 2633 case LEU: 2634 fputs (".le", file); 2635 break; 2636 case GE: 2637 case GEU: 2638 fputs (".ge", file); 2639 break; 2640 case LT: 2641 case LTU: 2642 fputs (".lt", file); 2643 break; 2644 case GT: 2645 case GTU: 2646 fputs (".gt", file); 2647 break; 2648 case LTGT: 2649 fputs (".ne", file); 2650 break; 2651 case UNEQ: 2652 fputs (".equ", file); 2653 break; 2654 case UNLE: 2655 fputs (".leu", file); 2656 break; 2657 case UNGE: 2658 fputs (".geu", file); 2659 break; 2660 case UNLT: 2661 fputs (".ltu", file); 2662 break; 2663 case UNGT: 2664 fputs (".gtu", file); 2665 break; 2666 case UNORDERED: 2667 fputs (".nan", file); 2668 break; 2669 case ORDERED: 2670 fputs (".num", file); 2671 break; 2672 default: 2673 gcc_unreachable (); 2674 } 2675 if (FLOAT_MODE_P (mode) 2676 || x_code == EQ || x_code == NE 2677 || x_code == GEU || x_code == GTU 2678 || x_code == LEU || x_code == LTU) 2679 fputs (nvptx_ptx_type_from_mode (mode, true), file); 2680 else 2681 fprintf (file, ".s%d", GET_MODE_BITSIZE (mode)); 2682 break; 2683 default: 2684 common: 2685 switch (x_code) 2686 { 2687 case SUBREG: 2688 { 2689 rtx inner_x = SUBREG_REG (x); 2690 machine_mode inner_mode = GET_MODE (inner_x); 2691 machine_mode split = maybe_split_mode (inner_mode); 2692 2693 if (VECTOR_MODE_P (inner_mode) 2694 && (GET_MODE_SIZE (mode) 2695 <= GET_MODE_SIZE (GET_MODE_INNER (inner_mode)))) 2696 { 2697 output_reg (file, REGNO (inner_x), VOIDmode); 2698 fprintf (file, ".%s", SUBREG_BYTE (x) == 0 ? "x" : "y"); 2699 } 2700 else if (split_mode_p (inner_mode) 2701 && (GET_MODE_SIZE (inner_mode) == GET_MODE_SIZE (mode))) 2702 output_reg (file, REGNO (inner_x), split); 2703 else 2704 output_reg (file, REGNO (inner_x), split, SUBREG_BYTE (x)); 2705 } 2706 break; 2707 2708 case REG: 2709 output_reg (file, REGNO (x), maybe_split_mode (mode)); 2710 break; 2711 2712 case MEM: 2713 fputc ('[', file); 2714 nvptx_print_address_operand (file, XEXP (x, 0), mode); 2715 fputc (']', file); 2716 break; 2717 2718 case CONST_INT: 2719 output_addr_const (file, x); 2720 break; 2721 2722 case CONST: 2723 case SYMBOL_REF: 2724 case LABEL_REF: 2725 /* We could use output_addr_const, but that can print things like 2726 "x-8", which breaks ptxas. Need to ensure it is output as 2727 "x+-8". */ 2728 nvptx_print_address_operand (file, x, VOIDmode); 2729 break; 2730 2731 case CONST_DOUBLE: 2732 long vals[2]; 2733 real_to_target (vals, CONST_DOUBLE_REAL_VALUE (x), mode); 2734 vals[0] &= 0xffffffff; 2735 vals[1] &= 0xffffffff; 2736 if (mode == SFmode) 2737 fprintf (file, "0f%08lx", vals[0]); 2738 else 2739 fprintf (file, "0d%08lx%08lx", vals[1], vals[0]); 2740 break; 2741 2742 case CONST_VECTOR: 2743 { 2744 unsigned n = CONST_VECTOR_NUNITS (x); 2745 fprintf (file, "{ "); 2746 for (unsigned i = 0; i < n; ++i) 2747 { 2748 if (i != 0) 2749 fprintf (file, ", "); 2750 2751 rtx elem = CONST_VECTOR_ELT (x, i); 2752 output_addr_const (file, elem); 2753 } 2754 fprintf (file, " }"); 2755 } 2756 break; 2757 2758 default: 2759 output_addr_const (file, x); 2760 } 2761 } 2762 } 2763 2764 /* Record replacement regs used to deal with subreg operands. */ 2765 struct reg_replace 2766 { 2767 rtx replacement[MAX_RECOG_OPERANDS]; 2768 machine_mode mode; 2769 int n_allocated; 2770 int n_in_use; 2771 }; 2772 2773 /* Allocate or reuse a replacement in R and return the rtx. */ 2774 2775 static rtx 2776 get_replacement (struct reg_replace *r) 2777 { 2778 if (r->n_allocated == r->n_in_use) 2779 r->replacement[r->n_allocated++] = gen_reg_rtx (r->mode); 2780 return r->replacement[r->n_in_use++]; 2781 } 2782 2783 /* Clean up subreg operands. In ptx assembly, everything is typed, and 2784 the presence of subregs would break the rules for most instructions. 2785 Replace them with a suitable new register of the right size, plus 2786 conversion copyin/copyout instructions. */ 2787 2788 static void 2789 nvptx_reorg_subreg (void) 2790 { 2791 struct reg_replace qiregs, hiregs, siregs, diregs; 2792 rtx_insn *insn, *next; 2793 2794 qiregs.n_allocated = 0; 2795 hiregs.n_allocated = 0; 2796 siregs.n_allocated = 0; 2797 diregs.n_allocated = 0; 2798 qiregs.mode = QImode; 2799 hiregs.mode = HImode; 2800 siregs.mode = SImode; 2801 diregs.mode = DImode; 2802 2803 for (insn = get_insns (); insn; insn = next) 2804 { 2805 next = NEXT_INSN (insn); 2806 if (!NONDEBUG_INSN_P (insn) 2807 || asm_noperands (PATTERN (insn)) >= 0 2808 || GET_CODE (PATTERN (insn)) == USE 2809 || GET_CODE (PATTERN (insn)) == CLOBBER) 2810 continue; 2811 2812 qiregs.n_in_use = 0; 2813 hiregs.n_in_use = 0; 2814 siregs.n_in_use = 0; 2815 diregs.n_in_use = 0; 2816 extract_insn (insn); 2817 enum attr_subregs_ok s_ok = get_attr_subregs_ok (insn); 2818 2819 for (int i = 0; i < recog_data.n_operands; i++) 2820 { 2821 rtx op = recog_data.operand[i]; 2822 if (GET_CODE (op) != SUBREG) 2823 continue; 2824 2825 rtx inner = SUBREG_REG (op); 2826 2827 machine_mode outer_mode = GET_MODE (op); 2828 machine_mode inner_mode = GET_MODE (inner); 2829 gcc_assert (s_ok); 2830 if (s_ok 2831 && (GET_MODE_PRECISION (inner_mode) 2832 >= GET_MODE_PRECISION (outer_mode))) 2833 continue; 2834 gcc_assert (SCALAR_INT_MODE_P (outer_mode)); 2835 struct reg_replace *r = (outer_mode == QImode ? &qiregs 2836 : outer_mode == HImode ? &hiregs 2837 : outer_mode == SImode ? &siregs 2838 : &diregs); 2839 rtx new_reg = get_replacement (r); 2840 2841 if (recog_data.operand_type[i] != OP_OUT) 2842 { 2843 enum rtx_code code; 2844 if (GET_MODE_PRECISION (inner_mode) 2845 < GET_MODE_PRECISION (outer_mode)) 2846 code = ZERO_EXTEND; 2847 else 2848 code = TRUNCATE; 2849 2850 rtx pat = gen_rtx_SET (new_reg, 2851 gen_rtx_fmt_e (code, outer_mode, inner)); 2852 emit_insn_before (pat, insn); 2853 } 2854 2855 if (recog_data.operand_type[i] != OP_IN) 2856 { 2857 enum rtx_code code; 2858 if (GET_MODE_PRECISION (inner_mode) 2859 < GET_MODE_PRECISION (outer_mode)) 2860 code = TRUNCATE; 2861 else 2862 code = ZERO_EXTEND; 2863 2864 rtx pat = gen_rtx_SET (inner, 2865 gen_rtx_fmt_e (code, inner_mode, new_reg)); 2866 emit_insn_after (pat, insn); 2867 } 2868 validate_change (insn, recog_data.operand_loc[i], new_reg, false); 2869 } 2870 } 2871 } 2872 2873 /* Return a SImode "master lane index" register for uniform-simt, allocating on 2874 first use. */ 2875 2876 static rtx 2877 nvptx_get_unisimt_master () 2878 { 2879 rtx &master = cfun->machine->unisimt_master; 2880 return master ? master : master = gen_reg_rtx (SImode); 2881 } 2882 2883 /* Return a BImode "predicate" register for uniform-simt, similar to above. */ 2884 2885 static rtx 2886 nvptx_get_unisimt_predicate () 2887 { 2888 rtx &pred = cfun->machine->unisimt_predicate; 2889 return pred ? pred : pred = gen_reg_rtx (BImode); 2890 } 2891 2892 /* Return true if given call insn references one of the functions provided by 2893 the CUDA runtime: malloc, free, vprintf. */ 2894 2895 static bool 2896 nvptx_call_insn_is_syscall_p (rtx_insn *insn) 2897 { 2898 rtx pat = PATTERN (insn); 2899 gcc_checking_assert (GET_CODE (pat) == PARALLEL); 2900 pat = XVECEXP (pat, 0, 0); 2901 if (GET_CODE (pat) == SET) 2902 pat = SET_SRC (pat); 2903 gcc_checking_assert (GET_CODE (pat) == CALL 2904 && GET_CODE (XEXP (pat, 0)) == MEM); 2905 rtx addr = XEXP (XEXP (pat, 0), 0); 2906 if (GET_CODE (addr) != SYMBOL_REF) 2907 return false; 2908 const char *name = XSTR (addr, 0); 2909 /* Ordinary malloc/free are redirected to __nvptx_{malloc,free), so only the 2910 references with forced assembler name refer to PTX syscalls. For vprintf, 2911 accept both normal and forced-assembler-name references. */ 2912 return (!strcmp (name, "vprintf") || !strcmp (name, "*vprintf") 2913 || !strcmp (name, "*malloc") 2914 || !strcmp (name, "*free")); 2915 } 2916 2917 /* If SET subexpression of INSN sets a register, emit a shuffle instruction to 2918 propagate its value from lane MASTER to current lane. */ 2919 2920 static void 2921 nvptx_unisimt_handle_set (rtx set, rtx_insn *insn, rtx master) 2922 { 2923 rtx reg; 2924 if (GET_CODE (set) == SET && REG_P (reg = SET_DEST (set))) 2925 emit_insn_after (nvptx_gen_shuffle (reg, reg, master, SHUFFLE_IDX), insn); 2926 } 2927 2928 /* Adjust code for uniform-simt code generation variant by making atomics and 2929 "syscalls" conditionally executed, and inserting shuffle-based propagation 2930 for registers being set. */ 2931 2932 static void 2933 nvptx_reorg_uniform_simt () 2934 { 2935 rtx_insn *insn, *next; 2936 2937 for (insn = get_insns (); insn; insn = next) 2938 { 2939 next = NEXT_INSN (insn); 2940 if (!(CALL_P (insn) && nvptx_call_insn_is_syscall_p (insn)) 2941 && !(NONJUMP_INSN_P (insn) 2942 && GET_CODE (PATTERN (insn)) == PARALLEL 2943 && get_attr_atomic (insn))) 2944 continue; 2945 rtx pat = PATTERN (insn); 2946 rtx master = nvptx_get_unisimt_master (); 2947 for (int i = 0; i < XVECLEN (pat, 0); i++) 2948 nvptx_unisimt_handle_set (XVECEXP (pat, 0, i), insn, master); 2949 rtx pred = nvptx_get_unisimt_predicate (); 2950 pred = gen_rtx_NE (BImode, pred, const0_rtx); 2951 pat = gen_rtx_COND_EXEC (VOIDmode, pred, pat); 2952 validate_change (insn, &PATTERN (insn), pat, false); 2953 } 2954 } 2955 2956 /* Offloading function attributes. */ 2957 2958 struct offload_attrs 2959 { 2960 unsigned mask; 2961 int num_gangs; 2962 int num_workers; 2963 int vector_length; 2964 }; 2965 2966 /* Define entries for cfun->machine->axis_dim. */ 2967 2968 #define MACH_VECTOR_LENGTH 0 2969 #define MACH_MAX_WORKERS 1 2970 2971 static void populate_offload_attrs (offload_attrs *oa); 2972 2973 static void 2974 init_axis_dim (void) 2975 { 2976 offload_attrs oa; 2977 int max_workers; 2978 2979 populate_offload_attrs (&oa); 2980 2981 if (oa.num_workers == 0) 2982 max_workers = PTX_CTA_SIZE / oa.vector_length; 2983 else 2984 max_workers = oa.num_workers; 2985 2986 cfun->machine->axis_dim[MACH_VECTOR_LENGTH] = oa.vector_length; 2987 cfun->machine->axis_dim[MACH_MAX_WORKERS] = max_workers; 2988 cfun->machine->axis_dim_init_p = true; 2989 } 2990 2991 static int ATTRIBUTE_UNUSED 2992 nvptx_mach_max_workers () 2993 { 2994 if (!cfun->machine->axis_dim_init_p) 2995 init_axis_dim (); 2996 return cfun->machine->axis_dim[MACH_MAX_WORKERS]; 2997 } 2998 2999 static int ATTRIBUTE_UNUSED 3000 nvptx_mach_vector_length () 3001 { 3002 if (!cfun->machine->axis_dim_init_p) 3003 init_axis_dim (); 3004 return cfun->machine->axis_dim[MACH_VECTOR_LENGTH]; 3005 } 3006 3007 /* Loop structure of the function. The entire function is described as 3008 a NULL loop. */ 3009 3010 struct parallel 3011 { 3012 /* Parent parallel. */ 3013 parallel *parent; 3014 3015 /* Next sibling parallel. */ 3016 parallel *next; 3017 3018 /* First child parallel. */ 3019 parallel *inner; 3020 3021 /* Partitioning mask of the parallel. */ 3022 unsigned mask; 3023 3024 /* Partitioning used within inner parallels. */ 3025 unsigned inner_mask; 3026 3027 /* Location of parallel forked and join. The forked is the first 3028 block in the parallel and the join is the first block after of 3029 the partition. */ 3030 basic_block forked_block; 3031 basic_block join_block; 3032 3033 rtx_insn *forked_insn; 3034 rtx_insn *join_insn; 3035 3036 rtx_insn *fork_insn; 3037 rtx_insn *joining_insn; 3038 3039 /* Basic blocks in this parallel, but not in child parallels. The 3040 FORKED and JOINING blocks are in the partition. The FORK and JOIN 3041 blocks are not. */ 3042 auto_vec<basic_block> blocks; 3043 3044 public: 3045 parallel (parallel *parent, unsigned mode); 3046 ~parallel (); 3047 }; 3048 3049 /* Constructor links the new parallel into it's parent's chain of 3050 children. */ 3051 3052 parallel::parallel (parallel *parent_, unsigned mask_) 3053 :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0) 3054 { 3055 forked_block = join_block = 0; 3056 forked_insn = join_insn = 0; 3057 fork_insn = joining_insn = 0; 3058 3059 if (parent) 3060 { 3061 next = parent->inner; 3062 parent->inner = this; 3063 } 3064 } 3065 3066 parallel::~parallel () 3067 { 3068 delete inner; 3069 delete next; 3070 } 3071 3072 /* Map of basic blocks to insns */ 3073 typedef hash_map<basic_block, rtx_insn *> bb_insn_map_t; 3074 3075 /* A tuple of an insn of interest and the BB in which it resides. */ 3076 typedef std::pair<rtx_insn *, basic_block> insn_bb_t; 3077 typedef auto_vec<insn_bb_t> insn_bb_vec_t; 3078 3079 /* Split basic blocks such that each forked and join unspecs are at 3080 the start of their basic blocks. Thus afterwards each block will 3081 have a single partitioning mode. We also do the same for return 3082 insns, as they are executed by every thread. Return the 3083 partitioning mode of the function as a whole. Populate MAP with 3084 head and tail blocks. We also clear the BB visited flag, which is 3085 used when finding partitions. */ 3086 3087 static void 3088 nvptx_split_blocks (bb_insn_map_t *map) 3089 { 3090 insn_bb_vec_t worklist; 3091 basic_block block; 3092 rtx_insn *insn; 3093 3094 /* Locate all the reorg instructions of interest. */ 3095 FOR_ALL_BB_FN (block, cfun) 3096 { 3097 bool seen_insn = false; 3098 3099 /* Clear visited flag, for use by parallel locator */ 3100 block->flags &= ~BB_VISITED; 3101 3102 FOR_BB_INSNS (block, insn) 3103 { 3104 if (!INSN_P (insn)) 3105 continue; 3106 switch (recog_memoized (insn)) 3107 { 3108 default: 3109 seen_insn = true; 3110 continue; 3111 case CODE_FOR_nvptx_forked: 3112 case CODE_FOR_nvptx_join: 3113 break; 3114 3115 case CODE_FOR_return: 3116 /* We also need to split just before return insns, as 3117 that insn needs executing by all threads, but the 3118 block it is in probably does not. */ 3119 break; 3120 } 3121 3122 if (seen_insn) 3123 /* We've found an instruction that must be at the start of 3124 a block, but isn't. Add it to the worklist. */ 3125 worklist.safe_push (insn_bb_t (insn, block)); 3126 else 3127 /* It was already the first instruction. Just add it to 3128 the map. */ 3129 map->get_or_insert (block) = insn; 3130 seen_insn = true; 3131 } 3132 } 3133 3134 /* Split blocks on the worklist. */ 3135 unsigned ix; 3136 insn_bb_t *elt; 3137 basic_block remap = 0; 3138 for (ix = 0; worklist.iterate (ix, &elt); ix++) 3139 { 3140 if (remap != elt->second) 3141 { 3142 block = elt->second; 3143 remap = block; 3144 } 3145 3146 /* Split block before insn. The insn is in the new block */ 3147 edge e = split_block (block, PREV_INSN (elt->first)); 3148 3149 block = e->dest; 3150 map->get_or_insert (block) = elt->first; 3151 } 3152 } 3153 3154 /* Return true if MASK contains parallelism that requires shared 3155 memory to broadcast. */ 3156 3157 static bool 3158 nvptx_needs_shared_bcast (unsigned mask) 3159 { 3160 bool worker = mask & GOMP_DIM_MASK (GOMP_DIM_WORKER); 3161 bool large_vector = (mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) 3162 && nvptx_mach_vector_length () != PTX_WARP_SIZE; 3163 3164 return worker || large_vector; 3165 } 3166 3167 /* BLOCK is a basic block containing a head or tail instruction. 3168 Locate the associated prehead or pretail instruction, which must be 3169 in the single predecessor block. */ 3170 3171 static rtx_insn * 3172 nvptx_discover_pre (basic_block block, int expected) 3173 { 3174 gcc_assert (block->preds->length () == 1); 3175 basic_block pre_block = (*block->preds)[0]->src; 3176 rtx_insn *pre_insn; 3177 3178 for (pre_insn = BB_END (pre_block); !INSN_P (pre_insn); 3179 pre_insn = PREV_INSN (pre_insn)) 3180 gcc_assert (pre_insn != BB_HEAD (pre_block)); 3181 3182 gcc_assert (recog_memoized (pre_insn) == expected); 3183 return pre_insn; 3184 } 3185 3186 /* Dump this parallel and all its inner parallels. */ 3187 3188 static void 3189 nvptx_dump_pars (parallel *par, unsigned depth) 3190 { 3191 fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n", 3192 depth, par->mask, 3193 par->forked_block ? par->forked_block->index : -1, 3194 par->join_block ? par->join_block->index : -1); 3195 3196 fprintf (dump_file, " blocks:"); 3197 3198 basic_block block; 3199 for (unsigned ix = 0; par->blocks.iterate (ix, &block); ix++) 3200 fprintf (dump_file, " %d", block->index); 3201 fprintf (dump_file, "\n"); 3202 if (par->inner) 3203 nvptx_dump_pars (par->inner, depth + 1); 3204 3205 if (par->next) 3206 nvptx_dump_pars (par->next, depth); 3207 } 3208 3209 /* If BLOCK contains a fork/join marker, process it to create or 3210 terminate a loop structure. Add this block to the current loop, 3211 and then walk successor blocks. */ 3212 3213 static parallel * 3214 nvptx_find_par (bb_insn_map_t *map, parallel *par, basic_block block) 3215 { 3216 if (block->flags & BB_VISITED) 3217 return par; 3218 block->flags |= BB_VISITED; 3219 3220 if (rtx_insn **endp = map->get (block)) 3221 { 3222 rtx_insn *end = *endp; 3223 3224 /* This is a block head or tail, or return instruction. */ 3225 switch (recog_memoized (end)) 3226 { 3227 case CODE_FOR_return: 3228 /* Return instructions are in their own block, and we 3229 don't need to do anything more. */ 3230 return par; 3231 3232 case CODE_FOR_nvptx_forked: 3233 /* Loop head, create a new inner loop and add it into 3234 our parent's child list. */ 3235 { 3236 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); 3237 3238 gcc_assert (mask); 3239 par = new parallel (par, mask); 3240 par->forked_block = block; 3241 par->forked_insn = end; 3242 if (nvptx_needs_shared_bcast (mask)) 3243 par->fork_insn 3244 = nvptx_discover_pre (block, CODE_FOR_nvptx_fork); 3245 } 3246 break; 3247 3248 case CODE_FOR_nvptx_join: 3249 /* A loop tail. Finish the current loop and return to 3250 parent. */ 3251 { 3252 unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0)); 3253 3254 gcc_assert (par->mask == mask); 3255 gcc_assert (par->join_block == NULL); 3256 par->join_block = block; 3257 par->join_insn = end; 3258 if (nvptx_needs_shared_bcast (mask)) 3259 par->joining_insn 3260 = nvptx_discover_pre (block, CODE_FOR_nvptx_joining); 3261 par = par->parent; 3262 } 3263 break; 3264 3265 default: 3266 gcc_unreachable (); 3267 } 3268 } 3269 3270 if (par) 3271 /* Add this block onto the current loop's list of blocks. */ 3272 par->blocks.safe_push (block); 3273 else 3274 /* This must be the entry block. Create a NULL parallel. */ 3275 par = new parallel (0, 0); 3276 3277 /* Walk successor blocks. */ 3278 edge e; 3279 edge_iterator ei; 3280 3281 FOR_EACH_EDGE (e, ei, block->succs) 3282 nvptx_find_par (map, par, e->dest); 3283 3284 return par; 3285 } 3286 3287 /* DFS walk the CFG looking for fork & join markers. Construct 3288 loop structures as we go. MAP is a mapping of basic blocks 3289 to head & tail markers, discovered when splitting blocks. This 3290 speeds up the discovery. We rely on the BB visited flag having 3291 been cleared when splitting blocks. */ 3292 3293 static parallel * 3294 nvptx_discover_pars (bb_insn_map_t *map) 3295 { 3296 basic_block block; 3297 3298 /* Mark exit blocks as visited. */ 3299 block = EXIT_BLOCK_PTR_FOR_FN (cfun); 3300 block->flags |= BB_VISITED; 3301 3302 /* And entry block as not. */ 3303 block = ENTRY_BLOCK_PTR_FOR_FN (cfun); 3304 block->flags &= ~BB_VISITED; 3305 3306 parallel *par = nvptx_find_par (map, 0, block); 3307 3308 if (dump_file) 3309 { 3310 fprintf (dump_file, "\nLoops\n"); 3311 nvptx_dump_pars (par, 0); 3312 fprintf (dump_file, "\n"); 3313 } 3314 3315 return par; 3316 } 3317 3318 /* Analyse a group of BBs within a partitioned region and create N 3319 Single-Entry-Single-Exit regions. Some of those regions will be 3320 trivial ones consisting of a single BB. The blocks of a 3321 partitioned region might form a set of disjoint graphs -- because 3322 the region encloses a differently partitoned sub region. 3323 3324 We use the linear time algorithm described in 'Finding Regions Fast: 3325 Single Entry Single Exit and control Regions in Linear Time' 3326 Johnson, Pearson & Pingali. That algorithm deals with complete 3327 CFGs, where a back edge is inserted from END to START, and thus the 3328 problem becomes one of finding equivalent loops. 3329 3330 In this case we have a partial CFG. We complete it by redirecting 3331 any incoming edge to the graph to be from an arbitrary external BB, 3332 and similarly redirecting any outgoing edge to be to that BB. 3333 Thus we end up with a closed graph. 3334 3335 The algorithm works by building a spanning tree of an undirected 3336 graph and keeping track of back edges from nodes further from the 3337 root in the tree to nodes nearer to the root in the tree. In the 3338 description below, the root is up and the tree grows downwards. 3339 3340 We avoid having to deal with degenerate back-edges to the same 3341 block, by splitting each BB into 3 -- one for input edges, one for 3342 the node itself and one for the output edges. Such back edges are 3343 referred to as 'Brackets'. Cycle equivalent nodes will have the 3344 same set of brackets. 3345 3346 Determining bracket equivalency is done by maintaining a list of 3347 brackets in such a manner that the list length and final bracket 3348 uniquely identify the set. 3349 3350 We use coloring to mark all BBs with cycle equivalency with the 3351 same color. This is the output of the 'Finding Regions Fast' 3352 algorithm. Notice it doesn't actually find the set of nodes within 3353 a particular region, just unorderd sets of nodes that are the 3354 entries and exits of SESE regions. 3355 3356 After determining cycle equivalency, we need to find the minimal 3357 set of SESE regions. Do this with a DFS coloring walk of the 3358 complete graph. We're either 'looking' or 'coloring'. When 3359 looking, and we're in the subgraph, we start coloring the color of 3360 the current node, and remember that node as the start of the 3361 current color's SESE region. Every time we go to a new node, we 3362 decrement the count of nodes with thet color. If it reaches zero, 3363 we remember that node as the end of the current color's SESE region 3364 and return to 'looking'. Otherwise we color the node the current 3365 color. 3366 3367 This way we end up with coloring the inside of non-trivial SESE 3368 regions with the color of that region. */ 3369 3370 /* A pair of BBs. We use this to represent SESE regions. */ 3371 typedef std::pair<basic_block, basic_block> bb_pair_t; 3372 typedef auto_vec<bb_pair_t> bb_pair_vec_t; 3373 3374 /* A node in the undirected CFG. The discriminator SECOND indicates just 3375 above or just below the BB idicated by FIRST. */ 3376 typedef std::pair<basic_block, int> pseudo_node_t; 3377 3378 /* A bracket indicates an edge towards the root of the spanning tree of the 3379 undirected graph. Each bracket has a color, determined 3380 from the currrent set of brackets. */ 3381 struct bracket 3382 { 3383 pseudo_node_t back; /* Back target */ 3384 3385 /* Current color and size of set. */ 3386 unsigned color; 3387 unsigned size; 3388 3389 bracket (pseudo_node_t back_) 3390 : back (back_), color (~0u), size (~0u) 3391 { 3392 } 3393 3394 unsigned get_color (auto_vec<unsigned> &color_counts, unsigned length) 3395 { 3396 if (length != size) 3397 { 3398 size = length; 3399 color = color_counts.length (); 3400 color_counts.quick_push (0); 3401 } 3402 color_counts[color]++; 3403 return color; 3404 } 3405 }; 3406 3407 typedef auto_vec<bracket> bracket_vec_t; 3408 3409 /* Basic block info for finding SESE regions. */ 3410 3411 struct bb_sese 3412 { 3413 int node; /* Node number in spanning tree. */ 3414 int parent; /* Parent node number. */ 3415 3416 /* The algorithm splits each node A into Ai, A', Ao. The incoming 3417 edges arrive at pseudo-node Ai and the outgoing edges leave at 3418 pseudo-node Ao. We have to remember which way we arrived at a 3419 particular node when generating the spanning tree. dir > 0 means 3420 we arrived at Ai, dir < 0 means we arrived at Ao. */ 3421 int dir; 3422 3423 /* Lowest numbered pseudo-node reached via a backedge from thsis 3424 node, or any descendant. */ 3425 pseudo_node_t high; 3426 3427 int color; /* Cycle-equivalence color */ 3428 3429 /* Stack of brackets for this node. */ 3430 bracket_vec_t brackets; 3431 3432 bb_sese (unsigned node_, unsigned p, int dir_) 3433 :node (node_), parent (p), dir (dir_) 3434 { 3435 } 3436 ~bb_sese (); 3437 3438 /* Push a bracket ending at BACK. */ 3439 void push (const pseudo_node_t &back) 3440 { 3441 if (dump_file) 3442 fprintf (dump_file, "Pushing backedge %d:%+d\n", 3443 back.first ? back.first->index : 0, back.second); 3444 brackets.safe_push (bracket (back)); 3445 } 3446 3447 void append (bb_sese *child); 3448 void remove (const pseudo_node_t &); 3449 3450 /* Set node's color. */ 3451 void set_color (auto_vec<unsigned> &color_counts) 3452 { 3453 color = brackets.last ().get_color (color_counts, brackets.length ()); 3454 } 3455 }; 3456 3457 bb_sese::~bb_sese () 3458 { 3459 } 3460 3461 /* Destructively append CHILD's brackets. */ 3462 3463 void 3464 bb_sese::append (bb_sese *child) 3465 { 3466 if (int len = child->brackets.length ()) 3467 { 3468 int ix; 3469 3470 if (dump_file) 3471 { 3472 for (ix = 0; ix < len; ix++) 3473 { 3474 const pseudo_node_t &pseudo = child->brackets[ix].back; 3475 fprintf (dump_file, "Appending (%d)'s backedge %d:%+d\n", 3476 child->node, pseudo.first ? pseudo.first->index : 0, 3477 pseudo.second); 3478 } 3479 } 3480 if (!brackets.length ()) 3481 std::swap (brackets, child->brackets); 3482 else 3483 { 3484 brackets.reserve (len); 3485 for (ix = 0; ix < len; ix++) 3486 brackets.quick_push (child->brackets[ix]); 3487 } 3488 } 3489 } 3490 3491 /* Remove brackets that terminate at PSEUDO. */ 3492 3493 void 3494 bb_sese::remove (const pseudo_node_t &pseudo) 3495 { 3496 unsigned removed = 0; 3497 int len = brackets.length (); 3498 3499 for (int ix = 0; ix < len; ix++) 3500 { 3501 if (brackets[ix].back == pseudo) 3502 { 3503 if (dump_file) 3504 fprintf (dump_file, "Removing backedge %d:%+d\n", 3505 pseudo.first ? pseudo.first->index : 0, pseudo.second); 3506 removed++; 3507 } 3508 else if (removed) 3509 brackets[ix-removed] = brackets[ix]; 3510 } 3511 while (removed--) 3512 brackets.pop (); 3513 } 3514 3515 /* Accessors for BB's aux pointer. */ 3516 #define BB_SET_SESE(B, S) ((B)->aux = (S)) 3517 #define BB_GET_SESE(B) ((bb_sese *)(B)->aux) 3518 3519 /* DFS walk creating SESE data structures. Only cover nodes with 3520 BB_VISITED set. Append discovered blocks to LIST. We number in 3521 increments of 3 so that the above and below pseudo nodes can be 3522 implicitly numbered too. */ 3523 3524 static int 3525 nvptx_sese_number (int n, int p, int dir, basic_block b, 3526 auto_vec<basic_block> *list) 3527 { 3528 if (BB_GET_SESE (b)) 3529 return n; 3530 3531 if (dump_file) 3532 fprintf (dump_file, "Block %d(%d), parent (%d), orientation %+d\n", 3533 b->index, n, p, dir); 3534 3535 BB_SET_SESE (b, new bb_sese (n, p, dir)); 3536 p = n; 3537 3538 n += 3; 3539 list->quick_push (b); 3540 3541 /* First walk the nodes on the 'other side' of this node, then walk 3542 the nodes on the same side. */ 3543 for (unsigned ix = 2; ix; ix--) 3544 { 3545 vec<edge, va_gc> *edges = dir > 0 ? b->succs : b->preds; 3546 size_t offset = (dir > 0 ? offsetof (edge_def, dest) 3547 : offsetof (edge_def, src)); 3548 edge e; 3549 edge_iterator ei; 3550 3551 FOR_EACH_EDGE (e, ei, edges) 3552 { 3553 basic_block target = *(basic_block *)((char *)e + offset); 3554 3555 if (target->flags & BB_VISITED) 3556 n = nvptx_sese_number (n, p, dir, target, list); 3557 } 3558 dir = -dir; 3559 } 3560 return n; 3561 } 3562 3563 /* Process pseudo node above (DIR < 0) or below (DIR > 0) ME. 3564 EDGES are the outgoing edges and OFFSET is the offset to the src 3565 or dst block on the edges. */ 3566 3567 static void 3568 nvptx_sese_pseudo (basic_block me, bb_sese *sese, int depth, int dir, 3569 vec<edge, va_gc> *edges, size_t offset) 3570 { 3571 edge e; 3572 edge_iterator ei; 3573 int hi_back = depth; 3574 pseudo_node_t node_back (NULL, depth); 3575 int hi_child = depth; 3576 pseudo_node_t node_child (NULL, depth); 3577 basic_block child = NULL; 3578 unsigned num_children = 0; 3579 int usd = -dir * sese->dir; 3580 3581 if (dump_file) 3582 fprintf (dump_file, "\nProcessing %d(%d) %+d\n", 3583 me->index, sese->node, dir); 3584 3585 if (dir < 0) 3586 { 3587 /* This is the above pseudo-child. It has the BB itself as an 3588 additional child node. */ 3589 node_child = sese->high; 3590 hi_child = node_child.second; 3591 if (node_child.first) 3592 hi_child += BB_GET_SESE (node_child.first)->node; 3593 num_children++; 3594 } 3595 3596 /* Examine each edge. 3597 - if it is a child (a) append its bracket list and (b) record 3598 whether it is the child with the highest reaching bracket. 3599 - if it is an edge to ancestor, record whether it's the highest 3600 reaching backlink. */ 3601 FOR_EACH_EDGE (e, ei, edges) 3602 { 3603 basic_block target = *(basic_block *)((char *)e + offset); 3604 3605 if (bb_sese *t_sese = BB_GET_SESE (target)) 3606 { 3607 if (t_sese->parent == sese->node && !(t_sese->dir + usd)) 3608 { 3609 /* Child node. Append its bracket list. */ 3610 num_children++; 3611 sese->append (t_sese); 3612 3613 /* Compare it's hi value. */ 3614 int t_hi = t_sese->high.second; 3615 3616 if (basic_block child_hi_block = t_sese->high.first) 3617 t_hi += BB_GET_SESE (child_hi_block)->node; 3618 3619 if (hi_child > t_hi) 3620 { 3621 hi_child = t_hi; 3622 node_child = t_sese->high; 3623 child = target; 3624 } 3625 } 3626 else if (t_sese->node < sese->node + dir 3627 && !(dir < 0 && sese->parent == t_sese->node)) 3628 { 3629 /* Non-parental ancestor node -- a backlink. */ 3630 int d = usd * t_sese->dir; 3631 int back = t_sese->node + d; 3632 3633 if (hi_back > back) 3634 { 3635 hi_back = back; 3636 node_back = pseudo_node_t (target, d); 3637 } 3638 } 3639 } 3640 else 3641 { /* Fallen off graph, backlink to entry node. */ 3642 hi_back = 0; 3643 node_back = pseudo_node_t (NULL, 0); 3644 } 3645 } 3646 3647 /* Remove any brackets that terminate at this pseudo node. */ 3648 sese->remove (pseudo_node_t (me, dir)); 3649 3650 /* Now push any backlinks from this pseudo node. */ 3651 FOR_EACH_EDGE (e, ei, edges) 3652 { 3653 basic_block target = *(basic_block *)((char *)e + offset); 3654 if (bb_sese *t_sese = BB_GET_SESE (target)) 3655 { 3656 if (t_sese->node < sese->node + dir 3657 && !(dir < 0 && sese->parent == t_sese->node)) 3658 /* Non-parental ancestor node - backedge from me. */ 3659 sese->push (pseudo_node_t (target, usd * t_sese->dir)); 3660 } 3661 else 3662 { 3663 /* back edge to entry node */ 3664 sese->push (pseudo_node_t (NULL, 0)); 3665 } 3666 } 3667 3668 /* If this node leads directly or indirectly to a no-return region of 3669 the graph, then fake a backedge to entry node. */ 3670 if (!sese->brackets.length () || !edges || !edges->length ()) 3671 { 3672 hi_back = 0; 3673 node_back = pseudo_node_t (NULL, 0); 3674 sese->push (node_back); 3675 } 3676 3677 /* Record the highest reaching backedge from us or a descendant. */ 3678 sese->high = hi_back < hi_child ? node_back : node_child; 3679 3680 if (num_children > 1) 3681 { 3682 /* There is more than one child -- this is a Y shaped piece of 3683 spanning tree. We have to insert a fake backedge from this 3684 node to the highest ancestor reached by not-the-highest 3685 reaching child. Note that there may be multiple children 3686 with backedges to the same highest node. That's ok and we 3687 insert the edge to that highest node. */ 3688 hi_child = depth; 3689 if (dir < 0 && child) 3690 { 3691 node_child = sese->high; 3692 hi_child = node_child.second; 3693 if (node_child.first) 3694 hi_child += BB_GET_SESE (node_child.first)->node; 3695 } 3696 3697 FOR_EACH_EDGE (e, ei, edges) 3698 { 3699 basic_block target = *(basic_block *)((char *)e + offset); 3700 3701 if (target == child) 3702 /* Ignore the highest child. */ 3703 continue; 3704 3705 bb_sese *t_sese = BB_GET_SESE (target); 3706 if (!t_sese) 3707 continue; 3708 if (t_sese->parent != sese->node) 3709 /* Not a child. */ 3710 continue; 3711 3712 /* Compare its hi value. */ 3713 int t_hi = t_sese->high.second; 3714 3715 if (basic_block child_hi_block = t_sese->high.first) 3716 t_hi += BB_GET_SESE (child_hi_block)->node; 3717 3718 if (hi_child > t_hi) 3719 { 3720 hi_child = t_hi; 3721 node_child = t_sese->high; 3722 } 3723 } 3724 3725 sese->push (node_child); 3726 } 3727 } 3728 3729 3730 /* DFS walk of BB graph. Color node BLOCK according to COLORING then 3731 proceed to successors. Set SESE entry and exit nodes of 3732 REGIONS. */ 3733 3734 static void 3735 nvptx_sese_color (auto_vec<unsigned> &color_counts, bb_pair_vec_t ®ions, 3736 basic_block block, int coloring) 3737 { 3738 bb_sese *sese = BB_GET_SESE (block); 3739 3740 if (block->flags & BB_VISITED) 3741 { 3742 /* If we've already encountered this block, either we must not 3743 be coloring, or it must have been colored the current color. */ 3744 gcc_assert (coloring < 0 || (sese && coloring == sese->color)); 3745 return; 3746 } 3747 3748 block->flags |= BB_VISITED; 3749 3750 if (sese) 3751 { 3752 if (coloring < 0) 3753 { 3754 /* Start coloring a region. */ 3755 regions[sese->color].first = block; 3756 coloring = sese->color; 3757 } 3758 3759 if (!--color_counts[sese->color] && sese->color == coloring) 3760 { 3761 /* Found final block of SESE region. */ 3762 regions[sese->color].second = block; 3763 coloring = -1; 3764 } 3765 else 3766 /* Color the node, so we can assert on revisiting the node 3767 that the graph is indeed SESE. */ 3768 sese->color = coloring; 3769 } 3770 else 3771 /* Fallen off the subgraph, we cannot be coloring. */ 3772 gcc_assert (coloring < 0); 3773 3774 /* Walk each successor block. */ 3775 if (block->succs && block->succs->length ()) 3776 { 3777 edge e; 3778 edge_iterator ei; 3779 3780 FOR_EACH_EDGE (e, ei, block->succs) 3781 nvptx_sese_color (color_counts, regions, e->dest, coloring); 3782 } 3783 else 3784 gcc_assert (coloring < 0); 3785 } 3786 3787 /* Find minimal set of SESE regions covering BLOCKS. REGIONS might 3788 end up with NULL entries in it. */ 3789 3790 static void 3791 nvptx_find_sese (auto_vec<basic_block> &blocks, bb_pair_vec_t ®ions) 3792 { 3793 basic_block block; 3794 int ix; 3795 3796 /* First clear each BB of the whole function. */ 3797 FOR_ALL_BB_FN (block, cfun) 3798 { 3799 block->flags &= ~BB_VISITED; 3800 BB_SET_SESE (block, 0); 3801 } 3802 3803 /* Mark blocks in the function that are in this graph. */ 3804 for (ix = 0; blocks.iterate (ix, &block); ix++) 3805 block->flags |= BB_VISITED; 3806 3807 /* Counts of nodes assigned to each color. There cannot be more 3808 colors than blocks (and hopefully there will be fewer). */ 3809 auto_vec<unsigned> color_counts; 3810 color_counts.reserve (blocks.length ()); 3811 3812 /* Worklist of nodes in the spanning tree. Again, there cannot be 3813 more nodes in the tree than blocks (there will be fewer if the 3814 CFG of blocks is disjoint). */ 3815 auto_vec<basic_block> spanlist; 3816 spanlist.reserve (blocks.length ()); 3817 3818 /* Make sure every block has its cycle class determined. */ 3819 for (ix = 0; blocks.iterate (ix, &block); ix++) 3820 { 3821 if (BB_GET_SESE (block)) 3822 /* We already met this block in an earlier graph solve. */ 3823 continue; 3824 3825 if (dump_file) 3826 fprintf (dump_file, "Searching graph starting at %d\n", block->index); 3827 3828 /* Number the nodes reachable from block initial DFS order. */ 3829 int depth = nvptx_sese_number (2, 0, +1, block, &spanlist); 3830 3831 /* Now walk in reverse DFS order to find cycle equivalents. */ 3832 while (spanlist.length ()) 3833 { 3834 block = spanlist.pop (); 3835 bb_sese *sese = BB_GET_SESE (block); 3836 3837 /* Do the pseudo node below. */ 3838 nvptx_sese_pseudo (block, sese, depth, +1, 3839 sese->dir > 0 ? block->succs : block->preds, 3840 (sese->dir > 0 ? offsetof (edge_def, dest) 3841 : offsetof (edge_def, src))); 3842 sese->set_color (color_counts); 3843 /* Do the pseudo node above. */ 3844 nvptx_sese_pseudo (block, sese, depth, -1, 3845 sese->dir < 0 ? block->succs : block->preds, 3846 (sese->dir < 0 ? offsetof (edge_def, dest) 3847 : offsetof (edge_def, src))); 3848 } 3849 if (dump_file) 3850 fprintf (dump_file, "\n"); 3851 } 3852 3853 if (dump_file) 3854 { 3855 unsigned count; 3856 const char *comma = ""; 3857 3858 fprintf (dump_file, "Found %d cycle equivalents\n", 3859 color_counts.length ()); 3860 for (ix = 0; color_counts.iterate (ix, &count); ix++) 3861 { 3862 fprintf (dump_file, "%s%d[%d]={", comma, ix, count); 3863 3864 comma = ""; 3865 for (unsigned jx = 0; blocks.iterate (jx, &block); jx++) 3866 if (BB_GET_SESE (block)->color == ix) 3867 { 3868 block->flags |= BB_VISITED; 3869 fprintf (dump_file, "%s%d", comma, block->index); 3870 comma=","; 3871 } 3872 fprintf (dump_file, "}"); 3873 comma = ", "; 3874 } 3875 fprintf (dump_file, "\n"); 3876 } 3877 3878 /* Now we've colored every block in the subgraph. We now need to 3879 determine the minimal set of SESE regions that cover that 3880 subgraph. Do this with a DFS walk of the complete function. 3881 During the walk we're either 'looking' or 'coloring'. When we 3882 reach the last node of a particular color, we stop coloring and 3883 return to looking. */ 3884 3885 /* There cannot be more SESE regions than colors. */ 3886 regions.reserve (color_counts.length ()); 3887 for (ix = color_counts.length (); ix--;) 3888 regions.quick_push (bb_pair_t (0, 0)); 3889 3890 for (ix = 0; blocks.iterate (ix, &block); ix++) 3891 block->flags &= ~BB_VISITED; 3892 3893 nvptx_sese_color (color_counts, regions, ENTRY_BLOCK_PTR_FOR_FN (cfun), -1); 3894 3895 if (dump_file) 3896 { 3897 const char *comma = ""; 3898 int len = regions.length (); 3899 3900 fprintf (dump_file, "SESE regions:"); 3901 for (ix = 0; ix != len; ix++) 3902 { 3903 basic_block from = regions[ix].first; 3904 basic_block to = regions[ix].second; 3905 3906 if (from) 3907 { 3908 fprintf (dump_file, "%s %d{%d", comma, ix, from->index); 3909 if (to != from) 3910 fprintf (dump_file, "->%d", to->index); 3911 3912 int color = BB_GET_SESE (from)->color; 3913 3914 /* Print the blocks within the region (excluding ends). */ 3915 FOR_EACH_BB_FN (block, cfun) 3916 { 3917 bb_sese *sese = BB_GET_SESE (block); 3918 3919 if (sese && sese->color == color 3920 && block != from && block != to) 3921 fprintf (dump_file, ".%d", block->index); 3922 } 3923 fprintf (dump_file, "}"); 3924 } 3925 comma = ","; 3926 } 3927 fprintf (dump_file, "\n\n"); 3928 } 3929 3930 for (ix = 0; blocks.iterate (ix, &block); ix++) 3931 delete BB_GET_SESE (block); 3932 } 3933 3934 #undef BB_SET_SESE 3935 #undef BB_GET_SESE 3936 3937 /* Propagate live state at the start of a partitioned region. IS_CALL 3938 indicates whether the propagation is for a (partitioned) call 3939 instruction. BLOCK provides the live register information, and 3940 might not contain INSN. Propagation is inserted just after INSN. RW 3941 indicates whether we are reading and/or writing state. This 3942 separation is needed for worker-level proppagation where we 3943 essentially do a spill & fill. FN is the underlying worker 3944 function to generate the propagation instructions for single 3945 register. DATA is user data. 3946 3947 Returns true if we didn't emit any instructions. 3948 3949 We propagate the live register set for non-calls and the entire 3950 frame for calls and non-calls. We could do better by (a) 3951 propagating just the live set that is used within the partitioned 3952 regions and (b) only propagating stack entries that are used. The 3953 latter might be quite hard to determine. */ 3954 3955 typedef rtx (*propagator_fn) (rtx, propagate_mask, unsigned, void *, bool); 3956 3957 static bool 3958 nvptx_propagate (bool is_call, basic_block block, rtx_insn *insn, 3959 propagate_mask rw, propagator_fn fn, void *data, bool vector) 3960 { 3961 bitmap live = DF_LIVE_IN (block); 3962 bitmap_iterator iterator; 3963 unsigned ix; 3964 bool empty = true; 3965 3966 /* Copy the frame array. */ 3967 HOST_WIDE_INT fs = get_frame_size (); 3968 if (fs) 3969 { 3970 rtx tmp = gen_reg_rtx (DImode); 3971 rtx idx = NULL_RTX; 3972 rtx ptr = gen_reg_rtx (Pmode); 3973 rtx pred = NULL_RTX; 3974 rtx_code_label *label = NULL; 3975 3976 empty = false; 3977 /* The frame size might not be DImode compatible, but the frame 3978 array's declaration will be. So it's ok to round up here. */ 3979 fs = (fs + GET_MODE_SIZE (DImode) - 1) / GET_MODE_SIZE (DImode); 3980 /* Detect single iteration loop. */ 3981 if (fs == 1) 3982 fs = 0; 3983 3984 start_sequence (); 3985 emit_insn (gen_rtx_SET (ptr, frame_pointer_rtx)); 3986 if (fs) 3987 { 3988 idx = gen_reg_rtx (SImode); 3989 pred = gen_reg_rtx (BImode); 3990 label = gen_label_rtx (); 3991 3992 emit_insn (gen_rtx_SET (idx, GEN_INT (fs))); 3993 /* Allow worker function to initialize anything needed. */ 3994 rtx init = fn (tmp, PM_loop_begin, fs, data, vector); 3995 if (init) 3996 emit_insn (init); 3997 emit_label (label); 3998 LABEL_NUSES (label)++; 3999 emit_insn (gen_addsi3 (idx, idx, GEN_INT (-1))); 4000 } 4001 if (rw & PM_read) 4002 emit_insn (gen_rtx_SET (tmp, gen_rtx_MEM (DImode, ptr))); 4003 emit_insn (fn (tmp, rw, fs, data, vector)); 4004 if (rw & PM_write) 4005 emit_insn (gen_rtx_SET (gen_rtx_MEM (DImode, ptr), tmp)); 4006 if (fs) 4007 { 4008 emit_insn (gen_rtx_SET (pred, gen_rtx_NE (BImode, idx, const0_rtx))); 4009 emit_insn (gen_adddi3 (ptr, ptr, GEN_INT (GET_MODE_SIZE (DImode)))); 4010 emit_insn (gen_br_true_uni (pred, label)); 4011 rtx fini = fn (tmp, PM_loop_end, fs, data, vector); 4012 if (fini) 4013 emit_insn (fini); 4014 emit_insn (gen_rtx_CLOBBER (GET_MODE (idx), idx)); 4015 } 4016 emit_insn (gen_rtx_CLOBBER (GET_MODE (tmp), tmp)); 4017 emit_insn (gen_rtx_CLOBBER (GET_MODE (ptr), ptr)); 4018 rtx cpy = get_insns (); 4019 end_sequence (); 4020 insn = emit_insn_after (cpy, insn); 4021 } 4022 4023 if (!is_call) 4024 /* Copy live registers. */ 4025 EXECUTE_IF_SET_IN_BITMAP (live, 0, ix, iterator) 4026 { 4027 rtx reg = regno_reg_rtx[ix]; 4028 4029 if (REGNO (reg) >= FIRST_PSEUDO_REGISTER) 4030 { 4031 rtx bcast = fn (reg, rw, 0, data, vector); 4032 4033 insn = emit_insn_after (bcast, insn); 4034 empty = false; 4035 } 4036 } 4037 return empty; 4038 } 4039 4040 /* Worker for nvptx_warp_propagate. */ 4041 4042 static rtx 4043 warp_prop_gen (rtx reg, propagate_mask pm, 4044 unsigned ARG_UNUSED (count), void *ARG_UNUSED (data), 4045 bool ARG_UNUSED (vector)) 4046 { 4047 if (!(pm & PM_read_write)) 4048 return 0; 4049 4050 return nvptx_gen_warp_bcast (reg); 4051 } 4052 4053 /* Propagate state that is live at start of BLOCK across the vectors 4054 of a single warp. Propagation is inserted just after INSN. 4055 IS_CALL and return as for nvptx_propagate. */ 4056 4057 static bool 4058 nvptx_warp_propagate (bool is_call, basic_block block, rtx_insn *insn) 4059 { 4060 return nvptx_propagate (is_call, block, insn, PM_read_write, 4061 warp_prop_gen, 0, false); 4062 } 4063 4064 /* Worker for nvptx_shared_propagate. */ 4065 4066 static rtx 4067 shared_prop_gen (rtx reg, propagate_mask pm, unsigned rep, void *data_, 4068 bool vector) 4069 { 4070 broadcast_data_t *data = (broadcast_data_t *)data_; 4071 4072 if (pm & PM_loop_begin) 4073 { 4074 /* Starting a loop, initialize pointer. */ 4075 unsigned align = GET_MODE_ALIGNMENT (GET_MODE (reg)) / BITS_PER_UNIT; 4076 4077 oacc_bcast_align = MAX (oacc_bcast_align, align); 4078 data->offset = ROUND_UP (data->offset, align); 4079 4080 data->ptr = gen_reg_rtx (Pmode); 4081 4082 return gen_adddi3 (data->ptr, data->base, GEN_INT (data->offset)); 4083 } 4084 else if (pm & PM_loop_end) 4085 { 4086 rtx clobber = gen_rtx_CLOBBER (GET_MODE (data->ptr), data->ptr); 4087 data->ptr = NULL_RTX; 4088 return clobber; 4089 } 4090 else 4091 return nvptx_gen_shared_bcast (reg, pm, rep, data, vector); 4092 } 4093 4094 /* Spill or fill live state that is live at start of BLOCK. PRE_P 4095 indicates if this is just before partitioned mode (do spill), or 4096 just after it starts (do fill). Sequence is inserted just after 4097 INSN. IS_CALL and return as for nvptx_propagate. */ 4098 4099 static bool 4100 nvptx_shared_propagate (bool pre_p, bool is_call, basic_block block, 4101 rtx_insn *insn, bool vector) 4102 { 4103 broadcast_data_t data; 4104 4105 data.base = gen_reg_rtx (Pmode); 4106 data.offset = 0; 4107 data.ptr = NULL_RTX; 4108 4109 bool empty = nvptx_propagate (is_call, block, insn, 4110 pre_p ? PM_read : PM_write, shared_prop_gen, 4111 &data, vector); 4112 gcc_assert (empty == !data.offset); 4113 if (data.offset) 4114 { 4115 rtx bcast_sym = oacc_bcast_sym; 4116 4117 /* Stuff was emitted, initialize the base pointer now. */ 4118 if (vector && nvptx_mach_max_workers () > 1) 4119 { 4120 if (!cfun->machine->bcast_partition) 4121 { 4122 /* It would be nice to place this register in 4123 DATA_AREA_SHARED. */ 4124 cfun->machine->bcast_partition = gen_reg_rtx (DImode); 4125 } 4126 if (!cfun->machine->sync_bar) 4127 cfun->machine->sync_bar = gen_reg_rtx (SImode); 4128 4129 bcast_sym = cfun->machine->bcast_partition; 4130 } 4131 4132 rtx init = gen_rtx_SET (data.base, bcast_sym); 4133 emit_insn_after (init, insn); 4134 4135 unsigned int psize = ROUND_UP (data.offset, oacc_bcast_align); 4136 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE 4137 ? nvptx_mach_max_workers () + 1 4138 : 1); 4139 4140 oacc_bcast_partition = MAX (oacc_bcast_partition, psize); 4141 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum); 4142 } 4143 return empty; 4144 } 4145 4146 /* Emit a CTA-level synchronization barrier. LOCK is the barrier number, 4147 which is an integer or a register. THREADS is the number of threads 4148 controlled by the barrier. */ 4149 4150 static rtx 4151 nvptx_cta_sync (rtx lock, int threads) 4152 { 4153 return gen_nvptx_barsync (lock, GEN_INT (threads)); 4154 } 4155 4156 #if WORKAROUND_PTXJIT_BUG 4157 /* Return first real insn in BB, or return NULL_RTX if BB does not contain 4158 real insns. */ 4159 4160 static rtx_insn * 4161 bb_first_real_insn (basic_block bb) 4162 { 4163 rtx_insn *insn; 4164 4165 /* Find first insn of from block. */ 4166 FOR_BB_INSNS (bb, insn) 4167 if (INSN_P (insn)) 4168 return insn; 4169 4170 return 0; 4171 } 4172 #endif 4173 4174 /* Return true if INSN needs neutering. */ 4175 4176 static bool 4177 needs_neutering_p (rtx_insn *insn) 4178 { 4179 if (!INSN_P (insn)) 4180 return false; 4181 4182 switch (recog_memoized (insn)) 4183 { 4184 case CODE_FOR_nvptx_fork: 4185 case CODE_FOR_nvptx_forked: 4186 case CODE_FOR_nvptx_joining: 4187 case CODE_FOR_nvptx_join: 4188 case CODE_FOR_nvptx_barsync: 4189 return false; 4190 default: 4191 return true; 4192 } 4193 } 4194 4195 /* Verify position of VECTOR_{JUMP,LABEL} and WORKER_{JUMP,LABEL} in FROM. */ 4196 4197 static bool 4198 verify_neutering_jumps (basic_block from, 4199 rtx_insn *vector_jump, rtx_insn *worker_jump, 4200 rtx_insn *vector_label, rtx_insn *worker_label) 4201 { 4202 basic_block bb = from; 4203 rtx_insn *insn = BB_HEAD (bb); 4204 bool seen_worker_jump = false; 4205 bool seen_vector_jump = false; 4206 bool seen_worker_label = false; 4207 bool seen_vector_label = false; 4208 bool worker_neutered = false; 4209 bool vector_neutered = false; 4210 while (true) 4211 { 4212 if (insn == worker_jump) 4213 { 4214 seen_worker_jump = true; 4215 worker_neutered = true; 4216 gcc_assert (!vector_neutered); 4217 } 4218 else if (insn == vector_jump) 4219 { 4220 seen_vector_jump = true; 4221 vector_neutered = true; 4222 } 4223 else if (insn == worker_label) 4224 { 4225 seen_worker_label = true; 4226 gcc_assert (worker_neutered); 4227 worker_neutered = false; 4228 } 4229 else if (insn == vector_label) 4230 { 4231 seen_vector_label = true; 4232 gcc_assert (vector_neutered); 4233 vector_neutered = false; 4234 } 4235 else if (INSN_P (insn)) 4236 switch (recog_memoized (insn)) 4237 { 4238 case CODE_FOR_nvptx_barsync: 4239 gcc_assert (!vector_neutered && !worker_neutered); 4240 break; 4241 default: 4242 break; 4243 } 4244 4245 if (insn != BB_END (bb)) 4246 insn = NEXT_INSN (insn); 4247 else if (JUMP_P (insn) && single_succ_p (bb) 4248 && !seen_vector_jump && !seen_worker_jump) 4249 { 4250 bb = single_succ (bb); 4251 insn = BB_HEAD (bb); 4252 } 4253 else 4254 break; 4255 } 4256 4257 gcc_assert (!(vector_jump && !seen_vector_jump)); 4258 gcc_assert (!(worker_jump && !seen_worker_jump)); 4259 4260 if (seen_vector_label || seen_worker_label) 4261 { 4262 gcc_assert (!(vector_label && !seen_vector_label)); 4263 gcc_assert (!(worker_label && !seen_worker_label)); 4264 4265 return true; 4266 } 4267 4268 return false; 4269 } 4270 4271 /* Verify position of VECTOR_LABEL and WORKER_LABEL in TO. */ 4272 4273 static void 4274 verify_neutering_labels (basic_block to, rtx_insn *vector_label, 4275 rtx_insn *worker_label) 4276 { 4277 basic_block bb = to; 4278 rtx_insn *insn = BB_END (bb); 4279 bool seen_worker_label = false; 4280 bool seen_vector_label = false; 4281 while (true) 4282 { 4283 if (insn == worker_label) 4284 { 4285 seen_worker_label = true; 4286 gcc_assert (!seen_vector_label); 4287 } 4288 else if (insn == vector_label) 4289 seen_vector_label = true; 4290 else if (INSN_P (insn)) 4291 switch (recog_memoized (insn)) 4292 { 4293 case CODE_FOR_nvptx_barsync: 4294 gcc_assert (!seen_vector_label && !seen_worker_label); 4295 break; 4296 } 4297 4298 if (insn != BB_HEAD (bb)) 4299 insn = PREV_INSN (insn); 4300 else 4301 break; 4302 } 4303 4304 gcc_assert (!(vector_label && !seen_vector_label)); 4305 gcc_assert (!(worker_label && !seen_worker_label)); 4306 } 4307 4308 /* Single neutering according to MASK. FROM is the incoming block and 4309 TO is the outgoing block. These may be the same block. Insert at 4310 start of FROM: 4311 4312 if (tid.<axis>) goto end. 4313 4314 and insert before ending branch of TO (if there is such an insn): 4315 4316 end: 4317 <possibly-broadcast-cond> 4318 <branch> 4319 4320 We currently only use differnt FROM and TO when skipping an entire 4321 loop. We could do more if we detected superblocks. */ 4322 4323 static void 4324 nvptx_single (unsigned mask, basic_block from, basic_block to) 4325 { 4326 rtx_insn *head = BB_HEAD (from); 4327 rtx_insn *tail = BB_END (to); 4328 unsigned skip_mask = mask; 4329 4330 while (true) 4331 { 4332 /* Find first insn of from block. */ 4333 while (head != BB_END (from) && !needs_neutering_p (head)) 4334 head = NEXT_INSN (head); 4335 4336 if (from == to) 4337 break; 4338 4339 if (!(JUMP_P (head) && single_succ_p (from))) 4340 break; 4341 4342 basic_block jump_target = single_succ (from); 4343 if (!single_pred_p (jump_target)) 4344 break; 4345 4346 from = jump_target; 4347 head = BB_HEAD (from); 4348 } 4349 4350 /* Find last insn of to block */ 4351 rtx_insn *limit = from == to ? head : BB_HEAD (to); 4352 while (tail != limit && !INSN_P (tail) && !LABEL_P (tail)) 4353 tail = PREV_INSN (tail); 4354 4355 /* Detect if tail is a branch. */ 4356 rtx tail_branch = NULL_RTX; 4357 rtx cond_branch = NULL_RTX; 4358 if (tail && INSN_P (tail)) 4359 { 4360 tail_branch = PATTERN (tail); 4361 if (GET_CODE (tail_branch) != SET || SET_DEST (tail_branch) != pc_rtx) 4362 tail_branch = NULL_RTX; 4363 else 4364 { 4365 cond_branch = SET_SRC (tail_branch); 4366 if (GET_CODE (cond_branch) != IF_THEN_ELSE) 4367 cond_branch = NULL_RTX; 4368 } 4369 } 4370 4371 if (tail == head) 4372 { 4373 /* If this is empty, do nothing. */ 4374 if (!head || !needs_neutering_p (head)) 4375 return; 4376 4377 if (cond_branch) 4378 { 4379 /* If we're only doing vector single, there's no need to 4380 emit skip code because we'll not insert anything. */ 4381 if (!(mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))) 4382 skip_mask = 0; 4383 } 4384 else if (tail_branch) 4385 /* Block with only unconditional branch. Nothing to do. */ 4386 return; 4387 } 4388 4389 /* Insert the vector test inside the worker test. */ 4390 unsigned mode; 4391 rtx_insn *before = tail; 4392 rtx_insn *neuter_start = NULL; 4393 rtx_insn *worker_label = NULL, *vector_label = NULL; 4394 rtx_insn *worker_jump = NULL, *vector_jump = NULL; 4395 rtx_insn *warp_sync = NULL; 4396 for (mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) 4397 if (GOMP_DIM_MASK (mode) & skip_mask) 4398 { 4399 rtx_code_label *label = gen_label_rtx (); 4400 rtx pred = cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER]; 4401 rtx_insn **mode_jump 4402 = mode == GOMP_DIM_VECTOR ? &vector_jump : &worker_jump; 4403 rtx_insn **mode_label 4404 = mode == GOMP_DIM_VECTOR ? &vector_label : &worker_label; 4405 4406 if (!pred) 4407 { 4408 pred = gen_reg_rtx (BImode); 4409 cfun->machine->axis_predicate[mode - GOMP_DIM_WORKER] = pred; 4410 } 4411 4412 rtx br; 4413 if (mode == GOMP_DIM_VECTOR) 4414 br = gen_br_true (pred, label); 4415 else 4416 br = gen_br_true_uni (pred, label); 4417 if (neuter_start) 4418 neuter_start = emit_insn_after (br, neuter_start); 4419 else 4420 neuter_start = emit_insn_before (br, head); 4421 *mode_jump = neuter_start; 4422 4423 LABEL_NUSES (label)++; 4424 rtx_insn *label_insn; 4425 if (tail_branch) 4426 { 4427 label_insn = emit_label_before (label, before); 4428 if (mode == GOMP_DIM_VECTOR) 4429 { 4430 if (TARGET_PTX_6_0) 4431 warp_sync = emit_insn_after (gen_nvptx_warpsync (), 4432 label_insn); 4433 else 4434 warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (), 4435 label_insn); 4436 } 4437 before = label_insn; 4438 } 4439 else 4440 { 4441 label_insn = emit_label_after (label, tail); 4442 if (mode == GOMP_DIM_VECTOR) 4443 { 4444 if (TARGET_PTX_6_0) 4445 warp_sync = emit_insn_after (gen_nvptx_warpsync (), 4446 label_insn); 4447 else 4448 warp_sync = emit_insn_after (gen_nvptx_uniform_warp_check (), 4449 label_insn); 4450 } 4451 if ((mode == GOMP_DIM_VECTOR || mode == GOMP_DIM_WORKER) 4452 && CALL_P (tail) && find_reg_note (tail, REG_NORETURN, NULL)) 4453 emit_insn_after (gen_exit (), label_insn); 4454 } 4455 4456 *mode_label = label_insn; 4457 } 4458 4459 /* Now deal with propagating the branch condition. */ 4460 if (cond_branch) 4461 { 4462 rtx pvar = XEXP (XEXP (cond_branch, 0), 0); 4463 4464 if (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask 4465 && nvptx_mach_vector_length () == PTX_WARP_SIZE) 4466 { 4467 /* Vector mode only, do a shuffle. */ 4468 #if WORKAROUND_PTXJIT_BUG 4469 /* The branch condition %rcond is propagated like this: 4470 4471 { 4472 .reg .u32 %x; 4473 mov.u32 %x,%tid.x; 4474 setp.ne.u32 %rnotvzero,%x,0; 4475 } 4476 4477 @%rnotvzero bra Lskip; 4478 setp.<op>.<type> %rcond,op1,op2; 4479 Lskip: 4480 selp.u32 %rcondu32,1,0,%rcond; 4481 shfl.idx.b32 %rcondu32,%rcondu32,0,31; 4482 setp.ne.u32 %rcond,%rcondu32,0; 4483 4484 There seems to be a bug in the ptx JIT compiler (observed at driver 4485 version 381.22, at -O1 and higher for sm_61), that drops the shfl 4486 unless %rcond is initialized to something before 'bra Lskip'. The 4487 bug is not observed with ptxas from cuda 8.0.61. 4488 4489 It is true that the code is non-trivial: at Lskip, %rcond is 4490 uninitialized in threads 1-31, and after the selp the same holds 4491 for %rcondu32. But shfl propagates the defined value in thread 0 4492 to threads 1-31, so after the shfl %rcondu32 is defined in threads 4493 0-31, and after the setp.ne %rcond is defined in threads 0-31. 4494 4495 There is nothing in the PTX spec to suggest that this is wrong, or 4496 to explain why the extra initialization is needed. So, we classify 4497 it as a JIT bug, and the extra initialization as workaround: 4498 4499 { 4500 .reg .u32 %x; 4501 mov.u32 %x,%tid.x; 4502 setp.ne.u32 %rnotvzero,%x,0; 4503 } 4504 4505 +.reg .pred %rcond2; 4506 +setp.eq.u32 %rcond2, 1, 0; 4507 4508 @%rnotvzero bra Lskip; 4509 setp.<op>.<type> %rcond,op1,op2; 4510 +mov.pred %rcond2, %rcond; 4511 Lskip: 4512 +mov.pred %rcond, %rcond2; 4513 selp.u32 %rcondu32,1,0,%rcond; 4514 shfl.idx.b32 %rcondu32,%rcondu32,0,31; 4515 setp.ne.u32 %rcond,%rcondu32,0; 4516 */ 4517 rtx_insn *label = PREV_INSN (tail); 4518 if (label == warp_sync) 4519 label = PREV_INSN (label); 4520 gcc_assert (label && LABEL_P (label)); 4521 rtx tmp = gen_reg_rtx (BImode); 4522 emit_insn_before (gen_movbi (tmp, const0_rtx), 4523 bb_first_real_insn (from)); 4524 emit_insn_before (gen_rtx_SET (tmp, pvar), label); 4525 emit_insn_before (gen_rtx_SET (pvar, tmp), tail); 4526 #endif 4527 emit_insn_before (nvptx_gen_warp_bcast (pvar), tail); 4528 } 4529 else 4530 { 4531 /* Includes worker mode, do spill & fill. By construction 4532 we should never have worker mode only. */ 4533 broadcast_data_t data; 4534 unsigned size = GET_MODE_SIZE (SImode); 4535 bool vector = (GOMP_DIM_MASK (GOMP_DIM_VECTOR) == mask) != 0; 4536 bool worker = (GOMP_DIM_MASK (GOMP_DIM_WORKER) == mask) != 0; 4537 rtx barrier = GEN_INT (0); 4538 int threads = 0; 4539 4540 data.base = oacc_bcast_sym; 4541 data.ptr = 0; 4542 4543 bool use_partitioning_p = (vector && !worker 4544 && nvptx_mach_max_workers () > 1 4545 && cfun->machine->bcast_partition); 4546 if (use_partitioning_p) 4547 { 4548 data.base = cfun->machine->bcast_partition; 4549 barrier = cfun->machine->sync_bar; 4550 threads = nvptx_mach_vector_length (); 4551 } 4552 gcc_assert (data.base != NULL); 4553 gcc_assert (barrier); 4554 4555 unsigned int psize = ROUND_UP (size, oacc_bcast_align); 4556 unsigned int pnum = (nvptx_mach_vector_length () > PTX_WARP_SIZE 4557 ? nvptx_mach_max_workers () + 1 4558 : 1); 4559 4560 oacc_bcast_partition = MAX (oacc_bcast_partition, psize); 4561 oacc_bcast_size = MAX (oacc_bcast_size, psize * pnum); 4562 4563 data.offset = 0; 4564 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_read, 0, &data, 4565 vector), 4566 before); 4567 4568 /* Barrier so other workers can see the write. */ 4569 emit_insn_before (nvptx_cta_sync (barrier, threads), tail); 4570 data.offset = 0; 4571 emit_insn_before (nvptx_gen_shared_bcast (pvar, PM_write, 0, &data, 4572 vector), 4573 tail); 4574 /* This barrier is needed to avoid worker zero clobbering 4575 the broadcast buffer before all the other workers have 4576 had a chance to read this instance of it. */ 4577 emit_insn_before (nvptx_cta_sync (barrier, threads), tail); 4578 } 4579 4580 extract_insn (tail); 4581 rtx unsp = gen_rtx_UNSPEC (BImode, gen_rtvec (1, pvar), 4582 UNSPEC_BR_UNIFIED); 4583 validate_change (tail, recog_data.operand_loc[0], unsp, false); 4584 } 4585 4586 bool seen_label = verify_neutering_jumps (from, vector_jump, worker_jump, 4587 vector_label, worker_label); 4588 if (!seen_label) 4589 verify_neutering_labels (to, vector_label, worker_label); 4590 } 4591 4592 /* PAR is a parallel that is being skipped in its entirety according to 4593 MASK. Treat this as skipping a superblock starting at forked 4594 and ending at joining. */ 4595 4596 static void 4597 nvptx_skip_par (unsigned mask, parallel *par) 4598 { 4599 basic_block tail = par->join_block; 4600 gcc_assert (tail->preds->length () == 1); 4601 4602 basic_block pre_tail = (*tail->preds)[0]->src; 4603 gcc_assert (pre_tail->succs->length () == 1); 4604 4605 nvptx_single (mask, par->forked_block, pre_tail); 4606 } 4607 4608 /* If PAR has a single inner parallel and PAR itself only contains 4609 empty entry and exit blocks, swallow the inner PAR. */ 4610 4611 static void 4612 nvptx_optimize_inner (parallel *par) 4613 { 4614 parallel *inner = par->inner; 4615 4616 /* We mustn't be the outer dummy par. */ 4617 if (!par->mask) 4618 return; 4619 4620 /* We must have a single inner par. */ 4621 if (!inner || inner->next) 4622 return; 4623 4624 /* We must only contain 2 blocks ourselves -- the head and tail of 4625 the inner par. */ 4626 if (par->blocks.length () != 2) 4627 return; 4628 4629 /* We must be disjoint partitioning. As we only have vector and 4630 worker partitioning, this is sufficient to guarantee the pars 4631 have adjacent partitioning. */ 4632 if ((par->mask & inner->mask) & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1)) 4633 /* This indicates malformed code generation. */ 4634 return; 4635 4636 /* The outer forked insn should be immediately followed by the inner 4637 fork insn. */ 4638 rtx_insn *forked = par->forked_insn; 4639 rtx_insn *fork = BB_END (par->forked_block); 4640 4641 if (NEXT_INSN (forked) != fork) 4642 return; 4643 gcc_checking_assert (recog_memoized (fork) == CODE_FOR_nvptx_fork); 4644 4645 /* The outer joining insn must immediately follow the inner join 4646 insn. */ 4647 rtx_insn *joining = par->joining_insn; 4648 rtx_insn *join = inner->join_insn; 4649 if (NEXT_INSN (join) != joining) 4650 return; 4651 4652 /* Preconditions met. Swallow the inner par. */ 4653 if (dump_file) 4654 fprintf (dump_file, "Merging loop %x [%d,%d] into %x [%d,%d]\n", 4655 inner->mask, inner->forked_block->index, 4656 inner->join_block->index, 4657 par->mask, par->forked_block->index, par->join_block->index); 4658 4659 par->mask |= inner->mask & (GOMP_DIM_MASK (GOMP_DIM_MAX) - 1); 4660 4661 par->blocks.reserve (inner->blocks.length ()); 4662 while (inner->blocks.length ()) 4663 par->blocks.quick_push (inner->blocks.pop ()); 4664 4665 par->inner = inner->inner; 4666 inner->inner = NULL; 4667 4668 delete inner; 4669 } 4670 4671 /* Process the parallel PAR and all its contained 4672 parallels. We do everything but the neutering. Return mask of 4673 partitioned modes used within this parallel. */ 4674 4675 static unsigned 4676 nvptx_process_pars (parallel *par) 4677 { 4678 if (nvptx_optimize) 4679 nvptx_optimize_inner (par); 4680 4681 unsigned inner_mask = par->mask; 4682 4683 /* Do the inner parallels first. */ 4684 if (par->inner) 4685 { 4686 par->inner_mask = nvptx_process_pars (par->inner); 4687 inner_mask |= par->inner_mask; 4688 } 4689 4690 bool is_call = (par->mask & GOMP_DIM_MASK (GOMP_DIM_MAX)) != 0; 4691 bool worker = (par->mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)); 4692 bool large_vector = ((par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) 4693 && nvptx_mach_vector_length () > PTX_WARP_SIZE); 4694 4695 if (worker || large_vector) 4696 { 4697 nvptx_shared_propagate (false, is_call, par->forked_block, 4698 par->forked_insn, !worker); 4699 bool no_prop_p 4700 = nvptx_shared_propagate (true, is_call, par->forked_block, 4701 par->fork_insn, !worker); 4702 bool empty_loop_p 4703 = !is_call && (NEXT_INSN (par->forked_insn) 4704 && NEXT_INSN (par->forked_insn) == par->joining_insn); 4705 rtx barrier = GEN_INT (0); 4706 int threads = 0; 4707 4708 if (!worker && cfun->machine->sync_bar) 4709 { 4710 barrier = cfun->machine->sync_bar; 4711 threads = nvptx_mach_vector_length (); 4712 } 4713 4714 if (no_prop_p && empty_loop_p) 4715 ; 4716 else if (no_prop_p && is_call) 4717 ; 4718 else 4719 { 4720 /* Insert begin and end synchronizations. */ 4721 emit_insn_before (nvptx_cta_sync (barrier, threads), 4722 par->forked_insn); 4723 emit_insn_before (nvptx_cta_sync (barrier, threads), par->join_insn); 4724 } 4725 } 4726 else if (par->mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR)) 4727 nvptx_warp_propagate (is_call, par->forked_block, par->forked_insn); 4728 4729 /* Now do siblings. */ 4730 if (par->next) 4731 inner_mask |= nvptx_process_pars (par->next); 4732 return inner_mask; 4733 } 4734 4735 /* Neuter the parallel described by PAR. We recurse in depth-first 4736 order. MODES are the partitioning of the execution and OUTER is 4737 the partitioning of the parallels we are contained in. */ 4738 4739 static void 4740 nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer) 4741 { 4742 unsigned me = (par->mask 4743 & (GOMP_DIM_MASK (GOMP_DIM_WORKER) 4744 | GOMP_DIM_MASK (GOMP_DIM_VECTOR))); 4745 unsigned skip_mask = 0, neuter_mask = 0; 4746 4747 if (par->inner) 4748 nvptx_neuter_pars (par->inner, modes, outer | me); 4749 4750 for (unsigned mode = GOMP_DIM_WORKER; mode <= GOMP_DIM_VECTOR; mode++) 4751 { 4752 if ((outer | me) & GOMP_DIM_MASK (mode)) 4753 {} /* Mode is partitioned: no neutering. */ 4754 else if (!(modes & GOMP_DIM_MASK (mode))) 4755 {} /* Mode is not used: nothing to do. */ 4756 else if (par->inner_mask & GOMP_DIM_MASK (mode) 4757 || !par->forked_insn) 4758 /* Partitioned in inner parallels, or we're not a partitioned 4759 at all: neuter individual blocks. */ 4760 neuter_mask |= GOMP_DIM_MASK (mode); 4761 else if (!par->parent || !par->parent->forked_insn 4762 || par->parent->inner_mask & GOMP_DIM_MASK (mode)) 4763 /* Parent isn't a parallel or contains this paralleling: skip 4764 parallel at this level. */ 4765 skip_mask |= GOMP_DIM_MASK (mode); 4766 else 4767 {} /* Parent will skip this parallel itself. */ 4768 } 4769 4770 if (neuter_mask) 4771 { 4772 int ix, len; 4773 4774 if (nvptx_optimize) 4775 { 4776 /* Neuter whole SESE regions. */ 4777 bb_pair_vec_t regions; 4778 4779 nvptx_find_sese (par->blocks, regions); 4780 len = regions.length (); 4781 for (ix = 0; ix != len; ix++) 4782 { 4783 basic_block from = regions[ix].first; 4784 basic_block to = regions[ix].second; 4785 4786 if (from) 4787 nvptx_single (neuter_mask, from, to); 4788 else 4789 gcc_assert (!to); 4790 } 4791 } 4792 else 4793 { 4794 /* Neuter each BB individually. */ 4795 len = par->blocks.length (); 4796 for (ix = 0; ix != len; ix++) 4797 { 4798 basic_block block = par->blocks[ix]; 4799 4800 nvptx_single (neuter_mask, block, block); 4801 } 4802 } 4803 } 4804 4805 if (skip_mask) 4806 nvptx_skip_par (skip_mask, par); 4807 4808 if (par->next) 4809 nvptx_neuter_pars (par->next, modes, outer); 4810 } 4811 4812 static void 4813 populate_offload_attrs (offload_attrs *oa) 4814 { 4815 tree attr = oacc_get_fn_attrib (current_function_decl); 4816 tree dims = TREE_VALUE (attr); 4817 unsigned ix; 4818 4819 oa->mask = 0; 4820 4821 for (ix = 0; ix != GOMP_DIM_MAX; ix++, dims = TREE_CHAIN (dims)) 4822 { 4823 tree t = TREE_VALUE (dims); 4824 int size = (t == NULL_TREE) ? -1 : TREE_INT_CST_LOW (t); 4825 tree allowed = TREE_PURPOSE (dims); 4826 4827 if (size != 1 && !(allowed && integer_zerop (allowed))) 4828 oa->mask |= GOMP_DIM_MASK (ix); 4829 4830 switch (ix) 4831 { 4832 case GOMP_DIM_GANG: 4833 oa->num_gangs = size; 4834 break; 4835 4836 case GOMP_DIM_WORKER: 4837 oa->num_workers = size; 4838 break; 4839 4840 case GOMP_DIM_VECTOR: 4841 oa->vector_length = size; 4842 break; 4843 } 4844 } 4845 } 4846 4847 #if WORKAROUND_PTXJIT_BUG_2 4848 /* Variant of pc_set that only requires JUMP_P (INSN) if STRICT. This variant 4849 is needed in the nvptx target because the branches generated for 4850 parititioning are NONJUMP_INSN_P, not JUMP_P. */ 4851 4852 static rtx 4853 nvptx_pc_set (const rtx_insn *insn, bool strict = true) 4854 { 4855 rtx pat; 4856 if ((strict && !JUMP_P (insn)) 4857 || (!strict && !INSN_P (insn))) 4858 return NULL_RTX; 4859 pat = PATTERN (insn); 4860 4861 /* The set is allowed to appear either as the insn pattern or 4862 the first set in a PARALLEL. */ 4863 if (GET_CODE (pat) == PARALLEL) 4864 pat = XVECEXP (pat, 0, 0); 4865 if (GET_CODE (pat) == SET && GET_CODE (SET_DEST (pat)) == PC) 4866 return pat; 4867 4868 return NULL_RTX; 4869 } 4870 4871 /* Variant of condjump_label that only requires JUMP_P (INSN) if STRICT. */ 4872 4873 static rtx 4874 nvptx_condjump_label (const rtx_insn *insn, bool strict = true) 4875 { 4876 rtx x = nvptx_pc_set (insn, strict); 4877 4878 if (!x) 4879 return NULL_RTX; 4880 x = SET_SRC (x); 4881 if (GET_CODE (x) == LABEL_REF) 4882 return x; 4883 if (GET_CODE (x) != IF_THEN_ELSE) 4884 return NULL_RTX; 4885 if (XEXP (x, 2) == pc_rtx && GET_CODE (XEXP (x, 1)) == LABEL_REF) 4886 return XEXP (x, 1); 4887 if (XEXP (x, 1) == pc_rtx && GET_CODE (XEXP (x, 2)) == LABEL_REF) 4888 return XEXP (x, 2); 4889 return NULL_RTX; 4890 } 4891 4892 /* Insert a dummy ptx insn when encountering a branch to a label with no ptx 4893 insn inbetween the branch and the label. This works around a JIT bug 4894 observed at driver version 384.111, at -O0 for sm_50. */ 4895 4896 static void 4897 prevent_branch_around_nothing (void) 4898 { 4899 rtx_insn *seen_label = NULL; 4900 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn)) 4901 { 4902 if (INSN_P (insn) && condjump_p (insn)) 4903 { 4904 seen_label = label_ref_label (nvptx_condjump_label (insn, false)); 4905 continue; 4906 } 4907 4908 if (seen_label == NULL) 4909 continue; 4910 4911 if (NOTE_P (insn) || DEBUG_INSN_P (insn)) 4912 continue; 4913 4914 if (INSN_P (insn)) 4915 switch (recog_memoized (insn)) 4916 { 4917 case CODE_FOR_nvptx_fork: 4918 case CODE_FOR_nvptx_forked: 4919 case CODE_FOR_nvptx_joining: 4920 case CODE_FOR_nvptx_join: 4921 continue; 4922 default: 4923 seen_label = NULL; 4924 continue; 4925 } 4926 4927 if (LABEL_P (insn) && insn == seen_label) 4928 emit_insn_before (gen_fake_nop (), insn); 4929 4930 seen_label = NULL; 4931 } 4932 } 4933 #endif 4934 4935 #ifdef WORKAROUND_PTXJIT_BUG_3 4936 /* Insert two membar.cta insns inbetween two subsequent bar.sync insns. This 4937 works around a hang observed at driver version 390.48 for sm_50. */ 4938 4939 static void 4940 workaround_barsyncs (void) 4941 { 4942 bool seen_barsync = false; 4943 for (rtx_insn *insn = get_insns (); insn; insn = NEXT_INSN (insn)) 4944 { 4945 if (INSN_P (insn) && recog_memoized (insn) == CODE_FOR_nvptx_barsync) 4946 { 4947 if (seen_barsync) 4948 { 4949 emit_insn_before (gen_nvptx_membar_cta (), insn); 4950 emit_insn_before (gen_nvptx_membar_cta (), insn); 4951 } 4952 4953 seen_barsync = true; 4954 continue; 4955 } 4956 4957 if (!seen_barsync) 4958 continue; 4959 4960 if (NOTE_P (insn) || DEBUG_INSN_P (insn)) 4961 continue; 4962 else if (INSN_P (insn)) 4963 switch (recog_memoized (insn)) 4964 { 4965 case CODE_FOR_nvptx_fork: 4966 case CODE_FOR_nvptx_forked: 4967 case CODE_FOR_nvptx_joining: 4968 case CODE_FOR_nvptx_join: 4969 continue; 4970 default: 4971 break; 4972 } 4973 4974 seen_barsync = false; 4975 } 4976 } 4977 #endif 4978 4979 /* PTX-specific reorganization 4980 - Split blocks at fork and join instructions 4981 - Compute live registers 4982 - Mark now-unused registers, so function begin doesn't declare 4983 unused registers. 4984 - Insert state propagation when entering partitioned mode 4985 - Insert neutering instructions when in single mode 4986 - Replace subregs with suitable sequences. 4987 */ 4988 4989 static void 4990 nvptx_reorg (void) 4991 { 4992 /* We are freeing block_for_insn in the toplev to keep compatibility 4993 with old MDEP_REORGS that are not CFG based. Recompute it now. */ 4994 compute_bb_for_insn (); 4995 4996 thread_prologue_and_epilogue_insns (); 4997 4998 /* Split blocks and record interesting unspecs. */ 4999 bb_insn_map_t bb_insn_map; 5000 5001 nvptx_split_blocks (&bb_insn_map); 5002 5003 /* Compute live regs */ 5004 df_clear_flags (DF_LR_RUN_DCE); 5005 df_set_flags (DF_NO_INSN_RESCAN | DF_NO_HARD_REGS); 5006 df_live_add_problem (); 5007 df_live_set_all_dirty (); 5008 df_analyze (); 5009 regstat_init_n_sets_and_refs (); 5010 5011 if (dump_file) 5012 df_dump (dump_file); 5013 5014 /* Mark unused regs as unused. */ 5015 int max_regs = max_reg_num (); 5016 for (int i = LAST_VIRTUAL_REGISTER + 1; i < max_regs; i++) 5017 if (REG_N_SETS (i) == 0 && REG_N_REFS (i) == 0) 5018 regno_reg_rtx[i] = const0_rtx; 5019 5020 /* Determine launch dimensions of the function. If it is not an 5021 offloaded function (i.e. this is a regular compiler), the 5022 function has no neutering. */ 5023 tree attr = oacc_get_fn_attrib (current_function_decl); 5024 if (attr) 5025 { 5026 /* If we determined this mask before RTL expansion, we could 5027 elide emission of some levels of forks and joins. */ 5028 offload_attrs oa; 5029 5030 populate_offload_attrs (&oa); 5031 5032 /* If there is worker neutering, there must be vector 5033 neutering. Otherwise the hardware will fail. */ 5034 gcc_assert (!(oa.mask & GOMP_DIM_MASK (GOMP_DIM_WORKER)) 5035 || (oa.mask & GOMP_DIM_MASK (GOMP_DIM_VECTOR))); 5036 5037 /* Discover & process partitioned regions. */ 5038 parallel *pars = nvptx_discover_pars (&bb_insn_map); 5039 nvptx_process_pars (pars); 5040 nvptx_neuter_pars (pars, oa.mask, 0); 5041 delete pars; 5042 } 5043 5044 /* Replace subregs. */ 5045 nvptx_reorg_subreg (); 5046 5047 if (TARGET_UNIFORM_SIMT) 5048 nvptx_reorg_uniform_simt (); 5049 5050 #if WORKAROUND_PTXJIT_BUG_2 5051 prevent_branch_around_nothing (); 5052 #endif 5053 5054 #ifdef WORKAROUND_PTXJIT_BUG_3 5055 workaround_barsyncs (); 5056 #endif 5057 5058 regstat_free_n_sets_and_refs (); 5059 5060 df_finish_pass (true); 5061 } 5062 5063 /* Handle a "kernel" attribute; arguments as in 5064 struct attribute_spec.handler. */ 5065 5066 static tree 5067 nvptx_handle_kernel_attribute (tree *node, tree name, tree ARG_UNUSED (args), 5068 int ARG_UNUSED (flags), bool *no_add_attrs) 5069 { 5070 tree decl = *node; 5071 5072 if (TREE_CODE (decl) != FUNCTION_DECL) 5073 { 5074 error ("%qE attribute only applies to functions", name); 5075 *no_add_attrs = true; 5076 } 5077 else if (!VOID_TYPE_P (TREE_TYPE (TREE_TYPE (decl)))) 5078 { 5079 error ("%qE attribute requires a void return type", name); 5080 *no_add_attrs = true; 5081 } 5082 5083 return NULL_TREE; 5084 } 5085 5086 /* Handle a "shared" attribute; arguments as in 5087 struct attribute_spec.handler. */ 5088 5089 static tree 5090 nvptx_handle_shared_attribute (tree *node, tree name, tree ARG_UNUSED (args), 5091 int ARG_UNUSED (flags), bool *no_add_attrs) 5092 { 5093 tree decl = *node; 5094 5095 if (TREE_CODE (decl) != VAR_DECL) 5096 { 5097 error ("%qE attribute only applies to variables", name); 5098 *no_add_attrs = true; 5099 } 5100 else if (!(TREE_PUBLIC (decl) || TREE_STATIC (decl))) 5101 { 5102 error ("%qE attribute not allowed with auto storage class", name); 5103 *no_add_attrs = true; 5104 } 5105 5106 return NULL_TREE; 5107 } 5108 5109 /* Table of valid machine attributes. */ 5110 static const struct attribute_spec nvptx_attribute_table[] = 5111 { 5112 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, 5113 affects_type_identity, handler, exclude } */ 5114 { "kernel", 0, 0, true, false, false, false, nvptx_handle_kernel_attribute, 5115 NULL }, 5116 { "shared", 0, 0, true, false, false, false, nvptx_handle_shared_attribute, 5117 NULL }, 5118 { NULL, 0, 0, false, false, false, false, NULL, NULL } 5119 }; 5120 5121 /* Limit vector alignments to BIGGEST_ALIGNMENT. */ 5122 5123 static HOST_WIDE_INT 5124 nvptx_vector_alignment (const_tree type) 5125 { 5126 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type)); 5127 5128 return MIN (align, BIGGEST_ALIGNMENT); 5129 } 5130 5131 /* Indicate that INSN cannot be duplicated. */ 5132 5133 static bool 5134 nvptx_cannot_copy_insn_p (rtx_insn *insn) 5135 { 5136 switch (recog_memoized (insn)) 5137 { 5138 case CODE_FOR_nvptx_shufflesi: 5139 case CODE_FOR_nvptx_shufflesf: 5140 case CODE_FOR_nvptx_barsync: 5141 case CODE_FOR_nvptx_fork: 5142 case CODE_FOR_nvptx_forked: 5143 case CODE_FOR_nvptx_joining: 5144 case CODE_FOR_nvptx_join: 5145 return true; 5146 default: 5147 return false; 5148 } 5149 } 5150 5151 /* Section anchors do not work. Initialization for flag_section_anchor 5152 probes the existence of the anchoring target hooks and prevents 5153 anchoring if they don't exist. However, we may be being used with 5154 a host-side compiler that does support anchoring, and hence see 5155 the anchor flag set (as it's not recalculated). So provide an 5156 implementation denying anchoring. */ 5157 5158 static bool 5159 nvptx_use_anchors_for_symbol_p (const_rtx ARG_UNUSED (a)) 5160 { 5161 return false; 5162 } 5163 5164 /* Record a symbol for mkoffload to enter into the mapping table. */ 5165 5166 static void 5167 nvptx_record_offload_symbol (tree decl) 5168 { 5169 switch (TREE_CODE (decl)) 5170 { 5171 case VAR_DECL: 5172 fprintf (asm_out_file, "//:VAR_MAP \"%s\"\n", 5173 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl))); 5174 break; 5175 5176 case FUNCTION_DECL: 5177 { 5178 tree attr = oacc_get_fn_attrib (decl); 5179 /* OpenMP offloading does not set this attribute. */ 5180 tree dims = attr ? TREE_VALUE (attr) : NULL_TREE; 5181 5182 fprintf (asm_out_file, "//:FUNC_MAP \"%s\"", 5183 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl))); 5184 5185 for (; dims; dims = TREE_CHAIN (dims)) 5186 { 5187 int size = TREE_INT_CST_LOW (TREE_VALUE (dims)); 5188 5189 gcc_assert (!TREE_PURPOSE (dims)); 5190 fprintf (asm_out_file, ", %#x", size); 5191 } 5192 5193 fprintf (asm_out_file, "\n"); 5194 } 5195 break; 5196 5197 default: 5198 gcc_unreachable (); 5199 } 5200 } 5201 5202 /* Implement TARGET_ASM_FILE_START. Write the kinds of things ptxas expects 5203 at the start of a file. */ 5204 5205 static void 5206 nvptx_file_start (void) 5207 { 5208 fputs ("// BEGIN PREAMBLE\n", asm_out_file); 5209 fputs ("\t.version\t3.1\n", asm_out_file); 5210 if (TARGET_SM35) 5211 fputs ("\t.target\tsm_35\n", asm_out_file); 5212 else 5213 fputs ("\t.target\tsm_30\n", asm_out_file); 5214 fprintf (asm_out_file, "\t.address_size %d\n", GET_MODE_BITSIZE (Pmode)); 5215 fputs ("// END PREAMBLE\n", asm_out_file); 5216 } 5217 5218 /* Emit a declaration for a worker and vector-level buffer in .shared 5219 memory. */ 5220 5221 static void 5222 write_shared_buffer (FILE *file, rtx sym, unsigned align, unsigned size) 5223 { 5224 const char *name = XSTR (sym, 0); 5225 5226 write_var_marker (file, true, false, name); 5227 fprintf (file, ".shared .align %d .u8 %s[%d];\n", 5228 align, name, size); 5229 } 5230 5231 /* Write out the function declarations we've collected and declare storage 5232 for the broadcast buffer. */ 5233 5234 static void 5235 nvptx_file_end (void) 5236 { 5237 hash_table<tree_hasher>::iterator iter; 5238 tree decl; 5239 FOR_EACH_HASH_TABLE_ELEMENT (*needed_fndecls_htab, decl, tree, iter) 5240 nvptx_record_fndecl (decl); 5241 fputs (func_decls.str().c_str(), asm_out_file); 5242 5243 if (oacc_bcast_size) 5244 write_shared_buffer (asm_out_file, oacc_bcast_sym, 5245 oacc_bcast_align, oacc_bcast_size); 5246 5247 if (worker_red_size) 5248 write_shared_buffer (asm_out_file, worker_red_sym, 5249 worker_red_align, worker_red_size); 5250 5251 if (vector_red_size) 5252 write_shared_buffer (asm_out_file, vector_red_sym, 5253 vector_red_align, vector_red_size); 5254 5255 if (need_softstack_decl) 5256 { 5257 write_var_marker (asm_out_file, false, true, "__nvptx_stacks"); 5258 /* 32 is the maximum number of warps in a block. Even though it's an 5259 external declaration, emit the array size explicitly; otherwise, it 5260 may fail at PTX JIT time if the definition is later in link order. */ 5261 fprintf (asm_out_file, ".extern .shared .u%d __nvptx_stacks[32];\n", 5262 POINTER_SIZE); 5263 } 5264 if (need_unisimt_decl) 5265 { 5266 write_var_marker (asm_out_file, false, true, "__nvptx_uni"); 5267 fprintf (asm_out_file, ".extern .shared .u32 __nvptx_uni[32];\n"); 5268 } 5269 } 5270 5271 /* Expander for the shuffle builtins. */ 5272 5273 static rtx 5274 nvptx_expand_shuffle (tree exp, rtx target, machine_mode mode, int ignore) 5275 { 5276 if (ignore) 5277 return target; 5278 5279 rtx src = expand_expr (CALL_EXPR_ARG (exp, 0), 5280 NULL_RTX, mode, EXPAND_NORMAL); 5281 if (!REG_P (src)) 5282 src = copy_to_mode_reg (mode, src); 5283 5284 rtx idx = expand_expr (CALL_EXPR_ARG (exp, 1), 5285 NULL_RTX, SImode, EXPAND_NORMAL); 5286 rtx op = expand_expr (CALL_EXPR_ARG (exp, 2), 5287 NULL_RTX, SImode, EXPAND_NORMAL); 5288 5289 if (!REG_P (idx) && GET_CODE (idx) != CONST_INT) 5290 idx = copy_to_mode_reg (SImode, idx); 5291 5292 rtx pat = nvptx_gen_shuffle (target, src, idx, 5293 (nvptx_shuffle_kind) INTVAL (op)); 5294 if (pat) 5295 emit_insn (pat); 5296 5297 return target; 5298 } 5299 5300 const char * 5301 nvptx_output_red_partition (rtx dst, rtx offset) 5302 { 5303 const char *zero_offset = "\t\tmov.u64\t%%r%d, %%r%d; // vred buffer\n"; 5304 const char *with_offset = "\t\tadd.u64\t%%r%d, %%r%d, %d; // vred buffer\n"; 5305 5306 if (offset == const0_rtx) 5307 fprintf (asm_out_file, zero_offset, REGNO (dst), 5308 REGNO (cfun->machine->red_partition)); 5309 else 5310 fprintf (asm_out_file, with_offset, REGNO (dst), 5311 REGNO (cfun->machine->red_partition), UINTVAL (offset)); 5312 5313 return ""; 5314 } 5315 5316 /* Shared-memory reduction address expander. */ 5317 5318 static rtx 5319 nvptx_expand_shared_addr (tree exp, rtx target, 5320 machine_mode ARG_UNUSED (mode), int ignore, 5321 int vector) 5322 { 5323 if (ignore) 5324 return target; 5325 5326 unsigned align = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 2)); 5327 unsigned offset = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 0)); 5328 unsigned size = TREE_INT_CST_LOW (CALL_EXPR_ARG (exp, 1)); 5329 rtx addr = worker_red_sym; 5330 5331 if (vector) 5332 { 5333 offload_attrs oa; 5334 5335 populate_offload_attrs (&oa); 5336 5337 unsigned int psize = ROUND_UP (size + offset, align); 5338 unsigned int pnum = nvptx_mach_max_workers (); 5339 vector_red_partition = MAX (vector_red_partition, psize); 5340 vector_red_size = MAX (vector_red_size, psize * pnum); 5341 vector_red_align = MAX (vector_red_align, align); 5342 5343 if (cfun->machine->red_partition == NULL) 5344 cfun->machine->red_partition = gen_reg_rtx (Pmode); 5345 5346 addr = gen_reg_rtx (Pmode); 5347 emit_insn (gen_nvptx_red_partition (addr, GEN_INT (offset))); 5348 } 5349 else 5350 { 5351 worker_red_align = MAX (worker_red_align, align); 5352 worker_red_size = MAX (worker_red_size, size + offset); 5353 5354 if (offset) 5355 { 5356 addr = gen_rtx_PLUS (Pmode, addr, GEN_INT (offset)); 5357 addr = gen_rtx_CONST (Pmode, addr); 5358 } 5359 } 5360 5361 emit_move_insn (target, addr); 5362 return target; 5363 } 5364 5365 /* Expand the CMP_SWAP PTX builtins. We have our own versions that do 5366 not require taking the address of any object, other than the memory 5367 cell being operated on. */ 5368 5369 static rtx 5370 nvptx_expand_cmp_swap (tree exp, rtx target, 5371 machine_mode ARG_UNUSED (m), int ARG_UNUSED (ignore)) 5372 { 5373 machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); 5374 5375 if (!target) 5376 target = gen_reg_rtx (mode); 5377 5378 rtx mem = expand_expr (CALL_EXPR_ARG (exp, 0), 5379 NULL_RTX, Pmode, EXPAND_NORMAL); 5380 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1), 5381 NULL_RTX, mode, EXPAND_NORMAL); 5382 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2), 5383 NULL_RTX, mode, EXPAND_NORMAL); 5384 rtx pat; 5385 5386 mem = gen_rtx_MEM (mode, mem); 5387 if (!REG_P (cmp)) 5388 cmp = copy_to_mode_reg (mode, cmp); 5389 if (!REG_P (src)) 5390 src = copy_to_mode_reg (mode, src); 5391 5392 if (mode == SImode) 5393 pat = gen_atomic_compare_and_swapsi_1 (target, mem, cmp, src, const0_rtx); 5394 else 5395 pat = gen_atomic_compare_and_swapdi_1 (target, mem, cmp, src, const0_rtx); 5396 5397 emit_insn (pat); 5398 5399 return target; 5400 } 5401 5402 5403 /* Codes for all the NVPTX builtins. */ 5404 enum nvptx_builtins 5405 { 5406 NVPTX_BUILTIN_SHUFFLE, 5407 NVPTX_BUILTIN_SHUFFLELL, 5408 NVPTX_BUILTIN_WORKER_ADDR, 5409 NVPTX_BUILTIN_VECTOR_ADDR, 5410 NVPTX_BUILTIN_CMP_SWAP, 5411 NVPTX_BUILTIN_CMP_SWAPLL, 5412 NVPTX_BUILTIN_MAX 5413 }; 5414 5415 static GTY(()) tree nvptx_builtin_decls[NVPTX_BUILTIN_MAX]; 5416 5417 /* Return the NVPTX builtin for CODE. */ 5418 5419 static tree 5420 nvptx_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p)) 5421 { 5422 if (code >= NVPTX_BUILTIN_MAX) 5423 return error_mark_node; 5424 5425 return nvptx_builtin_decls[code]; 5426 } 5427 5428 /* Set up all builtin functions for this target. */ 5429 5430 static void 5431 nvptx_init_builtins (void) 5432 { 5433 #define DEF(ID, NAME, T) \ 5434 (nvptx_builtin_decls[NVPTX_BUILTIN_ ## ID] \ 5435 = add_builtin_function ("__builtin_nvptx_" NAME, \ 5436 build_function_type_list T, \ 5437 NVPTX_BUILTIN_ ## ID, BUILT_IN_MD, NULL, NULL)) 5438 #define ST sizetype 5439 #define UINT unsigned_type_node 5440 #define LLUINT long_long_unsigned_type_node 5441 #define PTRVOID ptr_type_node 5442 5443 DEF (SHUFFLE, "shuffle", (UINT, UINT, UINT, UINT, NULL_TREE)); 5444 DEF (SHUFFLELL, "shufflell", (LLUINT, LLUINT, UINT, UINT, NULL_TREE)); 5445 DEF (WORKER_ADDR, "worker_addr", 5446 (PTRVOID, ST, UINT, UINT, NULL_TREE)); 5447 DEF (VECTOR_ADDR, "vector_addr", 5448 (PTRVOID, ST, UINT, UINT, NULL_TREE)); 5449 DEF (CMP_SWAP, "cmp_swap", (UINT, PTRVOID, UINT, UINT, NULL_TREE)); 5450 DEF (CMP_SWAPLL, "cmp_swapll", (LLUINT, PTRVOID, LLUINT, LLUINT, NULL_TREE)); 5451 5452 #undef DEF 5453 #undef ST 5454 #undef UINT 5455 #undef LLUINT 5456 #undef PTRVOID 5457 } 5458 5459 /* Expand an expression EXP that calls a built-in function, 5460 with result going to TARGET if that's convenient 5461 (and in mode MODE if that's convenient). 5462 SUBTARGET may be used as the target for computing one of EXP's operands. 5463 IGNORE is nonzero if the value is to be ignored. */ 5464 5465 static rtx 5466 nvptx_expand_builtin (tree exp, rtx target, rtx ARG_UNUSED (subtarget), 5467 machine_mode mode, int ignore) 5468 { 5469 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); 5470 switch (DECL_MD_FUNCTION_CODE (fndecl)) 5471 { 5472 case NVPTX_BUILTIN_SHUFFLE: 5473 case NVPTX_BUILTIN_SHUFFLELL: 5474 return nvptx_expand_shuffle (exp, target, mode, ignore); 5475 5476 case NVPTX_BUILTIN_WORKER_ADDR: 5477 return nvptx_expand_shared_addr (exp, target, mode, ignore, false); 5478 5479 case NVPTX_BUILTIN_VECTOR_ADDR: 5480 return nvptx_expand_shared_addr (exp, target, mode, ignore, true); 5481 5482 case NVPTX_BUILTIN_CMP_SWAP: 5483 case NVPTX_BUILTIN_CMP_SWAPLL: 5484 return nvptx_expand_cmp_swap (exp, target, mode, ignore); 5485 5486 default: gcc_unreachable (); 5487 } 5488 } 5489 5490 /* Implement TARGET_SIMT_VF target hook: number of threads in a warp. */ 5491 5492 static int 5493 nvptx_simt_vf () 5494 { 5495 return PTX_WARP_SIZE; 5496 } 5497 5498 /* Return 1 if TRAIT NAME is present in the OpenMP context's 5499 device trait set, return 0 if not present in any OpenMP context in the 5500 whole translation unit, or -1 if not present in the current OpenMP context 5501 but might be present in another OpenMP context in the same TU. */ 5502 5503 int 5504 nvptx_omp_device_kind_arch_isa (enum omp_device_kind_arch_isa trait, 5505 const char *name) 5506 { 5507 switch (trait) 5508 { 5509 case omp_device_kind: 5510 return strcmp (name, "gpu") == 0; 5511 case omp_device_arch: 5512 return strcmp (name, "nvptx") == 0; 5513 case omp_device_isa: 5514 if (strcmp (name, "sm_30") == 0) 5515 return !TARGET_SM35; 5516 if (strcmp (name, "sm_35") == 0) 5517 return TARGET_SM35; 5518 return 0; 5519 default: 5520 gcc_unreachable (); 5521 } 5522 } 5523 5524 static bool 5525 nvptx_welformed_vector_length_p (int l) 5526 { 5527 gcc_assert (l > 0); 5528 return l % PTX_WARP_SIZE == 0; 5529 } 5530 5531 static void 5532 nvptx_apply_dim_limits (int dims[]) 5533 { 5534 /* Check that the vector_length is not too large. */ 5535 if (dims[GOMP_DIM_VECTOR] > PTX_MAX_VECTOR_LENGTH) 5536 dims[GOMP_DIM_VECTOR] = PTX_MAX_VECTOR_LENGTH; 5537 5538 /* Check that the number of workers is not too large. */ 5539 if (dims[GOMP_DIM_WORKER] > PTX_WORKER_LENGTH) 5540 dims[GOMP_DIM_WORKER] = PTX_WORKER_LENGTH; 5541 5542 /* Ensure that num_worker * vector_length <= cta size. */ 5543 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0 5544 && dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] > PTX_CTA_SIZE) 5545 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; 5546 5547 /* If we need a per-worker barrier ... . */ 5548 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0 5549 && dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE) 5550 /* Don't use more barriers than available. */ 5551 dims[GOMP_DIM_WORKER] = MIN (dims[GOMP_DIM_WORKER], 5552 PTX_NUM_PER_WORKER_BARRIERS); 5553 } 5554 5555 /* Return true if FNDECL contains calls to vector-partitionable routines. */ 5556 5557 static bool 5558 has_vector_partitionable_routine_calls_p (tree fndecl) 5559 { 5560 if (!fndecl) 5561 return false; 5562 5563 basic_block bb; 5564 FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (fndecl)) 5565 for (gimple_stmt_iterator i = gsi_start_bb (bb); !gsi_end_p (i); 5566 gsi_next_nondebug (&i)) 5567 { 5568 gimple *stmt = gsi_stmt (i); 5569 if (gimple_code (stmt) != GIMPLE_CALL) 5570 continue; 5571 5572 tree callee = gimple_call_fndecl (stmt); 5573 if (!callee) 5574 continue; 5575 5576 tree attrs = oacc_get_fn_attrib (callee); 5577 if (attrs == NULL_TREE) 5578 return false; 5579 5580 int partition_level = oacc_fn_attrib_level (attrs); 5581 bool seq_routine_p = partition_level == GOMP_DIM_MAX; 5582 if (!seq_routine_p) 5583 return true; 5584 } 5585 5586 return false; 5587 } 5588 5589 /* As nvptx_goacc_validate_dims, but does not return bool to indicate whether 5590 DIMS has changed. */ 5591 5592 static void 5593 nvptx_goacc_validate_dims_1 (tree decl, int dims[], int fn_level, unsigned used) 5594 { 5595 bool oacc_default_dims_p = false; 5596 bool oacc_min_dims_p = false; 5597 bool offload_region_p = false; 5598 bool routine_p = false; 5599 bool routine_seq_p = false; 5600 int default_vector_length = -1; 5601 5602 if (decl == NULL_TREE) 5603 { 5604 if (fn_level == -1) 5605 oacc_default_dims_p = true; 5606 else if (fn_level == -2) 5607 oacc_min_dims_p = true; 5608 else 5609 gcc_unreachable (); 5610 } 5611 else if (fn_level == -1) 5612 offload_region_p = true; 5613 else if (0 <= fn_level && fn_level <= GOMP_DIM_MAX) 5614 { 5615 routine_p = true; 5616 routine_seq_p = fn_level == GOMP_DIM_MAX; 5617 } 5618 else 5619 gcc_unreachable (); 5620 5621 if (oacc_min_dims_p) 5622 { 5623 gcc_assert (dims[GOMP_DIM_VECTOR] == 1); 5624 gcc_assert (dims[GOMP_DIM_WORKER] == 1); 5625 gcc_assert (dims[GOMP_DIM_GANG] == 1); 5626 5627 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; 5628 return; 5629 } 5630 5631 if (routine_p) 5632 { 5633 if (!routine_seq_p) 5634 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; 5635 5636 return; 5637 } 5638 5639 if (oacc_default_dims_p) 5640 { 5641 /* -1 : not set 5642 0 : set at runtime, f.i. -fopenacc-dims=- 5643 >= 1: set at compile time, f.i. -fopenacc-dims=1. */ 5644 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1); 5645 gcc_assert (dims[GOMP_DIM_WORKER] >= -1); 5646 gcc_assert (dims[GOMP_DIM_GANG] >= -1); 5647 5648 /* But -fopenacc-dims=- is not yet supported on trunk. */ 5649 gcc_assert (dims[GOMP_DIM_VECTOR] != 0); 5650 gcc_assert (dims[GOMP_DIM_WORKER] != 0); 5651 gcc_assert (dims[GOMP_DIM_GANG] != 0); 5652 } 5653 5654 if (offload_region_p) 5655 { 5656 /* -1 : not set 5657 0 : set using variable, f.i. num_gangs (n) 5658 >= 1: set using constant, f.i. num_gangs (1). */ 5659 gcc_assert (dims[GOMP_DIM_VECTOR] >= -1); 5660 gcc_assert (dims[GOMP_DIM_WORKER] >= -1); 5661 gcc_assert (dims[GOMP_DIM_GANG] >= -1); 5662 } 5663 5664 if (offload_region_p) 5665 default_vector_length = oacc_get_default_dim (GOMP_DIM_VECTOR); 5666 else 5667 /* oacc_default_dims_p. */ 5668 default_vector_length = PTX_DEFAULT_VECTOR_LENGTH; 5669 5670 int old_dims[GOMP_DIM_MAX]; 5671 unsigned int i; 5672 for (i = 0; i < GOMP_DIM_MAX; ++i) 5673 old_dims[i] = dims[i]; 5674 5675 const char *vector_reason = NULL; 5676 if (offload_region_p && has_vector_partitionable_routine_calls_p (decl)) 5677 { 5678 default_vector_length = PTX_WARP_SIZE; 5679 5680 if (dims[GOMP_DIM_VECTOR] > PTX_WARP_SIZE) 5681 { 5682 vector_reason = G_("using vector_length (%d) due to call to" 5683 " vector-partitionable routine, ignoring %d"); 5684 dims[GOMP_DIM_VECTOR] = PTX_WARP_SIZE; 5685 } 5686 } 5687 5688 if (dims[GOMP_DIM_VECTOR] == 0) 5689 { 5690 vector_reason = G_("using vector_length (%d), ignoring runtime setting"); 5691 dims[GOMP_DIM_VECTOR] = default_vector_length; 5692 } 5693 5694 if (dims[GOMP_DIM_VECTOR] > 0 5695 && !nvptx_welformed_vector_length_p (dims[GOMP_DIM_VECTOR])) 5696 dims[GOMP_DIM_VECTOR] = default_vector_length; 5697 5698 nvptx_apply_dim_limits (dims); 5699 5700 if (dims[GOMP_DIM_VECTOR] != old_dims[GOMP_DIM_VECTOR]) 5701 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, 5702 vector_reason != NULL 5703 ? vector_reason 5704 : G_("using vector_length (%d), ignoring %d"), 5705 dims[GOMP_DIM_VECTOR], old_dims[GOMP_DIM_VECTOR]); 5706 5707 if (dims[GOMP_DIM_WORKER] != old_dims[GOMP_DIM_WORKER]) 5708 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 0, 5709 G_("using num_workers (%d), ignoring %d"), 5710 dims[GOMP_DIM_WORKER], old_dims[GOMP_DIM_WORKER]); 5711 5712 if (oacc_default_dims_p) 5713 { 5714 if (dims[GOMP_DIM_VECTOR] < 0) 5715 dims[GOMP_DIM_VECTOR] = default_vector_length; 5716 if (dims[GOMP_DIM_WORKER] < 0) 5717 dims[GOMP_DIM_WORKER] = PTX_DEFAULT_RUNTIME_DIM; 5718 if (dims[GOMP_DIM_GANG] < 0) 5719 dims[GOMP_DIM_GANG] = PTX_DEFAULT_RUNTIME_DIM; 5720 nvptx_apply_dim_limits (dims); 5721 } 5722 5723 if (offload_region_p) 5724 { 5725 for (i = 0; i < GOMP_DIM_MAX; i++) 5726 { 5727 if (!(dims[i] < 0)) 5728 continue; 5729 5730 if ((used & GOMP_DIM_MASK (i)) == 0) 5731 /* Function oacc_validate_dims will apply the minimal dimension. */ 5732 continue; 5733 5734 dims[i] = (i == GOMP_DIM_VECTOR 5735 ? default_vector_length 5736 : oacc_get_default_dim (i)); 5737 } 5738 5739 nvptx_apply_dim_limits (dims); 5740 } 5741 } 5742 5743 /* Validate compute dimensions of an OpenACC offload or routine, fill 5744 in non-unity defaults. FN_LEVEL indicates the level at which a 5745 routine might spawn a loop. It is negative for non-routines. If 5746 DECL is null, we are validating the default dimensions. */ 5747 5748 static bool 5749 nvptx_goacc_validate_dims (tree decl, int dims[], int fn_level, unsigned used) 5750 { 5751 int old_dims[GOMP_DIM_MAX]; 5752 unsigned int i; 5753 5754 for (i = 0; i < GOMP_DIM_MAX; ++i) 5755 old_dims[i] = dims[i]; 5756 5757 nvptx_goacc_validate_dims_1 (decl, dims, fn_level, used); 5758 5759 gcc_assert (dims[GOMP_DIM_VECTOR] != 0); 5760 if (dims[GOMP_DIM_WORKER] > 0 && dims[GOMP_DIM_VECTOR] > 0) 5761 gcc_assert (dims[GOMP_DIM_WORKER] * dims[GOMP_DIM_VECTOR] <= PTX_CTA_SIZE); 5762 5763 for (i = 0; i < GOMP_DIM_MAX; ++i) 5764 if (old_dims[i] != dims[i]) 5765 return true; 5766 5767 return false; 5768 } 5769 5770 /* Return maximum dimension size, or zero for unbounded. */ 5771 5772 static int 5773 nvptx_dim_limit (int axis) 5774 { 5775 switch (axis) 5776 { 5777 case GOMP_DIM_VECTOR: 5778 return PTX_MAX_VECTOR_LENGTH; 5779 5780 default: 5781 break; 5782 } 5783 return 0; 5784 } 5785 5786 /* Determine whether fork & joins are needed. */ 5787 5788 static bool 5789 nvptx_goacc_fork_join (gcall *call, const int dims[], 5790 bool ARG_UNUSED (is_fork)) 5791 { 5792 tree arg = gimple_call_arg (call, 2); 5793 unsigned axis = TREE_INT_CST_LOW (arg); 5794 5795 /* We only care about worker and vector partitioning. */ 5796 if (axis < GOMP_DIM_WORKER) 5797 return false; 5798 5799 /* If the size is 1, there's no partitioning. */ 5800 if (dims[axis] == 1) 5801 return false; 5802 5803 return true; 5804 } 5805 5806 /* Generate a PTX builtin function call that returns the address in 5807 the worker reduction buffer at OFFSET. TYPE is the type of the 5808 data at that location. */ 5809 5810 static tree 5811 nvptx_get_shared_red_addr (tree type, tree offset, bool vector) 5812 { 5813 enum nvptx_builtins addr_dim = NVPTX_BUILTIN_WORKER_ADDR; 5814 if (vector) 5815 addr_dim = NVPTX_BUILTIN_VECTOR_ADDR; 5816 machine_mode mode = TYPE_MODE (type); 5817 tree fndecl = nvptx_builtin_decl (addr_dim, true); 5818 tree size = build_int_cst (unsigned_type_node, GET_MODE_SIZE (mode)); 5819 tree align = build_int_cst (unsigned_type_node, 5820 GET_MODE_ALIGNMENT (mode) / BITS_PER_UNIT); 5821 tree call = build_call_expr (fndecl, 3, offset, size, align); 5822 5823 return fold_convert (build_pointer_type (type), call); 5824 } 5825 5826 /* Emit a SHFL.DOWN using index SHFL of VAR into DEST_VAR. This function 5827 will cast the variable if necessary. */ 5828 5829 static void 5830 nvptx_generate_vector_shuffle (location_t loc, 5831 tree dest_var, tree var, unsigned shift, 5832 gimple_seq *seq) 5833 { 5834 unsigned fn = NVPTX_BUILTIN_SHUFFLE; 5835 tree_code code = NOP_EXPR; 5836 tree arg_type = unsigned_type_node; 5837 tree var_type = TREE_TYPE (var); 5838 tree dest_type = var_type; 5839 5840 if (TREE_CODE (var_type) == COMPLEX_TYPE) 5841 var_type = TREE_TYPE (var_type); 5842 5843 if (TREE_CODE (var_type) == REAL_TYPE) 5844 code = VIEW_CONVERT_EXPR; 5845 5846 if (TYPE_SIZE (var_type) 5847 == TYPE_SIZE (long_long_unsigned_type_node)) 5848 { 5849 fn = NVPTX_BUILTIN_SHUFFLELL; 5850 arg_type = long_long_unsigned_type_node; 5851 } 5852 5853 tree call = nvptx_builtin_decl (fn, true); 5854 tree bits = build_int_cst (unsigned_type_node, shift); 5855 tree kind = build_int_cst (unsigned_type_node, SHUFFLE_DOWN); 5856 tree expr; 5857 5858 if (var_type != dest_type) 5859 { 5860 /* Do real and imaginary parts separately. */ 5861 tree real = fold_build1 (REALPART_EXPR, var_type, var); 5862 real = fold_build1 (code, arg_type, real); 5863 real = build_call_expr_loc (loc, call, 3, real, bits, kind); 5864 real = fold_build1 (code, var_type, real); 5865 5866 tree imag = fold_build1 (IMAGPART_EXPR, var_type, var); 5867 imag = fold_build1 (code, arg_type, imag); 5868 imag = build_call_expr_loc (loc, call, 3, imag, bits, kind); 5869 imag = fold_build1 (code, var_type, imag); 5870 5871 expr = fold_build2 (COMPLEX_EXPR, dest_type, real, imag); 5872 } 5873 else 5874 { 5875 expr = fold_build1 (code, arg_type, var); 5876 expr = build_call_expr_loc (loc, call, 3, expr, bits, kind); 5877 expr = fold_build1 (code, dest_type, expr); 5878 } 5879 5880 gimplify_assign (dest_var, expr, seq); 5881 } 5882 5883 /* Lazily generate the global lock var decl and return its address. */ 5884 5885 static tree 5886 nvptx_global_lock_addr () 5887 { 5888 tree v = global_lock_var; 5889 5890 if (!v) 5891 { 5892 tree name = get_identifier ("__reduction_lock"); 5893 tree type = build_qualified_type (unsigned_type_node, 5894 TYPE_QUAL_VOLATILE); 5895 v = build_decl (BUILTINS_LOCATION, VAR_DECL, name, type); 5896 global_lock_var = v; 5897 DECL_ARTIFICIAL (v) = 1; 5898 DECL_EXTERNAL (v) = 1; 5899 TREE_STATIC (v) = 1; 5900 TREE_PUBLIC (v) = 1; 5901 TREE_USED (v) = 1; 5902 mark_addressable (v); 5903 mark_decl_referenced (v); 5904 } 5905 5906 return build_fold_addr_expr (v); 5907 } 5908 5909 /* Insert code to locklessly update *PTR with *PTR OP VAR just before 5910 GSI. We use a lockless scheme for nearly all case, which looks 5911 like: 5912 actual = initval(OP); 5913 do { 5914 guess = actual; 5915 write = guess OP myval; 5916 actual = cmp&swap (ptr, guess, write) 5917 } while (actual bit-different-to guess); 5918 return write; 5919 5920 This relies on a cmp&swap instruction, which is available for 32- 5921 and 64-bit types. Larger types must use a locking scheme. */ 5922 5923 static tree 5924 nvptx_lockless_update (location_t loc, gimple_stmt_iterator *gsi, 5925 tree ptr, tree var, tree_code op) 5926 { 5927 unsigned fn = NVPTX_BUILTIN_CMP_SWAP; 5928 tree_code code = NOP_EXPR; 5929 tree arg_type = unsigned_type_node; 5930 tree var_type = TREE_TYPE (var); 5931 5932 if (TREE_CODE (var_type) == COMPLEX_TYPE 5933 || TREE_CODE (var_type) == REAL_TYPE) 5934 code = VIEW_CONVERT_EXPR; 5935 5936 if (TYPE_SIZE (var_type) == TYPE_SIZE (long_long_unsigned_type_node)) 5937 { 5938 arg_type = long_long_unsigned_type_node; 5939 fn = NVPTX_BUILTIN_CMP_SWAPLL; 5940 } 5941 5942 tree swap_fn = nvptx_builtin_decl (fn, true); 5943 5944 gimple_seq init_seq = NULL; 5945 tree init_var = make_ssa_name (arg_type); 5946 tree init_expr = omp_reduction_init_op (loc, op, var_type); 5947 init_expr = fold_build1 (code, arg_type, init_expr); 5948 gimplify_assign (init_var, init_expr, &init_seq); 5949 gimple *init_end = gimple_seq_last (init_seq); 5950 5951 gsi_insert_seq_before (gsi, init_seq, GSI_SAME_STMT); 5952 5953 /* Split the block just after the init stmts. */ 5954 basic_block pre_bb = gsi_bb (*gsi); 5955 edge pre_edge = split_block (pre_bb, init_end); 5956 basic_block loop_bb = pre_edge->dest; 5957 pre_bb = pre_edge->src; 5958 /* Reset the iterator. */ 5959 *gsi = gsi_for_stmt (gsi_stmt (*gsi)); 5960 5961 tree expect_var = make_ssa_name (arg_type); 5962 tree actual_var = make_ssa_name (arg_type); 5963 tree write_var = make_ssa_name (arg_type); 5964 5965 /* Build and insert the reduction calculation. */ 5966 gimple_seq red_seq = NULL; 5967 tree write_expr = fold_build1 (code, var_type, expect_var); 5968 write_expr = fold_build2 (op, var_type, write_expr, var); 5969 write_expr = fold_build1 (code, arg_type, write_expr); 5970 gimplify_assign (write_var, write_expr, &red_seq); 5971 5972 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); 5973 5974 /* Build & insert the cmp&swap sequence. */ 5975 gimple_seq latch_seq = NULL; 5976 tree swap_expr = build_call_expr_loc (loc, swap_fn, 3, 5977 ptr, expect_var, write_var); 5978 gimplify_assign (actual_var, swap_expr, &latch_seq); 5979 5980 gcond *cond = gimple_build_cond (EQ_EXPR, actual_var, expect_var, 5981 NULL_TREE, NULL_TREE); 5982 gimple_seq_add_stmt (&latch_seq, cond); 5983 5984 gimple *latch_end = gimple_seq_last (latch_seq); 5985 gsi_insert_seq_before (gsi, latch_seq, GSI_SAME_STMT); 5986 5987 /* Split the block just after the latch stmts. */ 5988 edge post_edge = split_block (loop_bb, latch_end); 5989 basic_block post_bb = post_edge->dest; 5990 loop_bb = post_edge->src; 5991 *gsi = gsi_for_stmt (gsi_stmt (*gsi)); 5992 5993 post_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; 5994 post_edge->probability = profile_probability::even (); 5995 edge loop_edge = make_edge (loop_bb, loop_bb, EDGE_FALSE_VALUE); 5996 loop_edge->probability = profile_probability::even (); 5997 set_immediate_dominator (CDI_DOMINATORS, loop_bb, pre_bb); 5998 set_immediate_dominator (CDI_DOMINATORS, post_bb, loop_bb); 5999 6000 gphi *phi = create_phi_node (expect_var, loop_bb); 6001 add_phi_arg (phi, init_var, pre_edge, loc); 6002 add_phi_arg (phi, actual_var, loop_edge, loc); 6003 6004 loop *loop = alloc_loop (); 6005 loop->header = loop_bb; 6006 loop->latch = loop_bb; 6007 add_loop (loop, loop_bb->loop_father); 6008 6009 return fold_build1 (code, var_type, write_var); 6010 } 6011 6012 /* Insert code to lockfully update *PTR with *PTR OP VAR just before 6013 GSI. This is necessary for types larger than 64 bits, where there 6014 is no cmp&swap instruction to implement a lockless scheme. We use 6015 a lock variable in global memory. 6016 6017 while (cmp&swap (&lock_var, 0, 1)) 6018 continue; 6019 T accum = *ptr; 6020 accum = accum OP var; 6021 *ptr = accum; 6022 cmp&swap (&lock_var, 1, 0); 6023 return accum; 6024 6025 A lock in global memory is necessary to force execution engine 6026 descheduling and avoid resource starvation that can occur if the 6027 lock is in .shared memory. */ 6028 6029 static tree 6030 nvptx_lockfull_update (location_t loc, gimple_stmt_iterator *gsi, 6031 tree ptr, tree var, tree_code op) 6032 { 6033 tree var_type = TREE_TYPE (var); 6034 tree swap_fn = nvptx_builtin_decl (NVPTX_BUILTIN_CMP_SWAP, true); 6035 tree uns_unlocked = build_int_cst (unsigned_type_node, 0); 6036 tree uns_locked = build_int_cst (unsigned_type_node, 1); 6037 6038 /* Split the block just before the gsi. Insert a gimple nop to make 6039 this easier. */ 6040 gimple *nop = gimple_build_nop (); 6041 gsi_insert_before (gsi, nop, GSI_SAME_STMT); 6042 basic_block entry_bb = gsi_bb (*gsi); 6043 edge entry_edge = split_block (entry_bb, nop); 6044 basic_block lock_bb = entry_edge->dest; 6045 /* Reset the iterator. */ 6046 *gsi = gsi_for_stmt (gsi_stmt (*gsi)); 6047 6048 /* Build and insert the locking sequence. */ 6049 gimple_seq lock_seq = NULL; 6050 tree lock_var = make_ssa_name (unsigned_type_node); 6051 tree lock_expr = nvptx_global_lock_addr (); 6052 lock_expr = build_call_expr_loc (loc, swap_fn, 3, lock_expr, 6053 uns_unlocked, uns_locked); 6054 gimplify_assign (lock_var, lock_expr, &lock_seq); 6055 gcond *cond = gimple_build_cond (EQ_EXPR, lock_var, uns_unlocked, 6056 NULL_TREE, NULL_TREE); 6057 gimple_seq_add_stmt (&lock_seq, cond); 6058 gimple *lock_end = gimple_seq_last (lock_seq); 6059 gsi_insert_seq_before (gsi, lock_seq, GSI_SAME_STMT); 6060 6061 /* Split the block just after the lock sequence. */ 6062 edge locked_edge = split_block (lock_bb, lock_end); 6063 basic_block update_bb = locked_edge->dest; 6064 lock_bb = locked_edge->src; 6065 *gsi = gsi_for_stmt (gsi_stmt (*gsi)); 6066 6067 /* Create the lock loop ... */ 6068 locked_edge->flags ^= EDGE_TRUE_VALUE | EDGE_FALLTHRU; 6069 locked_edge->probability = profile_probability::even (); 6070 edge loop_edge = make_edge (lock_bb, lock_bb, EDGE_FALSE_VALUE); 6071 loop_edge->probability = profile_probability::even (); 6072 set_immediate_dominator (CDI_DOMINATORS, lock_bb, entry_bb); 6073 set_immediate_dominator (CDI_DOMINATORS, update_bb, lock_bb); 6074 6075 /* ... and the loop structure. */ 6076 loop *lock_loop = alloc_loop (); 6077 lock_loop->header = lock_bb; 6078 lock_loop->latch = lock_bb; 6079 lock_loop->nb_iterations_estimate = 1; 6080 lock_loop->any_estimate = true; 6081 add_loop (lock_loop, entry_bb->loop_father); 6082 6083 /* Build and insert the reduction calculation. */ 6084 gimple_seq red_seq = NULL; 6085 tree acc_in = make_ssa_name (var_type); 6086 tree ref_in = build_simple_mem_ref (ptr); 6087 TREE_THIS_VOLATILE (ref_in) = 1; 6088 gimplify_assign (acc_in, ref_in, &red_seq); 6089 6090 tree acc_out = make_ssa_name (var_type); 6091 tree update_expr = fold_build2 (op, var_type, ref_in, var); 6092 gimplify_assign (acc_out, update_expr, &red_seq); 6093 6094 tree ref_out = build_simple_mem_ref (ptr); 6095 TREE_THIS_VOLATILE (ref_out) = 1; 6096 gimplify_assign (ref_out, acc_out, &red_seq); 6097 6098 gsi_insert_seq_before (gsi, red_seq, GSI_SAME_STMT); 6099 6100 /* Build & insert the unlock sequence. */ 6101 gimple_seq unlock_seq = NULL; 6102 tree unlock_expr = nvptx_global_lock_addr (); 6103 unlock_expr = build_call_expr_loc (loc, swap_fn, 3, unlock_expr, 6104 uns_locked, uns_unlocked); 6105 gimplify_and_add (unlock_expr, &unlock_seq); 6106 gsi_insert_seq_before (gsi, unlock_seq, GSI_SAME_STMT); 6107 6108 return acc_out; 6109 } 6110 6111 /* Emit a sequence to update a reduction accumlator at *PTR with the 6112 value held in VAR using operator OP. Return the updated value. 6113 6114 TODO: optimize for atomic ops and indepedent complex ops. */ 6115 6116 static tree 6117 nvptx_reduction_update (location_t loc, gimple_stmt_iterator *gsi, 6118 tree ptr, tree var, tree_code op) 6119 { 6120 tree type = TREE_TYPE (var); 6121 tree size = TYPE_SIZE (type); 6122 6123 if (size == TYPE_SIZE (unsigned_type_node) 6124 || size == TYPE_SIZE (long_long_unsigned_type_node)) 6125 return nvptx_lockless_update (loc, gsi, ptr, var, op); 6126 else 6127 return nvptx_lockfull_update (loc, gsi, ptr, var, op); 6128 } 6129 6130 /* NVPTX implementation of GOACC_REDUCTION_SETUP. */ 6131 6132 static void 6133 nvptx_goacc_reduction_setup (gcall *call, offload_attrs *oa) 6134 { 6135 gimple_stmt_iterator gsi = gsi_for_stmt (call); 6136 tree lhs = gimple_call_lhs (call); 6137 tree var = gimple_call_arg (call, 2); 6138 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); 6139 gimple_seq seq = NULL; 6140 6141 push_gimplify_context (true); 6142 6143 if (level != GOMP_DIM_GANG) 6144 { 6145 /* Copy the receiver object. */ 6146 tree ref_to_res = gimple_call_arg (call, 1); 6147 6148 if (!integer_zerop (ref_to_res)) 6149 var = build_simple_mem_ref (ref_to_res); 6150 } 6151 6152 if (level == GOMP_DIM_WORKER 6153 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE)) 6154 { 6155 /* Store incoming value to worker reduction buffer. */ 6156 tree offset = gimple_call_arg (call, 5); 6157 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset, 6158 level == GOMP_DIM_VECTOR); 6159 tree ptr = make_ssa_name (TREE_TYPE (call)); 6160 6161 gimplify_assign (ptr, call, &seq); 6162 tree ref = build_simple_mem_ref (ptr); 6163 TREE_THIS_VOLATILE (ref) = 1; 6164 gimplify_assign (ref, var, &seq); 6165 } 6166 6167 if (lhs) 6168 gimplify_assign (lhs, var, &seq); 6169 6170 pop_gimplify_context (NULL); 6171 gsi_replace_with_seq (&gsi, seq, true); 6172 } 6173 6174 /* NVPTX implementation of GOACC_REDUCTION_INIT. */ 6175 6176 static void 6177 nvptx_goacc_reduction_init (gcall *call, offload_attrs *oa) 6178 { 6179 gimple_stmt_iterator gsi = gsi_for_stmt (call); 6180 tree lhs = gimple_call_lhs (call); 6181 tree var = gimple_call_arg (call, 2); 6182 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); 6183 enum tree_code rcode 6184 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4)); 6185 tree init = omp_reduction_init_op (gimple_location (call), rcode, 6186 TREE_TYPE (var)); 6187 gimple_seq seq = NULL; 6188 6189 push_gimplify_context (true); 6190 6191 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE) 6192 { 6193 /* Initialize vector-non-zeroes to INIT_VAL (OP). */ 6194 tree tid = make_ssa_name (integer_type_node); 6195 tree dim_vector = gimple_call_arg (call, 3); 6196 gimple *tid_call = gimple_build_call_internal (IFN_GOACC_DIM_POS, 1, 6197 dim_vector); 6198 gimple *cond_stmt = gimple_build_cond (NE_EXPR, tid, integer_zero_node, 6199 NULL_TREE, NULL_TREE); 6200 6201 gimple_call_set_lhs (tid_call, tid); 6202 gimple_seq_add_stmt (&seq, tid_call); 6203 gimple_seq_add_stmt (&seq, cond_stmt); 6204 6205 /* Split the block just after the call. */ 6206 edge init_edge = split_block (gsi_bb (gsi), call); 6207 basic_block init_bb = init_edge->dest; 6208 basic_block call_bb = init_edge->src; 6209 6210 /* Fixup flags from call_bb to init_bb. */ 6211 init_edge->flags ^= EDGE_FALLTHRU | EDGE_TRUE_VALUE; 6212 init_edge->probability = profile_probability::even (); 6213 6214 /* Set the initialization stmts. */ 6215 gimple_seq init_seq = NULL; 6216 tree init_var = make_ssa_name (TREE_TYPE (var)); 6217 gimplify_assign (init_var, init, &init_seq); 6218 gsi = gsi_start_bb (init_bb); 6219 gsi_insert_seq_before (&gsi, init_seq, GSI_SAME_STMT); 6220 6221 /* Split block just after the init stmt. */ 6222 gsi_prev (&gsi); 6223 edge inited_edge = split_block (gsi_bb (gsi), gsi_stmt (gsi)); 6224 basic_block dst_bb = inited_edge->dest; 6225 6226 /* Create false edge from call_bb to dst_bb. */ 6227 edge nop_edge = make_edge (call_bb, dst_bb, EDGE_FALSE_VALUE); 6228 nop_edge->probability = profile_probability::even (); 6229 6230 /* Create phi node in dst block. */ 6231 gphi *phi = create_phi_node (lhs, dst_bb); 6232 add_phi_arg (phi, init_var, inited_edge, gimple_location (call)); 6233 add_phi_arg (phi, var, nop_edge, gimple_location (call)); 6234 6235 /* Reset dominator of dst bb. */ 6236 set_immediate_dominator (CDI_DOMINATORS, dst_bb, call_bb); 6237 6238 /* Reset the gsi. */ 6239 gsi = gsi_for_stmt (call); 6240 } 6241 else 6242 { 6243 if (level == GOMP_DIM_GANG) 6244 { 6245 /* If there's no receiver object, propagate the incoming VAR. */ 6246 tree ref_to_res = gimple_call_arg (call, 1); 6247 if (integer_zerop (ref_to_res)) 6248 init = var; 6249 } 6250 6251 if (lhs != NULL_TREE) 6252 gimplify_assign (lhs, init, &seq); 6253 } 6254 6255 pop_gimplify_context (NULL); 6256 gsi_replace_with_seq (&gsi, seq, true); 6257 } 6258 6259 /* NVPTX implementation of GOACC_REDUCTION_FINI. */ 6260 6261 static void 6262 nvptx_goacc_reduction_fini (gcall *call, offload_attrs *oa) 6263 { 6264 gimple_stmt_iterator gsi = gsi_for_stmt (call); 6265 tree lhs = gimple_call_lhs (call); 6266 tree ref_to_res = gimple_call_arg (call, 1); 6267 tree var = gimple_call_arg (call, 2); 6268 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); 6269 enum tree_code op 6270 = (enum tree_code)TREE_INT_CST_LOW (gimple_call_arg (call, 4)); 6271 gimple_seq seq = NULL; 6272 tree r = NULL_TREE;; 6273 6274 push_gimplify_context (true); 6275 6276 if (level == GOMP_DIM_VECTOR && oa->vector_length == PTX_WARP_SIZE) 6277 { 6278 /* Emit binary shuffle tree. TODO. Emit this as an actual loop, 6279 but that requires a method of emitting a unified jump at the 6280 gimple level. */ 6281 for (int shfl = PTX_WARP_SIZE / 2; shfl > 0; shfl = shfl >> 1) 6282 { 6283 tree other_var = make_ssa_name (TREE_TYPE (var)); 6284 nvptx_generate_vector_shuffle (gimple_location (call), 6285 other_var, var, shfl, &seq); 6286 6287 r = make_ssa_name (TREE_TYPE (var)); 6288 gimplify_assign (r, fold_build2 (op, TREE_TYPE (var), 6289 var, other_var), &seq); 6290 var = r; 6291 } 6292 } 6293 else 6294 { 6295 tree accum = NULL_TREE; 6296 6297 if (level == GOMP_DIM_WORKER || level == GOMP_DIM_VECTOR) 6298 { 6299 /* Get reduction buffer address. */ 6300 tree offset = gimple_call_arg (call, 5); 6301 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset, 6302 level == GOMP_DIM_VECTOR); 6303 tree ptr = make_ssa_name (TREE_TYPE (call)); 6304 6305 gimplify_assign (ptr, call, &seq); 6306 accum = ptr; 6307 } 6308 else if (integer_zerop (ref_to_res)) 6309 r = var; 6310 else 6311 accum = ref_to_res; 6312 6313 if (accum) 6314 { 6315 /* UPDATE the accumulator. */ 6316 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT); 6317 seq = NULL; 6318 r = nvptx_reduction_update (gimple_location (call), &gsi, 6319 accum, var, op); 6320 } 6321 } 6322 6323 if (lhs) 6324 gimplify_assign (lhs, r, &seq); 6325 pop_gimplify_context (NULL); 6326 6327 gsi_replace_with_seq (&gsi, seq, true); 6328 } 6329 6330 /* NVPTX implementation of GOACC_REDUCTION_TEARDOWN. */ 6331 6332 static void 6333 nvptx_goacc_reduction_teardown (gcall *call, offload_attrs *oa) 6334 { 6335 gimple_stmt_iterator gsi = gsi_for_stmt (call); 6336 tree lhs = gimple_call_lhs (call); 6337 tree var = gimple_call_arg (call, 2); 6338 int level = TREE_INT_CST_LOW (gimple_call_arg (call, 3)); 6339 gimple_seq seq = NULL; 6340 6341 push_gimplify_context (true); 6342 if (level == GOMP_DIM_WORKER 6343 || (level == GOMP_DIM_VECTOR && oa->vector_length > PTX_WARP_SIZE)) 6344 { 6345 /* Read the worker reduction buffer. */ 6346 tree offset = gimple_call_arg (call, 5); 6347 tree call = nvptx_get_shared_red_addr (TREE_TYPE (var), offset, 6348 level == GOMP_DIM_VECTOR); 6349 tree ptr = make_ssa_name (TREE_TYPE (call)); 6350 6351 gimplify_assign (ptr, call, &seq); 6352 var = build_simple_mem_ref (ptr); 6353 TREE_THIS_VOLATILE (var) = 1; 6354 } 6355 6356 if (level != GOMP_DIM_GANG) 6357 { 6358 /* Write to the receiver object. */ 6359 tree ref_to_res = gimple_call_arg (call, 1); 6360 6361 if (!integer_zerop (ref_to_res)) 6362 gimplify_assign (build_simple_mem_ref (ref_to_res), var, &seq); 6363 } 6364 6365 if (lhs) 6366 gimplify_assign (lhs, var, &seq); 6367 6368 pop_gimplify_context (NULL); 6369 6370 gsi_replace_with_seq (&gsi, seq, true); 6371 } 6372 6373 /* NVPTX reduction expander. */ 6374 6375 static void 6376 nvptx_goacc_reduction (gcall *call) 6377 { 6378 unsigned code = (unsigned)TREE_INT_CST_LOW (gimple_call_arg (call, 0)); 6379 offload_attrs oa; 6380 6381 populate_offload_attrs (&oa); 6382 6383 switch (code) 6384 { 6385 case IFN_GOACC_REDUCTION_SETUP: 6386 nvptx_goacc_reduction_setup (call, &oa); 6387 break; 6388 6389 case IFN_GOACC_REDUCTION_INIT: 6390 nvptx_goacc_reduction_init (call, &oa); 6391 break; 6392 6393 case IFN_GOACC_REDUCTION_FINI: 6394 nvptx_goacc_reduction_fini (call, &oa); 6395 break; 6396 6397 case IFN_GOACC_REDUCTION_TEARDOWN: 6398 nvptx_goacc_reduction_teardown (call, &oa); 6399 break; 6400 6401 default: 6402 gcc_unreachable (); 6403 } 6404 } 6405 6406 static bool 6407 nvptx_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, 6408 rtx x ATTRIBUTE_UNUSED) 6409 { 6410 return true; 6411 } 6412 6413 static bool 6414 nvptx_vector_mode_supported (machine_mode mode) 6415 { 6416 return (mode == V2SImode 6417 || mode == V2DImode); 6418 } 6419 6420 /* Return the preferred mode for vectorizing scalar MODE. */ 6421 6422 static machine_mode 6423 nvptx_preferred_simd_mode (scalar_mode mode) 6424 { 6425 switch (mode) 6426 { 6427 case E_DImode: 6428 return V2DImode; 6429 case E_SImode: 6430 return V2SImode; 6431 6432 default: 6433 return default_preferred_simd_mode (mode); 6434 } 6435 } 6436 6437 unsigned int 6438 nvptx_data_alignment (const_tree type, unsigned int basic_align) 6439 { 6440 if (TREE_CODE (type) == INTEGER_TYPE) 6441 { 6442 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type)); 6443 if (size == GET_MODE_SIZE (TImode)) 6444 return GET_MODE_BITSIZE (maybe_split_mode (TImode)); 6445 } 6446 6447 return basic_align; 6448 } 6449 6450 /* Implement TARGET_MODES_TIEABLE_P. */ 6451 6452 static bool 6453 nvptx_modes_tieable_p (machine_mode, machine_mode) 6454 { 6455 return false; 6456 } 6457 6458 /* Implement TARGET_HARD_REGNO_NREGS. */ 6459 6460 static unsigned int 6461 nvptx_hard_regno_nregs (unsigned int, machine_mode) 6462 { 6463 return 1; 6464 } 6465 6466 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */ 6467 6468 static bool 6469 nvptx_can_change_mode_class (machine_mode, machine_mode, reg_class_t) 6470 { 6471 return false; 6472 } 6473 6474 static GTY(()) tree nvptx_previous_fndecl; 6475 6476 static void 6477 nvptx_set_current_function (tree fndecl) 6478 { 6479 if (!fndecl || fndecl == nvptx_previous_fndecl) 6480 return; 6481 6482 nvptx_previous_fndecl = fndecl; 6483 vector_red_partition = 0; 6484 oacc_bcast_partition = 0; 6485 } 6486 6487 #undef TARGET_OPTION_OVERRIDE 6488 #define TARGET_OPTION_OVERRIDE nvptx_option_override 6489 6490 #undef TARGET_ATTRIBUTE_TABLE 6491 #define TARGET_ATTRIBUTE_TABLE nvptx_attribute_table 6492 6493 #undef TARGET_LRA_P 6494 #define TARGET_LRA_P hook_bool_void_false 6495 6496 #undef TARGET_LEGITIMATE_ADDRESS_P 6497 #define TARGET_LEGITIMATE_ADDRESS_P nvptx_legitimate_address_p 6498 6499 #undef TARGET_PROMOTE_FUNCTION_MODE 6500 #define TARGET_PROMOTE_FUNCTION_MODE nvptx_promote_function_mode 6501 6502 #undef TARGET_FUNCTION_ARG 6503 #define TARGET_FUNCTION_ARG nvptx_function_arg 6504 #undef TARGET_FUNCTION_INCOMING_ARG 6505 #define TARGET_FUNCTION_INCOMING_ARG nvptx_function_incoming_arg 6506 #undef TARGET_FUNCTION_ARG_ADVANCE 6507 #define TARGET_FUNCTION_ARG_ADVANCE nvptx_function_arg_advance 6508 #undef TARGET_FUNCTION_ARG_BOUNDARY 6509 #define TARGET_FUNCTION_ARG_BOUNDARY nvptx_function_arg_boundary 6510 #undef TARGET_PASS_BY_REFERENCE 6511 #define TARGET_PASS_BY_REFERENCE nvptx_pass_by_reference 6512 #undef TARGET_FUNCTION_VALUE_REGNO_P 6513 #define TARGET_FUNCTION_VALUE_REGNO_P nvptx_function_value_regno_p 6514 #undef TARGET_FUNCTION_VALUE 6515 #define TARGET_FUNCTION_VALUE nvptx_function_value 6516 #undef TARGET_LIBCALL_VALUE 6517 #define TARGET_LIBCALL_VALUE nvptx_libcall_value 6518 #undef TARGET_FUNCTION_OK_FOR_SIBCALL 6519 #define TARGET_FUNCTION_OK_FOR_SIBCALL nvptx_function_ok_for_sibcall 6520 #undef TARGET_GET_DRAP_RTX 6521 #define TARGET_GET_DRAP_RTX nvptx_get_drap_rtx 6522 #undef TARGET_SPLIT_COMPLEX_ARG 6523 #define TARGET_SPLIT_COMPLEX_ARG hook_bool_const_tree_true 6524 #undef TARGET_RETURN_IN_MEMORY 6525 #define TARGET_RETURN_IN_MEMORY nvptx_return_in_memory 6526 #undef TARGET_OMIT_STRUCT_RETURN_REG 6527 #define TARGET_OMIT_STRUCT_RETURN_REG true 6528 #undef TARGET_STRICT_ARGUMENT_NAMING 6529 #define TARGET_STRICT_ARGUMENT_NAMING nvptx_strict_argument_naming 6530 #undef TARGET_CALL_ARGS 6531 #define TARGET_CALL_ARGS nvptx_call_args 6532 #undef TARGET_END_CALL_ARGS 6533 #define TARGET_END_CALL_ARGS nvptx_end_call_args 6534 6535 #undef TARGET_ASM_FILE_START 6536 #define TARGET_ASM_FILE_START nvptx_file_start 6537 #undef TARGET_ASM_FILE_END 6538 #define TARGET_ASM_FILE_END nvptx_file_end 6539 #undef TARGET_ASM_GLOBALIZE_LABEL 6540 #define TARGET_ASM_GLOBALIZE_LABEL nvptx_globalize_label 6541 #undef TARGET_ASM_ASSEMBLE_UNDEFINED_DECL 6542 #define TARGET_ASM_ASSEMBLE_UNDEFINED_DECL nvptx_assemble_undefined_decl 6543 #undef TARGET_PRINT_OPERAND 6544 #define TARGET_PRINT_OPERAND nvptx_print_operand 6545 #undef TARGET_PRINT_OPERAND_ADDRESS 6546 #define TARGET_PRINT_OPERAND_ADDRESS nvptx_print_operand_address 6547 #undef TARGET_PRINT_OPERAND_PUNCT_VALID_P 6548 #define TARGET_PRINT_OPERAND_PUNCT_VALID_P nvptx_print_operand_punct_valid_p 6549 #undef TARGET_ASM_INTEGER 6550 #define TARGET_ASM_INTEGER nvptx_assemble_integer 6551 #undef TARGET_ASM_DECL_END 6552 #define TARGET_ASM_DECL_END nvptx_assemble_decl_end 6553 #undef TARGET_ASM_DECLARE_CONSTANT_NAME 6554 #define TARGET_ASM_DECLARE_CONSTANT_NAME nvptx_asm_declare_constant_name 6555 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P 6556 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P hook_bool_mode_const_rtx_true 6557 #undef TARGET_ASM_NEED_VAR_DECL_BEFORE_USE 6558 #define TARGET_ASM_NEED_VAR_DECL_BEFORE_USE true 6559 6560 #undef TARGET_MACHINE_DEPENDENT_REORG 6561 #define TARGET_MACHINE_DEPENDENT_REORG nvptx_reorg 6562 #undef TARGET_NO_REGISTER_ALLOCATION 6563 #define TARGET_NO_REGISTER_ALLOCATION true 6564 6565 #undef TARGET_ENCODE_SECTION_INFO 6566 #define TARGET_ENCODE_SECTION_INFO nvptx_encode_section_info 6567 #undef TARGET_RECORD_OFFLOAD_SYMBOL 6568 #define TARGET_RECORD_OFFLOAD_SYMBOL nvptx_record_offload_symbol 6569 6570 #undef TARGET_VECTOR_ALIGNMENT 6571 #define TARGET_VECTOR_ALIGNMENT nvptx_vector_alignment 6572 6573 #undef TARGET_CANNOT_COPY_INSN_P 6574 #define TARGET_CANNOT_COPY_INSN_P nvptx_cannot_copy_insn_p 6575 6576 #undef TARGET_USE_ANCHORS_FOR_SYMBOL_P 6577 #define TARGET_USE_ANCHORS_FOR_SYMBOL_P nvptx_use_anchors_for_symbol_p 6578 6579 #undef TARGET_INIT_BUILTINS 6580 #define TARGET_INIT_BUILTINS nvptx_init_builtins 6581 #undef TARGET_EXPAND_BUILTIN 6582 #define TARGET_EXPAND_BUILTIN nvptx_expand_builtin 6583 #undef TARGET_BUILTIN_DECL 6584 #define TARGET_BUILTIN_DECL nvptx_builtin_decl 6585 6586 #undef TARGET_SIMT_VF 6587 #define TARGET_SIMT_VF nvptx_simt_vf 6588 6589 #undef TARGET_OMP_DEVICE_KIND_ARCH_ISA 6590 #define TARGET_OMP_DEVICE_KIND_ARCH_ISA nvptx_omp_device_kind_arch_isa 6591 6592 #undef TARGET_GOACC_VALIDATE_DIMS 6593 #define TARGET_GOACC_VALIDATE_DIMS nvptx_goacc_validate_dims 6594 6595 #undef TARGET_GOACC_DIM_LIMIT 6596 #define TARGET_GOACC_DIM_LIMIT nvptx_dim_limit 6597 6598 #undef TARGET_GOACC_FORK_JOIN 6599 #define TARGET_GOACC_FORK_JOIN nvptx_goacc_fork_join 6600 6601 #undef TARGET_GOACC_REDUCTION 6602 #define TARGET_GOACC_REDUCTION nvptx_goacc_reduction 6603 6604 #undef TARGET_CANNOT_FORCE_CONST_MEM 6605 #define TARGET_CANNOT_FORCE_CONST_MEM nvptx_cannot_force_const_mem 6606 6607 #undef TARGET_VECTOR_MODE_SUPPORTED_P 6608 #define TARGET_VECTOR_MODE_SUPPORTED_P nvptx_vector_mode_supported 6609 6610 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE 6611 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE \ 6612 nvptx_preferred_simd_mode 6613 6614 #undef TARGET_MODES_TIEABLE_P 6615 #define TARGET_MODES_TIEABLE_P nvptx_modes_tieable_p 6616 6617 #undef TARGET_HARD_REGNO_NREGS 6618 #define TARGET_HARD_REGNO_NREGS nvptx_hard_regno_nregs 6619 6620 #undef TARGET_CAN_CHANGE_MODE_CLASS 6621 #define TARGET_CAN_CHANGE_MODE_CLASS nvptx_can_change_mode_class 6622 6623 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE 6624 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed 6625 6626 #undef TARGET_SET_CURRENT_FUNCTION 6627 #define TARGET_SET_CURRENT_FUNCTION nvptx_set_current_function 6628 6629 struct gcc_target targetm = TARGET_INITIALIZER; 6630 6631 #include "gt-nvptx.h" 6632