1 /* Copyright (C) 2016-2019 Free Software Foundation, Inc. 2 3 This file is free software; you can redistribute it and/or modify it under 4 the terms of the GNU General Public License as published by the Free 5 Software Foundation; either version 3 of the License, or (at your option) 6 any later version. 7 8 This file is distributed in the hope that it will be useful, but WITHOUT 9 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 10 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 11 for more details. 12 13 You should have received a copy of the GNU General Public License 14 along with GCC; see the file COPYING3. If not see 15 <http://www.gnu.org/licenses/>. */ 16 17 /* {{{ Includes. */ 18 19 /* We want GET_MODE_SIZE et al to return integers, please. */ 20 #define IN_TARGET_CODE 1 21 22 #include "config.h" 23 #include "system.h" 24 #include "coretypes.h" 25 #include "backend.h" 26 #include "target.h" 27 #include "memmodel.h" 28 #include "rtl.h" 29 #include "tree.h" 30 #include "df.h" 31 #include "tm_p.h" 32 #include "stringpool.h" 33 #include "optabs.h" 34 #include "regs.h" 35 #include "emit-rtl.h" 36 #include "recog.h" 37 #include "diagnostic-core.h" 38 #include "insn-attr.h" 39 #include "fold-const.h" 40 #include "calls.h" 41 #include "explow.h" 42 #include "expr.h" 43 #include "output.h" 44 #include "cfgrtl.h" 45 #include "langhooks.h" 46 #include "builtins.h" 47 #include "omp-general.h" 48 #include "print-rtl.h" 49 #include "attribs.h" 50 #include "varasm.h" 51 #include "intl.h" 52 #include "rtl-iter.h" 53 54 /* This file should be included last. */ 55 #include "target-def.h" 56 57 /* }}} */ 58 /* {{{ Global variables. */ 59 60 /* Constants used by FP instructions. */ 61 62 static REAL_VALUE_TYPE dconst4, dconst1over2pi; 63 static bool ext_gcn_constants_init = 0; 64 65 /* Holds the ISA variant, derived from the command line parameters. */ 66 67 int gcn_isa = 3; /* Default to GCN3. */ 68 69 /* Reserve this much space for LDS (for propagating variables from 70 worker-single mode to worker-partitioned mode), per workgroup. Global 71 analysis could calculate an exact bound, but we don't do that yet. 72 73 We reserve the whole LDS, which also prevents any other workgroup 74 sharing the Compute Unit. */ 75 76 #define LDS_SIZE 65536 77 78 /* }}} */ 79 /* {{{ Initialization and options. */ 80 81 /* Initialize machine_function. */ 82 83 static struct machine_function * 84 gcn_init_machine_status (void) 85 { 86 struct machine_function *f; 87 88 f = ggc_cleared_alloc<machine_function> (); 89 90 /* Set up LDS allocation for broadcasting for this function. */ 91 f->lds_allocated = 32; 92 f->lds_allocs = hash_map<tree, int>::create_ggc (64); 93 94 /* And LDS temporary decls for worker reductions. */ 95 vec_alloc (f->reduc_decls, 0); 96 97 if (TARGET_GCN3) 98 f->use_flat_addressing = true; 99 100 return f; 101 } 102 103 /* Implement TARGET_OPTION_OVERRIDE. 104 105 Override option settings where defaults are variable, or we have specific 106 needs to consider. */ 107 108 static void 109 gcn_option_override (void) 110 { 111 init_machine_status = gcn_init_machine_status; 112 113 /* The HSA runtime does not respect ELF load addresses, so force PIE. */ 114 if (!flag_pie) 115 flag_pie = 2; 116 if (!flag_pic) 117 flag_pic = flag_pie; 118 119 gcn_isa = gcn_arch == PROCESSOR_VEGA ? 5 : 3; 120 121 /* The default stack size needs to be small for offload kernels because 122 there may be many, many threads. Also, a smaller stack gives a 123 measureable performance boost. But, a small stack is insufficient 124 for running the testsuite, so we use a larger default for the stand 125 alone case. */ 126 if (stack_size_opt == -1) 127 { 128 if (flag_openacc || flag_openmp) 129 /* 512 bytes per work item = 32kB total. */ 130 stack_size_opt = 512 * 64; 131 else 132 /* 1MB total. */ 133 stack_size_opt = 1048576; 134 } 135 } 136 137 /* }}} */ 138 /* {{{ Attributes. */ 139 140 /* This table defines the arguments that are permitted in 141 __attribute__ ((amdgpu_hsa_kernel (...))). 142 143 The names and values correspond to the HSA metadata that is encoded 144 into the assembler file and binary. */ 145 146 static const struct gcn_kernel_arg_type 147 { 148 const char *name; 149 const char *header_pseudo; 150 machine_mode mode; 151 152 /* This should be set to -1 or -2 for a dynamically allocated register 153 number. Use -1 if this argument contributes to the user_sgpr_count, 154 -2 otherwise. */ 155 int fixed_regno; 156 } gcn_kernel_arg_types[] = { 157 {"exec", NULL, DImode, EXEC_REG}, 158 #define PRIVATE_SEGMENT_BUFFER_ARG 1 159 {"private_segment_buffer", 160 "enable_sgpr_private_segment_buffer", TImode, -1}, 161 #define DISPATCH_PTR_ARG 2 162 {"dispatch_ptr", "enable_sgpr_dispatch_ptr", DImode, -1}, 163 #define QUEUE_PTR_ARG 3 164 {"queue_ptr", "enable_sgpr_queue_ptr", DImode, -1}, 165 #define KERNARG_SEGMENT_PTR_ARG 4 166 {"kernarg_segment_ptr", "enable_sgpr_kernarg_segment_ptr", DImode, -1}, 167 {"dispatch_id", "enable_sgpr_dispatch_id", DImode, -1}, 168 #define FLAT_SCRATCH_INIT_ARG 6 169 {"flat_scratch_init", "enable_sgpr_flat_scratch_init", DImode, -1}, 170 #define FLAT_SCRATCH_SEGMENT_SIZE_ARG 7 171 {"private_segment_size", "enable_sgpr_private_segment_size", SImode, -1}, 172 {"grid_workgroup_count_X", 173 "enable_sgpr_grid_workgroup_count_x", SImode, -1}, 174 {"grid_workgroup_count_Y", 175 "enable_sgpr_grid_workgroup_count_y", SImode, -1}, 176 {"grid_workgroup_count_Z", 177 "enable_sgpr_grid_workgroup_count_z", SImode, -1}, 178 #define WORKGROUP_ID_X_ARG 11 179 {"workgroup_id_X", "enable_sgpr_workgroup_id_x", SImode, -2}, 180 {"workgroup_id_Y", "enable_sgpr_workgroup_id_y", SImode, -2}, 181 {"workgroup_id_Z", "enable_sgpr_workgroup_id_z", SImode, -2}, 182 {"workgroup_info", "enable_sgpr_workgroup_info", SImode, -1}, 183 #define PRIVATE_SEGMENT_WAVE_OFFSET_ARG 15 184 {"private_segment_wave_offset", 185 "enable_sgpr_private_segment_wave_byte_offset", SImode, -2}, 186 #define WORK_ITEM_ID_X_ARG 16 187 {"work_item_id_X", NULL, V64SImode, FIRST_VGPR_REG}, 188 #define WORK_ITEM_ID_Y_ARG 17 189 {"work_item_id_Y", NULL, V64SImode, FIRST_VGPR_REG + 1}, 190 #define WORK_ITEM_ID_Z_ARG 18 191 {"work_item_id_Z", NULL, V64SImode, FIRST_VGPR_REG + 2} 192 }; 193 194 /* Extract parameter settings from __attribute__((amdgpu_hsa_kernel ())). 195 This function also sets the default values for some arguments. 196 197 Return true on success, with ARGS populated. */ 198 199 static bool 200 gcn_parse_amdgpu_hsa_kernel_attribute (struct gcn_kernel_args *args, 201 tree list) 202 { 203 bool err = false; 204 args->requested = ((1 << PRIVATE_SEGMENT_BUFFER_ARG) 205 | (1 << QUEUE_PTR_ARG) 206 | (1 << KERNARG_SEGMENT_PTR_ARG) 207 | (1 << PRIVATE_SEGMENT_WAVE_OFFSET_ARG)); 208 args->nargs = 0; 209 210 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++) 211 args->reg[a] = -1; 212 213 for (; list; list = TREE_CHAIN (list)) 214 { 215 const char *str; 216 if (TREE_CODE (TREE_VALUE (list)) != STRING_CST) 217 { 218 error ("amdgpu_hsa_kernel attribute requires string constant " 219 "arguments"); 220 break; 221 } 222 str = TREE_STRING_POINTER (TREE_VALUE (list)); 223 int a; 224 for (a = 0; a < GCN_KERNEL_ARG_TYPES; a++) 225 { 226 if (!strcmp (str, gcn_kernel_arg_types[a].name)) 227 break; 228 } 229 if (a == GCN_KERNEL_ARG_TYPES) 230 { 231 error ("unknown specifier %s in amdgpu_hsa_kernel attribute", str); 232 err = true; 233 break; 234 } 235 if (args->requested & (1 << a)) 236 { 237 error ("duplicated parameter specifier %s in amdgpu_hsa_kernel " 238 "attribute", str); 239 err = true; 240 break; 241 } 242 args->requested |= (1 << a); 243 args->order[args->nargs++] = a; 244 } 245 args->requested |= (1 << WORKGROUP_ID_X_ARG); 246 args->requested |= (1 << WORK_ITEM_ID_Z_ARG); 247 248 /* Requesting WORK_ITEM_ID_Z_ARG implies requesting WORK_ITEM_ID_X_ARG and 249 WORK_ITEM_ID_Y_ARG. Similarly, requesting WORK_ITEM_ID_Y_ARG implies 250 requesting WORK_ITEM_ID_X_ARG. */ 251 if (args->requested & (1 << WORK_ITEM_ID_Z_ARG)) 252 args->requested |= (1 << WORK_ITEM_ID_Y_ARG); 253 if (args->requested & (1 << WORK_ITEM_ID_Y_ARG)) 254 args->requested |= (1 << WORK_ITEM_ID_X_ARG); 255 256 /* Always enable this so that kernargs is in a predictable place for 257 gomp_print, etc. */ 258 args->requested |= (1 << DISPATCH_PTR_ARG); 259 260 int sgpr_regno = FIRST_SGPR_REG; 261 args->nsgprs = 0; 262 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++) 263 { 264 if (!(args->requested & (1 << a))) 265 continue; 266 267 if (gcn_kernel_arg_types[a].fixed_regno >= 0) 268 args->reg[a] = gcn_kernel_arg_types[a].fixed_regno; 269 else 270 { 271 int reg_count; 272 273 switch (gcn_kernel_arg_types[a].mode) 274 { 275 case E_SImode: 276 reg_count = 1; 277 break; 278 case E_DImode: 279 reg_count = 2; 280 break; 281 case E_TImode: 282 reg_count = 4; 283 break; 284 default: 285 gcc_unreachable (); 286 } 287 args->reg[a] = sgpr_regno; 288 sgpr_regno += reg_count; 289 if (gcn_kernel_arg_types[a].fixed_regno == -1) 290 args->nsgprs += reg_count; 291 } 292 } 293 if (sgpr_regno > FIRST_SGPR_REG + 16) 294 { 295 error ("too many arguments passed in sgpr registers"); 296 } 297 return err; 298 } 299 300 /* Referenced by TARGET_ATTRIBUTE_TABLE. 301 302 Validates target specific attributes. */ 303 304 static tree 305 gcn_handle_amdgpu_hsa_kernel_attribute (tree *node, tree name, 306 tree args, int, bool *no_add_attrs) 307 { 308 if (FUNC_OR_METHOD_TYPE_P (*node) 309 && TREE_CODE (*node) != FIELD_DECL 310 && TREE_CODE (*node) != TYPE_DECL) 311 { 312 warning (OPT_Wattributes, "%qE attribute only applies to functions", 313 name); 314 *no_add_attrs = true; 315 return NULL_TREE; 316 } 317 318 /* Can combine regparm with all attributes but fastcall, and thiscall. */ 319 if (is_attribute_p ("gcnhsa_kernel", name)) 320 { 321 struct gcn_kernel_args kernelarg; 322 323 if (gcn_parse_amdgpu_hsa_kernel_attribute (&kernelarg, args)) 324 *no_add_attrs = true; 325 326 return NULL_TREE; 327 } 328 329 return NULL_TREE; 330 } 331 332 /* Implement TARGET_ATTRIBUTE_TABLE. 333 334 Create target-specific __attribute__ types. */ 335 336 static const struct attribute_spec gcn_attribute_table[] = { 337 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, handler, 338 affects_type_identity } */ 339 {"amdgpu_hsa_kernel", 0, GCN_KERNEL_ARG_TYPES, false, true, 340 true, true, gcn_handle_amdgpu_hsa_kernel_attribute, NULL}, 341 /* End element. */ 342 {NULL, 0, 0, false, false, false, false, NULL, NULL} 343 }; 344 345 /* }}} */ 346 /* {{{ Registers and modes. */ 347 348 /* Implement TARGET_CLASS_MAX_NREGS. 349 350 Return the number of hard registers needed to hold a value of MODE in 351 a register of class RCLASS. */ 352 353 static unsigned char 354 gcn_class_max_nregs (reg_class_t rclass, machine_mode mode) 355 { 356 /* Scalar registers are 32bit, vector registers are in fact tuples of 357 64 lanes. */ 358 if (rclass == VGPR_REGS) 359 { 360 if (vgpr_1reg_mode_p (mode)) 361 return 1; 362 if (vgpr_2reg_mode_p (mode)) 363 return 2; 364 /* TImode is used by DImode compare_and_swap. */ 365 if (mode == TImode) 366 return 4; 367 } 368 else if (rclass == VCC_CONDITIONAL_REG && mode == BImode) 369 return 2; 370 return CEIL (GET_MODE_SIZE (mode), 4); 371 } 372 373 /* Implement TARGET_HARD_REGNO_NREGS. 374 375 Return the number of hard registers needed to hold a value of MODE in 376 REGNO. */ 377 378 unsigned int 379 gcn_hard_regno_nregs (unsigned int regno, machine_mode mode) 380 { 381 return gcn_class_max_nregs (REGNO_REG_CLASS (regno), mode); 382 } 383 384 /* Implement TARGET_HARD_REGNO_MODE_OK. 385 386 Return true if REGNO can hold value in MODE. */ 387 388 bool 389 gcn_hard_regno_mode_ok (unsigned int regno, machine_mode mode) 390 { 391 /* Treat a complex mode as if it were a scalar mode of the same overall 392 size for the purposes of allocating hard registers. */ 393 if (COMPLEX_MODE_P (mode)) 394 switch (mode) 395 { 396 case E_CQImode: 397 case E_CHImode: 398 mode = SImode; 399 break; 400 case E_CSImode: 401 mode = DImode; 402 break; 403 case E_CDImode: 404 mode = TImode; 405 break; 406 case E_HCmode: 407 mode = SFmode; 408 break; 409 case E_SCmode: 410 mode = DFmode; 411 break; 412 default: 413 /* Not supported. */ 414 return false; 415 } 416 417 switch (regno) 418 { 419 case FLAT_SCRATCH_LO_REG: 420 case XNACK_MASK_LO_REG: 421 case TBA_LO_REG: 422 case TMA_LO_REG: 423 return (mode == SImode || mode == DImode); 424 case VCC_LO_REG: 425 case EXEC_LO_REG: 426 return (mode == BImode || mode == SImode || mode == DImode); 427 case M0_REG: 428 case FLAT_SCRATCH_HI_REG: 429 case XNACK_MASK_HI_REG: 430 case TBA_HI_REG: 431 case TMA_HI_REG: 432 return mode == SImode; 433 case VCC_HI_REG: 434 return false; 435 case EXEC_HI_REG: 436 return mode == SImode /*|| mode == V32BImode */ ; 437 case SCC_REG: 438 case VCCZ_REG: 439 case EXECZ_REG: 440 return mode == BImode; 441 } 442 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM) 443 return true; 444 if (SGPR_REGNO_P (regno)) 445 /* We restrict double register values to aligned registers. */ 446 return (sgpr_1reg_mode_p (mode) 447 || (!((regno - FIRST_SGPR_REG) & 1) && sgpr_2reg_mode_p (mode)) 448 || (((regno - FIRST_SGPR_REG) & 3) == 0 && mode == TImode)); 449 if (VGPR_REGNO_P (regno)) 450 return (vgpr_1reg_mode_p (mode) || vgpr_2reg_mode_p (mode) 451 /* TImode is used by DImode compare_and_swap. */ 452 || mode == TImode); 453 return false; 454 } 455 456 /* Implement REGNO_REG_CLASS via gcn.h. 457 458 Return smallest class containing REGNO. */ 459 460 enum reg_class 461 gcn_regno_reg_class (int regno) 462 { 463 switch (regno) 464 { 465 case SCC_REG: 466 return SCC_CONDITIONAL_REG; 467 case VCCZ_REG: 468 return VCCZ_CONDITIONAL_REG; 469 case EXECZ_REG: 470 return EXECZ_CONDITIONAL_REG; 471 case EXEC_LO_REG: 472 case EXEC_HI_REG: 473 return EXEC_MASK_REG; 474 } 475 if (VGPR_REGNO_P (regno)) 476 return VGPR_REGS; 477 if (SGPR_REGNO_P (regno)) 478 return SGPR_REGS; 479 if (regno < FIRST_VGPR_REG) 480 return GENERAL_REGS; 481 if (regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM) 482 return AFP_REGS; 483 return ALL_REGS; 484 } 485 486 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. 487 488 GCC assumes that lowpart contains first part of value as stored in memory. 489 This is not the case for vector registers. */ 490 491 bool 492 gcn_can_change_mode_class (machine_mode from, machine_mode to, 493 reg_class_t regclass) 494 { 495 if (!vgpr_vector_mode_p (from) && !vgpr_vector_mode_p (to)) 496 return true; 497 return (gcn_class_max_nregs (regclass, from) 498 == gcn_class_max_nregs (regclass, to)); 499 } 500 501 /* Implement TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P. 502 503 When this hook returns true for MODE, the compiler allows 504 registers explicitly used in the rtl to be used as spill registers 505 but prevents the compiler from extending the lifetime of these 506 registers. */ 507 508 bool 509 gcn_small_register_classes_for_mode_p (machine_mode mode) 510 { 511 /* We allocate into exec and vcc regs. Those make small register class. */ 512 return mode == DImode || mode == SImode; 513 } 514 515 /* Implement TARGET_CLASS_LIKELY_SPILLED_P. 516 517 Returns true if pseudos that have been assigned to registers of class RCLASS 518 would likely be spilled because registers of RCLASS are needed for spill 519 registers. */ 520 521 static bool 522 gcn_class_likely_spilled_p (reg_class_t rclass) 523 { 524 return (rclass == EXEC_MASK_REG 525 || reg_classes_intersect_p (ALL_CONDITIONAL_REGS, rclass)); 526 } 527 528 /* Implement TARGET_MODES_TIEABLE_P. 529 530 Returns true if a value of MODE1 is accessible in MODE2 without 531 copying. */ 532 533 bool 534 gcn_modes_tieable_p (machine_mode mode1, machine_mode mode2) 535 { 536 return (GET_MODE_BITSIZE (mode1) <= MAX_FIXED_MODE_SIZE 537 && GET_MODE_BITSIZE (mode2) <= MAX_FIXED_MODE_SIZE); 538 } 539 540 /* Implement TARGET_TRULY_NOOP_TRUNCATION. 541 542 Returns true if it is safe to “convert” a value of INPREC bits to one of 543 OUTPREC bits (where OUTPREC is smaller than INPREC) by merely operating on 544 it as if it had only OUTPREC bits. */ 545 546 bool 547 gcn_truly_noop_truncation (poly_uint64 outprec, poly_uint64 inprec) 548 { 549 return ((inprec <= 32) && (outprec <= inprec)); 550 } 551 552 /* Return N-th part of value occupying multiple registers. */ 553 554 rtx 555 gcn_operand_part (machine_mode mode, rtx op, int n) 556 { 557 if (GET_MODE_SIZE (mode) >= 256) 558 { 559 /*gcc_assert (GET_MODE_SIZE (mode) == 256 || n == 0); */ 560 561 if (REG_P (op)) 562 { 563 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER); 564 return gen_rtx_REG (V64SImode, REGNO (op) + n); 565 } 566 if (GET_CODE (op) == CONST_VECTOR) 567 { 568 int units = GET_MODE_NUNITS (mode); 569 rtvec v = rtvec_alloc (units); 570 571 for (int i = 0; i < units; ++i) 572 RTVEC_ELT (v, i) = gcn_operand_part (GET_MODE_INNER (mode), 573 CONST_VECTOR_ELT (op, i), n); 574 575 return gen_rtx_CONST_VECTOR (V64SImode, v); 576 } 577 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR) 578 return gcn_gen_undef (V64SImode); 579 gcc_unreachable (); 580 } 581 else if (GET_MODE_SIZE (mode) == 8 && REG_P (op)) 582 { 583 gcc_assert (REGNO (op) + n < FIRST_PSEUDO_REGISTER); 584 return gen_rtx_REG (SImode, REGNO (op) + n); 585 } 586 else 587 { 588 if (GET_CODE (op) == UNSPEC && XINT (op, 1) == UNSPEC_VECTOR) 589 return gcn_gen_undef (SImode); 590 591 /* If it's a constant then let's assume it is of the largest mode 592 available, otherwise simplify_gen_subreg will fail. */ 593 if (mode == VOIDmode && CONST_INT_P (op)) 594 mode = DImode; 595 return simplify_gen_subreg (SImode, op, mode, n * 4); 596 } 597 } 598 599 /* Return N-th part of value occupying multiple registers. */ 600 601 rtx 602 gcn_operand_doublepart (machine_mode mode, rtx op, int n) 603 { 604 return simplify_gen_subreg (DImode, op, mode, n * 8); 605 } 606 607 /* Return true if OP can be split into subregs or high/low parts. 608 This is always true for scalars, but not normally true for vectors. 609 However, for vectors in hardregs we can use the low and high registers. */ 610 611 bool 612 gcn_can_split_p (machine_mode, rtx op) 613 { 614 if (vgpr_vector_mode_p (GET_MODE (op))) 615 { 616 if (GET_CODE (op) == SUBREG) 617 op = SUBREG_REG (op); 618 if (!REG_P (op)) 619 return true; 620 return REGNO (op) <= FIRST_PSEUDO_REGISTER; 621 } 622 return true; 623 } 624 625 /* Implement TARGET_SPILL_CLASS. 626 627 Return class of registers which could be used for pseudo of MODE 628 and of class RCLASS for spilling instead of memory. Return NO_REGS 629 if it is not possible or non-profitable. */ 630 631 static reg_class_t 632 gcn_spill_class (reg_class_t c, machine_mode /*mode */ ) 633 { 634 if (reg_classes_intersect_p (ALL_CONDITIONAL_REGS, c)) 635 return SGPR_REGS; 636 else 637 return NO_REGS; 638 } 639 640 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS. 641 642 Change allocno class for given pseudo from allocno and best class 643 calculated by IRA. */ 644 645 static reg_class_t 646 gcn_ira_change_pseudo_allocno_class (int regno, reg_class_t cl, 647 reg_class_t best_cl) 648 { 649 /* Avoid returning classes that contain both vgpr and sgpr registers. */ 650 if (cl != ALL_REGS && cl != SRCDST_REGS && cl != ALL_GPR_REGS) 651 return cl; 652 if (best_cl != ALL_REGS && best_cl != SRCDST_REGS 653 && best_cl != ALL_GPR_REGS) 654 return best_cl; 655 656 machine_mode mode = PSEUDO_REGNO_MODE (regno); 657 if (vgpr_vector_mode_p (mode)) 658 return VGPR_REGS; 659 660 return GENERAL_REGS; 661 } 662 663 /* Create a new DImode pseudo reg and emit an instruction to initialize 664 it to VAL. */ 665 666 static rtx 667 get_exec (int64_t val) 668 { 669 rtx reg = gen_reg_rtx (DImode); 670 emit_insn (gen_rtx_SET (reg, gen_int_mode (val, DImode))); 671 return reg; 672 } 673 674 /* Return value of scalar exec register. */ 675 676 rtx 677 gcn_scalar_exec () 678 { 679 return const1_rtx; 680 } 681 682 /* Return pseudo holding scalar exec register. */ 683 684 rtx 685 gcn_scalar_exec_reg () 686 { 687 return get_exec (1); 688 } 689 690 /* Return value of full exec register. */ 691 692 rtx 693 gcn_full_exec () 694 { 695 return constm1_rtx; 696 } 697 698 /* Return pseudo holding full exec register. */ 699 700 rtx 701 gcn_full_exec_reg () 702 { 703 return get_exec (-1); 704 } 705 706 /* }}} */ 707 /* {{{ Immediate constants. */ 708 709 /* Initialize shared numeric constants. */ 710 711 static void 712 init_ext_gcn_constants (void) 713 { 714 real_from_integer (&dconst4, DFmode, 4, SIGNED); 715 716 /* FIXME: this constant probably does not match what hardware really loads. 717 Reality check it eventually. */ 718 real_from_string (&dconst1over2pi, 719 "0.1591549430918953357663423455968866839"); 720 real_convert (&dconst1over2pi, SFmode, &dconst1over2pi); 721 722 ext_gcn_constants_init = 1; 723 } 724 725 /* Return non-zero if X is a constant that can appear as an inline operand. 726 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi) 727 Or a vector of those. 728 The value returned should be the encoding of this constant. */ 729 730 int 731 gcn_inline_fp_constant_p (rtx x, bool allow_vector) 732 { 733 machine_mode mode = GET_MODE (x); 734 735 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode) 736 && allow_vector) 737 { 738 int n; 739 if (GET_CODE (x) != CONST_VECTOR) 740 return 0; 741 n = gcn_inline_fp_constant_p (CONST_VECTOR_ELT (x, 0), false); 742 if (!n) 743 return 0; 744 for (int i = 1; i < 64; i++) 745 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) 746 return 0; 747 return 1; 748 } 749 750 if (mode != HFmode && mode != SFmode && mode != DFmode) 751 return 0; 752 753 const REAL_VALUE_TYPE *r; 754 755 if (x == CONST0_RTX (mode)) 756 return 128; 757 if (x == CONST1_RTX (mode)) 758 return 242; 759 760 r = CONST_DOUBLE_REAL_VALUE (x); 761 762 if (real_identical (r, &dconstm1)) 763 return 243; 764 765 if (real_identical (r, &dconsthalf)) 766 return 240; 767 if (real_identical (r, &dconstm1)) 768 return 243; 769 if (real_identical (r, &dconst2)) 770 return 244; 771 if (real_identical (r, &dconst4)) 772 return 246; 773 if (real_identical (r, &dconst1over2pi)) 774 return 248; 775 if (!ext_gcn_constants_init) 776 init_ext_gcn_constants (); 777 real_value_negate (r); 778 if (real_identical (r, &dconsthalf)) 779 return 241; 780 if (real_identical (r, &dconst2)) 781 return 245; 782 if (real_identical (r, &dconst4)) 783 return 247; 784 785 /* FIXME: add 4, -4 and 1/(2*PI). */ 786 787 return 0; 788 } 789 790 /* Return non-zero if X is a constant that can appear as an immediate operand. 791 This is 0, 0.5, -0.5, 1, -1, 2, -2, 4,-4, 1/(2*pi) 792 Or a vector of those. 793 The value returned should be the encoding of this constant. */ 794 795 bool 796 gcn_fp_constant_p (rtx x, bool allow_vector) 797 { 798 machine_mode mode = GET_MODE (x); 799 800 if ((mode == V64HFmode || mode == V64SFmode || mode == V64DFmode) 801 && allow_vector) 802 { 803 int n; 804 if (GET_CODE (x) != CONST_VECTOR) 805 return false; 806 n = gcn_fp_constant_p (CONST_VECTOR_ELT (x, 0), false); 807 if (!n) 808 return false; 809 for (int i = 1; i < 64; i++) 810 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) 811 return false; 812 return true; 813 } 814 if (mode != HFmode && mode != SFmode && mode != DFmode) 815 return false; 816 817 if (gcn_inline_fp_constant_p (x, false)) 818 return true; 819 /* FIXME: It is not clear how 32bit immediates are interpreted here. */ 820 return (mode != DFmode); 821 } 822 823 /* Return true if X is a constant representable as an inline immediate 824 constant in a 32-bit instruction encoding. */ 825 826 bool 827 gcn_inline_constant_p (rtx x) 828 { 829 if (GET_CODE (x) == CONST_INT) 830 return INTVAL (x) >= -16 && INTVAL (x) < 64; 831 if (GET_CODE (x) == CONST_DOUBLE) 832 return gcn_inline_fp_constant_p (x, false); 833 if (GET_CODE (x) == CONST_VECTOR) 834 { 835 int n; 836 if (!vgpr_vector_mode_p (GET_MODE (x))) 837 return false; 838 n = gcn_inline_constant_p (CONST_VECTOR_ELT (x, 0)); 839 if (!n) 840 return false; 841 for (int i = 1; i < 64; i++) 842 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) 843 return false; 844 return 1; 845 } 846 return false; 847 } 848 849 /* Return true if X is a constant representable as an immediate constant 850 in a 32 or 64-bit instruction encoding. */ 851 852 bool 853 gcn_constant_p (rtx x) 854 { 855 switch (GET_CODE (x)) 856 { 857 case CONST_INT: 858 return true; 859 860 case CONST_DOUBLE: 861 return gcn_fp_constant_p (x, false); 862 863 case CONST_VECTOR: 864 { 865 int n; 866 if (!vgpr_vector_mode_p (GET_MODE (x))) 867 return false; 868 n = gcn_constant_p (CONST_VECTOR_ELT (x, 0)); 869 if (!n) 870 return false; 871 for (int i = 1; i < 64; i++) 872 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) 873 return false; 874 return true; 875 } 876 877 case SYMBOL_REF: 878 case LABEL_REF: 879 return true; 880 881 default: 882 ; 883 } 884 885 return false; 886 } 887 888 /* Return true if X is a constant representable as two inline immediate 889 constants in a 64-bit instruction that is split into two 32-bit 890 instructions. */ 891 892 bool 893 gcn_inline_constant64_p (rtx x) 894 { 895 if (GET_CODE (x) == CONST_VECTOR) 896 { 897 if (!vgpr_vector_mode_p (GET_MODE (x))) 898 return false; 899 if (!gcn_inline_constant64_p (CONST_VECTOR_ELT (x, 0))) 900 return false; 901 for (int i = 1; i < 64; i++) 902 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) 903 return false; 904 905 return true; 906 } 907 908 if (GET_CODE (x) != CONST_INT) 909 return false; 910 911 rtx val_lo = gcn_operand_part (DImode, x, 0); 912 rtx val_hi = gcn_operand_part (DImode, x, 1); 913 return gcn_inline_constant_p (val_lo) && gcn_inline_constant_p (val_hi); 914 } 915 916 /* Return true if X is a constant representable as an immediate constant 917 in a 32 or 64-bit instruction encoding where the hardware will 918 extend the immediate to 64-bits. */ 919 920 bool 921 gcn_constant64_p (rtx x) 922 { 923 if (!gcn_constant_p (x)) 924 return false; 925 926 if (GET_CODE (x) != CONST_INT) 927 return true; 928 929 /* Negative numbers are only allowed if they can be encoded within src0, 930 because the 32-bit immediates do not get sign-extended. 931 Unsigned numbers must not be encodable as 32-bit -1..-16, because the 932 assembler will use a src0 inline immediate and that will get 933 sign-extended. */ 934 HOST_WIDE_INT val = INTVAL (x); 935 return (((val & 0xffffffff) == val /* Positive 32-bit. */ 936 && (val & 0xfffffff0) != 0xfffffff0) /* Not -1..-16. */ 937 || gcn_inline_constant_p (x)); /* Src0. */ 938 } 939 940 /* Implement TARGET_LEGITIMATE_CONSTANT_P. 941 942 Returns true if X is a legitimate constant for a MODE immediate operand. */ 943 944 bool 945 gcn_legitimate_constant_p (machine_mode, rtx x) 946 { 947 return gcn_constant_p (x); 948 } 949 950 /* Return true if X is a CONST_VECTOR of single constant. */ 951 952 static bool 953 single_cst_vector_p (rtx x) 954 { 955 if (GET_CODE (x) != CONST_VECTOR) 956 return false; 957 for (int i = 1; i < 64; i++) 958 if (CONST_VECTOR_ELT (x, i) != CONST_VECTOR_ELT (x, 0)) 959 return false; 960 return true; 961 } 962 963 /* Create a CONST_VECTOR of duplicated value A. */ 964 965 rtx 966 gcn_vec_constant (machine_mode mode, int a) 967 { 968 /*if (!a) 969 return CONST0_RTX (mode); 970 if (a == -1) 971 return CONSTM1_RTX (mode); 972 if (a == 1) 973 return CONST1_RTX (mode); 974 if (a == 2) 975 return CONST2_RTX (mode);*/ 976 977 int units = GET_MODE_NUNITS (mode); 978 rtx tem = gen_int_mode (a, GET_MODE_INNER (mode)); 979 rtvec v = rtvec_alloc (units); 980 981 for (int i = 0; i < units; ++i) 982 RTVEC_ELT (v, i) = tem; 983 984 return gen_rtx_CONST_VECTOR (mode, v); 985 } 986 987 /* Create a CONST_VECTOR of duplicated value A. */ 988 989 rtx 990 gcn_vec_constant (machine_mode mode, rtx a) 991 { 992 int units = GET_MODE_NUNITS (mode); 993 rtvec v = rtvec_alloc (units); 994 995 for (int i = 0; i < units; ++i) 996 RTVEC_ELT (v, i) = a; 997 998 return gen_rtx_CONST_VECTOR (mode, v); 999 } 1000 1001 /* Create an undefined vector value, used where an insn operand is 1002 optional. */ 1003 1004 rtx 1005 gcn_gen_undef (machine_mode mode) 1006 { 1007 return gen_rtx_UNSPEC (mode, gen_rtvec (1, const0_rtx), UNSPEC_VECTOR); 1008 } 1009 1010 /* }}} */ 1011 /* {{{ Addresses, pointers and moves. */ 1012 1013 /* Return true is REG is a valid place to store a pointer, 1014 for instructions that require an SGPR. 1015 FIXME rename. */ 1016 1017 static bool 1018 gcn_address_register_p (rtx reg, machine_mode mode, bool strict) 1019 { 1020 if (GET_CODE (reg) == SUBREG) 1021 reg = SUBREG_REG (reg); 1022 1023 if (!REG_P (reg)) 1024 return false; 1025 1026 if (GET_MODE (reg) != mode) 1027 return false; 1028 1029 int regno = REGNO (reg); 1030 1031 if (regno >= FIRST_PSEUDO_REGISTER) 1032 { 1033 if (!strict) 1034 return true; 1035 1036 if (!reg_renumber) 1037 return false; 1038 1039 regno = reg_renumber[regno]; 1040 } 1041 1042 return (SGPR_REGNO_P (regno) || regno == M0_REG 1043 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM); 1044 } 1045 1046 /* Return true is REG is a valid place to store a pointer, 1047 for instructions that require a VGPR. */ 1048 1049 static bool 1050 gcn_vec_address_register_p (rtx reg, machine_mode mode, bool strict) 1051 { 1052 if (GET_CODE (reg) == SUBREG) 1053 reg = SUBREG_REG (reg); 1054 1055 if (!REG_P (reg)) 1056 return false; 1057 1058 if (GET_MODE (reg) != mode) 1059 return false; 1060 1061 int regno = REGNO (reg); 1062 1063 if (regno >= FIRST_PSEUDO_REGISTER) 1064 { 1065 if (!strict) 1066 return true; 1067 1068 if (!reg_renumber) 1069 return false; 1070 1071 regno = reg_renumber[regno]; 1072 } 1073 1074 return VGPR_REGNO_P (regno); 1075 } 1076 1077 /* Return true if X would be valid inside a MEM using the Flat address 1078 space. */ 1079 1080 bool 1081 gcn_flat_address_p (rtx x, machine_mode mode) 1082 { 1083 bool vec_mode = (GET_MODE_CLASS (mode) == MODE_VECTOR_INT 1084 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT); 1085 1086 if (vec_mode && gcn_address_register_p (x, DImode, false)) 1087 return true; 1088 1089 if (!vec_mode && gcn_vec_address_register_p (x, DImode, false)) 1090 return true; 1091 1092 if (TARGET_GCN5_PLUS 1093 && GET_CODE (x) == PLUS 1094 && gcn_vec_address_register_p (XEXP (x, 0), DImode, false) 1095 && CONST_INT_P (XEXP (x, 1))) 1096 return true; 1097 1098 return false; 1099 } 1100 1101 /* Return true if X would be valid inside a MEM using the Scalar Flat 1102 address space. */ 1103 1104 bool 1105 gcn_scalar_flat_address_p (rtx x) 1106 { 1107 if (gcn_address_register_p (x, DImode, false)) 1108 return true; 1109 1110 if (GET_CODE (x) == PLUS 1111 && gcn_address_register_p (XEXP (x, 0), DImode, false) 1112 && CONST_INT_P (XEXP (x, 1))) 1113 return true; 1114 1115 return false; 1116 } 1117 1118 /* Return true if MEM X would be valid for the Scalar Flat address space. */ 1119 1120 bool 1121 gcn_scalar_flat_mem_p (rtx x) 1122 { 1123 if (!MEM_P (x)) 1124 return false; 1125 1126 if (GET_MODE_SIZE (GET_MODE (x)) < 4) 1127 return false; 1128 1129 return gcn_scalar_flat_address_p (XEXP (x, 0)); 1130 } 1131 1132 /* Return true if X would be valid inside a MEM using the LDS or GDS 1133 address spaces. */ 1134 1135 bool 1136 gcn_ds_address_p (rtx x) 1137 { 1138 if (gcn_vec_address_register_p (x, SImode, false)) 1139 return true; 1140 1141 if (GET_CODE (x) == PLUS 1142 && gcn_vec_address_register_p (XEXP (x, 0), SImode, false) 1143 && CONST_INT_P (XEXP (x, 1))) 1144 return true; 1145 1146 return false; 1147 } 1148 1149 /* Return true if ADDR would be valid inside a MEM using the Global 1150 address space. */ 1151 1152 bool 1153 gcn_global_address_p (rtx addr) 1154 { 1155 if (gcn_address_register_p (addr, DImode, false) 1156 || gcn_vec_address_register_p (addr, DImode, false)) 1157 return true; 1158 1159 if (GET_CODE (addr) == PLUS) 1160 { 1161 rtx base = XEXP (addr, 0); 1162 rtx offset = XEXP (addr, 1); 1163 bool immediate_p = (CONST_INT_P (offset) 1164 && INTVAL (offset) >= -(1 << 12) 1165 && INTVAL (offset) < (1 << 12)); 1166 1167 if ((gcn_address_register_p (base, DImode, false) 1168 || gcn_vec_address_register_p (base, DImode, false)) 1169 && immediate_p) 1170 /* SGPR + CONST or VGPR + CONST */ 1171 return true; 1172 1173 if (gcn_address_register_p (base, DImode, false) 1174 && gcn_vgpr_register_operand (offset, SImode)) 1175 /* SPGR + VGPR */ 1176 return true; 1177 1178 if (GET_CODE (base) == PLUS 1179 && gcn_address_register_p (XEXP (base, 0), DImode, false) 1180 && gcn_vgpr_register_operand (XEXP (base, 1), SImode) 1181 && immediate_p) 1182 /* (SGPR + VGPR) + CONST */ 1183 return true; 1184 } 1185 1186 return false; 1187 } 1188 1189 /* Implement TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P. 1190 1191 Recognizes RTL expressions that are valid memory addresses for an 1192 instruction. The MODE argument is the machine mode for the MEM 1193 expression that wants to use this address. 1194 1195 It only recognizes address in canonical form. LEGITIMIZE_ADDRESS should 1196 convert common non-canonical forms to canonical form so that they will 1197 be recognized. */ 1198 1199 static bool 1200 gcn_addr_space_legitimate_address_p (machine_mode mode, rtx x, bool strict, 1201 addr_space_t as) 1202 { 1203 /* All vector instructions need to work on addresses in registers. */ 1204 if (!TARGET_GCN5_PLUS && (vgpr_vector_mode_p (mode) && !REG_P (x))) 1205 return false; 1206 1207 if (AS_SCALAR_FLAT_P (as)) 1208 { 1209 if (mode == QImode || mode == HImode) 1210 return 0; 1211 1212 switch (GET_CODE (x)) 1213 { 1214 case REG: 1215 return gcn_address_register_p (x, DImode, strict); 1216 /* Addresses are in the form BASE+OFFSET 1217 OFFSET is either 20bit unsigned immediate, SGPR or M0. 1218 Writes and atomics do not accept SGPR. */ 1219 case PLUS: 1220 { 1221 rtx x0 = XEXP (x, 0); 1222 rtx x1 = XEXP (x, 1); 1223 if (!gcn_address_register_p (x0, DImode, strict)) 1224 return false; 1225 /* FIXME: This is disabled because of the mode mismatch between 1226 SImode (for the address or m0 register) and the DImode PLUS. 1227 We'll need a zero_extend or similar. 1228 1229 if (gcn_m0_register_p (x1, SImode, strict) 1230 || gcn_address_register_p (x1, SImode, strict)) 1231 return true; 1232 else*/ 1233 if (GET_CODE (x1) == CONST_INT) 1234 { 1235 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20) 1236 /* The low bits of the offset are ignored, even when 1237 they're meant to realign the pointer. */ 1238 && !(INTVAL (x1) & 0x3)) 1239 return true; 1240 } 1241 return false; 1242 } 1243 1244 default: 1245 break; 1246 } 1247 } 1248 else if (AS_SCRATCH_P (as)) 1249 return gcn_address_register_p (x, SImode, strict); 1250 else if (AS_FLAT_P (as) || AS_FLAT_SCRATCH_P (as)) 1251 { 1252 if (TARGET_GCN3 || GET_CODE (x) == REG) 1253 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT 1254 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) 1255 ? gcn_address_register_p (x, DImode, strict) 1256 : gcn_vec_address_register_p (x, DImode, strict)); 1257 else 1258 { 1259 gcc_assert (TARGET_GCN5_PLUS); 1260 1261 if (GET_CODE (x) == PLUS) 1262 { 1263 rtx x1 = XEXP (x, 1); 1264 1265 if (VECTOR_MODE_P (mode) 1266 ? !gcn_address_register_p (x, DImode, strict) 1267 : !gcn_vec_address_register_p (x, DImode, strict)) 1268 return false; 1269 1270 if (GET_CODE (x1) == CONST_INT) 1271 { 1272 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 12) 1273 /* The low bits of the offset are ignored, even when 1274 they're meant to realign the pointer. */ 1275 && !(INTVAL (x1) & 0x3)) 1276 return true; 1277 } 1278 } 1279 return false; 1280 } 1281 } 1282 else if (AS_GLOBAL_P (as)) 1283 { 1284 gcc_assert (TARGET_GCN5_PLUS); 1285 1286 if (GET_CODE (x) == REG) 1287 return (gcn_address_register_p (x, DImode, strict) 1288 || (!VECTOR_MODE_P (mode) 1289 && gcn_vec_address_register_p (x, DImode, strict))); 1290 else if (GET_CODE (x) == PLUS) 1291 { 1292 rtx base = XEXP (x, 0); 1293 rtx offset = XEXP (x, 1); 1294 1295 bool immediate_p = (GET_CODE (offset) == CONST_INT 1296 /* Signed 13-bit immediate. */ 1297 && INTVAL (offset) >= -(1 << 12) 1298 && INTVAL (offset) < (1 << 12) 1299 /* The low bits of the offset are ignored, even 1300 when they're meant to realign the pointer. */ 1301 && !(INTVAL (offset) & 0x3)); 1302 1303 if (!VECTOR_MODE_P (mode)) 1304 { 1305 if ((gcn_address_register_p (base, DImode, strict) 1306 || gcn_vec_address_register_p (base, DImode, strict)) 1307 && immediate_p) 1308 /* SGPR + CONST or VGPR + CONST */ 1309 return true; 1310 1311 if (gcn_address_register_p (base, DImode, strict) 1312 && gcn_vgpr_register_operand (offset, SImode)) 1313 /* SGPR + VGPR */ 1314 return true; 1315 1316 if (GET_CODE (base) == PLUS 1317 && gcn_address_register_p (XEXP (base, 0), DImode, strict) 1318 && gcn_vgpr_register_operand (XEXP (base, 1), SImode) 1319 && immediate_p) 1320 /* (SGPR + VGPR) + CONST */ 1321 return true; 1322 } 1323 else 1324 { 1325 if (gcn_address_register_p (base, DImode, strict) 1326 && immediate_p) 1327 /* SGPR + CONST */ 1328 return true; 1329 } 1330 } 1331 else 1332 return false; 1333 } 1334 else if (AS_ANY_DS_P (as)) 1335 switch (GET_CODE (x)) 1336 { 1337 case REG: 1338 return (VECTOR_MODE_P (mode) 1339 ? gcn_address_register_p (x, SImode, strict) 1340 : gcn_vec_address_register_p (x, SImode, strict)); 1341 /* Addresses are in the form BASE+OFFSET 1342 OFFSET is either 20bit unsigned immediate, SGPR or M0. 1343 Writes and atomics do not accept SGPR. */ 1344 case PLUS: 1345 { 1346 rtx x0 = XEXP (x, 0); 1347 rtx x1 = XEXP (x, 1); 1348 if (!gcn_vec_address_register_p (x0, DImode, strict)) 1349 return false; 1350 if (GET_CODE (x1) == REG) 1351 { 1352 if (GET_CODE (x1) != REG 1353 || (REGNO (x1) <= FIRST_PSEUDO_REGISTER 1354 && !gcn_ssrc_register_operand (x1, DImode))) 1355 return false; 1356 } 1357 else if (GET_CODE (x1) == CONST_VECTOR 1358 && GET_CODE (CONST_VECTOR_ELT (x1, 0)) == CONST_INT 1359 && single_cst_vector_p (x1)) 1360 { 1361 x1 = CONST_VECTOR_ELT (x1, 0); 1362 if (INTVAL (x1) >= 0 && INTVAL (x1) < (1 << 20)) 1363 return true; 1364 } 1365 return false; 1366 } 1367 1368 default: 1369 break; 1370 } 1371 else 1372 gcc_unreachable (); 1373 return false; 1374 } 1375 1376 /* Implement TARGET_ADDR_SPACE_POINTER_MODE. 1377 1378 Return the appropriate mode for a named address pointer. */ 1379 1380 static scalar_int_mode 1381 gcn_addr_space_pointer_mode (addr_space_t addrspace) 1382 { 1383 switch (addrspace) 1384 { 1385 case ADDR_SPACE_SCRATCH: 1386 case ADDR_SPACE_LDS: 1387 case ADDR_SPACE_GDS: 1388 return SImode; 1389 case ADDR_SPACE_DEFAULT: 1390 case ADDR_SPACE_FLAT: 1391 case ADDR_SPACE_FLAT_SCRATCH: 1392 case ADDR_SPACE_SCALAR_FLAT: 1393 return DImode; 1394 default: 1395 gcc_unreachable (); 1396 } 1397 } 1398 1399 /* Implement TARGET_ADDR_SPACE_ADDRESS_MODE. 1400 1401 Return the appropriate mode for a named address space address. */ 1402 1403 static scalar_int_mode 1404 gcn_addr_space_address_mode (addr_space_t addrspace) 1405 { 1406 return gcn_addr_space_pointer_mode (addrspace); 1407 } 1408 1409 /* Implement TARGET_ADDR_SPACE_SUBSET_P. 1410 1411 Determine if one named address space is a subset of another. */ 1412 1413 static bool 1414 gcn_addr_space_subset_p (addr_space_t subset, addr_space_t superset) 1415 { 1416 if (subset == superset) 1417 return true; 1418 /* FIXME is this true? */ 1419 if (AS_FLAT_P (superset) || AS_SCALAR_FLAT_P (superset)) 1420 return true; 1421 return false; 1422 } 1423 1424 /* Convert from one address space to another. */ 1425 1426 static rtx 1427 gcn_addr_space_convert (rtx op, tree from_type, tree to_type) 1428 { 1429 gcc_assert (POINTER_TYPE_P (from_type)); 1430 gcc_assert (POINTER_TYPE_P (to_type)); 1431 1432 addr_space_t as_from = TYPE_ADDR_SPACE (TREE_TYPE (from_type)); 1433 addr_space_t as_to = TYPE_ADDR_SPACE (TREE_TYPE (to_type)); 1434 1435 if (AS_LDS_P (as_from) && AS_FLAT_P (as_to)) 1436 { 1437 rtx queue = gen_rtx_REG (DImode, 1438 cfun->machine->args.reg[QUEUE_PTR_ARG]); 1439 rtx group_seg_aperture_hi = gen_rtx_MEM (SImode, 1440 gen_rtx_PLUS (DImode, queue, 1441 gen_int_mode (64, SImode))); 1442 rtx tmp = gen_reg_rtx (DImode); 1443 1444 emit_move_insn (gen_lowpart (SImode, tmp), op); 1445 emit_move_insn (gen_highpart_mode (SImode, DImode, tmp), 1446 group_seg_aperture_hi); 1447 1448 return tmp; 1449 } 1450 else if (as_from == as_to) 1451 return op; 1452 else 1453 gcc_unreachable (); 1454 } 1455 1456 1457 /* Implement REGNO_MODE_CODE_OK_FOR_BASE_P via gcn.h 1458 1459 Retun true if REGNO is OK for memory adressing. */ 1460 1461 bool 1462 gcn_regno_mode_code_ok_for_base_p (int regno, 1463 machine_mode, addr_space_t as, int, int) 1464 { 1465 if (regno >= FIRST_PSEUDO_REGISTER) 1466 { 1467 if (reg_renumber) 1468 regno = reg_renumber[regno]; 1469 else 1470 return true; 1471 } 1472 if (AS_FLAT_P (as)) 1473 return (VGPR_REGNO_P (regno) 1474 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM); 1475 else if (AS_SCALAR_FLAT_P (as)) 1476 return (SGPR_REGNO_P (regno) 1477 || regno == ARG_POINTER_REGNUM || regno == FRAME_POINTER_REGNUM); 1478 else if (AS_GLOBAL_P (as)) 1479 { 1480 return (SGPR_REGNO_P (regno) 1481 || VGPR_REGNO_P (regno) 1482 || regno == ARG_POINTER_REGNUM 1483 || regno == FRAME_POINTER_REGNUM); 1484 } 1485 else 1486 /* For now. */ 1487 return false; 1488 } 1489 1490 /* Implement MODE_CODE_BASE_REG_CLASS via gcn.h. 1491 1492 Return a suitable register class for memory addressing. */ 1493 1494 reg_class 1495 gcn_mode_code_base_reg_class (machine_mode mode, addr_space_t as, int oc, 1496 int ic) 1497 { 1498 switch (as) 1499 { 1500 case ADDR_SPACE_DEFAULT: 1501 return gcn_mode_code_base_reg_class (mode, DEFAULT_ADDR_SPACE, oc, ic); 1502 case ADDR_SPACE_SCALAR_FLAT: 1503 case ADDR_SPACE_SCRATCH: 1504 return SGPR_REGS; 1505 break; 1506 case ADDR_SPACE_FLAT: 1507 case ADDR_SPACE_FLAT_SCRATCH: 1508 case ADDR_SPACE_LDS: 1509 case ADDR_SPACE_GDS: 1510 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT 1511 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) 1512 ? SGPR_REGS : VGPR_REGS); 1513 case ADDR_SPACE_GLOBAL: 1514 return ((GET_MODE_CLASS (mode) == MODE_VECTOR_INT 1515 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) 1516 ? SGPR_REGS : ALL_GPR_REGS); 1517 } 1518 gcc_unreachable (); 1519 } 1520 1521 /* Implement REGNO_OK_FOR_INDEX_P via gcn.h. 1522 1523 Return true if REGNO is OK for index of memory addressing. */ 1524 1525 bool 1526 regno_ok_for_index_p (int regno) 1527 { 1528 if (regno >= FIRST_PSEUDO_REGISTER) 1529 { 1530 if (reg_renumber) 1531 regno = reg_renumber[regno]; 1532 else 1533 return true; 1534 } 1535 return regno == M0_REG || VGPR_REGNO_P (regno); 1536 } 1537 1538 /* Generate move which uses the exec flags. If EXEC is NULL, then it is 1539 assumed that all lanes normally relevant to the mode of the move are 1540 affected. If PREV is NULL, then a sensible default is supplied for 1541 the inactive lanes. */ 1542 1543 static rtx 1544 gen_mov_with_exec (rtx op0, rtx op1, rtx exec = NULL, rtx prev = NULL) 1545 { 1546 machine_mode mode = GET_MODE (op0); 1547 1548 if (vgpr_vector_mode_p (mode)) 1549 { 1550 if (exec && exec != CONSTM1_RTX (DImode)) 1551 { 1552 if (!prev) 1553 prev = op0; 1554 } 1555 else 1556 { 1557 if (!prev) 1558 prev = gcn_gen_undef (mode); 1559 exec = gcn_full_exec_reg (); 1560 } 1561 1562 rtx set = gen_rtx_SET (op0, gen_rtx_VEC_MERGE (mode, op1, prev, exec)); 1563 1564 return gen_rtx_PARALLEL (VOIDmode, 1565 gen_rtvec (2, set, 1566 gen_rtx_CLOBBER (VOIDmode, 1567 gen_rtx_SCRATCH (V64DImode)))); 1568 } 1569 1570 return (gen_rtx_PARALLEL 1571 (VOIDmode, 1572 gen_rtvec (2, gen_rtx_SET (op0, op1), 1573 gen_rtx_USE (VOIDmode, 1574 exec ? exec : gcn_scalar_exec ())))); 1575 } 1576 1577 /* Generate masked move. */ 1578 1579 static rtx 1580 gen_duplicate_load (rtx op0, rtx op1, rtx op2 = NULL, rtx exec = NULL) 1581 { 1582 if (exec) 1583 return (gen_rtx_SET (op0, 1584 gen_rtx_VEC_MERGE (GET_MODE (op0), 1585 gen_rtx_VEC_DUPLICATE (GET_MODE 1586 (op0), op1), 1587 op2, exec))); 1588 else 1589 return (gen_rtx_SET (op0, gen_rtx_VEC_DUPLICATE (GET_MODE (op0), op1))); 1590 } 1591 1592 /* Expand vector init of OP0 by VEC. 1593 Implements vec_init instruction pattern. */ 1594 1595 void 1596 gcn_expand_vector_init (rtx op0, rtx vec) 1597 { 1598 int64_t initialized_mask = 0; 1599 int64_t curr_mask = 1; 1600 machine_mode mode = GET_MODE (op0); 1601 1602 rtx val = XVECEXP (vec, 0, 0); 1603 1604 for (int i = 1; i < 64; i++) 1605 if (rtx_equal_p (val, XVECEXP (vec, 0, i))) 1606 curr_mask |= (int64_t) 1 << i; 1607 1608 if (gcn_constant_p (val)) 1609 emit_move_insn (op0, gcn_vec_constant (mode, val)); 1610 else 1611 { 1612 val = force_reg (GET_MODE_INNER (mode), val); 1613 emit_insn (gen_duplicate_load (op0, val)); 1614 } 1615 initialized_mask |= curr_mask; 1616 for (int i = 1; i < 64; i++) 1617 if (!(initialized_mask & ((int64_t) 1 << i))) 1618 { 1619 curr_mask = (int64_t) 1 << i; 1620 rtx val = XVECEXP (vec, 0, i); 1621 1622 for (int j = i + 1; j < 64; j++) 1623 if (rtx_equal_p (val, XVECEXP (vec, 0, j))) 1624 curr_mask |= (int64_t) 1 << j; 1625 if (gcn_constant_p (val)) 1626 emit_insn (gen_mov_with_exec (op0, gcn_vec_constant (mode, val), 1627 get_exec (curr_mask))); 1628 else 1629 { 1630 val = force_reg (GET_MODE_INNER (mode), val); 1631 emit_insn (gen_duplicate_load (op0, val, op0, 1632 get_exec (curr_mask))); 1633 } 1634 initialized_mask |= curr_mask; 1635 } 1636 } 1637 1638 /* Load vector constant where n-th lane contains BASE+n*VAL. */ 1639 1640 static rtx 1641 strided_constant (machine_mode mode, int base, int val) 1642 { 1643 rtx x = gen_reg_rtx (mode); 1644 emit_move_insn (x, gcn_vec_constant (mode, base)); 1645 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 32), 1646 x, get_exec (0xffffffff00000000))); 1647 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 16), 1648 x, get_exec (0xffff0000ffff0000))); 1649 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 8), 1650 x, get_exec (0xff00ff00ff00ff00))); 1651 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 4), 1652 x, get_exec (0xf0f0f0f0f0f0f0f0))); 1653 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 2), 1654 x, get_exec (0xcccccccccccccccc))); 1655 emit_insn (gen_addv64si3_exec (x, x, gcn_vec_constant (mode, val * 1), 1656 x, get_exec (0xaaaaaaaaaaaaaaaa))); 1657 return x; 1658 } 1659 1660 /* Implement TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS. */ 1661 1662 static rtx 1663 gcn_addr_space_legitimize_address (rtx x, rtx old, machine_mode mode, 1664 addr_space_t as) 1665 { 1666 switch (as) 1667 { 1668 case ADDR_SPACE_DEFAULT: 1669 return gcn_addr_space_legitimize_address (x, old, mode, 1670 DEFAULT_ADDR_SPACE); 1671 case ADDR_SPACE_SCALAR_FLAT: 1672 case ADDR_SPACE_SCRATCH: 1673 /* Instructions working on vectors need the address to be in 1674 a register. */ 1675 if (vgpr_vector_mode_p (mode)) 1676 return force_reg (GET_MODE (x), x); 1677 1678 return x; 1679 case ADDR_SPACE_FLAT: 1680 case ADDR_SPACE_FLAT_SCRATCH: 1681 case ADDR_SPACE_GLOBAL: 1682 return TARGET_GCN3 ? force_reg (DImode, x) : x; 1683 case ADDR_SPACE_LDS: 1684 case ADDR_SPACE_GDS: 1685 /* FIXME: LDS support offsets, handle them!. */ 1686 if (vgpr_vector_mode_p (mode) && GET_MODE (x) != V64SImode) 1687 { 1688 rtx addrs = gen_reg_rtx (V64SImode); 1689 rtx base = force_reg (SImode, x); 1690 rtx offsets = strided_constant (V64SImode, 0, 1691 GET_MODE_UNIT_SIZE (mode)); 1692 1693 emit_insn (gen_vec_duplicatev64si (addrs, base)); 1694 emit_insn (gen_addv64si3 (addrs, offsets, addrs)); 1695 return addrs; 1696 } 1697 return x; 1698 } 1699 gcc_unreachable (); 1700 } 1701 1702 /* Convert a (mem:<MODE> (reg:DI)) to (mem:<MODE> (reg:V64DI)) with the 1703 proper vector of stepped addresses. 1704 1705 MEM will be a DImode address of a vector in an SGPR. 1706 TMP will be a V64DImode VGPR pair or (scratch:V64DI). */ 1707 1708 rtx 1709 gcn_expand_scalar_to_vector_address (machine_mode mode, rtx exec, rtx mem, 1710 rtx tmp) 1711 { 1712 gcc_assert (MEM_P (mem)); 1713 rtx mem_base = XEXP (mem, 0); 1714 rtx mem_index = NULL_RTX; 1715 1716 if (!TARGET_GCN5_PLUS) 1717 { 1718 /* gcn_addr_space_legitimize_address should have put the address in a 1719 register. If not, it is too late to do anything about it. */ 1720 gcc_assert (REG_P (mem_base)); 1721 } 1722 1723 if (GET_CODE (mem_base) == PLUS) 1724 { 1725 mem_index = XEXP (mem_base, 1); 1726 mem_base = XEXP (mem_base, 0); 1727 } 1728 1729 /* RF and RM base registers for vector modes should be always an SGPR. */ 1730 gcc_assert (SGPR_REGNO_P (REGNO (mem_base)) 1731 || REGNO (mem_base) >= FIRST_PSEUDO_REGISTER); 1732 1733 machine_mode inner = GET_MODE_INNER (mode); 1734 int shift = exact_log2 (GET_MODE_SIZE (inner)); 1735 rtx ramp = gen_rtx_REG (V64SImode, VGPR_REGNO (1)); 1736 rtx undef_v64si = gcn_gen_undef (V64SImode); 1737 rtx new_base = NULL_RTX; 1738 addr_space_t as = MEM_ADDR_SPACE (mem); 1739 1740 rtx tmplo = (REG_P (tmp) 1741 ? gcn_operand_part (V64DImode, tmp, 0) 1742 : gen_reg_rtx (V64SImode)); 1743 1744 /* tmplo[:] = ramp[:] << shift */ 1745 if (exec) 1746 emit_insn (gen_ashlv64si3_exec (tmplo, ramp, 1747 gen_int_mode (shift, SImode), 1748 undef_v64si, exec)); 1749 else 1750 emit_insn (gen_ashlv64si3 (tmplo, ramp, gen_int_mode (shift, SImode))); 1751 1752 if (AS_FLAT_P (as)) 1753 { 1754 if (REG_P (tmp)) 1755 { 1756 rtx vcc = gen_rtx_REG (DImode, CC_SAVE_REG); 1757 rtx mem_base_lo = gcn_operand_part (DImode, mem_base, 0); 1758 rtx mem_base_hi = gcn_operand_part (DImode, mem_base, 1); 1759 rtx tmphi = gcn_operand_part (V64DImode, tmp, 1); 1760 1761 /* tmphi[:] = mem_base_hi */ 1762 if (exec) 1763 emit_insn (gen_vec_duplicatev64si_exec (tmphi, mem_base_hi, 1764 undef_v64si, exec)); 1765 else 1766 emit_insn (gen_vec_duplicatev64si (tmphi, mem_base_hi)); 1767 1768 /* tmp[:] += zext (mem_base) */ 1769 if (exec) 1770 { 1771 rtx undef_di = gcn_gen_undef (DImode); 1772 emit_insn (gen_addv64si3_vcc_dup_exec (tmplo, mem_base_lo, tmplo, 1773 vcc, undef_v64si, exec)); 1774 emit_insn (gen_addcv64si3_exec (tmphi, tmphi, const0_rtx, 1775 vcc, vcc, undef_v64si, exec)); 1776 } 1777 else 1778 emit_insn (gen_addv64di3_zext_dup (tmp, mem_base_lo, tmp)); 1779 } 1780 else 1781 { 1782 tmp = gen_reg_rtx (V64DImode); 1783 if (exec) 1784 emit_insn (gen_addv64di3_zext_dup2_exec (tmp, tmplo, mem_base, 1785 gcn_gen_undef (V64DImode), 1786 exec)); 1787 else 1788 emit_insn (gen_addv64di3_zext_dup2 (tmp, tmplo, mem_base)); 1789 } 1790 1791 new_base = tmp; 1792 } 1793 else if (AS_ANY_DS_P (as)) 1794 { 1795 if (!exec) 1796 emit_insn (gen_addv64si3_dup (tmplo, tmplo, mem_base)); 1797 else 1798 emit_insn (gen_addv64si3_dup_exec (tmplo, tmplo, mem_base, 1799 gcn_gen_undef (V64SImode), exec)); 1800 new_base = tmplo; 1801 } 1802 else 1803 { 1804 mem_base = gen_rtx_VEC_DUPLICATE (V64DImode, mem_base); 1805 new_base = gen_rtx_PLUS (V64DImode, mem_base, 1806 gen_rtx_SIGN_EXTEND (V64DImode, tmplo)); 1807 } 1808 1809 return gen_rtx_PLUS (GET_MODE (new_base), new_base, 1810 gen_rtx_VEC_DUPLICATE (GET_MODE (new_base), 1811 (mem_index ? mem_index 1812 : const0_rtx))); 1813 } 1814 1815 /* Convert a BASE address, a vector of OFFSETS, and a SCALE, to addresses 1816 suitable for the given address space. This is indented for use in 1817 gather/scatter patterns. 1818 1819 The offsets may be signed or unsigned, according to UNSIGNED_P. 1820 If EXEC is set then _exec patterns will be used, otherwise plain. 1821 1822 Return values. 1823 ADDR_SPACE_FLAT - return V64DImode vector of absolute addresses. 1824 ADDR_SPACE_GLOBAL - return V64SImode vector of offsets. */ 1825 1826 rtx 1827 gcn_expand_scaled_offsets (addr_space_t as, rtx base, rtx offsets, rtx scale, 1828 bool unsigned_p, rtx exec) 1829 { 1830 /* Convert the offsets to V64SImode. 1831 TODO: more conversions will be needed when more types are vectorized. */ 1832 if (GET_MODE (offsets) == V64DImode) 1833 { 1834 rtx tmp = gen_reg_rtx (V64SImode); 1835 emit_insn (gen_vec_truncatev64div64si (tmp, offsets)); 1836 offsets = tmp; 1837 } 1838 1839 rtx tmpsi = gen_reg_rtx (V64SImode); 1840 rtx tmpdi = gen_reg_rtx (V64DImode); 1841 rtx undefsi = exec ? gcn_gen_undef (V64SImode) : NULL; 1842 rtx undefdi = exec ? gcn_gen_undef (V64DImode) : NULL; 1843 1844 if (CONST_INT_P (scale) 1845 && INTVAL (scale) > 0 1846 && exact_log2 (INTVAL (scale)) >= 0) 1847 emit_insn (gen_ashlv64si3 (tmpsi, offsets, 1848 GEN_INT (exact_log2 (INTVAL (scale))))); 1849 else 1850 (exec 1851 ? emit_insn (gen_mulv64si3_dup_exec (tmpsi, offsets, scale, undefsi, 1852 exec)) 1853 : emit_insn (gen_mulv64si3_dup (tmpsi, offsets, scale))); 1854 1855 /* "Global" instructions do not support negative register offsets. */ 1856 if (as == ADDR_SPACE_FLAT || !unsigned_p) 1857 { 1858 if (unsigned_p) 1859 (exec 1860 ? emit_insn (gen_addv64di3_zext_dup2_exec (tmpdi, tmpsi, base, 1861 undefdi, exec)) 1862 : emit_insn (gen_addv64di3_zext_dup2 (tmpdi, tmpsi, base))); 1863 else 1864 (exec 1865 ? emit_insn (gen_addv64di3_sext_dup2_exec (tmpdi, tmpsi, base, 1866 undefdi, exec)) 1867 : emit_insn (gen_addv64di3_sext_dup2 (tmpdi, tmpsi, base))); 1868 return tmpdi; 1869 } 1870 else if (as == ADDR_SPACE_GLOBAL) 1871 return tmpsi; 1872 1873 gcc_unreachable (); 1874 } 1875 1876 /* Return true if move from OP0 to OP1 is known to be executed in vector 1877 unit. */ 1878 1879 bool 1880 gcn_vgpr_move_p (rtx op0, rtx op1) 1881 { 1882 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0))) 1883 return true; 1884 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1))) 1885 return true; 1886 return ((REG_P (op0) && VGPR_REGNO_P (REGNO (op0))) 1887 || (REG_P (op1) && VGPR_REGNO_P (REGNO (op1))) 1888 || vgpr_vector_mode_p (GET_MODE (op0))); 1889 } 1890 1891 /* Return true if move from OP0 to OP1 is known to be executed in scalar 1892 unit. Used in the machine description. */ 1893 1894 bool 1895 gcn_sgpr_move_p (rtx op0, rtx op1) 1896 { 1897 if (MEM_P (op0) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op0))) 1898 return true; 1899 if (MEM_P (op1) && AS_SCALAR_FLAT_P (MEM_ADDR_SPACE (op1))) 1900 return true; 1901 if (!REG_P (op0) || REGNO (op0) >= FIRST_PSEUDO_REGISTER 1902 || VGPR_REGNO_P (REGNO (op0))) 1903 return false; 1904 if (REG_P (op1) 1905 && REGNO (op1) < FIRST_PSEUDO_REGISTER 1906 && !VGPR_REGNO_P (REGNO (op1))) 1907 return true; 1908 return immediate_operand (op1, VOIDmode) || memory_operand (op1, VOIDmode); 1909 } 1910 1911 /* Implement TARGET_SECONDARY_RELOAD. 1912 1913 The address space determines which registers can be used for loads and 1914 stores. */ 1915 1916 static reg_class_t 1917 gcn_secondary_reload (bool in_p, rtx x, reg_class_t rclass, 1918 machine_mode reload_mode, secondary_reload_info *sri) 1919 { 1920 reg_class_t result = NO_REGS; 1921 bool spilled_pseudo = 1922 (REG_P (x) || GET_CODE (x) == SUBREG) && true_regnum (x) == -1; 1923 1924 if (dump_file && (dump_flags & TDF_DETAILS)) 1925 { 1926 fprintf (dump_file, "gcn_secondary_reload: "); 1927 dump_value_slim (dump_file, x, 1); 1928 fprintf (dump_file, " %s %s:%s", (in_p ? "->" : "<-"), 1929 reg_class_names[rclass], GET_MODE_NAME (reload_mode)); 1930 if (REG_P (x) || GET_CODE (x) == SUBREG) 1931 fprintf (dump_file, " (true regnum: %d \"%s\")", true_regnum (x), 1932 (true_regnum (x) >= 0 1933 && true_regnum (x) < FIRST_PSEUDO_REGISTER 1934 ? reg_names[true_regnum (x)] 1935 : (spilled_pseudo ? "stack spill" : "??"))); 1936 fprintf (dump_file, "\n"); 1937 } 1938 1939 /* Some callers don't use or initialize icode. */ 1940 sri->icode = CODE_FOR_nothing; 1941 1942 if (MEM_P (x) || spilled_pseudo) 1943 { 1944 addr_space_t as = DEFAULT_ADDR_SPACE; 1945 1946 /* If we have a spilled pseudo, we can't find the address space 1947 directly, but we know it's in ADDR_SPACE_FLAT space for GCN3 or 1948 ADDR_SPACE_GLOBAL for GCN5. */ 1949 if (MEM_P (x)) 1950 as = MEM_ADDR_SPACE (x); 1951 1952 if (as == ADDR_SPACE_DEFAULT) 1953 as = DEFAULT_ADDR_SPACE; 1954 1955 switch (as) 1956 { 1957 case ADDR_SPACE_SCALAR_FLAT: 1958 result = 1959 ((!MEM_P (x) || rclass == SGPR_REGS) ? NO_REGS : SGPR_REGS); 1960 break; 1961 case ADDR_SPACE_FLAT: 1962 case ADDR_SPACE_FLAT_SCRATCH: 1963 case ADDR_SPACE_GLOBAL: 1964 if (GET_MODE_CLASS (reload_mode) == MODE_VECTOR_INT 1965 || GET_MODE_CLASS (reload_mode) == MODE_VECTOR_FLOAT) 1966 { 1967 if (in_p) 1968 switch (reload_mode) 1969 { 1970 case E_V64SImode: 1971 sri->icode = CODE_FOR_reload_inv64si; 1972 break; 1973 case E_V64SFmode: 1974 sri->icode = CODE_FOR_reload_inv64sf; 1975 break; 1976 case E_V64HImode: 1977 sri->icode = CODE_FOR_reload_inv64hi; 1978 break; 1979 case E_V64HFmode: 1980 sri->icode = CODE_FOR_reload_inv64hf; 1981 break; 1982 case E_V64QImode: 1983 sri->icode = CODE_FOR_reload_inv64qi; 1984 break; 1985 case E_V64DImode: 1986 sri->icode = CODE_FOR_reload_inv64di; 1987 break; 1988 case E_V64DFmode: 1989 sri->icode = CODE_FOR_reload_inv64df; 1990 break; 1991 default: 1992 gcc_unreachable (); 1993 } 1994 else 1995 switch (reload_mode) 1996 { 1997 case E_V64SImode: 1998 sri->icode = CODE_FOR_reload_outv64si; 1999 break; 2000 case E_V64SFmode: 2001 sri->icode = CODE_FOR_reload_outv64sf; 2002 break; 2003 case E_V64HImode: 2004 sri->icode = CODE_FOR_reload_outv64hi; 2005 break; 2006 case E_V64HFmode: 2007 sri->icode = CODE_FOR_reload_outv64hf; 2008 break; 2009 case E_V64QImode: 2010 sri->icode = CODE_FOR_reload_outv64qi; 2011 break; 2012 case E_V64DImode: 2013 sri->icode = CODE_FOR_reload_outv64di; 2014 break; 2015 case E_V64DFmode: 2016 sri->icode = CODE_FOR_reload_outv64df; 2017 break; 2018 default: 2019 gcc_unreachable (); 2020 } 2021 break; 2022 } 2023 /* Fallthrough. */ 2024 case ADDR_SPACE_LDS: 2025 case ADDR_SPACE_GDS: 2026 case ADDR_SPACE_SCRATCH: 2027 result = (rclass == VGPR_REGS ? NO_REGS : VGPR_REGS); 2028 break; 2029 } 2030 } 2031 2032 if (dump_file && (dump_flags & TDF_DETAILS)) 2033 fprintf (dump_file, " <= %s (icode: %s)\n", reg_class_names[result], 2034 get_insn_name (sri->icode)); 2035 2036 return result; 2037 } 2038 2039 /* Update register usage after having seen the compiler flags and kernel 2040 attributes. We typically want to fix registers that contain values 2041 set by the HSA runtime. */ 2042 2043 static void 2044 gcn_conditional_register_usage (void) 2045 { 2046 int i; 2047 2048 /* FIXME: Do we need to reset fixed_regs? */ 2049 2050 /* Limit ourselves to 1/16 the register file for maximimum sized workgroups. 2051 There are enough SGPRs not to limit those. 2052 TODO: Adjust this more dynamically. */ 2053 for (i = FIRST_VGPR_REG + 64; i <= LAST_VGPR_REG; i++) 2054 fixed_regs[i] = 1, call_used_regs[i] = 1; 2055 2056 if (!cfun || !cfun->machine || cfun->machine->normal_function) 2057 { 2058 /* Normal functions can't know what kernel argument registers are 2059 live, so just fix the bottom 16 SGPRs, and bottom 3 VGPRs. */ 2060 for (i = 0; i < 16; i++) 2061 fixed_regs[FIRST_SGPR_REG + i] = 1; 2062 for (i = 0; i < 3; i++) 2063 fixed_regs[FIRST_VGPR_REG + i] = 1; 2064 return; 2065 } 2066 2067 /* Fix the runtime argument register containing values that may be 2068 needed later. DISPATCH_PTR_ARG and FLAT_SCRATCH_* should not be 2069 needed after the prologue so there's no need to fix them. */ 2070 if (cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG] >= 0) 2071 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]] = 1; 2072 if (cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] >= 0) 2073 { 2074 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG]] = 1; 2075 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 1] = 1; 2076 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 2] = 1; 2077 fixed_regs[cfun->machine->args.reg[PRIVATE_SEGMENT_BUFFER_ARG] + 3] = 1; 2078 } 2079 if (cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] >= 0) 2080 { 2081 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG]] = 1; 2082 fixed_regs[cfun->machine->args.reg[KERNARG_SEGMENT_PTR_ARG] + 1] = 1; 2083 } 2084 if (cfun->machine->args.reg[DISPATCH_PTR_ARG] >= 0) 2085 { 2086 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG]] = 1; 2087 fixed_regs[cfun->machine->args.reg[DISPATCH_PTR_ARG] + 1] = 1; 2088 } 2089 if (cfun->machine->args.reg[WORKGROUP_ID_X_ARG] >= 0) 2090 fixed_regs[cfun->machine->args.reg[WORKGROUP_ID_X_ARG]] = 1; 2091 if (cfun->machine->args.reg[WORK_ITEM_ID_X_ARG] >= 0) 2092 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_X_ARG]] = 1; 2093 if (cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG] >= 0) 2094 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Y_ARG]] = 1; 2095 if (cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG] >= 0) 2096 fixed_regs[cfun->machine->args.reg[WORK_ITEM_ID_Z_ARG]] = 1; 2097 2098 if (TARGET_GCN5_PLUS) 2099 /* v0 is always zero, for global nul-offsets. */ 2100 fixed_regs[VGPR_REGNO (0)] = 1; 2101 } 2102 2103 /* Determine if a load or store is valid, according to the register classes 2104 and address space. Used primarily by the machine description to decide 2105 when to split a move into two steps. */ 2106 2107 bool 2108 gcn_valid_move_p (machine_mode mode, rtx dest, rtx src) 2109 { 2110 if (!MEM_P (dest) && !MEM_P (src)) 2111 return true; 2112 2113 if (MEM_P (dest) 2114 && AS_FLAT_P (MEM_ADDR_SPACE (dest)) 2115 && (gcn_flat_address_p (XEXP (dest, 0), mode) 2116 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF 2117 || GET_CODE (XEXP (dest, 0)) == LABEL_REF) 2118 && gcn_vgpr_register_operand (src, mode)) 2119 return true; 2120 else if (MEM_P (src) 2121 && AS_FLAT_P (MEM_ADDR_SPACE (src)) 2122 && (gcn_flat_address_p (XEXP (src, 0), mode) 2123 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF 2124 || GET_CODE (XEXP (src, 0)) == LABEL_REF) 2125 && gcn_vgpr_register_operand (dest, mode)) 2126 return true; 2127 2128 if (MEM_P (dest) 2129 && AS_GLOBAL_P (MEM_ADDR_SPACE (dest)) 2130 && (gcn_global_address_p (XEXP (dest, 0)) 2131 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF 2132 || GET_CODE (XEXP (dest, 0)) == LABEL_REF) 2133 && gcn_vgpr_register_operand (src, mode)) 2134 return true; 2135 else if (MEM_P (src) 2136 && AS_GLOBAL_P (MEM_ADDR_SPACE (src)) 2137 && (gcn_global_address_p (XEXP (src, 0)) 2138 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF 2139 || GET_CODE (XEXP (src, 0)) == LABEL_REF) 2140 && gcn_vgpr_register_operand (dest, mode)) 2141 return true; 2142 2143 if (MEM_P (dest) 2144 && MEM_ADDR_SPACE (dest) == ADDR_SPACE_SCALAR_FLAT 2145 && (gcn_scalar_flat_address_p (XEXP (dest, 0)) 2146 || GET_CODE (XEXP (dest, 0)) == SYMBOL_REF 2147 || GET_CODE (XEXP (dest, 0)) == LABEL_REF) 2148 && gcn_ssrc_register_operand (src, mode)) 2149 return true; 2150 else if (MEM_P (src) 2151 && MEM_ADDR_SPACE (src) == ADDR_SPACE_SCALAR_FLAT 2152 && (gcn_scalar_flat_address_p (XEXP (src, 0)) 2153 || GET_CODE (XEXP (src, 0)) == SYMBOL_REF 2154 || GET_CODE (XEXP (src, 0)) == LABEL_REF) 2155 && gcn_sdst_register_operand (dest, mode)) 2156 return true; 2157 2158 if (MEM_P (dest) 2159 && AS_ANY_DS_P (MEM_ADDR_SPACE (dest)) 2160 && gcn_ds_address_p (XEXP (dest, 0)) 2161 && gcn_vgpr_register_operand (src, mode)) 2162 return true; 2163 else if (MEM_P (src) 2164 && AS_ANY_DS_P (MEM_ADDR_SPACE (src)) 2165 && gcn_ds_address_p (XEXP (src, 0)) 2166 && gcn_vgpr_register_operand (dest, mode)) 2167 return true; 2168 2169 return false; 2170 } 2171 2172 /* }}} */ 2173 /* {{{ Functions and ABI. */ 2174 2175 /* Implement TARGET_FUNCTION_VALUE. 2176 2177 Define how to find the value returned by a function. 2178 The register location is always the same, but the mode depends on 2179 VALTYPE. */ 2180 2181 static rtx 2182 gcn_function_value (const_tree valtype, const_tree, bool) 2183 { 2184 machine_mode mode = TYPE_MODE (valtype); 2185 2186 if (INTEGRAL_TYPE_P (valtype) 2187 && GET_MODE_CLASS (mode) == MODE_INT 2188 && GET_MODE_SIZE (mode) < 4) 2189 mode = SImode; 2190 2191 return gen_rtx_REG (mode, SGPR_REGNO (RETURN_VALUE_REG)); 2192 } 2193 2194 /* Implement TARGET_FUNCTION_VALUE_REGNO_P. 2195 2196 Return true if N is a possible register number for the function return 2197 value. */ 2198 2199 static bool 2200 gcn_function_value_regno_p (const unsigned int n) 2201 { 2202 return n == RETURN_VALUE_REG; 2203 } 2204 2205 /* Calculate the number of registers required to hold a function argument 2206 of MODE and TYPE. */ 2207 2208 static int 2209 num_arg_regs (machine_mode mode, const_tree type) 2210 { 2211 int size; 2212 2213 if (targetm.calls.must_pass_in_stack (mode, type)) 2214 return 0; 2215 2216 if (type && mode == BLKmode) 2217 size = int_size_in_bytes (type); 2218 else 2219 size = GET_MODE_SIZE (mode); 2220 2221 return (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD; 2222 } 2223 2224 /* Implement TARGET_STRICT_ARGUMENT_NAMING. 2225 2226 Return true if the location where a function argument is passed 2227 depends on whether or not it is a named argument 2228 2229 For gcn, we know how to handle functions declared as stdarg: by 2230 passing an extra pointer to the unnamed arguments. However, the 2231 Fortran frontend can produce a different situation, where a 2232 function pointer is declared with no arguments, but the actual 2233 function and calls to it take more arguments. In that case, we 2234 want to ensure the call matches the definition of the function. */ 2235 2236 static bool 2237 gcn_strict_argument_naming (cumulative_args_t cum_v) 2238 { 2239 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 2240 2241 return cum->fntype == NULL_TREE || stdarg_p (cum->fntype); 2242 } 2243 2244 /* Implement TARGET_PRETEND_OUTGOING_VARARGS_NAMED. 2245 2246 See comment on gcn_strict_argument_naming. */ 2247 2248 static bool 2249 gcn_pretend_outgoing_varargs_named (cumulative_args_t cum_v) 2250 { 2251 return !gcn_strict_argument_naming (cum_v); 2252 } 2253 2254 /* Implement TARGET_FUNCTION_ARG. 2255 2256 Return an RTX indicating whether a function argument is passed in a register 2257 and if so, which register. */ 2258 2259 static rtx 2260 gcn_function_arg (cumulative_args_t cum_v, machine_mode mode, const_tree type, 2261 bool named) 2262 { 2263 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 2264 if (cum->normal_function) 2265 { 2266 if (!named || mode == VOIDmode) 2267 return 0; 2268 2269 if (targetm.calls.must_pass_in_stack (mode, type)) 2270 return 0; 2271 2272 int reg_num = FIRST_PARM_REG + cum->num; 2273 int num_regs = num_arg_regs (mode, type); 2274 if (num_regs > 0) 2275 while (reg_num % num_regs != 0) 2276 reg_num++; 2277 if (reg_num + num_regs <= FIRST_PARM_REG + NUM_PARM_REGS) 2278 return gen_rtx_REG (mode, reg_num); 2279 } 2280 else 2281 { 2282 if (cum->num >= cum->args.nargs) 2283 { 2284 cum->offset = (cum->offset + TYPE_ALIGN (type) / 8 - 1) 2285 & -(TYPE_ALIGN (type) / 8); 2286 cfun->machine->kernarg_segment_alignment 2287 = MAX ((unsigned) cfun->machine->kernarg_segment_alignment, 2288 TYPE_ALIGN (type) / 8); 2289 rtx addr = gen_rtx_REG (DImode, 2290 cum->args.reg[KERNARG_SEGMENT_PTR_ARG]); 2291 if (cum->offset) 2292 addr = gen_rtx_PLUS (DImode, addr, 2293 gen_int_mode (cum->offset, DImode)); 2294 rtx mem = gen_rtx_MEM (mode, addr); 2295 set_mem_attributes (mem, const_cast<tree>(type), 1); 2296 set_mem_addr_space (mem, ADDR_SPACE_SCALAR_FLAT); 2297 MEM_READONLY_P (mem) = 1; 2298 return mem; 2299 } 2300 2301 int a = cum->args.order[cum->num]; 2302 if (mode != gcn_kernel_arg_types[a].mode) 2303 { 2304 error ("wrong type of argument %s", gcn_kernel_arg_types[a].name); 2305 return 0; 2306 } 2307 return gen_rtx_REG ((machine_mode) gcn_kernel_arg_types[a].mode, 2308 cum->args.reg[a]); 2309 } 2310 return 0; 2311 } 2312 2313 /* Implement TARGET_FUNCTION_ARG_ADVANCE. 2314 2315 Updates the summarizer variable pointed to by CUM_V to advance past an 2316 argument in the argument list. */ 2317 2318 static void 2319 gcn_function_arg_advance (cumulative_args_t cum_v, machine_mode mode, 2320 const_tree type, bool named) 2321 { 2322 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 2323 2324 if (cum->normal_function) 2325 { 2326 if (!named) 2327 return; 2328 2329 int num_regs = num_arg_regs (mode, type); 2330 if (num_regs > 0) 2331 while ((FIRST_PARM_REG + cum->num) % num_regs != 0) 2332 cum->num++; 2333 cum->num += num_regs; 2334 } 2335 else 2336 { 2337 if (cum->num < cum->args.nargs) 2338 cum->num++; 2339 else 2340 { 2341 cum->offset += tree_to_uhwi (TYPE_SIZE_UNIT (type)); 2342 cfun->machine->kernarg_segment_byte_size = cum->offset; 2343 } 2344 } 2345 } 2346 2347 /* Implement TARGET_ARG_PARTIAL_BYTES. 2348 2349 Returns the number of bytes at the beginning of an argument that must be put 2350 in registers. The value must be zero for arguments that are passed entirely 2351 in registers or that are entirely pushed on the stack. */ 2352 2353 static int 2354 gcn_arg_partial_bytes (cumulative_args_t cum_v, machine_mode mode, tree type, 2355 bool named) 2356 { 2357 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 2358 2359 if (!named) 2360 return 0; 2361 2362 if (targetm.calls.must_pass_in_stack (mode, type)) 2363 return 0; 2364 2365 if (cum->num >= NUM_PARM_REGS) 2366 return 0; 2367 2368 /* If the argument fits entirely in registers, return 0. */ 2369 if (cum->num + num_arg_regs (mode, type) <= NUM_PARM_REGS) 2370 return 0; 2371 2372 return (NUM_PARM_REGS - cum->num) * UNITS_PER_WORD; 2373 } 2374 2375 /* A normal function which takes a pointer argument (to a scalar) may be 2376 passed a pointer to LDS space (via a high-bits-set aperture), and that only 2377 works with FLAT addressing, not GLOBAL. Force FLAT addressing if the 2378 function has an incoming pointer-to-scalar parameter. */ 2379 2380 static void 2381 gcn_detect_incoming_pointer_arg (tree fndecl) 2382 { 2383 gcc_assert (cfun && cfun->machine); 2384 2385 for (tree arg = TYPE_ARG_TYPES (TREE_TYPE (fndecl)); 2386 arg; 2387 arg = TREE_CHAIN (arg)) 2388 if (POINTER_TYPE_P (TREE_VALUE (arg)) 2389 && !AGGREGATE_TYPE_P (TREE_TYPE (TREE_VALUE (arg)))) 2390 cfun->machine->use_flat_addressing = true; 2391 } 2392 2393 /* Implement INIT_CUMULATIVE_ARGS, via gcn.h. 2394 2395 Initialize a variable CUM of type CUMULATIVE_ARGS for a call to a function 2396 whose data type is FNTYPE. For a library call, FNTYPE is 0. */ 2397 2398 void 2399 gcn_init_cumulative_args (CUMULATIVE_ARGS *cum /* Argument info to init */ , 2400 tree fntype /* tree ptr for function decl */ , 2401 rtx libname /* SYMBOL_REF of library name or 0 */ , 2402 tree fndecl, int caller) 2403 { 2404 memset (cum, 0, sizeof (*cum)); 2405 cum->fntype = fntype; 2406 if (libname) 2407 { 2408 gcc_assert (cfun && cfun->machine); 2409 cum->normal_function = true; 2410 if (!caller) 2411 { 2412 cfun->machine->normal_function = true; 2413 gcn_detect_incoming_pointer_arg (fndecl); 2414 } 2415 return; 2416 } 2417 tree attr = NULL; 2418 if (fndecl) 2419 attr = lookup_attribute ("amdgpu_hsa_kernel", DECL_ATTRIBUTES (fndecl)); 2420 if (fndecl && !attr) 2421 attr = lookup_attribute ("amdgpu_hsa_kernel", 2422 TYPE_ATTRIBUTES (TREE_TYPE (fndecl))); 2423 if (!attr && fntype) 2424 attr = lookup_attribute ("amdgpu_hsa_kernel", TYPE_ATTRIBUTES (fntype)); 2425 /* Handle main () as kernel, so we can run testsuite. 2426 Handle OpenACC kernels similarly to main. */ 2427 if (!attr && !caller && fndecl 2428 && (MAIN_NAME_P (DECL_NAME (fndecl)) 2429 || lookup_attribute ("omp target entrypoint", 2430 DECL_ATTRIBUTES (fndecl)) != NULL_TREE)) 2431 gcn_parse_amdgpu_hsa_kernel_attribute (&cum->args, NULL_TREE); 2432 else 2433 { 2434 if (!attr || caller) 2435 { 2436 gcc_assert (cfun && cfun->machine); 2437 cum->normal_function = true; 2438 if (!caller) 2439 cfun->machine->normal_function = true; 2440 } 2441 gcn_parse_amdgpu_hsa_kernel_attribute 2442 (&cum->args, attr ? TREE_VALUE (attr) : NULL_TREE); 2443 } 2444 cfun->machine->args = cum->args; 2445 if (!caller && cfun->machine->normal_function) 2446 gcn_detect_incoming_pointer_arg (fndecl); 2447 } 2448 2449 static bool 2450 gcn_return_in_memory (const_tree type, const_tree ARG_UNUSED (fntype)) 2451 { 2452 machine_mode mode = TYPE_MODE (type); 2453 HOST_WIDE_INT size = int_size_in_bytes (type); 2454 2455 if (AGGREGATE_TYPE_P (type)) 2456 return true; 2457 2458 if (mode == BLKmode) 2459 return true; 2460 2461 if (size > 2 * UNITS_PER_WORD) 2462 return true; 2463 2464 return false; 2465 } 2466 2467 /* Implement TARGET_PROMOTE_FUNCTION_MODE. 2468 2469 Return the mode to use for outgoing function arguments. */ 2470 2471 machine_mode 2472 gcn_promote_function_mode (const_tree ARG_UNUSED (type), machine_mode mode, 2473 int *ARG_UNUSED (punsignedp), 2474 const_tree ARG_UNUSED (funtype), 2475 int ARG_UNUSED (for_return)) 2476 { 2477 if (GET_MODE_CLASS (mode) == MODE_INT && GET_MODE_SIZE (mode) < 4) 2478 return SImode; 2479 2480 return mode; 2481 } 2482 2483 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. 2484 2485 Derived from hppa_gimplify_va_arg_expr. The generic routine doesn't handle 2486 ARGS_GROW_DOWNWARDS. */ 2487 2488 static tree 2489 gcn_gimplify_va_arg_expr (tree valist, tree type, 2490 gimple_seq *ARG_UNUSED (pre_p), 2491 gimple_seq *ARG_UNUSED (post_p)) 2492 { 2493 tree ptr = build_pointer_type (type); 2494 tree valist_type; 2495 tree t, u; 2496 bool indirect; 2497 2498 indirect = pass_by_reference (NULL, TYPE_MODE (type), type, 0); 2499 if (indirect) 2500 { 2501 type = ptr; 2502 ptr = build_pointer_type (type); 2503 } 2504 valist_type = TREE_TYPE (valist); 2505 2506 /* Args grow down. Not handled by generic routines. */ 2507 2508 u = fold_convert (sizetype, size_in_bytes (type)); 2509 u = fold_build1 (NEGATE_EXPR, sizetype, u); 2510 t = fold_build_pointer_plus (valist, u); 2511 2512 /* Align to 8 byte boundary. */ 2513 2514 u = build_int_cst (TREE_TYPE (t), -8); 2515 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, u); 2516 t = fold_convert (valist_type, t); 2517 2518 t = build2 (MODIFY_EXPR, valist_type, valist, t); 2519 2520 t = fold_convert (ptr, t); 2521 t = build_va_arg_indirect_ref (t); 2522 2523 if (indirect) 2524 t = build_va_arg_indirect_ref (t); 2525 2526 return t; 2527 } 2528 2529 /* Calculate stack offsets needed to create prologues and epilogues. */ 2530 2531 static struct machine_function * 2532 gcn_compute_frame_offsets (void) 2533 { 2534 machine_function *offsets = cfun->machine; 2535 2536 if (reload_completed) 2537 return offsets; 2538 2539 offsets->need_frame_pointer = frame_pointer_needed; 2540 2541 offsets->outgoing_args_size = crtl->outgoing_args_size; 2542 offsets->pretend_size = crtl->args.pretend_args_size; 2543 2544 offsets->local_vars = get_frame_size (); 2545 2546 offsets->lr_needs_saving = (!leaf_function_p () 2547 || df_regs_ever_live_p (LR_REGNUM) 2548 || df_regs_ever_live_p (LR_REGNUM + 1)); 2549 2550 offsets->callee_saves = offsets->lr_needs_saving ? 8 : 0; 2551 2552 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) 2553 if ((df_regs_ever_live_p (regno) && !call_used_regs[regno]) 2554 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM 2555 && frame_pointer_needed)) 2556 offsets->callee_saves += (VGPR_REGNO_P (regno) ? 256 : 4); 2557 2558 /* Round up to 64-bit boundary to maintain stack alignment. */ 2559 offsets->callee_saves = (offsets->callee_saves + 7) & ~7; 2560 2561 return offsets; 2562 } 2563 2564 /* Insert code into the prologue or epilogue to store or load any 2565 callee-save register to/from the stack. 2566 2567 Helper function for gcn_expand_prologue and gcn_expand_epilogue. */ 2568 2569 static void 2570 move_callee_saved_registers (rtx sp, machine_function *offsets, 2571 bool prologue) 2572 { 2573 int regno, offset, saved_scalars; 2574 rtx exec = gen_rtx_REG (DImode, EXEC_REG); 2575 rtx vcc = gen_rtx_REG (DImode, VCC_LO_REG); 2576 rtx offreg = gen_rtx_REG (SImode, SGPR_REGNO (22)); 2577 rtx as = gen_rtx_CONST_INT (VOIDmode, STACK_ADDR_SPACE); 2578 HOST_WIDE_INT exec_set = 0; 2579 int offreg_set = 0; 2580 2581 start_sequence (); 2582 2583 /* Move scalars into two vector registers. */ 2584 for (regno = 0, saved_scalars = 0; regno < FIRST_VGPR_REG; regno++) 2585 if ((df_regs_ever_live_p (regno) && !call_used_regs[regno]) 2586 || ((regno & ~1) == LINK_REGNUM && offsets->lr_needs_saving) 2587 || ((regno & ~1) == HARD_FRAME_POINTER_REGNUM 2588 && offsets->need_frame_pointer)) 2589 { 2590 rtx reg = gen_rtx_REG (SImode, regno); 2591 rtx vreg = gen_rtx_REG (V64SImode, 2592 VGPR_REGNO (6 + (saved_scalars / 64))); 2593 int lane = saved_scalars % 64; 2594 2595 if (prologue) 2596 emit_insn (gen_vec_setv64si (vreg, reg, GEN_INT (lane))); 2597 else 2598 emit_insn (gen_vec_extractv64sisi (reg, vreg, GEN_INT (lane))); 2599 2600 saved_scalars++; 2601 } 2602 2603 rtx move_scalars = get_insns (); 2604 end_sequence (); 2605 start_sequence (); 2606 2607 /* Ensure that all vector lanes are moved. */ 2608 exec_set = -1; 2609 emit_move_insn (exec, GEN_INT (exec_set)); 2610 2611 /* Set up a vector stack pointer. */ 2612 rtx _0_1_2_3 = gen_rtx_REG (V64SImode, VGPR_REGNO (1)); 2613 rtx _0_4_8_12 = gen_rtx_REG (V64SImode, VGPR_REGNO (3)); 2614 emit_insn (gen_ashlv64si3_exec (_0_4_8_12, _0_1_2_3, GEN_INT (2), 2615 gcn_gen_undef (V64SImode), exec)); 2616 rtx vsp = gen_rtx_REG (V64DImode, VGPR_REGNO (4)); 2617 emit_insn (gen_vec_duplicatev64di_exec (vsp, sp, gcn_gen_undef (V64DImode), 2618 exec)); 2619 emit_insn (gen_addv64si3_vcc_exec (gcn_operand_part (V64SImode, vsp, 0), 2620 gcn_operand_part (V64SImode, vsp, 0), 2621 _0_4_8_12, vcc, gcn_gen_undef (V64SImode), 2622 exec)); 2623 emit_insn (gen_addcv64si3_exec (gcn_operand_part (V64SImode, vsp, 1), 2624 gcn_operand_part (V64SImode, vsp, 1), 2625 const0_rtx, vcc, vcc, 2626 gcn_gen_undef (V64SImode), exec)); 2627 2628 /* Move vectors. */ 2629 for (regno = FIRST_VGPR_REG, offset = offsets->pretend_size; 2630 regno < FIRST_PSEUDO_REGISTER; regno++) 2631 if ((df_regs_ever_live_p (regno) && !call_used_regs[regno]) 2632 || (regno == VGPR_REGNO (6) && saved_scalars > 0) 2633 || (regno == VGPR_REGNO (7) && saved_scalars > 63)) 2634 { 2635 rtx reg = gen_rtx_REG (V64SImode, regno); 2636 int size = 256; 2637 2638 if (regno == VGPR_REGNO (6) && saved_scalars < 64) 2639 size = saved_scalars * 4; 2640 else if (regno == VGPR_REGNO (7) && saved_scalars < 128) 2641 size = (saved_scalars - 64) * 4; 2642 2643 if (size != 256 || exec_set != -1) 2644 { 2645 exec_set = ((unsigned HOST_WIDE_INT) 1 << (size / 4)) - 1; 2646 emit_move_insn (exec, gen_int_mode (exec_set, DImode)); 2647 } 2648 2649 if (prologue) 2650 emit_insn (gen_scatterv64si_insn_1offset_exec (vsp, const0_rtx, reg, 2651 as, const0_rtx, exec)); 2652 else 2653 emit_insn (gen_gatherv64si_insn_1offset_exec 2654 (reg, vsp, const0_rtx, as, const0_rtx, 2655 gcn_gen_undef (V64SImode), exec)); 2656 2657 /* Move our VSP to the next stack entry. */ 2658 if (offreg_set != size) 2659 { 2660 offreg_set = size; 2661 emit_move_insn (offreg, GEN_INT (size)); 2662 } 2663 if (exec_set != -1) 2664 { 2665 exec_set = -1; 2666 emit_move_insn (exec, GEN_INT (exec_set)); 2667 } 2668 emit_insn (gen_addv64si3_vcc_dup_exec 2669 (gcn_operand_part (V64SImode, vsp, 0), 2670 offreg, gcn_operand_part (V64SImode, vsp, 0), 2671 vcc, gcn_gen_undef (V64SImode), exec)); 2672 emit_insn (gen_addcv64si3_exec 2673 (gcn_operand_part (V64SImode, vsp, 1), 2674 gcn_operand_part (V64SImode, vsp, 1), 2675 const0_rtx, vcc, vcc, gcn_gen_undef (V64SImode), exec)); 2676 2677 offset += size; 2678 } 2679 2680 rtx move_vectors = get_insns (); 2681 end_sequence (); 2682 2683 if (prologue) 2684 { 2685 emit_insn (move_scalars); 2686 emit_insn (move_vectors); 2687 } 2688 else 2689 { 2690 emit_insn (move_vectors); 2691 emit_insn (move_scalars); 2692 } 2693 } 2694 2695 /* Generate prologue. Called from gen_prologue during pro_and_epilogue pass. 2696 2697 For a non-kernel function, the stack layout looks like this (interim), 2698 growing *upwards*: 2699 2700 hi | + ... 2701 |__________________| <-- current SP 2702 | outgoing args | 2703 |__________________| 2704 | (alloca space) | 2705 |__________________| 2706 | local vars | 2707 |__________________| <-- FP/hard FP 2708 | callee-save regs | 2709 |__________________| <-- soft arg pointer 2710 | pretend args | 2711 |__________________| <-- incoming SP 2712 | incoming args | 2713 lo |..................| 2714 2715 This implies arguments (beyond the first N in registers) must grow 2716 downwards (as, apparently, PA has them do). 2717 2718 For a kernel function we have the simpler: 2719 2720 hi | + ... 2721 |__________________| <-- current SP 2722 | outgoing args | 2723 |__________________| 2724 | (alloca space) | 2725 |__________________| 2726 | local vars | 2727 lo |__________________| <-- FP/hard FP 2728 2729 */ 2730 2731 void 2732 gcn_expand_prologue () 2733 { 2734 machine_function *offsets = gcn_compute_frame_offsets (); 2735 2736 if (!cfun || !cfun->machine || cfun->machine->normal_function) 2737 { 2738 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM); 2739 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM); 2740 2741 start_sequence (); 2742 2743 if (offsets->pretend_size > 0) 2744 { 2745 /* FIXME: Do the actual saving of register pretend args to the stack. 2746 Register order needs consideration. */ 2747 } 2748 2749 /* Save callee-save regs. */ 2750 move_callee_saved_registers (sp, offsets, true); 2751 2752 HOST_WIDE_INT sp_adjust = offsets->pretend_size 2753 + offsets->callee_saves 2754 + offsets->local_vars + offsets->outgoing_args_size; 2755 if (sp_adjust > 0) 2756 emit_insn (gen_adddi3_scc (sp, sp, gen_int_mode (sp_adjust, DImode))); 2757 2758 if (offsets->need_frame_pointer) 2759 emit_insn (gen_adddi3_scc (fp, sp, 2760 gen_int_mode 2761 (-(offsets->local_vars + 2762 offsets->outgoing_args_size), 2763 DImode))); 2764 2765 rtx_insn *seq = get_insns (); 2766 end_sequence (); 2767 2768 /* FIXME: Prologue insns should have this flag set for debug output, etc. 2769 but it causes issues for now. 2770 for (insn = seq; insn; insn = NEXT_INSN (insn)) 2771 if (INSN_P (insn)) 2772 RTX_FRAME_RELATED_P (insn) = 1;*/ 2773 2774 emit_insn (seq); 2775 } 2776 else 2777 { 2778 rtx wave_offset = gen_rtx_REG (SImode, 2779 cfun->machine->args. 2780 reg[PRIVATE_SEGMENT_WAVE_OFFSET_ARG]); 2781 2782 if (TARGET_GCN5_PLUS) 2783 { 2784 /* v0 is reserved for constant zero so that "global" 2785 memory instructions can have a nul-offset without 2786 causing reloads. */ 2787 emit_insn (gen_vec_duplicatev64si 2788 (gen_rtx_REG (V64SImode, VGPR_REGNO (0)), const0_rtx)); 2789 } 2790 2791 if (cfun->machine->args.requested & (1 << FLAT_SCRATCH_INIT_ARG)) 2792 { 2793 rtx fs_init_lo = 2794 gen_rtx_REG (SImode, 2795 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG]); 2796 rtx fs_init_hi = 2797 gen_rtx_REG (SImode, 2798 cfun->machine->args.reg[FLAT_SCRATCH_INIT_ARG] + 1); 2799 rtx fs_reg_lo = gen_rtx_REG (SImode, FLAT_SCRATCH_REG); 2800 rtx fs_reg_hi = gen_rtx_REG (SImode, FLAT_SCRATCH_REG + 1); 2801 2802 /*rtx queue = gen_rtx_REG(DImode, 2803 cfun->machine->args.reg[QUEUE_PTR_ARG]); 2804 rtx aperture = gen_rtx_MEM (SImode, 2805 gen_rtx_PLUS (DImode, queue, 2806 gen_int_mode (68, SImode))); 2807 set_mem_addr_space (aperture, ADDR_SPACE_SCALAR_FLAT);*/ 2808 2809 /* Set up flat_scratch. */ 2810 emit_insn (gen_addsi3_scc (fs_reg_hi, fs_init_lo, wave_offset)); 2811 emit_insn (gen_lshrsi3_scc (fs_reg_hi, fs_reg_hi, 2812 gen_int_mode (8, SImode))); 2813 emit_move_insn (fs_reg_lo, fs_init_hi); 2814 } 2815 2816 /* Set up frame pointer and stack pointer. */ 2817 rtx sp = gen_rtx_REG (DImode, STACK_POINTER_REGNUM); 2818 rtx fp = gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM); 2819 rtx fp_hi = simplify_gen_subreg (SImode, fp, DImode, 4); 2820 rtx fp_lo = simplify_gen_subreg (SImode, fp, DImode, 0); 2821 2822 HOST_WIDE_INT sp_adjust = (offsets->local_vars 2823 + offsets->outgoing_args_size); 2824 2825 /* Initialise FP and SP from the buffer descriptor in s[0:3]. */ 2826 emit_move_insn (fp_lo, gen_rtx_REG (SImode, 0)); 2827 emit_insn (gen_andsi3_scc (fp_hi, gen_rtx_REG (SImode, 1), 2828 gen_int_mode (0xffff, SImode))); 2829 rtx scc = gen_rtx_REG (BImode, SCC_REG); 2830 emit_insn (gen_addsi3_scalar_carry (fp_lo, fp_lo, wave_offset, scc)); 2831 emit_insn (gen_addcsi3_scalar_zero (fp_hi, fp_hi, scc)); 2832 2833 if (sp_adjust > 0) 2834 emit_insn (gen_adddi3_scc (sp, fp, gen_int_mode (sp_adjust, DImode))); 2835 else 2836 emit_move_insn (sp, fp); 2837 2838 /* Make sure the flat scratch reg doesn't get optimised away. */ 2839 emit_insn (gen_prologue_use (gen_rtx_REG (DImode, FLAT_SCRATCH_REG))); 2840 } 2841 2842 /* Ensure that the scheduler doesn't do anything unexpected. */ 2843 emit_insn (gen_blockage ()); 2844 2845 emit_move_insn (gen_rtx_REG (SImode, M0_REG), 2846 gen_int_mode (LDS_SIZE, SImode)); 2847 2848 emit_insn (gen_prologue_use (gen_rtx_REG (SImode, M0_REG))); 2849 if (TARGET_GCN5_PLUS) 2850 emit_insn (gen_prologue_use (gen_rtx_REG (SImode, VGPR_REGNO (0)))); 2851 2852 if (cfun && cfun->machine && !cfun->machine->normal_function && flag_openmp) 2853 { 2854 /* OpenMP kernels have an implicit call to gomp_gcn_enter_kernel. */ 2855 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG); 2856 emit_move_insn (fn_reg, gen_rtx_SYMBOL_REF (Pmode, 2857 "gomp_gcn_enter_kernel")); 2858 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx)); 2859 } 2860 } 2861 2862 /* Generate epilogue. Called from gen_epilogue during pro_and_epilogue pass. 2863 2864 See gcn_expand_prologue for stack details. */ 2865 2866 void 2867 gcn_expand_epilogue (void) 2868 { 2869 /* Ensure that the scheduler doesn't do anything unexpected. */ 2870 emit_insn (gen_blockage ()); 2871 2872 if (!cfun || !cfun->machine || cfun->machine->normal_function) 2873 { 2874 machine_function *offsets = gcn_compute_frame_offsets (); 2875 rtx sp = gen_rtx_REG (Pmode, STACK_POINTER_REGNUM); 2876 rtx fp = gen_rtx_REG (Pmode, HARD_FRAME_POINTER_REGNUM); 2877 2878 HOST_WIDE_INT sp_adjust = offsets->callee_saves + offsets->pretend_size; 2879 2880 if (offsets->need_frame_pointer) 2881 { 2882 /* Restore old SP from the frame pointer. */ 2883 if (sp_adjust > 0) 2884 emit_insn (gen_subdi3 (sp, fp, gen_int_mode (sp_adjust, DImode))); 2885 else 2886 emit_move_insn (sp, fp); 2887 } 2888 else 2889 { 2890 /* Restore old SP from current SP. */ 2891 sp_adjust += offsets->outgoing_args_size + offsets->local_vars; 2892 2893 if (sp_adjust > 0) 2894 emit_insn (gen_subdi3 (sp, sp, gen_int_mode (sp_adjust, DImode))); 2895 } 2896 2897 move_callee_saved_registers (sp, offsets, false); 2898 2899 /* There's no explicit use of the link register on the return insn. Emit 2900 one here instead. */ 2901 if (offsets->lr_needs_saving) 2902 emit_use (gen_rtx_REG (DImode, LINK_REGNUM)); 2903 2904 /* Similar for frame pointer. */ 2905 if (offsets->need_frame_pointer) 2906 emit_use (gen_rtx_REG (DImode, HARD_FRAME_POINTER_REGNUM)); 2907 } 2908 else if (flag_openmp) 2909 { 2910 /* OpenMP kernels have an implicit call to gomp_gcn_exit_kernel. */ 2911 rtx fn_reg = gen_rtx_REG (Pmode, FIRST_PARM_REG); 2912 emit_move_insn (fn_reg, 2913 gen_rtx_SYMBOL_REF (Pmode, "gomp_gcn_exit_kernel")); 2914 emit_call_insn (gen_gcn_indirect_call (fn_reg, const0_rtx)); 2915 } 2916 else if (TREE_CODE (TREE_TYPE (DECL_RESULT (cfun->decl))) != VOID_TYPE) 2917 { 2918 /* Assume that an exit value compatible with gcn-run is expected. 2919 That is, the third input parameter is an int*. 2920 2921 We can't allocate any new registers, but the kernarg_reg is 2922 dead after this, so we'll use that. */ 2923 rtx kernarg_reg = gen_rtx_REG (DImode, cfun->machine->args.reg 2924 [KERNARG_SEGMENT_PTR_ARG]); 2925 rtx retptr_mem = gen_rtx_MEM (DImode, 2926 gen_rtx_PLUS (DImode, kernarg_reg, 2927 GEN_INT (16))); 2928 set_mem_addr_space (retptr_mem, ADDR_SPACE_SCALAR_FLAT); 2929 emit_move_insn (kernarg_reg, retptr_mem); 2930 2931 rtx retval_mem = gen_rtx_MEM (SImode, kernarg_reg); 2932 set_mem_addr_space (retval_mem, ADDR_SPACE_SCALAR_FLAT); 2933 emit_move_insn (retval_mem, 2934 gen_rtx_REG (SImode, SGPR_REGNO (RETURN_VALUE_REG))); 2935 } 2936 2937 emit_jump_insn (gen_gcn_return ()); 2938 } 2939 2940 /* Implement TARGET_CAN_ELIMINATE. 2941 2942 Return true if the compiler is allowed to try to replace register number 2943 FROM_REG with register number TO_REG. 2944 2945 FIXME: is the default "true" not enough? Should this be a negative set? */ 2946 2947 bool 2948 gcn_can_eliminate_p (int /*from_reg */ , int to_reg) 2949 { 2950 return (to_reg == HARD_FRAME_POINTER_REGNUM 2951 || to_reg == STACK_POINTER_REGNUM); 2952 } 2953 2954 /* Implement INITIAL_ELIMINATION_OFFSET. 2955 2956 Returns the initial difference between the specified pair of registers, in 2957 terms of stack position. */ 2958 2959 HOST_WIDE_INT 2960 gcn_initial_elimination_offset (int from, int to) 2961 { 2962 machine_function *offsets = gcn_compute_frame_offsets (); 2963 2964 switch (from) 2965 { 2966 case ARG_POINTER_REGNUM: 2967 if (to == STACK_POINTER_REGNUM) 2968 return -(offsets->callee_saves + offsets->local_vars 2969 + offsets->outgoing_args_size); 2970 else if (to == FRAME_POINTER_REGNUM || to == HARD_FRAME_POINTER_REGNUM) 2971 return -offsets->callee_saves; 2972 else 2973 gcc_unreachable (); 2974 break; 2975 2976 case FRAME_POINTER_REGNUM: 2977 if (to == STACK_POINTER_REGNUM) 2978 return -(offsets->local_vars + offsets->outgoing_args_size); 2979 else if (to == HARD_FRAME_POINTER_REGNUM) 2980 return 0; 2981 else 2982 gcc_unreachable (); 2983 break; 2984 2985 default: 2986 gcc_unreachable (); 2987 } 2988 } 2989 2990 /* Implement HARD_REGNO_RENAME_OK. 2991 2992 Return true if it is permissible to rename a hard register from 2993 FROM_REG to TO_REG. */ 2994 2995 bool 2996 gcn_hard_regno_rename_ok (unsigned int from_reg, unsigned int to_reg) 2997 { 2998 if (from_reg == SCC_REG 2999 || from_reg == VCC_LO_REG || from_reg == VCC_HI_REG 3000 || from_reg == EXEC_LO_REG || from_reg == EXEC_HI_REG 3001 || to_reg == SCC_REG 3002 || to_reg == VCC_LO_REG || to_reg == VCC_HI_REG 3003 || to_reg == EXEC_LO_REG || to_reg == EXEC_HI_REG) 3004 return false; 3005 3006 /* Allow the link register to be used if it was saved. */ 3007 if ((to_reg & ~1) == LINK_REGNUM) 3008 return !cfun || cfun->machine->lr_needs_saving; 3009 3010 /* Allow the registers used for the static chain to be used if the chain is 3011 not in active use. */ 3012 if ((to_reg & ~1) == STATIC_CHAIN_REGNUM) 3013 return !cfun 3014 || !(cfun->static_chain_decl 3015 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM) 3016 && df_regs_ever_live_p (STATIC_CHAIN_REGNUM + 1)); 3017 3018 return true; 3019 } 3020 3021 /* Implement HARD_REGNO_CALLER_SAVE_MODE. 3022 3023 Which mode is required for saving NREGS of a pseudo-register in 3024 call-clobbered hard register REGNO. */ 3025 3026 machine_mode 3027 gcn_hard_regno_caller_save_mode (unsigned int regno, unsigned int nregs, 3028 machine_mode regmode) 3029 { 3030 machine_mode result = choose_hard_reg_mode (regno, nregs, false); 3031 3032 if (VECTOR_MODE_P (result) && !VECTOR_MODE_P (regmode)) 3033 result = (nregs == 1 ? SImode : DImode); 3034 3035 return result; 3036 } 3037 3038 /* Implement TARGET_ASM_TRAMPOLINE_TEMPLATE. 3039 3040 Output assembler code for a block containing the constant parts 3041 of a trampoline, leaving space for the variable parts. */ 3042 3043 static void 3044 gcn_asm_trampoline_template (FILE *f) 3045 { 3046 /* The source operand of the move instructions must be a 32-bit 3047 constant following the opcode. */ 3048 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM); 3049 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", STATIC_CHAIN_REGNUM + 1); 3050 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG); 3051 asm_fprintf (f, "\ts_mov_b32\ts%i, 0xffff\n", CC_SAVE_REG + 1); 3052 asm_fprintf (f, "\ts_setpc_b64\ts[%i:%i]\n", CC_SAVE_REG, CC_SAVE_REG + 1); 3053 } 3054 3055 /* Implement TARGET_TRAMPOLINE_INIT. 3056 3057 Emit RTL insns to initialize the variable parts of a trampoline. 3058 FNDECL is the decl of the target address, M_TRAMP is a MEM for 3059 the trampoline, and CHAIN_VALUE is an RTX for the static chain 3060 to be passed to the target function. */ 3061 3062 static void 3063 gcn_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) 3064 { 3065 if (TARGET_GCN5_PLUS) 3066 sorry ("nested function trampolines not supported on GCN5 due to" 3067 " non-executable stacks"); 3068 3069 emit_block_move (m_tramp, assemble_trampoline_template (), 3070 GEN_INT (TRAMPOLINE_SIZE), BLOCK_OP_NORMAL); 3071 3072 rtx fnaddr = XEXP (DECL_RTL (fndecl), 0); 3073 rtx chain_value_reg = copy_to_reg (chain_value); 3074 rtx fnaddr_reg = copy_to_reg (fnaddr); 3075 3076 for (int i = 0; i < 4; i++) 3077 { 3078 rtx mem = adjust_address (m_tramp, SImode, i * 8 + 4); 3079 rtx reg = i < 2 ? chain_value_reg : fnaddr_reg; 3080 emit_move_insn (mem, gen_rtx_SUBREG (SImode, reg, (i % 2) * 4)); 3081 } 3082 3083 rtx tramp_addr = XEXP (m_tramp, 0); 3084 emit_insn (gen_clear_icache (tramp_addr, 3085 plus_constant (ptr_mode, tramp_addr, 3086 TRAMPOLINE_SIZE))); 3087 } 3088 3089 /* }}} */ 3090 /* {{{ Miscellaneous. */ 3091 3092 /* Implement TARGET_CANNOT_COPY_INSN_P. 3093 3094 Return true if INSN must not be duplicated. */ 3095 3096 static bool 3097 gcn_cannot_copy_insn_p (rtx_insn *insn) 3098 { 3099 if (recog_memoized (insn) == CODE_FOR_gcn_wavefront_barrier) 3100 return true; 3101 3102 return false; 3103 } 3104 3105 /* Implement TARGET_DEBUG_UNWIND_INFO. 3106 3107 Defines the mechanism that will be used for describing frame unwind 3108 information to the debugger. */ 3109 3110 static enum unwind_info_type 3111 gcn_debug_unwind_info () 3112 { 3113 /* No support for debug info, yet. */ 3114 return UI_NONE; 3115 } 3116 3117 /* Determine if there is a suitable hardware conversion instruction. 3118 Used primarily by the machine description. */ 3119 3120 bool 3121 gcn_valid_cvt_p (machine_mode from, machine_mode to, enum gcn_cvt_t op) 3122 { 3123 if (VECTOR_MODE_P (from) != VECTOR_MODE_P (to)) 3124 return false; 3125 3126 if (VECTOR_MODE_P (from)) 3127 { 3128 from = GET_MODE_INNER (from); 3129 to = GET_MODE_INNER (to); 3130 } 3131 3132 switch (op) 3133 { 3134 case fix_trunc_cvt: 3135 case fixuns_trunc_cvt: 3136 if (GET_MODE_CLASS (from) != MODE_FLOAT 3137 || GET_MODE_CLASS (to) != MODE_INT) 3138 return false; 3139 break; 3140 case float_cvt: 3141 case floatuns_cvt: 3142 if (GET_MODE_CLASS (from) != MODE_INT 3143 || GET_MODE_CLASS (to) != MODE_FLOAT) 3144 return false; 3145 break; 3146 case extend_cvt: 3147 if (GET_MODE_CLASS (from) != MODE_FLOAT 3148 || GET_MODE_CLASS (to) != MODE_FLOAT 3149 || GET_MODE_SIZE (from) >= GET_MODE_SIZE (to)) 3150 return false; 3151 break; 3152 case trunc_cvt: 3153 if (GET_MODE_CLASS (from) != MODE_FLOAT 3154 || GET_MODE_CLASS (to) != MODE_FLOAT 3155 || GET_MODE_SIZE (from) <= GET_MODE_SIZE (to)) 3156 return false; 3157 break; 3158 } 3159 3160 return ((to == HImode && from == HFmode) 3161 || (to == SImode && (from == SFmode || from == DFmode)) 3162 || (to == HFmode && (from == HImode || from == SFmode)) 3163 || (to == SFmode && (from == SImode || from == HFmode 3164 || from == DFmode)) 3165 || (to == DFmode && (from == SImode || from == SFmode))); 3166 } 3167 3168 /* Implement both TARGET_ASM_CONSTRUCTOR and TARGET_ASM_DESTRUCTOR. 3169 3170 The current loader does not support running code outside "main". This 3171 hook implementation can be replaced or removed when that changes. */ 3172 3173 void 3174 gcn_disable_constructors (rtx symbol, int priority __attribute__ ((unused))) 3175 { 3176 tree d = SYMBOL_REF_DECL (symbol); 3177 location_t l = d ? DECL_SOURCE_LOCATION (d) : UNKNOWN_LOCATION; 3178 3179 sorry_at (l, "GCN does not support static constructors or destructors"); 3180 } 3181 3182 /* }}} */ 3183 /* {{{ Costs. */ 3184 3185 /* Implement TARGET_RTX_COSTS. 3186 3187 Compute a (partial) cost for rtx X. Return true if the complete 3188 cost has been computed, and false if subexpressions should be 3189 scanned. In either case, *TOTAL contains the cost result. */ 3190 3191 static bool 3192 gcn_rtx_costs (rtx x, machine_mode, int, int, int *total, bool) 3193 { 3194 enum rtx_code code = GET_CODE (x); 3195 switch (code) 3196 { 3197 case CONST: 3198 case CONST_DOUBLE: 3199 case CONST_VECTOR: 3200 case CONST_INT: 3201 if (gcn_inline_constant_p (x)) 3202 *total = 0; 3203 else if (code == CONST_INT 3204 && ((unsigned HOST_WIDE_INT) INTVAL (x) + 0x8000) < 0x10000) 3205 *total = 1; 3206 else if (gcn_constant_p (x)) 3207 *total = 2; 3208 else 3209 *total = vgpr_vector_mode_p (GET_MODE (x)) ? 64 : 4; 3210 return true; 3211 3212 case DIV: 3213 *total = 100; 3214 return false; 3215 3216 default: 3217 *total = 3; 3218 return false; 3219 } 3220 } 3221 3222 /* Implement TARGET_MEMORY_MOVE_COST. 3223 3224 Return the cost of moving data of mode M between a 3225 register and memory. A value of 2 is the default; this cost is 3226 relative to those in `REGISTER_MOVE_COST'. 3227 3228 This function is used extensively by register_move_cost that is used to 3229 build tables at startup. Make it inline in this case. 3230 When IN is 2, return maximum of in and out move cost. 3231 3232 If moving between registers and memory is more expensive than 3233 between two registers, you should define this macro to express the 3234 relative cost. 3235 3236 Model also increased moving costs of QImode registers in non 3237 Q_REGS classes. */ 3238 3239 #define LOAD_COST 32 3240 #define STORE_COST 32 3241 static int 3242 gcn_memory_move_cost (machine_mode mode, reg_class_t regclass, bool in) 3243 { 3244 int nregs = CEIL (GET_MODE_SIZE (mode), 4); 3245 switch (regclass) 3246 { 3247 case SCC_CONDITIONAL_REG: 3248 case VCCZ_CONDITIONAL_REG: 3249 case VCC_CONDITIONAL_REG: 3250 case EXECZ_CONDITIONAL_REG: 3251 case ALL_CONDITIONAL_REGS: 3252 case SGPR_REGS: 3253 case SGPR_EXEC_REGS: 3254 case EXEC_MASK_REG: 3255 case SGPR_VOP_SRC_REGS: 3256 case SGPR_MEM_SRC_REGS: 3257 case SGPR_SRC_REGS: 3258 case SGPR_DST_REGS: 3259 case GENERAL_REGS: 3260 case AFP_REGS: 3261 if (!in) 3262 return (STORE_COST + 2) * nregs; 3263 return LOAD_COST * nregs; 3264 case VGPR_REGS: 3265 if (in) 3266 return (LOAD_COST + 2) * nregs; 3267 return STORE_COST * nregs; 3268 case ALL_REGS: 3269 case ALL_GPR_REGS: 3270 case SRCDST_REGS: 3271 if (in) 3272 return (LOAD_COST + 2) * nregs; 3273 return (STORE_COST + 2) * nregs; 3274 default: 3275 gcc_unreachable (); 3276 } 3277 } 3278 3279 /* Implement TARGET_REGISTER_MOVE_COST. 3280 3281 Return the cost of moving data from a register in class CLASS1 to 3282 one in class CLASS2. Base value is 2. */ 3283 3284 static int 3285 gcn_register_move_cost (machine_mode, reg_class_t dst, reg_class_t src) 3286 { 3287 /* Increase cost of moving from and to vector registers. While this is 3288 fast in hardware (I think), it has hidden cost of setting up the exec 3289 flags. */ 3290 if ((src < VGPR_REGS) != (dst < VGPR_REGS)) 3291 return 4; 3292 return 2; 3293 } 3294 3295 /* }}} */ 3296 /* {{{ Builtins. */ 3297 3298 /* Type codes used by GCN built-in definitions. */ 3299 3300 enum gcn_builtin_type_index 3301 { 3302 GCN_BTI_END_OF_PARAMS, 3303 3304 GCN_BTI_VOID, 3305 GCN_BTI_BOOL, 3306 GCN_BTI_INT, 3307 GCN_BTI_UINT, 3308 GCN_BTI_SIZE_T, 3309 GCN_BTI_LLINT, 3310 GCN_BTI_LLUINT, 3311 GCN_BTI_EXEC, 3312 3313 GCN_BTI_SF, 3314 GCN_BTI_V64SI, 3315 GCN_BTI_V64SF, 3316 GCN_BTI_V64PTR, 3317 GCN_BTI_SIPTR, 3318 GCN_BTI_SFPTR, 3319 GCN_BTI_VOIDPTR, 3320 3321 GCN_BTI_LDS_VOIDPTR, 3322 3323 GCN_BTI_MAX 3324 }; 3325 3326 static GTY(()) tree gcn_builtin_types[GCN_BTI_MAX]; 3327 3328 #define exec_type_node (gcn_builtin_types[GCN_BTI_EXEC]) 3329 #define sf_type_node (gcn_builtin_types[GCN_BTI_SF]) 3330 #define v64si_type_node (gcn_builtin_types[GCN_BTI_V64SI]) 3331 #define v64sf_type_node (gcn_builtin_types[GCN_BTI_V64SF]) 3332 #define v64ptr_type_node (gcn_builtin_types[GCN_BTI_V64PTR]) 3333 #define siptr_type_node (gcn_builtin_types[GCN_BTI_SIPTR]) 3334 #define sfptr_type_node (gcn_builtin_types[GCN_BTI_SFPTR]) 3335 #define voidptr_type_node (gcn_builtin_types[GCN_BTI_VOIDPTR]) 3336 #define size_t_type_node (gcn_builtin_types[GCN_BTI_SIZE_T]) 3337 3338 static rtx gcn_expand_builtin_1 (tree, rtx, rtx, machine_mode, int, 3339 struct gcn_builtin_description *); 3340 static rtx gcn_expand_builtin_binop (tree, rtx, rtx, machine_mode, int, 3341 struct gcn_builtin_description *); 3342 3343 struct gcn_builtin_description; 3344 typedef rtx (*gcn_builtin_expander) (tree, rtx, rtx, machine_mode, int, 3345 struct gcn_builtin_description *); 3346 3347 enum gcn_builtin_type 3348 { 3349 B_UNIMPLEMENTED, /* Sorry out */ 3350 B_INSN, /* Emit a pattern */ 3351 B_OVERLOAD /* Placeholder for an overloaded function */ 3352 }; 3353 3354 struct gcn_builtin_description 3355 { 3356 int fcode; 3357 int icode; 3358 const char *name; 3359 enum gcn_builtin_type type; 3360 /* The first element of parm is always the return type. The rest 3361 are a zero terminated list of parameters. */ 3362 int parm[6]; 3363 gcn_builtin_expander expander; 3364 }; 3365 3366 /* Read in the GCN builtins from gcn-builtins.def. */ 3367 3368 extern GTY(()) struct gcn_builtin_description gcn_builtins[GCN_BUILTIN_MAX]; 3369 3370 struct gcn_builtin_description gcn_builtins[] = { 3371 #define DEF_BUILTIN(fcode, icode, name, type, params, expander) \ 3372 {GCN_BUILTIN_ ## fcode, icode, name, type, params, expander}, 3373 3374 #define DEF_BUILTIN_BINOP_INT_FP(fcode, ic, name) \ 3375 {GCN_BUILTIN_ ## fcode ## _V64SI, \ 3376 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int", B_INSN, \ 3377 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \ 3378 GCN_BTI_V64SI, GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, \ 3379 {GCN_BUILTIN_ ## fcode ## _V64SI_unspec, \ 3380 CODE_FOR_ ## ic ##v64si3_exec, name "_v64int_unspec", B_INSN, \ 3381 {GCN_BTI_V64SI, GCN_BTI_EXEC, GCN_BTI_V64SI, GCN_BTI_V64SI, \ 3382 GCN_BTI_END_OF_PARAMS}, gcn_expand_builtin_binop}, 3383 3384 #include "gcn-builtins.def" 3385 #undef DEF_BUILTIN_BINOP_INT_FP 3386 #undef DEF_BUILTIN 3387 }; 3388 3389 static GTY(()) tree gcn_builtin_decls[GCN_BUILTIN_MAX]; 3390 3391 /* Implement TARGET_BUILTIN_DECL. 3392 3393 Return the GCN builtin for CODE. */ 3394 3395 tree 3396 gcn_builtin_decl (unsigned code, bool ARG_UNUSED (initialize_p)) 3397 { 3398 if (code >= GCN_BUILTIN_MAX) 3399 return error_mark_node; 3400 3401 return gcn_builtin_decls[code]; 3402 } 3403 3404 /* Helper function for gcn_init_builtins. */ 3405 3406 static void 3407 gcn_init_builtin_types (void) 3408 { 3409 gcn_builtin_types[GCN_BTI_VOID] = void_type_node; 3410 gcn_builtin_types[GCN_BTI_BOOL] = boolean_type_node; 3411 gcn_builtin_types[GCN_BTI_INT] = intSI_type_node; 3412 gcn_builtin_types[GCN_BTI_UINT] = unsigned_type_for (intSI_type_node); 3413 gcn_builtin_types[GCN_BTI_SIZE_T] = size_type_node; 3414 gcn_builtin_types[GCN_BTI_LLINT] = intDI_type_node; 3415 gcn_builtin_types[GCN_BTI_LLUINT] = unsigned_type_for (intDI_type_node); 3416 3417 exec_type_node = unsigned_intDI_type_node; 3418 sf_type_node = float32_type_node; 3419 v64si_type_node = build_vector_type (intSI_type_node, 64); 3420 v64sf_type_node = build_vector_type (float_type_node, 64); 3421 v64ptr_type_node = build_vector_type (unsigned_intDI_type_node 3422 /*build_pointer_type 3423 (integer_type_node) */ 3424 , 64); 3425 tree tmp = build_distinct_type_copy (intSI_type_node); 3426 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT; 3427 siptr_type_node = build_pointer_type (tmp); 3428 3429 tmp = build_distinct_type_copy (float_type_node); 3430 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT; 3431 sfptr_type_node = build_pointer_type (tmp); 3432 3433 tmp = build_distinct_type_copy (void_type_node); 3434 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_FLAT; 3435 voidptr_type_node = build_pointer_type (tmp); 3436 3437 tmp = build_distinct_type_copy (void_type_node); 3438 TYPE_ADDR_SPACE (tmp) = ADDR_SPACE_LDS; 3439 gcn_builtin_types[GCN_BTI_LDS_VOIDPTR] = build_pointer_type (tmp); 3440 } 3441 3442 /* Implement TARGET_INIT_BUILTINS. 3443 3444 Set up all builtin functions for this target. */ 3445 3446 static void 3447 gcn_init_builtins (void) 3448 { 3449 gcn_init_builtin_types (); 3450 3451 struct gcn_builtin_description *d; 3452 unsigned int i; 3453 for (i = 0, d = gcn_builtins; i < GCN_BUILTIN_MAX; i++, d++) 3454 { 3455 tree p; 3456 char name[64]; /* build_function will make a copy. */ 3457 int parm; 3458 3459 /* FIXME: Is this necessary/useful? */ 3460 if (d->name == 0) 3461 continue; 3462 3463 /* Find last parm. */ 3464 for (parm = 1; d->parm[parm] != GCN_BTI_END_OF_PARAMS; parm++) 3465 ; 3466 3467 p = void_list_node; 3468 while (parm > 1) 3469 p = tree_cons (NULL_TREE, gcn_builtin_types[d->parm[--parm]], p); 3470 3471 p = build_function_type (gcn_builtin_types[d->parm[0]], p); 3472 3473 sprintf (name, "__builtin_gcn_%s", d->name); 3474 gcn_builtin_decls[i] 3475 = add_builtin_function (name, p, i, BUILT_IN_MD, NULL, NULL_TREE); 3476 3477 /* These builtins don't throw. */ 3478 TREE_NOTHROW (gcn_builtin_decls[i]) = 1; 3479 } 3480 3481 /* FIXME: remove the ifdef once OpenACC support is merged upstream. */ 3482 #ifdef BUILT_IN_GOACC_SINGLE_START 3483 /* These builtins need to take/return an LDS pointer: override the generic 3484 versions here. */ 3485 3486 set_builtin_decl (BUILT_IN_GOACC_SINGLE_START, 3487 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_START], false); 3488 3489 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_START, 3490 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_START], 3491 false); 3492 3493 set_builtin_decl (BUILT_IN_GOACC_SINGLE_COPY_END, 3494 gcn_builtin_decls[GCN_BUILTIN_ACC_SINGLE_COPY_END], 3495 false); 3496 3497 set_builtin_decl (BUILT_IN_GOACC_BARRIER, 3498 gcn_builtin_decls[GCN_BUILTIN_ACC_BARRIER], false); 3499 #endif 3500 } 3501 3502 /* Expand the CMP_SWAP GCN builtins. We have our own versions that do 3503 not require taking the address of any object, other than the memory 3504 cell being operated on. 3505 3506 Helper function for gcn_expand_builtin_1. */ 3507 3508 static rtx 3509 gcn_expand_cmp_swap (tree exp, rtx target) 3510 { 3511 machine_mode mode = TYPE_MODE (TREE_TYPE (exp)); 3512 addr_space_t as 3513 = TYPE_ADDR_SPACE (TREE_TYPE (TREE_TYPE (CALL_EXPR_ARG (exp, 0)))); 3514 machine_mode as_mode = gcn_addr_space_address_mode (as); 3515 3516 if (!target) 3517 target = gen_reg_rtx (mode); 3518 3519 rtx addr = expand_expr (CALL_EXPR_ARG (exp, 0), 3520 NULL_RTX, as_mode, EXPAND_NORMAL); 3521 rtx cmp = expand_expr (CALL_EXPR_ARG (exp, 1), 3522 NULL_RTX, mode, EXPAND_NORMAL); 3523 rtx src = expand_expr (CALL_EXPR_ARG (exp, 2), 3524 NULL_RTX, mode, EXPAND_NORMAL); 3525 rtx pat; 3526 3527 rtx mem = gen_rtx_MEM (mode, force_reg (as_mode, addr)); 3528 set_mem_addr_space (mem, as); 3529 3530 if (!REG_P (cmp)) 3531 cmp = copy_to_mode_reg (mode, cmp); 3532 if (!REG_P (src)) 3533 src = copy_to_mode_reg (mode, src); 3534 3535 if (mode == SImode) 3536 pat = gen_sync_compare_and_swapsi (target, mem, cmp, src); 3537 else 3538 pat = gen_sync_compare_and_swapdi (target, mem, cmp, src); 3539 3540 emit_insn (pat); 3541 3542 return target; 3543 } 3544 3545 /* Expand many different builtins. 3546 3547 Intended for use in gcn-builtins.def. */ 3548 3549 static rtx 3550 gcn_expand_builtin_1 (tree exp, rtx target, rtx /*subtarget */ , 3551 machine_mode /*mode */ , int ignore, 3552 struct gcn_builtin_description *) 3553 { 3554 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); 3555 switch (DECL_FUNCTION_CODE (fndecl)) 3556 { 3557 case GCN_BUILTIN_FLAT_LOAD_INT32: 3558 { 3559 if (ignore) 3560 return target; 3561 /*rtx exec = */ 3562 force_reg (DImode, 3563 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode, 3564 EXPAND_NORMAL)); 3565 /*rtx ptr = */ 3566 force_reg (V64DImode, 3567 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, V64DImode, 3568 EXPAND_NORMAL)); 3569 /*emit_insn (gen_vector_flat_loadv64si 3570 (target, gcn_gen_undef (V64SImode), ptr, exec)); */ 3571 return target; 3572 } 3573 case GCN_BUILTIN_FLAT_LOAD_PTR_INT32: 3574 case GCN_BUILTIN_FLAT_LOAD_PTR_FLOAT: 3575 { 3576 if (ignore) 3577 return target; 3578 rtx exec = force_reg (DImode, 3579 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, 3580 DImode, 3581 EXPAND_NORMAL)); 3582 rtx ptr = force_reg (DImode, 3583 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, 3584 V64DImode, 3585 EXPAND_NORMAL)); 3586 rtx offsets = force_reg (V64SImode, 3587 expand_expr (CALL_EXPR_ARG (exp, 2), 3588 NULL_RTX, V64DImode, 3589 EXPAND_NORMAL)); 3590 rtx addrs = gen_reg_rtx (V64DImode); 3591 rtx tmp = gen_reg_rtx (V64SImode); 3592 emit_insn (gen_ashlv64si3_exec (tmp, offsets, 3593 GEN_INT (2), 3594 gcn_gen_undef (V64SImode), exec)); 3595 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr, 3596 gcn_gen_undef (V64DImode), 3597 exec)); 3598 rtx mem = gen_rtx_MEM (GET_MODE (target), addrs); 3599 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */ 3600 /* FIXME: set attributes. */ 3601 emit_insn (gen_mov_with_exec (target, mem, exec)); 3602 return target; 3603 } 3604 case GCN_BUILTIN_FLAT_STORE_PTR_INT32: 3605 case GCN_BUILTIN_FLAT_STORE_PTR_FLOAT: 3606 { 3607 rtx exec = force_reg (DImode, 3608 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, 3609 DImode, 3610 EXPAND_NORMAL)); 3611 rtx ptr = force_reg (DImode, 3612 expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, 3613 V64DImode, 3614 EXPAND_NORMAL)); 3615 rtx offsets = force_reg (V64SImode, 3616 expand_expr (CALL_EXPR_ARG (exp, 2), 3617 NULL_RTX, V64DImode, 3618 EXPAND_NORMAL)); 3619 machine_mode vmode = TYPE_MODE (TREE_TYPE (CALL_EXPR_ARG (exp, 3620 3))); 3621 rtx val = force_reg (vmode, 3622 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX, 3623 vmode, 3624 EXPAND_NORMAL)); 3625 rtx addrs = gen_reg_rtx (V64DImode); 3626 rtx tmp = gen_reg_rtx (V64SImode); 3627 emit_insn (gen_ashlv64si3_exec (tmp, offsets, 3628 GEN_INT (2), 3629 gcn_gen_undef (V64SImode), exec)); 3630 emit_insn (gen_addv64di3_zext_dup2_exec (addrs, tmp, ptr, 3631 gcn_gen_undef (V64DImode), 3632 exec)); 3633 rtx mem = gen_rtx_MEM (vmode, addrs); 3634 /*set_mem_addr_space (mem, ADDR_SPACE_FLAT); */ 3635 /* FIXME: set attributes. */ 3636 emit_insn (gen_mov_with_exec (mem, val, exec)); 3637 return target; 3638 } 3639 case GCN_BUILTIN_SQRTVF: 3640 { 3641 if (ignore) 3642 return target; 3643 rtx exec = gcn_full_exec_reg (); 3644 rtx arg = force_reg (V64SFmode, 3645 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, 3646 V64SFmode, 3647 EXPAND_NORMAL)); 3648 emit_insn (gen_sqrtv64sf2_exec 3649 (target, arg, gcn_gen_undef (V64SFmode), exec)); 3650 return target; 3651 } 3652 case GCN_BUILTIN_SQRTF: 3653 { 3654 if (ignore) 3655 return target; 3656 rtx arg = force_reg (SFmode, 3657 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, 3658 SFmode, 3659 EXPAND_NORMAL)); 3660 emit_insn (gen_sqrtsf2 (target, arg)); 3661 return target; 3662 } 3663 case GCN_BUILTIN_OMP_DIM_SIZE: 3664 { 3665 if (ignore) 3666 return target; 3667 emit_insn (gen_oacc_dim_size (target, 3668 expand_expr (CALL_EXPR_ARG (exp, 0), 3669 NULL_RTX, SImode, 3670 EXPAND_NORMAL))); 3671 return target; 3672 } 3673 case GCN_BUILTIN_OMP_DIM_POS: 3674 { 3675 if (ignore) 3676 return target; 3677 emit_insn (gen_oacc_dim_pos (target, 3678 expand_expr (CALL_EXPR_ARG (exp, 0), 3679 NULL_RTX, SImode, 3680 EXPAND_NORMAL))); 3681 return target; 3682 } 3683 case GCN_BUILTIN_CMP_SWAP: 3684 case GCN_BUILTIN_CMP_SWAPLL: 3685 return gcn_expand_cmp_swap (exp, target); 3686 3687 case GCN_BUILTIN_ACC_SINGLE_START: 3688 { 3689 if (ignore) 3690 return target; 3691 3692 rtx wavefront = gcn_oacc_dim_pos (1); 3693 rtx cond = gen_rtx_EQ (VOIDmode, wavefront, const0_rtx); 3694 rtx cc = (target && REG_P (target)) ? target : gen_reg_rtx (BImode); 3695 emit_insn (gen_cstoresi4 (cc, cond, wavefront, const0_rtx)); 3696 return cc; 3697 } 3698 3699 case GCN_BUILTIN_ACC_SINGLE_COPY_START: 3700 { 3701 rtx blk = force_reg (SImode, 3702 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, 3703 SImode, EXPAND_NORMAL)); 3704 rtx wavefront = gcn_oacc_dim_pos (1); 3705 rtx cond = gen_rtx_NE (VOIDmode, wavefront, const0_rtx); 3706 rtx not_zero = gen_label_rtx (); 3707 emit_insn (gen_cbranchsi4 (cond, wavefront, const0_rtx, not_zero)); 3708 emit_move_insn (blk, const0_rtx); 3709 emit_label (not_zero); 3710 return blk; 3711 } 3712 3713 case GCN_BUILTIN_ACC_SINGLE_COPY_END: 3714 return target; 3715 3716 case GCN_BUILTIN_ACC_BARRIER: 3717 emit_insn (gen_gcn_wavefront_barrier ()); 3718 return target; 3719 3720 default: 3721 gcc_unreachable (); 3722 } 3723 } 3724 3725 /* Expansion of simple arithmetic and bit binary operation builtins. 3726 3727 Intended for use with gcn_builtins table. */ 3728 3729 static rtx 3730 gcn_expand_builtin_binop (tree exp, rtx target, rtx /*subtarget */ , 3731 machine_mode /*mode */ , int ignore, 3732 struct gcn_builtin_description *d) 3733 { 3734 int icode = d->icode; 3735 if (ignore) 3736 return target; 3737 3738 rtx exec = force_reg (DImode, 3739 expand_expr (CALL_EXPR_ARG (exp, 0), NULL_RTX, DImode, 3740 EXPAND_NORMAL)); 3741 3742 machine_mode m1 = insn_data[icode].operand[1].mode; 3743 rtx arg1 = expand_expr (CALL_EXPR_ARG (exp, 1), NULL_RTX, m1, 3744 EXPAND_NORMAL); 3745 if (!insn_data[icode].operand[1].predicate (arg1, m1)) 3746 arg1 = force_reg (m1, arg1); 3747 3748 machine_mode m2 = insn_data[icode].operand[2].mode; 3749 rtx arg2 = expand_expr (CALL_EXPR_ARG (exp, 2), NULL_RTX, m2, 3750 EXPAND_NORMAL); 3751 if (!insn_data[icode].operand[2].predicate (arg2, m2)) 3752 arg2 = force_reg (m2, arg2); 3753 3754 rtx arg_prev; 3755 if (call_expr_nargs (exp) == 4) 3756 { 3757 machine_mode m_prev = insn_data[icode].operand[4].mode; 3758 arg_prev = force_reg (m_prev, 3759 expand_expr (CALL_EXPR_ARG (exp, 3), NULL_RTX, 3760 m_prev, EXPAND_NORMAL)); 3761 } 3762 else 3763 arg_prev = gcn_gen_undef (GET_MODE (target)); 3764 3765 rtx pat = GEN_FCN (icode) (target, arg1, arg2, exec, arg_prev); 3766 emit_insn (pat); 3767 return target; 3768 } 3769 3770 /* Implement TARGET_EXPAND_BUILTIN. 3771 3772 Expand an expression EXP that calls a built-in function, with result going 3773 to TARGET if that's convenient (and in mode MODE if that's convenient). 3774 SUBTARGET may be used as the target for computing one of EXP's operands. 3775 IGNORE is nonzero if the value is to be ignored. */ 3776 3777 rtx 3778 gcn_expand_builtin (tree exp, rtx target, rtx subtarget, machine_mode mode, 3779 int ignore) 3780 { 3781 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0); 3782 unsigned int fcode = DECL_FUNCTION_CODE (fndecl); 3783 struct gcn_builtin_description *d; 3784 3785 gcc_assert (fcode < GCN_BUILTIN_MAX); 3786 d = &gcn_builtins[fcode]; 3787 3788 if (d->type == B_UNIMPLEMENTED) 3789 sorry ("Builtin not implemented"); 3790 3791 return d->expander (exp, target, subtarget, mode, ignore, d); 3792 } 3793 3794 /* }}} */ 3795 /* {{{ Vectorization. */ 3796 3797 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. 3798 3799 A vector mask is a value that holds one boolean result for every element in 3800 a vector. */ 3801 3802 opt_machine_mode 3803 gcn_vectorize_get_mask_mode (poly_uint64 ARG_UNUSED (nunits), 3804 poly_uint64 ARG_UNUSED (length)) 3805 { 3806 /* GCN uses a DImode bit-mask. */ 3807 return DImode; 3808 } 3809 3810 /* Return an RTX that references a vector with the i-th lane containing 3811 PERM[i]*4. 3812 3813 Helper function for gcn_vectorize_vec_perm_const. */ 3814 3815 static rtx 3816 gcn_make_vec_perm_address (unsigned int *perm) 3817 { 3818 rtx x = gen_reg_rtx (V64SImode); 3819 emit_move_insn (x, gcn_vec_constant (V64SImode, 0)); 3820 3821 /* Permutation addresses use byte addressing. With each vector lane being 3822 4 bytes wide, and with 64 lanes in total, only bits 2..7 are significant, 3823 so only set those. 3824 3825 The permutation given to the vec_perm* patterns range from 0 to 2N-1 to 3826 select between lanes in two vectors, but as the DS_BPERMUTE* instructions 3827 only take one source vector, the most-significant bit can be ignored 3828 here. Instead, we can use EXEC masking to select the relevant part of 3829 each source vector after they are permuted separately. */ 3830 uint64_t bit_mask = 1 << 2; 3831 for (int i = 2; i < 8; i++, bit_mask <<= 1) 3832 { 3833 uint64_t exec_mask = 0; 3834 uint64_t lane_mask = 1; 3835 for (int j = 0; j < 64; j++, lane_mask <<= 1) 3836 if ((perm[j] * 4) & bit_mask) 3837 exec_mask |= lane_mask; 3838 3839 if (exec_mask) 3840 emit_insn (gen_addv64si3_exec (x, x, 3841 gcn_vec_constant (V64SImode, 3842 bit_mask), 3843 x, get_exec (exec_mask))); 3844 } 3845 3846 return x; 3847 } 3848 3849 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. 3850 3851 Return true if permutation with SEL is possible. 3852 3853 If DST/SRC0/SRC1 are non-null, emit the instructions to perform the 3854 permutations. */ 3855 3856 static bool 3857 gcn_vectorize_vec_perm_const (machine_mode vmode, rtx dst, 3858 rtx src0, rtx src1, 3859 const vec_perm_indices & sel) 3860 { 3861 unsigned int nelt = GET_MODE_NUNITS (vmode); 3862 3863 gcc_assert (VECTOR_MODE_P (vmode)); 3864 gcc_assert (nelt <= 64); 3865 gcc_assert (sel.length () == nelt); 3866 3867 if (!dst) 3868 { 3869 /* All vector permutations are possible on this architecture, 3870 with varying degrees of efficiency depending on the permutation. */ 3871 return true; 3872 } 3873 3874 unsigned int perm[64]; 3875 for (unsigned int i = 0; i < nelt; ++i) 3876 perm[i] = sel[i] & (2 * nelt - 1); 3877 3878 /* Make life a bit easier by swapping operands if necessary so that 3879 the first element always comes from src0. */ 3880 if (perm[0] >= nelt) 3881 { 3882 rtx temp = src0; 3883 src0 = src1; 3884 src1 = temp; 3885 3886 for (unsigned int i = 0; i < nelt; ++i) 3887 if (perm[i] < nelt) 3888 perm[i] += nelt; 3889 else 3890 perm[i] -= nelt; 3891 } 3892 3893 /* TODO: There are more efficient ways to implement certain permutations 3894 using ds_swizzle_b32 and/or DPP. Test for and expand them here, before 3895 this more inefficient generic approach is used. */ 3896 3897 int64_t src1_lanes = 0; 3898 int64_t lane_bit = 1; 3899 3900 for (unsigned int i = 0; i < nelt; ++i, lane_bit <<= 1) 3901 { 3902 /* Set the bits for lanes from src1. */ 3903 if (perm[i] >= nelt) 3904 src1_lanes |= lane_bit; 3905 } 3906 3907 rtx addr = gcn_make_vec_perm_address (perm); 3908 rtx (*ds_bpermute) (rtx, rtx, rtx, rtx); 3909 3910 switch (vmode) 3911 { 3912 case E_V64QImode: 3913 ds_bpermute = gen_ds_bpermutev64qi; 3914 break; 3915 case E_V64HImode: 3916 ds_bpermute = gen_ds_bpermutev64hi; 3917 break; 3918 case E_V64SImode: 3919 ds_bpermute = gen_ds_bpermutev64si; 3920 break; 3921 case E_V64HFmode: 3922 ds_bpermute = gen_ds_bpermutev64hf; 3923 break; 3924 case E_V64SFmode: 3925 ds_bpermute = gen_ds_bpermutev64sf; 3926 break; 3927 case E_V64DImode: 3928 ds_bpermute = gen_ds_bpermutev64di; 3929 break; 3930 case E_V64DFmode: 3931 ds_bpermute = gen_ds_bpermutev64df; 3932 break; 3933 default: 3934 gcc_assert (false); 3935 } 3936 3937 /* Load elements from src0 to dst. */ 3938 gcc_assert (~src1_lanes); 3939 emit_insn (ds_bpermute (dst, addr, src0, gcn_full_exec_reg ())); 3940 3941 /* Load elements from src1 to dst. */ 3942 if (src1_lanes) 3943 { 3944 /* Masking a lane masks both the destination and source lanes for 3945 DS_BPERMUTE, so we need to have all lanes enabled for the permute, 3946 then add an extra masked move to merge the results of permuting 3947 the two source vectors together. 3948 */ 3949 rtx tmp = gen_reg_rtx (vmode); 3950 emit_insn (ds_bpermute (tmp, addr, src1, gcn_full_exec_reg ())); 3951 emit_insn (gen_mov_with_exec (dst, tmp, get_exec (src1_lanes))); 3952 } 3953 3954 return true; 3955 } 3956 3957 /* Implements TARGET_VECTOR_MODE_SUPPORTED_P. 3958 3959 Return nonzero if vector MODE is supported with at least move 3960 instructions. */ 3961 3962 static bool 3963 gcn_vector_mode_supported_p (machine_mode mode) 3964 { 3965 /* FIXME: Enable V64QImode and V64HImode. 3966 We should support these modes, but vector operations are usually 3967 assumed to automatically truncate types, and GCN does not. We 3968 need to add explicit truncates and/or use SDWA for QI/HI insns. */ 3969 return (/* mode == V64QImode || mode == V64HImode 3970 ||*/ mode == V64SImode || mode == V64DImode 3971 || mode == V64SFmode || mode == V64DFmode); 3972 } 3973 3974 /* Implement TARGET_VECTORIZE_PREFERRED_SIMD_MODE. 3975 3976 Enables autovectorization for all supported modes. */ 3977 3978 static machine_mode 3979 gcn_vectorize_preferred_simd_mode (scalar_mode mode) 3980 { 3981 switch (mode) 3982 { 3983 case E_QImode: 3984 return V64QImode; 3985 case E_HImode: 3986 return V64HImode; 3987 case E_SImode: 3988 return V64SImode; 3989 case E_DImode: 3990 return V64DImode; 3991 case E_SFmode: 3992 return V64SFmode; 3993 case E_DFmode: 3994 return V64DFmode; 3995 default: 3996 return word_mode; 3997 } 3998 } 3999 4000 /* Implement TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. 4001 4002 Returns the preferred alignment in bits for accesses to vectors of type type 4003 in vectorized code. This might be less than or greater than the ABI-defined 4004 value returned by TARGET_VECTOR_ALIGNMENT. It can be equal to the alignment 4005 of a single element, in which case the vectorizer will not try to optimize 4006 for alignment. */ 4007 4008 static poly_uint64 4009 gcn_preferred_vector_alignment (const_tree type) 4010 { 4011 return TYPE_ALIGN (TREE_TYPE (type)); 4012 } 4013 4014 /* Implement TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT. 4015 4016 Return true if the target supports misaligned vector store/load of a 4017 specific factor denoted in the misalignment parameter. */ 4018 4019 static bool 4020 gcn_vectorize_support_vector_misalignment (machine_mode ARG_UNUSED (mode), 4021 const_tree type, int misalignment, 4022 bool is_packed) 4023 { 4024 if (is_packed) 4025 return false; 4026 4027 /* If the misalignment is unknown, we should be able to handle the access 4028 so long as it is not to a member of a packed data structure. */ 4029 if (misalignment == -1) 4030 return true; 4031 4032 /* Return true if the misalignment is a multiple of the natural alignment 4033 of the vector's element type. This is probably always going to be 4034 true in practice, since we've already established that this isn't a 4035 packed access. */ 4036 return misalignment % TYPE_ALIGN_UNIT (type) == 0; 4037 } 4038 4039 /* Implement TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. 4040 4041 Return true if vector alignment is reachable (by peeling N iterations) for 4042 the given scalar type TYPE. */ 4043 4044 static bool 4045 gcn_vector_alignment_reachable (const_tree ARG_UNUSED (type), bool is_packed) 4046 { 4047 /* Vectors which aren't in packed structures will not be less aligned than 4048 the natural alignment of their element type, so this is safe. */ 4049 return !is_packed; 4050 } 4051 4052 /* Generate DPP instructions used for vector reductions. 4053 4054 The opcode is given by INSN. 4055 The first operand of the operation is shifted right by SHIFT vector lanes. 4056 SHIFT must be a power of 2. If SHIFT is 16, the 15th lane of each row is 4057 broadcast the next row (thereby acting like a shift of 16 for the end of 4058 each row). If SHIFT is 32, lane 31 is broadcast to all the 4059 following lanes (thereby acting like a shift of 32 for lane 63). */ 4060 4061 char * 4062 gcn_expand_dpp_shr_insn (machine_mode mode, const char *insn, 4063 int unspec, int shift) 4064 { 4065 static char buf[64]; 4066 const char *dpp; 4067 const char *vcc_in = ""; 4068 const char *vcc_out = ""; 4069 4070 /* Add the vcc operand if needed. */ 4071 if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT) 4072 { 4073 if (unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR) 4074 vcc_in = ", vcc"; 4075 4076 if (unspec == UNSPEC_PLUS_CARRY_DPP_SHR 4077 || unspec == UNSPEC_PLUS_CARRY_IN_DPP_SHR) 4078 vcc_out = ", vcc"; 4079 } 4080 4081 /* Add the DPP modifiers. */ 4082 switch (shift) 4083 { 4084 case 1: 4085 dpp = "row_shr:1 bound_ctrl:0"; 4086 break; 4087 case 2: 4088 dpp = "row_shr:2 bound_ctrl:0"; 4089 break; 4090 case 4: 4091 dpp = "row_shr:4 bank_mask:0xe"; 4092 break; 4093 case 8: 4094 dpp = "row_shr:8 bank_mask:0xc"; 4095 break; 4096 case 16: 4097 dpp = "row_bcast:15 row_mask:0xa"; 4098 break; 4099 case 32: 4100 dpp = "row_bcast:31 row_mask:0xc"; 4101 break; 4102 default: 4103 gcc_unreachable (); 4104 } 4105 4106 sprintf (buf, "%s\t%%0%s, %%1, %%2%s %s", insn, vcc_out, vcc_in, dpp); 4107 4108 return buf; 4109 } 4110 4111 /* Generate vector reductions in terms of DPP instructions. 4112 4113 The vector register SRC of mode MODE is reduced using the operation given 4114 by UNSPEC, and the scalar result is returned in lane 63 of a vector 4115 register. */ 4116 4117 rtx 4118 gcn_expand_reduc_scalar (machine_mode mode, rtx src, int unspec) 4119 { 4120 rtx tmp = gen_reg_rtx (mode); 4121 bool use_plus_carry = unspec == UNSPEC_PLUS_DPP_SHR 4122 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT 4123 && (TARGET_GCN3 || mode == V64DImode); 4124 4125 if (use_plus_carry) 4126 unspec = UNSPEC_PLUS_CARRY_DPP_SHR; 4127 4128 /* Perform reduction by first performing the reduction operation on every 4129 pair of lanes, then on every pair of results from the previous 4130 iteration (thereby effectively reducing every 4 lanes) and so on until 4131 all lanes are reduced. */ 4132 for (int i = 0, shift = 1; i < 6; i++, shift <<= 1) 4133 { 4134 rtx shift_val = gen_rtx_CONST_INT (VOIDmode, shift); 4135 rtx insn = gen_rtx_SET (tmp, 4136 gen_rtx_UNSPEC (mode, 4137 gen_rtvec (3, 4138 src, src, shift_val), 4139 unspec)); 4140 4141 /* Add clobber for instructions that set the carry flags. */ 4142 if (use_plus_carry) 4143 { 4144 rtx clobber = gen_rtx_CLOBBER (VOIDmode, 4145 gen_rtx_REG (DImode, VCC_REG)); 4146 insn = gen_rtx_PARALLEL (VOIDmode, 4147 gen_rtvec (2, insn, clobber)); 4148 } 4149 4150 emit_insn (insn); 4151 4152 /* The source operands for every iteration after the first 4153 should be TMP. */ 4154 src = tmp; 4155 } 4156 4157 return tmp; 4158 } 4159 4160 /* Implement TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST. */ 4161 4162 int 4163 gcn_vectorization_cost (enum vect_cost_for_stmt ARG_UNUSED (type_of_cost), 4164 tree ARG_UNUSED (vectype), int ARG_UNUSED (misalign)) 4165 { 4166 /* Always vectorize. */ 4167 return 1; 4168 } 4169 4170 /* }}} */ 4171 /* {{{ md_reorg pass. */ 4172 4173 /* Identify VMEM instructions from their "type" attribute. */ 4174 4175 static bool 4176 gcn_vmem_insn_p (attr_type type) 4177 { 4178 switch (type) 4179 { 4180 case TYPE_MUBUF: 4181 case TYPE_MTBUF: 4182 case TYPE_FLAT: 4183 return true; 4184 case TYPE_UNKNOWN: 4185 case TYPE_SOP1: 4186 case TYPE_SOP2: 4187 case TYPE_SOPK: 4188 case TYPE_SOPC: 4189 case TYPE_SOPP: 4190 case TYPE_SMEM: 4191 case TYPE_DS: 4192 case TYPE_VOP2: 4193 case TYPE_VOP1: 4194 case TYPE_VOPC: 4195 case TYPE_VOP3A: 4196 case TYPE_VOP3B: 4197 case TYPE_VOP_SDWA: 4198 case TYPE_VOP_DPP: 4199 case TYPE_MULT: 4200 case TYPE_VMULT: 4201 return false; 4202 } 4203 gcc_unreachable (); 4204 return false; 4205 } 4206 4207 /* If INSN sets the EXEC register to a constant value, return the value, 4208 otherwise return zero. */ 4209 4210 static int64_t 4211 gcn_insn_exec_value (rtx_insn *insn) 4212 { 4213 if (!NONDEBUG_INSN_P (insn)) 4214 return 0; 4215 4216 rtx pattern = PATTERN (insn); 4217 4218 if (GET_CODE (pattern) == SET) 4219 { 4220 rtx dest = XEXP (pattern, 0); 4221 rtx src = XEXP (pattern, 1); 4222 4223 if (GET_MODE (dest) == DImode 4224 && REG_P (dest) && REGNO (dest) == EXEC_REG 4225 && CONST_INT_P (src)) 4226 return INTVAL (src); 4227 } 4228 4229 return 0; 4230 } 4231 4232 /* Sets the EXEC register before INSN to the value that it had after 4233 LAST_EXEC_DEF. The constant value of the EXEC register is returned if 4234 known, otherwise it returns zero. */ 4235 4236 static int64_t 4237 gcn_restore_exec (rtx_insn *insn, rtx_insn *last_exec_def, int64_t curr_exec, 4238 bool curr_exec_known, bool &last_exec_def_saved) 4239 { 4240 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG); 4241 rtx exec; 4242 4243 int64_t exec_value = gcn_insn_exec_value (last_exec_def); 4244 4245 if (exec_value) 4246 { 4247 /* If the EXEC value is a constant and it happens to be the same as the 4248 current EXEC value, the restore can be skipped. */ 4249 if (curr_exec_known && exec_value == curr_exec) 4250 return exec_value; 4251 4252 exec = GEN_INT (exec_value); 4253 } 4254 else 4255 { 4256 /* If the EXEC value is not a constant, save it in a register after the 4257 point of definition. */ 4258 rtx exec_save_reg = gen_rtx_REG (DImode, EXEC_SAVE_REG); 4259 4260 if (!last_exec_def_saved) 4261 { 4262 start_sequence (); 4263 emit_move_insn (exec_save_reg, exec_reg); 4264 rtx_insn *seq = get_insns (); 4265 end_sequence (); 4266 4267 emit_insn_after (seq, last_exec_def); 4268 if (dump_file && (dump_flags & TDF_DETAILS)) 4269 fprintf (dump_file, "Saving EXEC after insn %d.\n", 4270 INSN_UID (last_exec_def)); 4271 4272 last_exec_def_saved = true; 4273 } 4274 4275 exec = exec_save_reg; 4276 } 4277 4278 /* Restore EXEC register before the usage. */ 4279 start_sequence (); 4280 emit_move_insn (exec_reg, exec); 4281 rtx_insn *seq = get_insns (); 4282 end_sequence (); 4283 emit_insn_before (seq, insn); 4284 4285 if (dump_file && (dump_flags & TDF_DETAILS)) 4286 { 4287 if (exec_value) 4288 fprintf (dump_file, "Restoring EXEC to %ld before insn %d.\n", 4289 exec_value, INSN_UID (insn)); 4290 else 4291 fprintf (dump_file, 4292 "Restoring EXEC from saved value before insn %d.\n", 4293 INSN_UID (insn)); 4294 } 4295 4296 return exec_value; 4297 } 4298 4299 /* Implement TARGET_MACHINE_DEPENDENT_REORG. 4300 4301 Ensure that pipeline dependencies and lane masking are set correctly. */ 4302 4303 static void 4304 gcn_md_reorg (void) 4305 { 4306 basic_block bb; 4307 rtx exec_reg = gen_rtx_REG (DImode, EXEC_REG); 4308 rtx exec_lo_reg = gen_rtx_REG (SImode, EXEC_LO_REG); 4309 rtx exec_hi_reg = gen_rtx_REG (SImode, EXEC_HI_REG); 4310 regset_head live; 4311 4312 INIT_REG_SET (&live); 4313 4314 compute_bb_for_insn (); 4315 4316 if (!optimize) 4317 { 4318 split_all_insns (); 4319 if (dump_file && (dump_flags & TDF_DETAILS)) 4320 { 4321 fprintf (dump_file, "After split:\n"); 4322 print_rtl_with_bb (dump_file, get_insns (), dump_flags); 4323 } 4324 4325 /* Update data-flow information for split instructions. */ 4326 df_insn_rescan_all (); 4327 } 4328 4329 df_analyze (); 4330 4331 /* This pass ensures that the EXEC register is set correctly, according 4332 to the "exec" attribute. However, care must be taken so that the 4333 value that reaches explicit uses of the EXEC register remains the 4334 same as before. 4335 */ 4336 4337 FOR_EACH_BB_FN (bb, cfun) 4338 { 4339 if (dump_file && (dump_flags & TDF_DETAILS)) 4340 fprintf (dump_file, "BB %d:\n", bb->index); 4341 4342 rtx_insn *insn, *curr; 4343 rtx_insn *last_exec_def = BB_HEAD (bb); 4344 bool last_exec_def_saved = false; 4345 bool curr_exec_explicit = true; 4346 bool curr_exec_known = true; 4347 int64_t curr_exec = 0; /* 0 here means 'the value is that of EXEC 4348 after last_exec_def is executed'. */ 4349 4350 FOR_BB_INSNS_SAFE (bb, insn, curr) 4351 { 4352 if (!NONDEBUG_INSN_P (insn)) 4353 continue; 4354 4355 if (GET_CODE (PATTERN (insn)) == USE 4356 || GET_CODE (PATTERN (insn)) == CLOBBER) 4357 continue; 4358 4359 HARD_REG_SET defs, uses; 4360 CLEAR_HARD_REG_SET (defs); 4361 CLEAR_HARD_REG_SET (uses); 4362 note_stores (PATTERN (insn), record_hard_reg_sets, &defs); 4363 note_uses (&PATTERN (insn), record_hard_reg_uses, &uses); 4364 4365 bool exec_lo_def_p = TEST_HARD_REG_BIT (defs, EXEC_LO_REG); 4366 bool exec_hi_def_p = TEST_HARD_REG_BIT (defs, EXEC_HI_REG); 4367 bool exec_used = (hard_reg_set_intersect_p 4368 (uses, reg_class_contents[(int) EXEC_MASK_REG]) 4369 || TEST_HARD_REG_BIT (uses, EXECZ_REG)); 4370 4371 /* Check the instruction for implicit setting of EXEC via an 4372 attribute. */ 4373 attr_exec exec_attr = get_attr_exec (insn); 4374 int64_t new_exec; 4375 4376 switch (exec_attr) 4377 { 4378 case EXEC_NONE: 4379 new_exec = 0; 4380 break; 4381 4382 case EXEC_SINGLE: 4383 /* Instructions that do not involve memory accesses only require 4384 bit 0 of EXEC to be set. */ 4385 if (gcn_vmem_insn_p (get_attr_type (insn)) 4386 || get_attr_type (insn) == TYPE_DS) 4387 new_exec = 1; 4388 else 4389 new_exec = curr_exec | 1; 4390 break; 4391 4392 case EXEC_FULL: 4393 new_exec = -1; 4394 break; 4395 4396 default: /* Auto-detect what setting is appropriate. */ 4397 { 4398 new_exec = 0; 4399 4400 /* If EXEC is referenced explicitly then we don't need to do 4401 anything to set it, so we're done. */ 4402 if (exec_used) 4403 break; 4404 4405 /* Scan the insn for VGPRs defs or uses. The mode determines 4406 what kind of exec is needed. */ 4407 subrtx_iterator::array_type array; 4408 FOR_EACH_SUBRTX (iter, array, PATTERN (insn), NONCONST) 4409 { 4410 const_rtx x = *iter; 4411 if (REG_P (x) && VGPR_REGNO_P (REGNO (x))) 4412 { 4413 if (VECTOR_MODE_P (GET_MODE (x))) 4414 { 4415 new_exec = -1; 4416 break; 4417 } 4418 else 4419 new_exec = 1; 4420 } 4421 } 4422 } 4423 break; 4424 } 4425 4426 if (new_exec && (!curr_exec_known || new_exec != curr_exec)) 4427 { 4428 start_sequence (); 4429 emit_move_insn (exec_reg, GEN_INT (new_exec)); 4430 rtx_insn *seq = get_insns (); 4431 end_sequence (); 4432 emit_insn_before (seq, insn); 4433 4434 if (dump_file && (dump_flags & TDF_DETAILS)) 4435 fprintf (dump_file, "Setting EXEC to %ld before insn %d.\n", 4436 new_exec, INSN_UID (insn)); 4437 4438 curr_exec = new_exec; 4439 curr_exec_explicit = false; 4440 curr_exec_known = true; 4441 } 4442 else if (new_exec && dump_file && (dump_flags & TDF_DETAILS)) 4443 { 4444 fprintf (dump_file, "Exec already is %ld before insn %d.\n", 4445 new_exec, INSN_UID (insn)); 4446 } 4447 4448 /* The state of the EXEC register is unknown after a 4449 function call. */ 4450 if (CALL_P (insn)) 4451 curr_exec_known = false; 4452 4453 /* Handle explicit uses of EXEC. If the instruction is a partial 4454 explicit definition of EXEC, then treat it as an explicit use of 4455 EXEC as well. */ 4456 if (exec_used || exec_lo_def_p != exec_hi_def_p) 4457 { 4458 /* An instruction that explicitly uses EXEC should not also 4459 implicitly define it. */ 4460 gcc_assert (!exec_used || !new_exec); 4461 4462 if (!curr_exec_known || !curr_exec_explicit) 4463 { 4464 /* Restore the previous explicitly defined value. */ 4465 curr_exec = gcn_restore_exec (insn, last_exec_def, 4466 curr_exec, curr_exec_known, 4467 last_exec_def_saved); 4468 curr_exec_explicit = true; 4469 curr_exec_known = true; 4470 } 4471 } 4472 4473 /* Handle explicit definitions of EXEC. */ 4474 if (exec_lo_def_p || exec_hi_def_p) 4475 { 4476 last_exec_def = insn; 4477 last_exec_def_saved = false; 4478 curr_exec = gcn_insn_exec_value (insn); 4479 curr_exec_explicit = true; 4480 curr_exec_known = true; 4481 4482 if (dump_file && (dump_flags & TDF_DETAILS)) 4483 fprintf (dump_file, 4484 "Found %s definition of EXEC at insn %d.\n", 4485 exec_lo_def_p == exec_hi_def_p ? "full" : "partial", 4486 INSN_UID (insn)); 4487 } 4488 } 4489 4490 COPY_REG_SET (&live, DF_LR_OUT (bb)); 4491 df_simulate_initialize_backwards (bb, &live); 4492 4493 /* If EXEC is live after the basic block, restore the value of EXEC 4494 at the end of the block. */ 4495 if ((REGNO_REG_SET_P (&live, EXEC_LO_REG) 4496 || REGNO_REG_SET_P (&live, EXEC_HI_REG)) 4497 && (!curr_exec_known || !curr_exec_explicit)) 4498 { 4499 rtx_insn *end_insn = BB_END (bb); 4500 4501 /* If the instruction is not a jump instruction, do the restore 4502 after the last instruction in the basic block. */ 4503 if (NONJUMP_INSN_P (end_insn)) 4504 end_insn = NEXT_INSN (end_insn); 4505 4506 gcn_restore_exec (end_insn, last_exec_def, curr_exec, 4507 curr_exec_known, last_exec_def_saved); 4508 } 4509 } 4510 4511 CLEAR_REG_SET (&live); 4512 4513 /* "Manually Inserted Wait States (NOPs)." 4514 4515 GCN hardware detects most kinds of register dependencies, but there 4516 are some exceptions documented in the ISA manual. This pass 4517 detects the missed cases, and inserts the documented number of NOPs 4518 required for correct execution. */ 4519 4520 const int max_waits = 5; 4521 struct ilist 4522 { 4523 rtx_insn *insn; 4524 attr_unit unit; 4525 HARD_REG_SET writes; 4526 int age; 4527 } back[max_waits]; 4528 int oldest = 0; 4529 for (int i = 0; i < max_waits; i++) 4530 back[i].insn = NULL; 4531 4532 rtx_insn *insn, *last_insn = NULL; 4533 for (insn = get_insns (); insn != 0; insn = NEXT_INSN (insn)) 4534 { 4535 if (!NONDEBUG_INSN_P (insn)) 4536 continue; 4537 4538 if (GET_CODE (PATTERN (insn)) == USE 4539 || GET_CODE (PATTERN (insn)) == CLOBBER) 4540 continue; 4541 4542 attr_type itype = get_attr_type (insn); 4543 attr_unit iunit = get_attr_unit (insn); 4544 HARD_REG_SET ireads, iwrites; 4545 CLEAR_HARD_REG_SET (ireads); 4546 CLEAR_HARD_REG_SET (iwrites); 4547 note_stores (PATTERN (insn), record_hard_reg_sets, &iwrites); 4548 note_uses (&PATTERN (insn), record_hard_reg_uses, &ireads); 4549 4550 /* Scan recent previous instructions for dependencies not handled in 4551 hardware. */ 4552 int nops_rqd = 0; 4553 for (int i = oldest; i < oldest + max_waits; i++) 4554 { 4555 struct ilist *prev_insn = &back[i % max_waits]; 4556 4557 if (!prev_insn->insn) 4558 continue; 4559 4560 /* VALU writes SGPR followed by VMEM reading the same SGPR 4561 requires 5 wait states. */ 4562 if ((prev_insn->age + nops_rqd) < 5 4563 && prev_insn->unit == UNIT_VECTOR 4564 && gcn_vmem_insn_p (itype)) 4565 { 4566 HARD_REG_SET regs; 4567 COPY_HARD_REG_SET (regs, prev_insn->writes); 4568 AND_HARD_REG_SET (regs, ireads); 4569 if (hard_reg_set_intersect_p 4570 (regs, reg_class_contents[(int) SGPR_REGS])) 4571 nops_rqd = 5 - prev_insn->age; 4572 } 4573 4574 /* VALU sets VCC/EXEC followed by VALU uses VCCZ/EXECZ 4575 requires 5 wait states. */ 4576 if ((prev_insn->age + nops_rqd) < 5 4577 && prev_insn->unit == UNIT_VECTOR 4578 && iunit == UNIT_VECTOR 4579 && ((hard_reg_set_intersect_p 4580 (prev_insn->writes, 4581 reg_class_contents[(int) EXEC_MASK_REG]) 4582 && TEST_HARD_REG_BIT (ireads, EXECZ_REG)) 4583 || 4584 (hard_reg_set_intersect_p 4585 (prev_insn->writes, 4586 reg_class_contents[(int) VCC_CONDITIONAL_REG]) 4587 && TEST_HARD_REG_BIT (ireads, VCCZ_REG)))) 4588 nops_rqd = 5 - prev_insn->age; 4589 4590 /* VALU writes SGPR/VCC followed by v_{read,write}lane using 4591 SGPR/VCC as lane select requires 4 wait states. */ 4592 if ((prev_insn->age + nops_rqd) < 4 4593 && prev_insn->unit == UNIT_VECTOR 4594 && get_attr_laneselect (insn) == LANESELECT_YES) 4595 { 4596 HARD_REG_SET regs; 4597 COPY_HARD_REG_SET (regs, prev_insn->writes); 4598 AND_HARD_REG_SET (regs, ireads); 4599 if (hard_reg_set_intersect_p 4600 (regs, reg_class_contents[(int) SGPR_REGS]) 4601 || hard_reg_set_intersect_p 4602 (regs, reg_class_contents[(int) VCC_CONDITIONAL_REG])) 4603 nops_rqd = 4 - prev_insn->age; 4604 } 4605 4606 /* VALU writes VGPR followed by VALU_DPP reading that VGPR 4607 requires 2 wait states. */ 4608 if ((prev_insn->age + nops_rqd) < 2 4609 && prev_insn->unit == UNIT_VECTOR 4610 && itype == TYPE_VOP_DPP) 4611 { 4612 HARD_REG_SET regs; 4613 COPY_HARD_REG_SET (regs, prev_insn->writes); 4614 AND_HARD_REG_SET (regs, ireads); 4615 if (hard_reg_set_intersect_p 4616 (regs, reg_class_contents[(int) VGPR_REGS])) 4617 nops_rqd = 2 - prev_insn->age; 4618 } 4619 } 4620 4621 /* Insert the required number of NOPs. */ 4622 for (int i = nops_rqd; i > 0; i--) 4623 emit_insn_after (gen_nop (), last_insn); 4624 4625 /* Age the previous instructions. We can also ignore writes to 4626 registers subsequently overwritten. */ 4627 HARD_REG_SET written; 4628 CLEAR_HARD_REG_SET (written); 4629 for (int i = oldest + max_waits - 1; i > oldest; i--) 4630 { 4631 struct ilist *prev_insn = &back[i % max_waits]; 4632 4633 /* Assume all instructions are equivalent to one "wait", the same 4634 as s_nop. This is probably true for SALU, but not VALU (which 4635 may take longer), so this is not optimal. However, AMD do 4636 not publish the cycle times for instructions. */ 4637 prev_insn->age += 1 + nops_rqd; 4638 4639 IOR_HARD_REG_SET (written, iwrites); 4640 AND_COMPL_HARD_REG_SET (prev_insn->writes, written); 4641 } 4642 4643 /* Track the current instruction as a previous instruction. */ 4644 back[oldest].insn = insn; 4645 back[oldest].unit = iunit; 4646 COPY_HARD_REG_SET (back[oldest].writes, iwrites); 4647 back[oldest].age = 0; 4648 oldest = (oldest + 1) % max_waits; 4649 4650 last_insn = insn; 4651 } 4652 } 4653 4654 /* }}} */ 4655 /* {{{ OpenACC / OpenMP. */ 4656 4657 #define GCN_DEFAULT_GANGS 0 /* Choose at runtime. */ 4658 #define GCN_DEFAULT_WORKERS 0 /* Choose at runtime. */ 4659 #define GCN_DEFAULT_VECTORS 1 /* Use autovectorization only, for now. */ 4660 4661 /* Implement TARGET_GOACC_VALIDATE_DIMS. 4662 4663 Check the launch dimensions provided for an OpenACC compute 4664 region, or routine. */ 4665 4666 static bool 4667 gcn_goacc_validate_dims (tree decl, int dims[], int fn_level, 4668 unsigned /*used*/) 4669 { 4670 bool changed = false; 4671 4672 /* FIXME: remove -facc-experimental-workers when they're ready. */ 4673 int max_workers = flag_worker_partitioning ? 16 : 1; 4674 4675 /* The vector size must appear to be 64, to the user, unless this is a 4676 SEQ routine. The real, internal value is always 1, which means use 4677 autovectorization, but the user should not see that. */ 4678 if (fn_level <= GOMP_DIM_VECTOR && fn_level >= -1 4679 && dims[GOMP_DIM_VECTOR] >= 0) 4680 { 4681 if (fn_level < 0 && dims[GOMP_DIM_VECTOR] >= 0 4682 && dims[GOMP_DIM_VECTOR] != 64) 4683 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 4684 OPT_Wopenacc_dims, 4685 (dims[GOMP_DIM_VECTOR] 4686 ? G_("using vector_length (64), ignoring %d") 4687 : G_("using vector_length (64), " 4688 "ignoring runtime setting")), 4689 dims[GOMP_DIM_VECTOR]); 4690 dims[GOMP_DIM_VECTOR] = 1; 4691 changed = true; 4692 } 4693 4694 /* Check the num workers is not too large. */ 4695 if (dims[GOMP_DIM_WORKER] > max_workers) 4696 { 4697 warning_at (decl ? DECL_SOURCE_LOCATION (decl) : UNKNOWN_LOCATION, 4698 OPT_Wopenacc_dims, 4699 "using num_workers (%d), ignoring %d", 4700 max_workers, dims[GOMP_DIM_WORKER]); 4701 dims[GOMP_DIM_WORKER] = max_workers; 4702 changed = true; 4703 } 4704 4705 /* Set global defaults. */ 4706 if (!decl) 4707 { 4708 dims[GOMP_DIM_VECTOR] = GCN_DEFAULT_VECTORS; 4709 if (dims[GOMP_DIM_WORKER] < 0) 4710 dims[GOMP_DIM_WORKER] = (flag_worker_partitioning 4711 ? GCN_DEFAULT_WORKERS : 1); 4712 if (dims[GOMP_DIM_GANG] < 0) 4713 dims[GOMP_DIM_GANG] = GCN_DEFAULT_GANGS; 4714 changed = true; 4715 } 4716 4717 return changed; 4718 } 4719 4720 /* Helper function for oacc_dim_size instruction. 4721 Also used for OpenMP, via builtin_gcn_dim_size, and the omp_gcn pass. */ 4722 4723 rtx 4724 gcn_oacc_dim_size (int dim) 4725 { 4726 if (dim < 0 || dim > 2) 4727 error ("offload dimension out of range (%d)", dim); 4728 4729 /* Vectors are a special case. */ 4730 if (dim == 2) 4731 return const1_rtx; /* Think of this as 1 times 64. */ 4732 4733 static int offset[] = { 4734 /* Offsets into dispatch packet. */ 4735 12, /* X dim = Gang / Team / Work-group. */ 4736 20, /* Z dim = Worker / Thread / Wavefront. */ 4737 16 /* Y dim = Vector / SIMD / Work-item. */ 4738 }; 4739 rtx addr = gen_rtx_PLUS (DImode, 4740 gen_rtx_REG (DImode, 4741 cfun->machine->args. 4742 reg[DISPATCH_PTR_ARG]), 4743 GEN_INT (offset[dim])); 4744 return gen_rtx_MEM (SImode, addr); 4745 } 4746 4747 /* Helper function for oacc_dim_pos instruction. 4748 Also used for OpenMP, via builtin_gcn_dim_pos, and the omp_gcn pass. */ 4749 4750 rtx 4751 gcn_oacc_dim_pos (int dim) 4752 { 4753 if (dim < 0 || dim > 2) 4754 error ("offload dimension out of range (%d)", dim); 4755 4756 static const int reg[] = { 4757 WORKGROUP_ID_X_ARG, /* Gang / Team / Work-group. */ 4758 WORK_ITEM_ID_Z_ARG, /* Worker / Thread / Wavefront. */ 4759 WORK_ITEM_ID_Y_ARG /* Vector / SIMD / Work-item. */ 4760 }; 4761 4762 int reg_num = cfun->machine->args.reg[reg[dim]]; 4763 4764 /* The information must have been requested by the kernel. */ 4765 gcc_assert (reg_num >= 0); 4766 4767 return gen_rtx_REG (SImode, reg_num); 4768 } 4769 4770 /* Implement TARGET_GOACC_FORK_JOIN. */ 4771 4772 static bool 4773 gcn_fork_join (gcall *ARG_UNUSED (call), const int *ARG_UNUSED (dims), 4774 bool ARG_UNUSED (is_fork)) 4775 { 4776 /* GCN does not use the fork/join concept invented for NVPTX. 4777 Instead we use standard autovectorization. */ 4778 return false; 4779 } 4780 4781 /* Implement ??????? 4782 FIXME make this a real hook. 4783 4784 Adjust FNDECL such that options inherited from the host compiler 4785 are made appropriate for the accelerator compiler. */ 4786 4787 void 4788 gcn_fixup_accel_lto_options (tree fndecl) 4789 { 4790 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl); 4791 if (!func_optimize) 4792 return; 4793 4794 tree old_optimize = build_optimization_node (&global_options); 4795 tree new_optimize; 4796 4797 /* If the function changed the optimization levels as well as 4798 setting target options, start with the optimizations 4799 specified. */ 4800 if (func_optimize != old_optimize) 4801 cl_optimization_restore (&global_options, 4802 TREE_OPTIMIZATION (func_optimize)); 4803 4804 gcn_option_override (); 4805 4806 /* The target attributes may also change some optimization flags, 4807 so update the optimization options if necessary. */ 4808 new_optimize = build_optimization_node (&global_options); 4809 4810 if (old_optimize != new_optimize) 4811 { 4812 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize; 4813 cl_optimization_restore (&global_options, 4814 TREE_OPTIMIZATION (old_optimize)); 4815 } 4816 } 4817 4818 /* }}} */ 4819 /* {{{ ASM Output. */ 4820 4821 /* Implement TARGET_ASM_FILE_START. 4822 4823 Print assembler file header text. */ 4824 4825 static void 4826 output_file_start (void) 4827 { 4828 fprintf (asm_out_file, "\t.text\n"); 4829 fprintf (asm_out_file, "\t.hsa_code_object_version 2,0\n"); 4830 fprintf (asm_out_file, "\t.hsa_code_object_isa\n"); /* Autodetect. */ 4831 fprintf (asm_out_file, "\t.section\t.AMDGPU.config\n"); 4832 fprintf (asm_out_file, "\t.text\n"); 4833 } 4834 4835 /* Implement ASM_DECLARE_FUNCTION_NAME via gcn-hsa.h. 4836 4837 Print the initial definition of a function name. 4838 4839 For GCN kernel entry points this includes all the HSA meta-data, special 4840 alignment constraints that don't apply to regular functions, and magic 4841 comments that pass information to mkoffload. */ 4842 4843 void 4844 gcn_hsa_declare_function_name (FILE *file, const char *name, tree) 4845 { 4846 int sgpr, vgpr; 4847 bool xnack_enabled = false; 4848 int extra_regs = 0; 4849 4850 if (cfun && cfun->machine && cfun->machine->normal_function) 4851 { 4852 fputs ("\t.type\t", file); 4853 assemble_name (file, name); 4854 fputs (",@function\n", file); 4855 assemble_name (file, name); 4856 fputs (":\n", file); 4857 return; 4858 } 4859 4860 /* Determine count of sgpr/vgpr registers by looking for last 4861 one used. */ 4862 for (sgpr = 101; sgpr >= 0; sgpr--) 4863 if (df_regs_ever_live_p (FIRST_SGPR_REG + sgpr)) 4864 break; 4865 sgpr++; 4866 for (vgpr = 255; vgpr >= 0; vgpr--) 4867 if (df_regs_ever_live_p (FIRST_VGPR_REG + vgpr)) 4868 break; 4869 vgpr++; 4870 4871 if (xnack_enabled) 4872 extra_regs = 6; 4873 if (df_regs_ever_live_p (FLAT_SCRATCH_LO_REG) 4874 || df_regs_ever_live_p (FLAT_SCRATCH_HI_REG)) 4875 extra_regs = 4; 4876 else if (df_regs_ever_live_p (VCC_LO_REG) 4877 || df_regs_ever_live_p (VCC_HI_REG)) 4878 extra_regs = 2; 4879 4880 if (!leaf_function_p ()) 4881 { 4882 /* We can't know how many registers function calls might use. */ 4883 if (vgpr < 64) 4884 vgpr = 64; 4885 if (sgpr + extra_regs < 102) 4886 sgpr = 102 - extra_regs; 4887 } 4888 4889 fputs ("\t.align\t256\n", file); 4890 fputs ("\t.type\t", file); 4891 assemble_name (file, name); 4892 fputs (",@function\n\t.amdgpu_hsa_kernel\t", file); 4893 assemble_name (file, name); 4894 fputs ("\n", file); 4895 assemble_name (file, name); 4896 fputs (":\n", file); 4897 fprintf (file, "\t.amd_kernel_code_t\n" 4898 "\t\tkernel_code_version_major = 1\n" 4899 "\t\tkernel_code_version_minor = 0\n" "\t\tmachine_kind = 1\n" 4900 /* "\t\tmachine_version_major = 8\n" 4901 "\t\tmachine_version_minor = 0\n" 4902 "\t\tmachine_version_stepping = 1\n" */ 4903 "\t\tkernel_code_entry_byte_offset = 256\n" 4904 "\t\tkernel_code_prefetch_byte_size = 0\n" 4905 "\t\tmax_scratch_backing_memory_byte_size = 0\n" 4906 "\t\tcompute_pgm_rsrc1_vgprs = %i\n" 4907 "\t\tcompute_pgm_rsrc1_sgprs = %i\n" 4908 "\t\tcompute_pgm_rsrc1_priority = 0\n" 4909 "\t\tcompute_pgm_rsrc1_float_mode = 192\n" 4910 "\t\tcompute_pgm_rsrc1_priv = 0\n" 4911 "\t\tcompute_pgm_rsrc1_dx10_clamp = 1\n" 4912 "\t\tcompute_pgm_rsrc1_debug_mode = 0\n" 4913 "\t\tcompute_pgm_rsrc1_ieee_mode = 1\n" 4914 /* We enable scratch memory. */ 4915 "\t\tcompute_pgm_rsrc2_scratch_en = 1\n" 4916 "\t\tcompute_pgm_rsrc2_user_sgpr = %i\n" 4917 "\t\tcompute_pgm_rsrc2_tgid_x_en = 1\n" 4918 "\t\tcompute_pgm_rsrc2_tgid_y_en = 0\n" 4919 "\t\tcompute_pgm_rsrc2_tgid_z_en = 0\n" 4920 "\t\tcompute_pgm_rsrc2_tg_size_en = 0\n" 4921 "\t\tcompute_pgm_rsrc2_tidig_comp_cnt = 0\n" 4922 "\t\tcompute_pgm_rsrc2_excp_en_msb = 0\n" 4923 "\t\tcompute_pgm_rsrc2_lds_size = 0\n" /* Set at runtime. */ 4924 "\t\tcompute_pgm_rsrc2_excp_en = 0\n", 4925 (vgpr - 1) / 4, 4926 /* Must match wavefront_sgpr_count */ 4927 (sgpr + extra_regs + 7) / 8 - 1, 4928 /* The total number of SGPR user data registers requested. This 4929 number must match the number of user data registers enabled. */ 4930 cfun->machine->args.nsgprs); 4931 int reg = FIRST_SGPR_REG; 4932 for (int a = 0; a < GCN_KERNEL_ARG_TYPES; a++) 4933 { 4934 int reg_first = -1; 4935 int reg_last; 4936 if ((cfun->machine->args.requested & (1 << a)) 4937 && (gcn_kernel_arg_types[a].fixed_regno < 0)) 4938 { 4939 reg_first = reg; 4940 reg_last = (reg_first 4941 + (GET_MODE_SIZE (gcn_kernel_arg_types[a].mode) 4942 / UNITS_PER_WORD) - 1); 4943 reg = reg_last + 1; 4944 } 4945 4946 if (gcn_kernel_arg_types[a].header_pseudo) 4947 { 4948 fprintf (file, "\t\t%s = %i", 4949 gcn_kernel_arg_types[a].header_pseudo, 4950 (cfun->machine->args.requested & (1 << a)) != 0); 4951 if (reg_first != -1) 4952 { 4953 fprintf (file, " ; ("); 4954 for (int i = reg_first; i <= reg_last; ++i) 4955 { 4956 if (i != reg_first) 4957 fprintf (file, ", "); 4958 fprintf (file, "%s", reg_names[i]); 4959 } 4960 fprintf (file, ")"); 4961 } 4962 fprintf (file, "\n"); 4963 } 4964 else if (gcn_kernel_arg_types[a].fixed_regno >= 0 4965 && cfun->machine->args.requested & (1 << a)) 4966 fprintf (file, "\t\t; %s = %i (%s)\n", 4967 gcn_kernel_arg_types[a].name, 4968 (cfun->machine->args.requested & (1 << a)) != 0, 4969 reg_names[gcn_kernel_arg_types[a].fixed_regno]); 4970 } 4971 fprintf (file, "\t\tenable_vgpr_workitem_id = %i\n", 4972 (cfun->machine->args.requested & (1 << WORK_ITEM_ID_Z_ARG)) 4973 ? 2 4974 : cfun->machine->args.requested & (1 << WORK_ITEM_ID_Y_ARG) 4975 ? 1 : 0); 4976 fprintf (file, "\t\tenable_ordered_append_gds = 0\n" 4977 "\t\tprivate_element_size = 1\n" 4978 "\t\tis_ptr64 = 1\n" 4979 "\t\tis_dynamic_callstack = 0\n" 4980 "\t\tis_debug_enabled = 0\n" 4981 "\t\tis_xnack_enabled = %i\n" 4982 "\t\tworkitem_private_segment_byte_size = %i\n" 4983 "\t\tworkgroup_group_segment_byte_size = %u\n" 4984 "\t\tgds_segment_byte_size = 0\n" 4985 "\t\tkernarg_segment_byte_size = %i\n" 4986 "\t\tworkgroup_fbarrier_count = 0\n" 4987 "\t\twavefront_sgpr_count = %i\n" 4988 "\t\tworkitem_vgpr_count = %i\n" 4989 "\t\treserved_vgpr_first = 0\n" 4990 "\t\treserved_vgpr_count = 0\n" 4991 "\t\treserved_sgpr_first = 0\n" 4992 "\t\treserved_sgpr_count = 0\n" 4993 "\t\tdebug_wavefront_private_segment_offset_sgpr = 0\n" 4994 "\t\tdebug_private_segment_buffer_sgpr = 0\n" 4995 "\t\tkernarg_segment_alignment = %i\n" 4996 "\t\tgroup_segment_alignment = 4\n" 4997 "\t\tprivate_segment_alignment = %i\n" 4998 "\t\twavefront_size = 6\n" 4999 "\t\tcall_convention = 0\n" 5000 "\t\truntime_loader_kernel_symbol = 0\n" 5001 "\t.end_amd_kernel_code_t\n", xnack_enabled, 5002 /* workitem_private_segment_bytes_size needs to be 5003 one 64th the wave-front stack size. */ 5004 stack_size_opt / 64, 5005 LDS_SIZE, cfun->machine->kernarg_segment_byte_size, 5006 /* Number of scalar registers used by a wavefront. This 5007 includes the special SGPRs for VCC, Flat Scratch (Base, 5008 Size) and XNACK (for GFX8 (VI)+). It does not include the 5009 16 SGPR added if a trap handler is enabled. Must match 5010 compute_pgm_rsrc1.sgprs. */ 5011 sgpr + extra_regs, vgpr, 5012 cfun->machine->kernarg_segment_alignment, 5013 crtl->stack_alignment_needed / 8); 5014 5015 /* This comment is read by mkoffload. */ 5016 if (flag_openacc) 5017 fprintf (file, "\t;; OPENACC-DIMS: %d, %d, %d : %s\n", 5018 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_GANG), 5019 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_WORKER), 5020 oacc_get_fn_dim_size (cfun->decl, GOMP_DIM_VECTOR), name); 5021 } 5022 5023 /* Implement TARGET_ASM_SELECT_SECTION. 5024 5025 Return the section into which EXP should be placed. */ 5026 5027 static section * 5028 gcn_asm_select_section (tree exp, int reloc, unsigned HOST_WIDE_INT align) 5029 { 5030 if (TREE_TYPE (exp) != error_mark_node 5031 && TYPE_ADDR_SPACE (TREE_TYPE (exp)) == ADDR_SPACE_LDS) 5032 { 5033 if (!DECL_P (exp)) 5034 return get_section (".lds_bss", 5035 SECTION_WRITE | SECTION_BSS | SECTION_DEBUG, 5036 NULL); 5037 5038 return get_named_section (exp, ".lds_bss", reloc); 5039 } 5040 5041 return default_elf_select_section (exp, reloc, align); 5042 } 5043 5044 /* Implement TARGET_ASM_FUNCTION_PROLOGUE. 5045 5046 Emits custom text into the assembler file at the head of each function. */ 5047 5048 static void 5049 gcn_target_asm_function_prologue (FILE *file) 5050 { 5051 machine_function *offsets = gcn_compute_frame_offsets (); 5052 5053 asm_fprintf (file, "\t; using %s addressing in function\n", 5054 offsets->use_flat_addressing ? "flat" : "global"); 5055 5056 if (offsets->normal_function) 5057 { 5058 asm_fprintf (file, "\t; frame pointer needed: %s\n", 5059 offsets->need_frame_pointer ? "true" : "false"); 5060 asm_fprintf (file, "\t; lr needs saving: %s\n", 5061 offsets->lr_needs_saving ? "true" : "false"); 5062 asm_fprintf (file, "\t; outgoing args size: %wd\n", 5063 offsets->outgoing_args_size); 5064 asm_fprintf (file, "\t; pretend size: %wd\n", offsets->pretend_size); 5065 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars); 5066 asm_fprintf (file, "\t; callee save size: %wd\n", 5067 offsets->callee_saves); 5068 } 5069 else 5070 { 5071 asm_fprintf (file, "\t; HSA kernel entry point\n"); 5072 asm_fprintf (file, "\t; local vars size: %wd\n", offsets->local_vars); 5073 asm_fprintf (file, "\t; outgoing args size: %wd\n", 5074 offsets->outgoing_args_size); 5075 5076 /* Enable denorms. */ 5077 asm_fprintf (file, "\n\t; Set MODE[FP_DENORM]: allow single and double" 5078 " input and output denorms\n"); 5079 asm_fprintf (file, "\ts_setreg_imm32_b32\thwreg(1, 4, 4), 0xf\n\n"); 5080 } 5081 } 5082 5083 /* Helper function for print_operand and print_operand_address. 5084 5085 Print a register as the assembler requires, according to mode and name. */ 5086 5087 static void 5088 print_reg (FILE *file, rtx x) 5089 { 5090 machine_mode mode = GET_MODE (x); 5091 if (mode == BImode || mode == QImode || mode == HImode || mode == SImode 5092 || mode == HFmode || mode == SFmode 5093 || mode == V64SFmode || mode == V64SImode 5094 || mode == V64QImode || mode == V64HImode) 5095 fprintf (file, "%s", reg_names[REGNO (x)]); 5096 else if (mode == DImode || mode == V64DImode 5097 || mode == DFmode || mode == V64DFmode) 5098 { 5099 if (SGPR_REGNO_P (REGNO (x))) 5100 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG, 5101 REGNO (x) - FIRST_SGPR_REG + 1); 5102 else if (VGPR_REGNO_P (REGNO (x))) 5103 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG, 5104 REGNO (x) - FIRST_VGPR_REG + 1); 5105 else if (REGNO (x) == FLAT_SCRATCH_REG) 5106 fprintf (file, "flat_scratch"); 5107 else if (REGNO (x) == EXEC_REG) 5108 fprintf (file, "exec"); 5109 else if (REGNO (x) == VCC_LO_REG) 5110 fprintf (file, "vcc"); 5111 else 5112 fprintf (file, "[%s:%s]", 5113 reg_names[REGNO (x)], reg_names[REGNO (x) + 1]); 5114 } 5115 else if (mode == TImode) 5116 { 5117 if (SGPR_REGNO_P (REGNO (x))) 5118 fprintf (file, "s[%i:%i]", REGNO (x) - FIRST_SGPR_REG, 5119 REGNO (x) - FIRST_SGPR_REG + 3); 5120 else if (VGPR_REGNO_P (REGNO (x))) 5121 fprintf (file, "v[%i:%i]", REGNO (x) - FIRST_VGPR_REG, 5122 REGNO (x) - FIRST_VGPR_REG + 3); 5123 else 5124 gcc_unreachable (); 5125 } 5126 else 5127 gcc_unreachable (); 5128 } 5129 5130 /* Implement TARGET_SECTION_TYPE_FLAGS. 5131 5132 Return a set of section attributes for use by TARGET_ASM_NAMED_SECTION. */ 5133 5134 static unsigned int 5135 gcn_section_type_flags (tree decl, const char *name, int reloc) 5136 { 5137 if (strcmp (name, ".lds_bss") == 0) 5138 return SECTION_WRITE | SECTION_BSS | SECTION_DEBUG; 5139 5140 return default_section_type_flags (decl, name, reloc); 5141 } 5142 5143 /* Helper function for gcn_asm_output_symbol_ref. 5144 5145 FIXME: If we want to have propagation blocks allocated separately and 5146 statically like this, it would be better done via symbol refs and the 5147 assembler/linker. This is a temporary hack. */ 5148 5149 static void 5150 gcn_print_lds_decl (FILE *f, tree var) 5151 { 5152 int *offset; 5153 machine_function *machfun = cfun->machine; 5154 5155 if ((offset = machfun->lds_allocs->get (var))) 5156 fprintf (f, "%u", (unsigned) *offset); 5157 else 5158 { 5159 unsigned HOST_WIDE_INT align = DECL_ALIGN_UNIT (var); 5160 tree type = TREE_TYPE (var); 5161 unsigned HOST_WIDE_INT size = tree_to_uhwi (TYPE_SIZE_UNIT (type)); 5162 if (size > align && size > 4 && align < 8) 5163 align = 8; 5164 5165 machfun->lds_allocated = ((machfun->lds_allocated + align - 1) 5166 & ~(align - 1)); 5167 5168 machfun->lds_allocs->put (var, machfun->lds_allocated); 5169 fprintf (f, "%u", machfun->lds_allocated); 5170 machfun->lds_allocated += size; 5171 if (machfun->lds_allocated > LDS_SIZE) 5172 error ("local data-share memory exhausted"); 5173 } 5174 } 5175 5176 /* Implement ASM_OUTPUT_SYMBOL_REF via gcn-hsa.h. */ 5177 5178 void 5179 gcn_asm_output_symbol_ref (FILE *file, rtx x) 5180 { 5181 tree decl; 5182 if ((decl = SYMBOL_REF_DECL (x)) != 0 5183 && TREE_CODE (decl) == VAR_DECL 5184 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl)))) 5185 { 5186 /* LDS symbols (emitted using this hook) are only used at present 5187 to propagate worker values from an active thread to neutered 5188 threads. Use the same offset for each such block, but don't 5189 use zero because null pointers are used to identify the active 5190 thread in GOACC_single_copy_start calls. */ 5191 gcn_print_lds_decl (file, decl); 5192 } 5193 else 5194 { 5195 assemble_name (file, XSTR (x, 0)); 5196 /* FIXME: See above -- this condition is unreachable. */ 5197 if ((decl = SYMBOL_REF_DECL (x)) != 0 5198 && TREE_CODE (decl) == VAR_DECL 5199 && AS_LDS_P (TYPE_ADDR_SPACE (TREE_TYPE (decl)))) 5200 fputs ("@abs32", file); 5201 } 5202 } 5203 5204 /* Implement TARGET_CONSTANT_ALIGNMENT. 5205 5206 Returns the alignment in bits of a constant that is being placed in memory. 5207 CONSTANT is the constant and BASIC_ALIGN is the alignment that the object 5208 would ordinarily have. */ 5209 5210 static HOST_WIDE_INT 5211 gcn_constant_alignment (const_tree ARG_UNUSED (constant), 5212 HOST_WIDE_INT basic_align) 5213 { 5214 return basic_align > 128 ? basic_align : 128; 5215 } 5216 5217 /* Implement PRINT_OPERAND_ADDRESS via gcn.h. */ 5218 5219 void 5220 print_operand_address (FILE *file, rtx mem) 5221 { 5222 gcc_assert (MEM_P (mem)); 5223 5224 rtx reg; 5225 rtx offset; 5226 addr_space_t as = MEM_ADDR_SPACE (mem); 5227 rtx addr = XEXP (mem, 0); 5228 gcc_assert (REG_P (addr) || GET_CODE (addr) == PLUS); 5229 5230 if (AS_SCRATCH_P (as)) 5231 switch (GET_CODE (addr)) 5232 { 5233 case REG: 5234 print_reg (file, addr); 5235 break; 5236 5237 case PLUS: 5238 reg = XEXP (addr, 0); 5239 offset = XEXP (addr, 1); 5240 print_reg (file, reg); 5241 if (GET_CODE (offset) == CONST_INT) 5242 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (offset)); 5243 else 5244 abort (); 5245 break; 5246 5247 default: 5248 debug_rtx (addr); 5249 abort (); 5250 } 5251 else if (AS_ANY_FLAT_P (as)) 5252 { 5253 if (GET_CODE (addr) == REG) 5254 print_reg (file, addr); 5255 else 5256 { 5257 gcc_assert (TARGET_GCN5_PLUS); 5258 print_reg (file, XEXP (addr, 0)); 5259 } 5260 } 5261 else if (AS_GLOBAL_P (as)) 5262 { 5263 gcc_assert (TARGET_GCN5_PLUS); 5264 5265 rtx base = addr; 5266 rtx vgpr_offset = NULL_RTX; 5267 5268 if (GET_CODE (addr) == PLUS) 5269 { 5270 base = XEXP (addr, 0); 5271 5272 if (GET_CODE (base) == PLUS) 5273 { 5274 /* (SGPR + VGPR) + CONST */ 5275 vgpr_offset = XEXP (base, 1); 5276 base = XEXP (base, 0); 5277 } 5278 else 5279 { 5280 rtx offset = XEXP (addr, 1); 5281 5282 if (REG_P (offset)) 5283 /* SGPR + VGPR */ 5284 vgpr_offset = offset; 5285 else if (CONST_INT_P (offset)) 5286 /* VGPR + CONST or SGPR + CONST */ 5287 ; 5288 else 5289 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address"); 5290 } 5291 } 5292 5293 if (REG_P (base)) 5294 { 5295 if (VGPR_REGNO_P (REGNO (base))) 5296 print_reg (file, base); 5297 else if (SGPR_REGNO_P (REGNO (base))) 5298 { 5299 /* The assembler requires a 64-bit VGPR pair here, even though 5300 the offset should be only 32-bit. */ 5301 if (vgpr_offset == NULL_RTX) 5302 /* In this case, the vector offset is zero, so we use v0, 5303 which is initialized by the kernel prologue to zero. */ 5304 fprintf (file, "v[0:1]"); 5305 else if (REG_P (vgpr_offset) 5306 && VGPR_REGNO_P (REGNO (vgpr_offset))) 5307 { 5308 fprintf (file, "v[%d:%d]", 5309 REGNO (vgpr_offset) - FIRST_VGPR_REG, 5310 REGNO (vgpr_offset) - FIRST_VGPR_REG + 1); 5311 } 5312 else 5313 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address"); 5314 } 5315 } 5316 else 5317 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address"); 5318 } 5319 else if (AS_ANY_DS_P (as)) 5320 switch (GET_CODE (addr)) 5321 { 5322 case REG: 5323 print_reg (file, addr); 5324 break; 5325 5326 case PLUS: 5327 reg = XEXP (addr, 0); 5328 print_reg (file, reg); 5329 break; 5330 5331 default: 5332 debug_rtx (addr); 5333 abort (); 5334 } 5335 else 5336 switch (GET_CODE (addr)) 5337 { 5338 case REG: 5339 print_reg (file, addr); 5340 fprintf (file, ", 0"); 5341 break; 5342 5343 case PLUS: 5344 reg = XEXP (addr, 0); 5345 offset = XEXP (addr, 1); 5346 print_reg (file, reg); 5347 fprintf (file, ", "); 5348 if (GET_CODE (offset) == REG) 5349 print_reg (file, reg); 5350 else if (GET_CODE (offset) == CONST_INT) 5351 fprintf (file, HOST_WIDE_INT_PRINT_DEC, INTVAL (offset)); 5352 else 5353 abort (); 5354 break; 5355 5356 default: 5357 debug_rtx (addr); 5358 abort (); 5359 } 5360 } 5361 5362 /* Implement PRINT_OPERAND via gcn.h. 5363 5364 b - print operand size as untyped operand (b8/b16/b32/b64) 5365 B - print operand size as SI/DI untyped operand (b32/b32/b32/b64) 5366 i - print operand size as untyped operand (i16/b32/i64) 5367 u - print operand size as untyped operand (u16/u32/u64) 5368 o - print operand size as memory access size for loads 5369 (ubyte/ushort/dword/dwordx2/wordx3/dwordx4) 5370 s - print operand size as memory access size for stores 5371 (byte/short/dword/dwordx2/wordx3/dwordx4) 5372 C - print conditional code for s_cbranch (_sccz/_sccnz/_vccz/_vccnz...) 5373 c - print inverse conditional code for s_cbranch 5374 D - print conditional code for s_cmp (eq_u64/lg_u64...) 5375 E - print conditional code for v_cmp (eq_u64/ne_u64...) 5376 A - print address in formatting suitable for given address space. 5377 O - print offset:n for data share operations. 5378 ^ - print "_co" suffix for GCN5 mnemonics 5379 g - print "glc", if appropriate for given MEM 5380 */ 5381 5382 void 5383 print_operand (FILE *file, rtx x, int code) 5384 { 5385 int xcode = x ? GET_CODE (x) : 0; 5386 bool invert = false; 5387 switch (code) 5388 { 5389 /* Instructions have the following suffixes. 5390 If there are two suffixes, the first is the destination type, 5391 and the second is the source type. 5392 5393 B32 Bitfield (untyped data) 32-bit 5394 B64 Bitfield (untyped data) 64-bit 5395 F16 floating-point 16-bit 5396 F32 floating-point 32-bit (IEEE 754 single-precision float) 5397 F64 floating-point 64-bit (IEEE 754 double-precision float) 5398 I16 signed 32-bit integer 5399 I32 signed 32-bit integer 5400 I64 signed 64-bit integer 5401 U16 unsigned 32-bit integer 5402 U32 unsigned 32-bit integer 5403 U64 unsigned 64-bit integer */ 5404 5405 /* Print operand size as untyped suffix. */ 5406 case 'b': 5407 { 5408 const char *s = ""; 5409 machine_mode mode = GET_MODE (x); 5410 if (VECTOR_MODE_P (mode)) 5411 mode = GET_MODE_INNER (mode); 5412 switch (GET_MODE_SIZE (mode)) 5413 { 5414 case 1: 5415 s = "_b8"; 5416 break; 5417 case 2: 5418 s = "_b16"; 5419 break; 5420 case 4: 5421 s = "_b32"; 5422 break; 5423 case 8: 5424 s = "_b64"; 5425 break; 5426 default: 5427 output_operand_lossage ("invalid operand %%xn code"); 5428 return; 5429 } 5430 fputs (s, file); 5431 } 5432 return; 5433 case 'B': 5434 { 5435 const char *s = ""; 5436 machine_mode mode = GET_MODE (x); 5437 if (VECTOR_MODE_P (mode)) 5438 mode = GET_MODE_INNER (mode); 5439 switch (GET_MODE_SIZE (mode)) 5440 { 5441 case 1: 5442 case 2: 5443 case 4: 5444 s = "_b32"; 5445 break; 5446 case 8: 5447 s = "_b64"; 5448 break; 5449 default: 5450 output_operand_lossage ("invalid operand %%xn code"); 5451 return; 5452 } 5453 fputs (s, file); 5454 } 5455 return; 5456 case 'e': 5457 fputs ("sext(", file); 5458 print_operand (file, x, 0); 5459 fputs (")", file); 5460 return; 5461 case 'i': 5462 case 'u': 5463 { 5464 bool signed_p = code == 'i'; 5465 const char *s = ""; 5466 machine_mode mode = GET_MODE (x); 5467 if (VECTOR_MODE_P (mode)) 5468 mode = GET_MODE_INNER (mode); 5469 if (mode == VOIDmode) 5470 switch (GET_CODE (x)) 5471 { 5472 case CONST_INT: 5473 s = signed_p ? "_i32" : "_u32"; 5474 break; 5475 case CONST_DOUBLE: 5476 s = "_f64"; 5477 break; 5478 default: 5479 output_operand_lossage ("invalid operand %%xn code"); 5480 return; 5481 } 5482 else if (FLOAT_MODE_P (mode)) 5483 switch (GET_MODE_SIZE (mode)) 5484 { 5485 case 2: 5486 s = "_f16"; 5487 break; 5488 case 4: 5489 s = "_f32"; 5490 break; 5491 case 8: 5492 s = "_f64"; 5493 break; 5494 default: 5495 output_operand_lossage ("invalid operand %%xn code"); 5496 return; 5497 } 5498 else 5499 switch (GET_MODE_SIZE (mode)) 5500 { 5501 case 1: 5502 s = signed_p ? "_i8" : "_u8"; 5503 break; 5504 case 2: 5505 s = signed_p ? "_i16" : "_u16"; 5506 break; 5507 case 4: 5508 s = signed_p ? "_i32" : "_u32"; 5509 break; 5510 case 8: 5511 s = signed_p ? "_i64" : "_u64"; 5512 break; 5513 default: 5514 output_operand_lossage ("invalid operand %%xn code"); 5515 return; 5516 } 5517 fputs (s, file); 5518 } 5519 return; 5520 /* Print operand size as untyped suffix. */ 5521 case 'o': 5522 { 5523 const char *s = 0; 5524 switch (GET_MODE_SIZE (GET_MODE (x))) 5525 { 5526 case 1: 5527 s = "_ubyte"; 5528 break; 5529 case 2: 5530 s = "_ushort"; 5531 break; 5532 /* The following are full-vector variants. */ 5533 case 64: 5534 s = "_ubyte"; 5535 break; 5536 case 128: 5537 s = "_ushort"; 5538 break; 5539 } 5540 5541 if (s) 5542 { 5543 fputs (s, file); 5544 return; 5545 } 5546 5547 /* Fall-through - the other cases for 'o' are the same as for 's'. */ 5548 gcc_fallthrough(); 5549 } 5550 case 's': 5551 { 5552 const char *s = ""; 5553 switch (GET_MODE_SIZE (GET_MODE (x))) 5554 { 5555 case 1: 5556 s = "_byte"; 5557 break; 5558 case 2: 5559 s = "_short"; 5560 break; 5561 case 4: 5562 s = "_dword"; 5563 break; 5564 case 8: 5565 s = "_dwordx2"; 5566 break; 5567 case 12: 5568 s = "_dwordx3"; 5569 break; 5570 case 16: 5571 s = "_dwordx4"; 5572 break; 5573 case 32: 5574 s = "_dwordx8"; 5575 break; 5576 case 64: 5577 s = VECTOR_MODE_P (GET_MODE (x)) ? "_byte" : "_dwordx16"; 5578 break; 5579 /* The following are full-vector variants. */ 5580 case 128: 5581 s = "_short"; 5582 break; 5583 case 256: 5584 s = "_dword"; 5585 break; 5586 case 512: 5587 s = "_dwordx2"; 5588 break; 5589 default: 5590 output_operand_lossage ("invalid operand %%xn code"); 5591 return; 5592 } 5593 fputs (s, file); 5594 } 5595 return; 5596 case 'A': 5597 if (xcode != MEM) 5598 { 5599 output_operand_lossage ("invalid %%xn code"); 5600 return; 5601 } 5602 print_operand_address (file, x); 5603 return; 5604 case 'O': 5605 { 5606 if (xcode != MEM) 5607 { 5608 output_operand_lossage ("invalid %%xn code"); 5609 return; 5610 } 5611 if (AS_GDS_P (MEM_ADDR_SPACE (x))) 5612 fprintf (file, " gds"); 5613 5614 rtx x0 = XEXP (x, 0); 5615 if (AS_GLOBAL_P (MEM_ADDR_SPACE (x))) 5616 { 5617 gcc_assert (TARGET_GCN5_PLUS); 5618 5619 fprintf (file, ", "); 5620 5621 rtx base = x0; 5622 rtx const_offset = NULL_RTX; 5623 5624 if (GET_CODE (base) == PLUS) 5625 { 5626 rtx offset = XEXP (x0, 1); 5627 base = XEXP (x0, 0); 5628 5629 if (GET_CODE (base) == PLUS) 5630 /* (SGPR + VGPR) + CONST */ 5631 /* Ignore the VGPR offset for this operand. */ 5632 base = XEXP (base, 0); 5633 5634 if (CONST_INT_P (offset)) 5635 const_offset = XEXP (x0, 1); 5636 else if (REG_P (offset)) 5637 /* SGPR + VGPR */ 5638 /* Ignore the VGPR offset for this operand. */ 5639 ; 5640 else 5641 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address"); 5642 } 5643 5644 if (REG_P (base)) 5645 { 5646 if (VGPR_REGNO_P (REGNO (base))) 5647 /* The VGPR address is specified in the %A operand. */ 5648 fprintf (file, "off"); 5649 else if (SGPR_REGNO_P (REGNO (base))) 5650 print_reg (file, base); 5651 else 5652 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address"); 5653 } 5654 else 5655 output_operand_lossage ("bad ADDR_SPACE_GLOBAL address"); 5656 5657 if (const_offset != NULL_RTX) 5658 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, 5659 INTVAL (const_offset)); 5660 5661 return; 5662 } 5663 5664 if (GET_CODE (x0) == REG) 5665 return; 5666 if (GET_CODE (x0) != PLUS) 5667 { 5668 output_operand_lossage ("invalid %%xn code"); 5669 return; 5670 } 5671 rtx val = XEXP (x0, 1); 5672 if (GET_CODE (val) == CONST_VECTOR) 5673 val = CONST_VECTOR_ELT (val, 0); 5674 if (GET_CODE (val) != CONST_INT) 5675 { 5676 output_operand_lossage ("invalid %%xn code"); 5677 return; 5678 } 5679 fprintf (file, " offset:" HOST_WIDE_INT_PRINT_DEC, INTVAL (val)); 5680 5681 } 5682 return; 5683 case 'c': 5684 invert = true; 5685 /* Fall through. */ 5686 case 'C': 5687 { 5688 const char *s; 5689 bool num = false; 5690 if ((xcode != EQ && xcode != NE) || !REG_P (XEXP (x, 0))) 5691 { 5692 output_operand_lossage ("invalid %%xn code"); 5693 return; 5694 } 5695 switch (REGNO (XEXP (x, 0))) 5696 { 5697 case VCC_REG: 5698 case VCCZ_REG: 5699 s = "_vcc"; 5700 break; 5701 case SCC_REG: 5702 /* For some reason llvm-mc insists on scc0 instead of sccz. */ 5703 num = true; 5704 s = "_scc"; 5705 break; 5706 case EXECZ_REG: 5707 s = "_exec"; 5708 break; 5709 default: 5710 output_operand_lossage ("invalid %%xn code"); 5711 return; 5712 } 5713 fputs (s, file); 5714 if (xcode == (invert ? NE : EQ)) 5715 fputc (num ? '0' : 'z', file); 5716 else 5717 fputs (num ? "1" : "nz", file); 5718 return; 5719 } 5720 case 'D': 5721 { 5722 const char *s; 5723 bool cmp_signed = false; 5724 switch (xcode) 5725 { 5726 case EQ: 5727 s = "_eq_"; 5728 break; 5729 case NE: 5730 s = "_lg_"; 5731 break; 5732 case LT: 5733 s = "_lt_"; 5734 cmp_signed = true; 5735 break; 5736 case LE: 5737 s = "_le_"; 5738 cmp_signed = true; 5739 break; 5740 case GT: 5741 s = "_gt_"; 5742 cmp_signed = true; 5743 break; 5744 case GE: 5745 s = "_ge_"; 5746 cmp_signed = true; 5747 break; 5748 case LTU: 5749 s = "_lt_"; 5750 break; 5751 case LEU: 5752 s = "_le_"; 5753 break; 5754 case GTU: 5755 s = "_gt_"; 5756 break; 5757 case GEU: 5758 s = "_ge_"; 5759 break; 5760 default: 5761 output_operand_lossage ("invalid %%xn code"); 5762 return; 5763 } 5764 fputs (s, file); 5765 fputc (cmp_signed ? 'i' : 'u', file); 5766 5767 machine_mode mode = GET_MODE (XEXP (x, 0)); 5768 5769 if (mode == VOIDmode) 5770 mode = GET_MODE (XEXP (x, 1)); 5771 5772 /* If both sides are constants, then assume the instruction is in 5773 SImode since s_cmp can only do integer compares. */ 5774 if (mode == VOIDmode) 5775 mode = SImode; 5776 5777 switch (GET_MODE_SIZE (mode)) 5778 { 5779 case 4: 5780 s = "32"; 5781 break; 5782 case 8: 5783 s = "64"; 5784 break; 5785 default: 5786 output_operand_lossage ("invalid operand %%xn code"); 5787 return; 5788 } 5789 fputs (s, file); 5790 return; 5791 } 5792 case 'E': 5793 { 5794 const char *s; 5795 bool cmp_signed = false; 5796 machine_mode mode = GET_MODE (XEXP (x, 0)); 5797 5798 if (mode == VOIDmode) 5799 mode = GET_MODE (XEXP (x, 1)); 5800 5801 /* If both sides are constants, assume the instruction is in SFmode 5802 if either operand is floating point, otherwise assume SImode. */ 5803 if (mode == VOIDmode) 5804 { 5805 if (GET_CODE (XEXP (x, 0)) == CONST_DOUBLE 5806 || GET_CODE (XEXP (x, 1)) == CONST_DOUBLE) 5807 mode = SFmode; 5808 else 5809 mode = SImode; 5810 } 5811 5812 /* Use the same format code for vector comparisons. */ 5813 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT 5814 || GET_MODE_CLASS (mode) == MODE_VECTOR_INT) 5815 mode = GET_MODE_INNER (mode); 5816 5817 bool float_p = GET_MODE_CLASS (mode) == MODE_FLOAT; 5818 5819 switch (xcode) 5820 { 5821 case EQ: 5822 s = "_eq_"; 5823 break; 5824 case NE: 5825 s = float_p ? "_neq_" : "_ne_"; 5826 break; 5827 case LT: 5828 s = "_lt_"; 5829 cmp_signed = true; 5830 break; 5831 case LE: 5832 s = "_le_"; 5833 cmp_signed = true; 5834 break; 5835 case GT: 5836 s = "_gt_"; 5837 cmp_signed = true; 5838 break; 5839 case GE: 5840 s = "_ge_"; 5841 cmp_signed = true; 5842 break; 5843 case LTU: 5844 s = "_lt_"; 5845 break; 5846 case LEU: 5847 s = "_le_"; 5848 break; 5849 case GTU: 5850 s = "_gt_"; 5851 break; 5852 case GEU: 5853 s = "_ge_"; 5854 break; 5855 case ORDERED: 5856 s = "_o_"; 5857 break; 5858 case UNORDERED: 5859 s = "_u_"; 5860 break; 5861 default: 5862 output_operand_lossage ("invalid %%xn code"); 5863 return; 5864 } 5865 fputs (s, file); 5866 fputc (float_p ? 'f' : cmp_signed ? 'i' : 'u', file); 5867 5868 switch (GET_MODE_SIZE (mode)) 5869 { 5870 case 1: 5871 s = "32"; 5872 break; 5873 case 2: 5874 s = float_p ? "16" : "32"; 5875 break; 5876 case 4: 5877 s = "32"; 5878 break; 5879 case 8: 5880 s = "64"; 5881 break; 5882 default: 5883 output_operand_lossage ("invalid operand %%xn code"); 5884 return; 5885 } 5886 fputs (s, file); 5887 return; 5888 } 5889 case 'L': 5890 print_operand (file, gcn_operand_part (GET_MODE (x), x, 0), 0); 5891 return; 5892 case 'H': 5893 print_operand (file, gcn_operand_part (GET_MODE (x), x, 1), 0); 5894 return; 5895 case 'R': 5896 /* Print a scalar register number as an integer. Temporary hack. */ 5897 gcc_assert (REG_P (x)); 5898 fprintf (file, "%u", (int) REGNO (x)); 5899 return; 5900 case 'V': 5901 /* Print a vector register number as an integer. Temporary hack. */ 5902 gcc_assert (REG_P (x)); 5903 fprintf (file, "%u", (int) REGNO (x) - FIRST_VGPR_REG); 5904 return; 5905 case 0: 5906 if (xcode == REG) 5907 print_reg (file, x); 5908 else if (xcode == MEM) 5909 output_address (GET_MODE (x), x); 5910 else if (xcode == CONST_INT) 5911 fprintf (file, "%i", (int) INTVAL (x)); 5912 else if (xcode == CONST_VECTOR) 5913 print_operand (file, CONST_VECTOR_ELT (x, 0), code); 5914 else if (xcode == CONST_DOUBLE) 5915 { 5916 const char *str; 5917 switch (gcn_inline_fp_constant_p (x, false)) 5918 { 5919 case 240: 5920 str = "0.5"; 5921 break; 5922 case 241: 5923 str = "-0.5"; 5924 break; 5925 case 242: 5926 str = "1.0"; 5927 break; 5928 case 243: 5929 str = "-1.0"; 5930 break; 5931 case 244: 5932 str = "2.0"; 5933 break; 5934 case 245: 5935 str = "-2.0"; 5936 break; 5937 case 246: 5938 str = "4.0"; 5939 break; 5940 case 247: 5941 str = "-4.0"; 5942 break; 5943 case 248: 5944 str = "1/pi"; 5945 break; 5946 default: 5947 rtx ix = simplify_gen_subreg (GET_MODE (x) == DFmode 5948 ? DImode : SImode, 5949 x, GET_MODE (x), 0); 5950 if (x) 5951 print_operand (file, ix, code); 5952 else 5953 output_operand_lossage ("invalid fp constant"); 5954 return; 5955 break; 5956 } 5957 fprintf (file, str); 5958 return; 5959 } 5960 else 5961 output_addr_const (file, x); 5962 return; 5963 case '^': 5964 if (TARGET_GCN5_PLUS) 5965 fputs ("_co", file); 5966 return; 5967 case 'g': 5968 gcc_assert (xcode == MEM); 5969 if (MEM_VOLATILE_P (x)) 5970 fputs (" glc", file); 5971 return; 5972 default: 5973 output_operand_lossage ("invalid %%xn code"); 5974 } 5975 gcc_unreachable (); 5976 } 5977 5978 /* }}} */ 5979 /* {{{ TARGET hook overrides. */ 5980 5981 #undef TARGET_ADDR_SPACE_ADDRESS_MODE 5982 #define TARGET_ADDR_SPACE_ADDRESS_MODE gcn_addr_space_address_mode 5983 #undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P 5984 #define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \ 5985 gcn_addr_space_legitimate_address_p 5986 #undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS 5987 #define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS gcn_addr_space_legitimize_address 5988 #undef TARGET_ADDR_SPACE_POINTER_MODE 5989 #define TARGET_ADDR_SPACE_POINTER_MODE gcn_addr_space_pointer_mode 5990 #undef TARGET_ADDR_SPACE_SUBSET_P 5991 #define TARGET_ADDR_SPACE_SUBSET_P gcn_addr_space_subset_p 5992 #undef TARGET_ADDR_SPACE_CONVERT 5993 #define TARGET_ADDR_SPACE_CONVERT gcn_addr_space_convert 5994 #undef TARGET_ARG_PARTIAL_BYTES 5995 #define TARGET_ARG_PARTIAL_BYTES gcn_arg_partial_bytes 5996 #undef TARGET_ASM_ALIGNED_DI_OP 5997 #define TARGET_ASM_ALIGNED_DI_OP "\t.8byte\t" 5998 #undef TARGET_ASM_CONSTRUCTOR 5999 #define TARGET_ASM_CONSTRUCTOR gcn_disable_constructors 6000 #undef TARGET_ASM_DESTRUCTOR 6001 #define TARGET_ASM_DESTRUCTOR gcn_disable_constructors 6002 #undef TARGET_ASM_FILE_START 6003 #define TARGET_ASM_FILE_START output_file_start 6004 #undef TARGET_ASM_FUNCTION_PROLOGUE 6005 #define TARGET_ASM_FUNCTION_PROLOGUE gcn_target_asm_function_prologue 6006 #undef TARGET_ASM_SELECT_SECTION 6007 #define TARGET_ASM_SELECT_SECTION gcn_asm_select_section 6008 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE 6009 #define TARGET_ASM_TRAMPOLINE_TEMPLATE gcn_asm_trampoline_template 6010 #undef TARGET_ATTRIBUTE_TABLE 6011 #define TARGET_ATTRIBUTE_TABLE gcn_attribute_table 6012 #undef TARGET_BUILTIN_DECL 6013 #define TARGET_BUILTIN_DECL gcn_builtin_decl 6014 #undef TARGET_CAN_CHANGE_MODE_CLASS 6015 #define TARGET_CAN_CHANGE_MODE_CLASS gcn_can_change_mode_class 6016 #undef TARGET_CAN_ELIMINATE 6017 #define TARGET_CAN_ELIMINATE gcn_can_eliminate_p 6018 #undef TARGET_CANNOT_COPY_INSN_P 6019 #define TARGET_CANNOT_COPY_INSN_P gcn_cannot_copy_insn_p 6020 #undef TARGET_CLASS_LIKELY_SPILLED_P 6021 #define TARGET_CLASS_LIKELY_SPILLED_P gcn_class_likely_spilled_p 6022 #undef TARGET_CLASS_MAX_NREGS 6023 #define TARGET_CLASS_MAX_NREGS gcn_class_max_nregs 6024 #undef TARGET_CONDITIONAL_REGISTER_USAGE 6025 #define TARGET_CONDITIONAL_REGISTER_USAGE gcn_conditional_register_usage 6026 #undef TARGET_CONSTANT_ALIGNMENT 6027 #define TARGET_CONSTANT_ALIGNMENT gcn_constant_alignment 6028 #undef TARGET_DEBUG_UNWIND_INFO 6029 #define TARGET_DEBUG_UNWIND_INFO gcn_debug_unwind_info 6030 #undef TARGET_EXPAND_BUILTIN 6031 #define TARGET_EXPAND_BUILTIN gcn_expand_builtin 6032 #undef TARGET_FUNCTION_ARG 6033 #undef TARGET_FUNCTION_ARG_ADVANCE 6034 #define TARGET_FUNCTION_ARG_ADVANCE gcn_function_arg_advance 6035 #define TARGET_FUNCTION_ARG gcn_function_arg 6036 #undef TARGET_FUNCTION_VALUE 6037 #define TARGET_FUNCTION_VALUE gcn_function_value 6038 #undef TARGET_FUNCTION_VALUE_REGNO_P 6039 #define TARGET_FUNCTION_VALUE_REGNO_P gcn_function_value_regno_p 6040 #undef TARGET_GIMPLIFY_VA_ARG_EXPR 6041 #define TARGET_GIMPLIFY_VA_ARG_EXPR gcn_gimplify_va_arg_expr 6042 #undef TARGET_GOACC_ADJUST_PROPAGATION_RECORD 6043 #define TARGET_GOACC_ADJUST_PROPAGATION_RECORD \ 6044 gcn_goacc_adjust_propagation_record 6045 #undef TARGET_GOACC_ADJUST_GANGPRIVATE_DECL 6046 #define TARGET_GOACC_ADJUST_GANGPRIVATE_DECL gcn_goacc_adjust_gangprivate_decl 6047 #undef TARGET_GOACC_FORK_JOIN 6048 #define TARGET_GOACC_FORK_JOIN gcn_fork_join 6049 #undef TARGET_GOACC_REDUCTION 6050 #define TARGET_GOACC_REDUCTION gcn_goacc_reduction 6051 #undef TARGET_GOACC_VALIDATE_DIMS 6052 #define TARGET_GOACC_VALIDATE_DIMS gcn_goacc_validate_dims 6053 #undef TARGET_GOACC_WORKER_PARTITIONING 6054 #define TARGET_GOACC_WORKER_PARTITIONING true 6055 #undef TARGET_HARD_REGNO_MODE_OK 6056 #define TARGET_HARD_REGNO_MODE_OK gcn_hard_regno_mode_ok 6057 #undef TARGET_HARD_REGNO_NREGS 6058 #define TARGET_HARD_REGNO_NREGS gcn_hard_regno_nregs 6059 #undef TARGET_HAVE_SPECULATION_SAFE_VALUE 6060 #define TARGET_HAVE_SPECULATION_SAFE_VALUE speculation_safe_value_not_needed 6061 #undef TARGET_INIT_BUILTINS 6062 #define TARGET_INIT_BUILTINS gcn_init_builtins 6063 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS 6064 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \ 6065 gcn_ira_change_pseudo_allocno_class 6066 #undef TARGET_LEGITIMATE_CONSTANT_P 6067 #define TARGET_LEGITIMATE_CONSTANT_P gcn_legitimate_constant_p 6068 #undef TARGET_LRA_P 6069 #define TARGET_LRA_P hook_bool_void_true 6070 #undef TARGET_MACHINE_DEPENDENT_REORG 6071 #define TARGET_MACHINE_DEPENDENT_REORG gcn_md_reorg 6072 #undef TARGET_MEMORY_MOVE_COST 6073 #define TARGET_MEMORY_MOVE_COST gcn_memory_move_cost 6074 #undef TARGET_MODES_TIEABLE_P 6075 #define TARGET_MODES_TIEABLE_P gcn_modes_tieable_p 6076 #undef TARGET_OPTION_OVERRIDE 6077 #define TARGET_OPTION_OVERRIDE gcn_option_override 6078 #undef TARGET_PRETEND_OUTGOING_VARARGS_NAMED 6079 #define TARGET_PRETEND_OUTGOING_VARARGS_NAMED \ 6080 gcn_pretend_outgoing_varargs_named 6081 #undef TARGET_PROMOTE_FUNCTION_MODE 6082 #define TARGET_PROMOTE_FUNCTION_MODE gcn_promote_function_mode 6083 #undef TARGET_REGISTER_MOVE_COST 6084 #define TARGET_REGISTER_MOVE_COST gcn_register_move_cost 6085 #undef TARGET_RETURN_IN_MEMORY 6086 #define TARGET_RETURN_IN_MEMORY gcn_return_in_memory 6087 #undef TARGET_RTX_COSTS 6088 #define TARGET_RTX_COSTS gcn_rtx_costs 6089 #undef TARGET_SECONDARY_RELOAD 6090 #define TARGET_SECONDARY_RELOAD gcn_secondary_reload 6091 #undef TARGET_SECTION_TYPE_FLAGS 6092 #define TARGET_SECTION_TYPE_FLAGS gcn_section_type_flags 6093 #undef TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P 6094 #define TARGET_SMALL_REGISTER_CLASSES_FOR_MODE_P \ 6095 gcn_small_register_classes_for_mode_p 6096 #undef TARGET_SPILL_CLASS 6097 #define TARGET_SPILL_CLASS gcn_spill_class 6098 #undef TARGET_STRICT_ARGUMENT_NAMING 6099 #define TARGET_STRICT_ARGUMENT_NAMING gcn_strict_argument_naming 6100 #undef TARGET_TRAMPOLINE_INIT 6101 #define TARGET_TRAMPOLINE_INIT gcn_trampoline_init 6102 #undef TARGET_TRULY_NOOP_TRUNCATION 6103 #define TARGET_TRULY_NOOP_TRUNCATION gcn_truly_noop_truncation 6104 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST 6105 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST gcn_vectorization_cost 6106 #undef TARGET_VECTORIZE_GET_MASK_MODE 6107 #define TARGET_VECTORIZE_GET_MASK_MODE gcn_vectorize_get_mask_mode 6108 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE 6109 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE gcn_vectorize_preferred_simd_mode 6110 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT 6111 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ 6112 gcn_preferred_vector_alignment 6113 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT 6114 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \ 6115 gcn_vectorize_support_vector_misalignment 6116 #undef TARGET_VECTORIZE_VEC_PERM_CONST 6117 #define TARGET_VECTORIZE_VEC_PERM_CONST gcn_vectorize_vec_perm_const 6118 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE 6119 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \ 6120 gcn_vector_alignment_reachable 6121 #undef TARGET_VECTOR_MODE_SUPPORTED_P 6122 #define TARGET_VECTOR_MODE_SUPPORTED_P gcn_vector_mode_supported_p 6123 6124 struct gcc_target targetm = TARGET_INITIALIZER; 6125 6126 #include "gt-gcn.h" 6127 /* }}} */ 6128