1 /* Machine description for AArch64 architecture. 2 Copyright (C) 2009-2019 Free Software Foundation, Inc. 3 Contributed by ARM Ltd. 4 5 This file is part of GCC. 6 7 GCC is free software; you can redistribute it and/or modify it 8 under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 3, or (at your option) 10 any later version. 11 12 GCC is distributed in the hope that it will be useful, but 13 WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with GCC; see the file COPYING3. If not see 19 <http://www.gnu.org/licenses/>. */ 20 21 #define IN_TARGET_CODE 1 22 23 #include "config.h" 24 #define INCLUDE_STRING 25 #include "system.h" 26 #include "coretypes.h" 27 #include "backend.h" 28 #include "target.h" 29 #include "rtl.h" 30 #include "tree.h" 31 #include "memmodel.h" 32 #include "gimple.h" 33 #include "cfghooks.h" 34 #include "cfgloop.h" 35 #include "df.h" 36 #include "tm_p.h" 37 #include "stringpool.h" 38 #include "attribs.h" 39 #include "optabs.h" 40 #include "regs.h" 41 #include "emit-rtl.h" 42 #include "recog.h" 43 #include "cgraph.h" 44 #include "diagnostic.h" 45 #include "insn-attr.h" 46 #include "alias.h" 47 #include "fold-const.h" 48 #include "stor-layout.h" 49 #include "calls.h" 50 #include "varasm.h" 51 #include "output.h" 52 #include "flags.h" 53 #include "explow.h" 54 #include "expr.h" 55 #include "reload.h" 56 #include "langhooks.h" 57 #include "opts.h" 58 #include "params.h" 59 #include "gimplify.h" 60 #include "dwarf2.h" 61 #include "gimple-iterator.h" 62 #include "tree-vectorizer.h" 63 #include "aarch64-cost-tables.h" 64 #include "dumpfile.h" 65 #include "builtins.h" 66 #include "rtl-iter.h" 67 #include "tm-constrs.h" 68 #include "sched-int.h" 69 #include "target-globals.h" 70 #include "common/common-target.h" 71 #include "cfgrtl.h" 72 #include "selftest.h" 73 #include "selftest-rtl.h" 74 #include "rtx-vector-builder.h" 75 #include "intl.h" 76 77 /* This file should be included last. */ 78 #include "target-def.h" 79 80 /* Defined for convenience. */ 81 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT) 82 83 /* Information about a legitimate vector immediate operand. */ 84 struct simd_immediate_info 85 { 86 enum insn_type { MOV, MVN }; 87 enum modifier_type { LSL, MSL }; 88 89 simd_immediate_info () {} 90 simd_immediate_info (scalar_float_mode, rtx); 91 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT, 92 insn_type = MOV, modifier_type = LSL, 93 unsigned int = 0); 94 simd_immediate_info (scalar_mode, rtx, rtx); 95 96 /* The mode of the elements. */ 97 scalar_mode elt_mode; 98 99 /* The value of each element if all elements are the same, or the 100 first value if the constant is a series. */ 101 rtx value; 102 103 /* The value of the step if the constant is a series, null otherwise. */ 104 rtx step; 105 106 /* The instruction to use to move the immediate into a vector. */ 107 insn_type insn; 108 109 /* The kind of shift modifier to use, and the number of bits to shift. 110 This is (LSL, 0) if no shift is needed. */ 111 modifier_type modifier; 112 unsigned int shift; 113 }; 114 115 /* Construct a floating-point immediate in which each element has mode 116 ELT_MODE_IN and value VALUE_IN. */ 117 inline simd_immediate_info 118 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in) 119 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV), 120 modifier (LSL), shift (0) 121 {} 122 123 /* Construct an integer immediate in which each element has mode ELT_MODE_IN 124 and value VALUE_IN. The other parameters are as for the structure 125 fields. */ 126 inline simd_immediate_info 127 ::simd_immediate_info (scalar_int_mode elt_mode_in, 128 unsigned HOST_WIDE_INT value_in, 129 insn_type insn_in, modifier_type modifier_in, 130 unsigned int shift_in) 131 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)), 132 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in) 133 {} 134 135 /* Construct an integer immediate in which each element has mode ELT_MODE_IN 136 and where element I is equal to VALUE_IN + I * STEP_IN. */ 137 inline simd_immediate_info 138 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in) 139 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV), 140 modifier (LSL), shift (0) 141 {} 142 143 /* The current code model. */ 144 enum aarch64_code_model aarch64_cmodel; 145 146 /* The number of 64-bit elements in an SVE vector. */ 147 poly_uint16 aarch64_sve_vg; 148 149 #ifdef HAVE_AS_TLS 150 #undef TARGET_HAVE_TLS 151 #define TARGET_HAVE_TLS 1 152 #endif 153 154 static bool aarch64_composite_type_p (const_tree, machine_mode); 155 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode, 156 const_tree, 157 machine_mode *, int *, 158 bool *); 159 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED; 160 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED; 161 static void aarch64_override_options_after_change (void); 162 static bool aarch64_vector_mode_supported_p (machine_mode); 163 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool); 164 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode, 165 const_tree type, 166 int misalignment, 167 bool is_packed); 168 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64); 169 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx, 170 aarch64_addr_query_type); 171 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val); 172 173 /* Major revision number of the ARM Architecture implemented by the target. */ 174 unsigned aarch64_architecture_version; 175 176 /* The processor for which instructions should be scheduled. */ 177 enum aarch64_processor aarch64_tune = cortexa53; 178 179 /* Mask to specify which instruction scheduling options should be used. */ 180 unsigned long aarch64_tune_flags = 0; 181 182 /* Global flag for PC relative loads. */ 183 bool aarch64_pcrelative_literal_loads; 184 185 /* Global flag for whether frame pointer is enabled. */ 186 bool aarch64_use_frame_pointer; 187 188 #define BRANCH_PROTECT_STR_MAX 255 189 char *accepted_branch_protection_string = NULL; 190 191 static enum aarch64_parse_opt_result 192 aarch64_parse_branch_protection (const char*, char**); 193 194 /* Support for command line parsing of boolean flags in the tuning 195 structures. */ 196 struct aarch64_flag_desc 197 { 198 const char* name; 199 unsigned int flag; 200 }; 201 202 #define AARCH64_FUSION_PAIR(name, internal_name) \ 203 { name, AARCH64_FUSE_##internal_name }, 204 static const struct aarch64_flag_desc aarch64_fusible_pairs[] = 205 { 206 { "none", AARCH64_FUSE_NOTHING }, 207 #include "aarch64-fusion-pairs.def" 208 { "all", AARCH64_FUSE_ALL }, 209 { NULL, AARCH64_FUSE_NOTHING } 210 }; 211 212 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \ 213 { name, AARCH64_EXTRA_TUNE_##internal_name }, 214 static const struct aarch64_flag_desc aarch64_tuning_flags[] = 215 { 216 { "none", AARCH64_EXTRA_TUNE_NONE }, 217 #include "aarch64-tuning-flags.def" 218 { "all", AARCH64_EXTRA_TUNE_ALL }, 219 { NULL, AARCH64_EXTRA_TUNE_NONE } 220 }; 221 222 /* Tuning parameters. */ 223 224 static const struct cpu_addrcost_table generic_addrcost_table = 225 { 226 { 227 1, /* hi */ 228 0, /* si */ 229 0, /* di */ 230 1, /* ti */ 231 }, 232 0, /* pre_modify */ 233 0, /* post_modify */ 234 0, /* register_offset */ 235 0, /* register_sextend */ 236 0, /* register_zextend */ 237 0 /* imm_offset */ 238 }; 239 240 static const struct cpu_addrcost_table exynosm1_addrcost_table = 241 { 242 { 243 0, /* hi */ 244 0, /* si */ 245 0, /* di */ 246 2, /* ti */ 247 }, 248 0, /* pre_modify */ 249 0, /* post_modify */ 250 1, /* register_offset */ 251 1, /* register_sextend */ 252 2, /* register_zextend */ 253 0, /* imm_offset */ 254 }; 255 256 static const struct cpu_addrcost_table xgene1_addrcost_table = 257 { 258 { 259 1, /* hi */ 260 0, /* si */ 261 0, /* di */ 262 1, /* ti */ 263 }, 264 1, /* pre_modify */ 265 1, /* post_modify */ 266 0, /* register_offset */ 267 1, /* register_sextend */ 268 1, /* register_zextend */ 269 0, /* imm_offset */ 270 }; 271 272 static const struct cpu_addrcost_table thunderx2t99_addrcost_table = 273 { 274 { 275 1, /* hi */ 276 1, /* si */ 277 1, /* di */ 278 2, /* ti */ 279 }, 280 0, /* pre_modify */ 281 0, /* post_modify */ 282 2, /* register_offset */ 283 3, /* register_sextend */ 284 3, /* register_zextend */ 285 0, /* imm_offset */ 286 }; 287 288 static const struct cpu_addrcost_table tsv110_addrcost_table = 289 { 290 { 291 1, /* hi */ 292 0, /* si */ 293 0, /* di */ 294 1, /* ti */ 295 }, 296 0, /* pre_modify */ 297 0, /* post_modify */ 298 0, /* register_offset */ 299 1, /* register_sextend */ 300 1, /* register_zextend */ 301 0, /* imm_offset */ 302 }; 303 304 static const struct cpu_addrcost_table qdf24xx_addrcost_table = 305 { 306 { 307 1, /* hi */ 308 1, /* si */ 309 1, /* di */ 310 2, /* ti */ 311 }, 312 1, /* pre_modify */ 313 1, /* post_modify */ 314 3, /* register_offset */ 315 3, /* register_sextend */ 316 3, /* register_zextend */ 317 2, /* imm_offset */ 318 }; 319 320 static const struct cpu_regmove_cost generic_regmove_cost = 321 { 322 1, /* GP2GP */ 323 /* Avoid the use of slow int<->fp moves for spilling by setting 324 their cost higher than memmov_cost. */ 325 5, /* GP2FP */ 326 5, /* FP2GP */ 327 2 /* FP2FP */ 328 }; 329 330 static const struct cpu_regmove_cost cortexa57_regmove_cost = 331 { 332 1, /* GP2GP */ 333 /* Avoid the use of slow int<->fp moves for spilling by setting 334 their cost higher than memmov_cost. */ 335 5, /* GP2FP */ 336 5, /* FP2GP */ 337 2 /* FP2FP */ 338 }; 339 340 static const struct cpu_regmove_cost cortexa53_regmove_cost = 341 { 342 1, /* GP2GP */ 343 /* Avoid the use of slow int<->fp moves for spilling by setting 344 their cost higher than memmov_cost. */ 345 5, /* GP2FP */ 346 5, /* FP2GP */ 347 2 /* FP2FP */ 348 }; 349 350 static const struct cpu_regmove_cost exynosm1_regmove_cost = 351 { 352 1, /* GP2GP */ 353 /* Avoid the use of slow int<->fp moves for spilling by setting 354 their cost higher than memmov_cost (actual, 4 and 9). */ 355 9, /* GP2FP */ 356 9, /* FP2GP */ 357 1 /* FP2FP */ 358 }; 359 360 static const struct cpu_regmove_cost thunderx_regmove_cost = 361 { 362 2, /* GP2GP */ 363 2, /* GP2FP */ 364 6, /* FP2GP */ 365 4 /* FP2FP */ 366 }; 367 368 static const struct cpu_regmove_cost xgene1_regmove_cost = 369 { 370 1, /* GP2GP */ 371 /* Avoid the use of slow int<->fp moves for spilling by setting 372 their cost higher than memmov_cost. */ 373 8, /* GP2FP */ 374 8, /* FP2GP */ 375 2 /* FP2FP */ 376 }; 377 378 static const struct cpu_regmove_cost qdf24xx_regmove_cost = 379 { 380 2, /* GP2GP */ 381 /* Avoid the use of int<->fp moves for spilling. */ 382 6, /* GP2FP */ 383 6, /* FP2GP */ 384 4 /* FP2FP */ 385 }; 386 387 static const struct cpu_regmove_cost thunderx2t99_regmove_cost = 388 { 389 1, /* GP2GP */ 390 /* Avoid the use of int<->fp moves for spilling. */ 391 8, /* GP2FP */ 392 8, /* FP2GP */ 393 4 /* FP2FP */ 394 }; 395 396 static const struct cpu_regmove_cost tsv110_regmove_cost = 397 { 398 1, /* GP2GP */ 399 /* Avoid the use of slow int<->fp moves for spilling by setting 400 their cost higher than memmov_cost. */ 401 2, /* GP2FP */ 402 3, /* FP2GP */ 403 2 /* FP2FP */ 404 }; 405 406 /* Generic costs for vector insn classes. */ 407 static const struct cpu_vector_cost generic_vector_cost = 408 { 409 1, /* scalar_int_stmt_cost */ 410 1, /* scalar_fp_stmt_cost */ 411 1, /* scalar_load_cost */ 412 1, /* scalar_store_cost */ 413 1, /* vec_int_stmt_cost */ 414 1, /* vec_fp_stmt_cost */ 415 2, /* vec_permute_cost */ 416 1, /* vec_to_scalar_cost */ 417 1, /* scalar_to_vec_cost */ 418 1, /* vec_align_load_cost */ 419 1, /* vec_unalign_load_cost */ 420 1, /* vec_unalign_store_cost */ 421 1, /* vec_store_cost */ 422 3, /* cond_taken_branch_cost */ 423 1 /* cond_not_taken_branch_cost */ 424 }; 425 426 /* QDF24XX costs for vector insn classes. */ 427 static const struct cpu_vector_cost qdf24xx_vector_cost = 428 { 429 1, /* scalar_int_stmt_cost */ 430 1, /* scalar_fp_stmt_cost */ 431 1, /* scalar_load_cost */ 432 1, /* scalar_store_cost */ 433 1, /* vec_int_stmt_cost */ 434 3, /* vec_fp_stmt_cost */ 435 2, /* vec_permute_cost */ 436 1, /* vec_to_scalar_cost */ 437 1, /* scalar_to_vec_cost */ 438 1, /* vec_align_load_cost */ 439 1, /* vec_unalign_load_cost */ 440 1, /* vec_unalign_store_cost */ 441 1, /* vec_store_cost */ 442 3, /* cond_taken_branch_cost */ 443 1 /* cond_not_taken_branch_cost */ 444 }; 445 446 /* ThunderX costs for vector insn classes. */ 447 static const struct cpu_vector_cost thunderx_vector_cost = 448 { 449 1, /* scalar_int_stmt_cost */ 450 1, /* scalar_fp_stmt_cost */ 451 3, /* scalar_load_cost */ 452 1, /* scalar_store_cost */ 453 4, /* vec_int_stmt_cost */ 454 1, /* vec_fp_stmt_cost */ 455 4, /* vec_permute_cost */ 456 2, /* vec_to_scalar_cost */ 457 2, /* scalar_to_vec_cost */ 458 3, /* vec_align_load_cost */ 459 5, /* vec_unalign_load_cost */ 460 5, /* vec_unalign_store_cost */ 461 1, /* vec_store_cost */ 462 3, /* cond_taken_branch_cost */ 463 3 /* cond_not_taken_branch_cost */ 464 }; 465 466 static const struct cpu_vector_cost tsv110_vector_cost = 467 { 468 1, /* scalar_int_stmt_cost */ 469 1, /* scalar_fp_stmt_cost */ 470 5, /* scalar_load_cost */ 471 1, /* scalar_store_cost */ 472 2, /* vec_int_stmt_cost */ 473 2, /* vec_fp_stmt_cost */ 474 2, /* vec_permute_cost */ 475 3, /* vec_to_scalar_cost */ 476 2, /* scalar_to_vec_cost */ 477 5, /* vec_align_load_cost */ 478 5, /* vec_unalign_load_cost */ 479 1, /* vec_unalign_store_cost */ 480 1, /* vec_store_cost */ 481 1, /* cond_taken_branch_cost */ 482 1 /* cond_not_taken_branch_cost */ 483 }; 484 485 /* Generic costs for vector insn classes. */ 486 static const struct cpu_vector_cost cortexa57_vector_cost = 487 { 488 1, /* scalar_int_stmt_cost */ 489 1, /* scalar_fp_stmt_cost */ 490 4, /* scalar_load_cost */ 491 1, /* scalar_store_cost */ 492 2, /* vec_int_stmt_cost */ 493 2, /* vec_fp_stmt_cost */ 494 3, /* vec_permute_cost */ 495 8, /* vec_to_scalar_cost */ 496 8, /* scalar_to_vec_cost */ 497 4, /* vec_align_load_cost */ 498 4, /* vec_unalign_load_cost */ 499 1, /* vec_unalign_store_cost */ 500 1, /* vec_store_cost */ 501 1, /* cond_taken_branch_cost */ 502 1 /* cond_not_taken_branch_cost */ 503 }; 504 505 static const struct cpu_vector_cost exynosm1_vector_cost = 506 { 507 1, /* scalar_int_stmt_cost */ 508 1, /* scalar_fp_stmt_cost */ 509 5, /* scalar_load_cost */ 510 1, /* scalar_store_cost */ 511 3, /* vec_int_stmt_cost */ 512 3, /* vec_fp_stmt_cost */ 513 3, /* vec_permute_cost */ 514 3, /* vec_to_scalar_cost */ 515 3, /* scalar_to_vec_cost */ 516 5, /* vec_align_load_cost */ 517 5, /* vec_unalign_load_cost */ 518 1, /* vec_unalign_store_cost */ 519 1, /* vec_store_cost */ 520 1, /* cond_taken_branch_cost */ 521 1 /* cond_not_taken_branch_cost */ 522 }; 523 524 /* Generic costs for vector insn classes. */ 525 static const struct cpu_vector_cost xgene1_vector_cost = 526 { 527 1, /* scalar_int_stmt_cost */ 528 1, /* scalar_fp_stmt_cost */ 529 5, /* scalar_load_cost */ 530 1, /* scalar_store_cost */ 531 2, /* vec_int_stmt_cost */ 532 2, /* vec_fp_stmt_cost */ 533 2, /* vec_permute_cost */ 534 4, /* vec_to_scalar_cost */ 535 4, /* scalar_to_vec_cost */ 536 10, /* vec_align_load_cost */ 537 10, /* vec_unalign_load_cost */ 538 2, /* vec_unalign_store_cost */ 539 2, /* vec_store_cost */ 540 2, /* cond_taken_branch_cost */ 541 1 /* cond_not_taken_branch_cost */ 542 }; 543 544 /* Costs for vector insn classes for Vulcan. */ 545 static const struct cpu_vector_cost thunderx2t99_vector_cost = 546 { 547 1, /* scalar_int_stmt_cost */ 548 6, /* scalar_fp_stmt_cost */ 549 4, /* scalar_load_cost */ 550 1, /* scalar_store_cost */ 551 5, /* vec_int_stmt_cost */ 552 6, /* vec_fp_stmt_cost */ 553 3, /* vec_permute_cost */ 554 6, /* vec_to_scalar_cost */ 555 5, /* scalar_to_vec_cost */ 556 8, /* vec_align_load_cost */ 557 8, /* vec_unalign_load_cost */ 558 4, /* vec_unalign_store_cost */ 559 4, /* vec_store_cost */ 560 2, /* cond_taken_branch_cost */ 561 1 /* cond_not_taken_branch_cost */ 562 }; 563 564 /* Generic costs for branch instructions. */ 565 static const struct cpu_branch_cost generic_branch_cost = 566 { 567 1, /* Predictable. */ 568 3 /* Unpredictable. */ 569 }; 570 571 /* Generic approximation modes. */ 572 static const cpu_approx_modes generic_approx_modes = 573 { 574 AARCH64_APPROX_NONE, /* division */ 575 AARCH64_APPROX_NONE, /* sqrt */ 576 AARCH64_APPROX_NONE /* recip_sqrt */ 577 }; 578 579 /* Approximation modes for Exynos M1. */ 580 static const cpu_approx_modes exynosm1_approx_modes = 581 { 582 AARCH64_APPROX_NONE, /* division */ 583 AARCH64_APPROX_ALL, /* sqrt */ 584 AARCH64_APPROX_ALL /* recip_sqrt */ 585 }; 586 587 /* Approximation modes for X-Gene 1. */ 588 static const cpu_approx_modes xgene1_approx_modes = 589 { 590 AARCH64_APPROX_NONE, /* division */ 591 AARCH64_APPROX_NONE, /* sqrt */ 592 AARCH64_APPROX_ALL /* recip_sqrt */ 593 }; 594 595 /* Generic prefetch settings (which disable prefetch). */ 596 static const cpu_prefetch_tune generic_prefetch_tune = 597 { 598 0, /* num_slots */ 599 -1, /* l1_cache_size */ 600 -1, /* l1_cache_line_size */ 601 -1, /* l2_cache_size */ 602 true, /* prefetch_dynamic_strides */ 603 -1, /* minimum_stride */ 604 -1 /* default_opt_level */ 605 }; 606 607 static const cpu_prefetch_tune exynosm1_prefetch_tune = 608 { 609 0, /* num_slots */ 610 -1, /* l1_cache_size */ 611 64, /* l1_cache_line_size */ 612 -1, /* l2_cache_size */ 613 true, /* prefetch_dynamic_strides */ 614 -1, /* minimum_stride */ 615 -1 /* default_opt_level */ 616 }; 617 618 static const cpu_prefetch_tune qdf24xx_prefetch_tune = 619 { 620 4, /* num_slots */ 621 32, /* l1_cache_size */ 622 64, /* l1_cache_line_size */ 623 512, /* l2_cache_size */ 624 false, /* prefetch_dynamic_strides */ 625 2048, /* minimum_stride */ 626 3 /* default_opt_level */ 627 }; 628 629 static const cpu_prefetch_tune thunderxt88_prefetch_tune = 630 { 631 8, /* num_slots */ 632 32, /* l1_cache_size */ 633 128, /* l1_cache_line_size */ 634 16*1024, /* l2_cache_size */ 635 true, /* prefetch_dynamic_strides */ 636 -1, /* minimum_stride */ 637 3 /* default_opt_level */ 638 }; 639 640 static const cpu_prefetch_tune thunderx_prefetch_tune = 641 { 642 8, /* num_slots */ 643 32, /* l1_cache_size */ 644 128, /* l1_cache_line_size */ 645 -1, /* l2_cache_size */ 646 true, /* prefetch_dynamic_strides */ 647 -1, /* minimum_stride */ 648 -1 /* default_opt_level */ 649 }; 650 651 static const cpu_prefetch_tune thunderx2t99_prefetch_tune = 652 { 653 8, /* num_slots */ 654 32, /* l1_cache_size */ 655 64, /* l1_cache_line_size */ 656 256, /* l2_cache_size */ 657 true, /* prefetch_dynamic_strides */ 658 -1, /* minimum_stride */ 659 -1 /* default_opt_level */ 660 }; 661 662 static const cpu_prefetch_tune tsv110_prefetch_tune = 663 { 664 0, /* num_slots */ 665 64, /* l1_cache_size */ 666 64, /* l1_cache_line_size */ 667 512, /* l2_cache_size */ 668 true, /* prefetch_dynamic_strides */ 669 -1, /* minimum_stride */ 670 -1 /* default_opt_level */ 671 }; 672 673 static const cpu_prefetch_tune xgene1_prefetch_tune = 674 { 675 8, /* num_slots */ 676 32, /* l1_cache_size */ 677 64, /* l1_cache_line_size */ 678 256, /* l2_cache_size */ 679 true, /* prefetch_dynamic_strides */ 680 -1, /* minimum_stride */ 681 -1 /* default_opt_level */ 682 }; 683 684 static const struct tune_params generic_tunings = 685 { 686 &cortexa57_extra_costs, 687 &generic_addrcost_table, 688 &generic_regmove_cost, 689 &generic_vector_cost, 690 &generic_branch_cost, 691 &generic_approx_modes, 692 SVE_NOT_IMPLEMENTED, /* sve_width */ 693 4, /* memmov_cost */ 694 2, /* issue_rate */ 695 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */ 696 "8", /* function_align. */ 697 "4", /* jump_align. */ 698 "8", /* loop_align. */ 699 2, /* int_reassoc_width. */ 700 4, /* fp_reassoc_width. */ 701 1, /* vec_reassoc_width. */ 702 2, /* min_div_recip_mul_sf. */ 703 2, /* min_div_recip_mul_df. */ 704 0, /* max_case_values. */ 705 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 706 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 707 &generic_prefetch_tune 708 }; 709 710 static const struct tune_params cortexa35_tunings = 711 { 712 &cortexa53_extra_costs, 713 &generic_addrcost_table, 714 &cortexa53_regmove_cost, 715 &generic_vector_cost, 716 &generic_branch_cost, 717 &generic_approx_modes, 718 SVE_NOT_IMPLEMENTED, /* sve_width */ 719 4, /* memmov_cost */ 720 1, /* issue_rate */ 721 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 722 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ 723 "16", /* function_align. */ 724 "4", /* jump_align. */ 725 "8", /* loop_align. */ 726 2, /* int_reassoc_width. */ 727 4, /* fp_reassoc_width. */ 728 1, /* vec_reassoc_width. */ 729 2, /* min_div_recip_mul_sf. */ 730 2, /* min_div_recip_mul_df. */ 731 0, /* max_case_values. */ 732 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 733 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 734 &generic_prefetch_tune 735 }; 736 737 static const struct tune_params cortexa53_tunings = 738 { 739 &cortexa53_extra_costs, 740 &generic_addrcost_table, 741 &cortexa53_regmove_cost, 742 &generic_vector_cost, 743 &generic_branch_cost, 744 &generic_approx_modes, 745 SVE_NOT_IMPLEMENTED, /* sve_width */ 746 4, /* memmov_cost */ 747 2, /* issue_rate */ 748 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 749 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ 750 "16", /* function_align. */ 751 "4", /* jump_align. */ 752 "8", /* loop_align. */ 753 2, /* int_reassoc_width. */ 754 4, /* fp_reassoc_width. */ 755 1, /* vec_reassoc_width. */ 756 2, /* min_div_recip_mul_sf. */ 757 2, /* min_div_recip_mul_df. */ 758 0, /* max_case_values. */ 759 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 760 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 761 &generic_prefetch_tune 762 }; 763 764 static const struct tune_params cortexa57_tunings = 765 { 766 &cortexa57_extra_costs, 767 &generic_addrcost_table, 768 &cortexa57_regmove_cost, 769 &cortexa57_vector_cost, 770 &generic_branch_cost, 771 &generic_approx_modes, 772 SVE_NOT_IMPLEMENTED, /* sve_width */ 773 4, /* memmov_cost */ 774 3, /* issue_rate */ 775 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 776 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ 777 "16", /* function_align. */ 778 "4", /* jump_align. */ 779 "8", /* loop_align. */ 780 2, /* int_reassoc_width. */ 781 4, /* fp_reassoc_width. */ 782 1, /* vec_reassoc_width. */ 783 2, /* min_div_recip_mul_sf. */ 784 2, /* min_div_recip_mul_df. */ 785 0, /* max_case_values. */ 786 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 787 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */ 788 &generic_prefetch_tune 789 }; 790 791 static const struct tune_params cortexa72_tunings = 792 { 793 &cortexa57_extra_costs, 794 &generic_addrcost_table, 795 &cortexa57_regmove_cost, 796 &cortexa57_vector_cost, 797 &generic_branch_cost, 798 &generic_approx_modes, 799 SVE_NOT_IMPLEMENTED, /* sve_width */ 800 4, /* memmov_cost */ 801 3, /* issue_rate */ 802 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 803 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */ 804 "16", /* function_align. */ 805 "4", /* jump_align. */ 806 "8", /* loop_align. */ 807 2, /* int_reassoc_width. */ 808 4, /* fp_reassoc_width. */ 809 1, /* vec_reassoc_width. */ 810 2, /* min_div_recip_mul_sf. */ 811 2, /* min_div_recip_mul_df. */ 812 0, /* max_case_values. */ 813 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 814 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 815 &generic_prefetch_tune 816 }; 817 818 static const struct tune_params cortexa73_tunings = 819 { 820 &cortexa57_extra_costs, 821 &generic_addrcost_table, 822 &cortexa57_regmove_cost, 823 &cortexa57_vector_cost, 824 &generic_branch_cost, 825 &generic_approx_modes, 826 SVE_NOT_IMPLEMENTED, /* sve_width */ 827 4, /* memmov_cost. */ 828 2, /* issue_rate. */ 829 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 830 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */ 831 "16", /* function_align. */ 832 "4", /* jump_align. */ 833 "8", /* loop_align. */ 834 2, /* int_reassoc_width. */ 835 4, /* fp_reassoc_width. */ 836 1, /* vec_reassoc_width. */ 837 2, /* min_div_recip_mul_sf. */ 838 2, /* min_div_recip_mul_df. */ 839 0, /* max_case_values. */ 840 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 841 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 842 &generic_prefetch_tune 843 }; 844 845 846 847 static const struct tune_params exynosm1_tunings = 848 { 849 &exynosm1_extra_costs, 850 &exynosm1_addrcost_table, 851 &exynosm1_regmove_cost, 852 &exynosm1_vector_cost, 853 &generic_branch_cost, 854 &exynosm1_approx_modes, 855 SVE_NOT_IMPLEMENTED, /* sve_width */ 856 4, /* memmov_cost */ 857 3, /* issue_rate */ 858 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */ 859 "4", /* function_align. */ 860 "4", /* jump_align. */ 861 "4", /* loop_align. */ 862 2, /* int_reassoc_width. */ 863 4, /* fp_reassoc_width. */ 864 1, /* vec_reassoc_width. */ 865 2, /* min_div_recip_mul_sf. */ 866 2, /* min_div_recip_mul_df. */ 867 48, /* max_case_values. */ 868 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 869 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 870 &exynosm1_prefetch_tune 871 }; 872 873 static const struct tune_params thunderxt88_tunings = 874 { 875 &thunderx_extra_costs, 876 &generic_addrcost_table, 877 &thunderx_regmove_cost, 878 &thunderx_vector_cost, 879 &generic_branch_cost, 880 &generic_approx_modes, 881 SVE_NOT_IMPLEMENTED, /* sve_width */ 882 6, /* memmov_cost */ 883 2, /* issue_rate */ 884 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */ 885 "8", /* function_align. */ 886 "8", /* jump_align. */ 887 "8", /* loop_align. */ 888 2, /* int_reassoc_width. */ 889 4, /* fp_reassoc_width. */ 890 1, /* vec_reassoc_width. */ 891 2, /* min_div_recip_mul_sf. */ 892 2, /* min_div_recip_mul_df. */ 893 0, /* max_case_values. */ 894 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ 895 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */ 896 &thunderxt88_prefetch_tune 897 }; 898 899 static const struct tune_params thunderx_tunings = 900 { 901 &thunderx_extra_costs, 902 &generic_addrcost_table, 903 &thunderx_regmove_cost, 904 &thunderx_vector_cost, 905 &generic_branch_cost, 906 &generic_approx_modes, 907 SVE_NOT_IMPLEMENTED, /* sve_width */ 908 6, /* memmov_cost */ 909 2, /* issue_rate */ 910 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */ 911 "8", /* function_align. */ 912 "8", /* jump_align. */ 913 "8", /* loop_align. */ 914 2, /* int_reassoc_width. */ 915 4, /* fp_reassoc_width. */ 916 1, /* vec_reassoc_width. */ 917 2, /* min_div_recip_mul_sf. */ 918 2, /* min_div_recip_mul_df. */ 919 0, /* max_case_values. */ 920 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ 921 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW 922 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */ 923 &thunderx_prefetch_tune 924 }; 925 926 static const struct tune_params tsv110_tunings = 927 { 928 &tsv110_extra_costs, 929 &tsv110_addrcost_table, 930 &tsv110_regmove_cost, 931 &tsv110_vector_cost, 932 &generic_branch_cost, 933 &generic_approx_modes, 934 SVE_NOT_IMPLEMENTED, /* sve_width */ 935 4, /* memmov_cost */ 936 4, /* issue_rate */ 937 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH 938 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */ 939 "16", /* function_align. */ 940 "4", /* jump_align. */ 941 "8", /* loop_align. */ 942 2, /* int_reassoc_width. */ 943 4, /* fp_reassoc_width. */ 944 1, /* vec_reassoc_width. */ 945 2, /* min_div_recip_mul_sf. */ 946 2, /* min_div_recip_mul_df. */ 947 0, /* max_case_values. */ 948 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 949 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 950 &tsv110_prefetch_tune 951 }; 952 953 static const struct tune_params xgene1_tunings = 954 { 955 &xgene1_extra_costs, 956 &xgene1_addrcost_table, 957 &xgene1_regmove_cost, 958 &xgene1_vector_cost, 959 &generic_branch_cost, 960 &xgene1_approx_modes, 961 SVE_NOT_IMPLEMENTED, /* sve_width */ 962 6, /* memmov_cost */ 963 4, /* issue_rate */ 964 AARCH64_FUSE_NOTHING, /* fusible_ops */ 965 "16", /* function_align. */ 966 "16", /* jump_align. */ 967 "16", /* loop_align. */ 968 2, /* int_reassoc_width. */ 969 4, /* fp_reassoc_width. */ 970 1, /* vec_reassoc_width. */ 971 2, /* min_div_recip_mul_sf. */ 972 2, /* min_div_recip_mul_df. */ 973 17, /* max_case_values. */ 974 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ 975 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ 976 &xgene1_prefetch_tune 977 }; 978 979 static const struct tune_params emag_tunings = 980 { 981 &xgene1_extra_costs, 982 &xgene1_addrcost_table, 983 &xgene1_regmove_cost, 984 &xgene1_vector_cost, 985 &generic_branch_cost, 986 &xgene1_approx_modes, 987 SVE_NOT_IMPLEMENTED, 988 6, /* memmov_cost */ 989 4, /* issue_rate */ 990 AARCH64_FUSE_NOTHING, /* fusible_ops */ 991 "16", /* function_align. */ 992 "16", /* jump_align. */ 993 "16", /* loop_align. */ 994 2, /* int_reassoc_width. */ 995 4, /* fp_reassoc_width. */ 996 1, /* vec_reassoc_width. */ 997 2, /* min_div_recip_mul_sf. */ 998 2, /* min_div_recip_mul_df. */ 999 17, /* max_case_values. */ 1000 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */ 1001 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */ 1002 &xgene1_prefetch_tune 1003 }; 1004 1005 static const struct tune_params qdf24xx_tunings = 1006 { 1007 &qdf24xx_extra_costs, 1008 &qdf24xx_addrcost_table, 1009 &qdf24xx_regmove_cost, 1010 &qdf24xx_vector_cost, 1011 &generic_branch_cost, 1012 &generic_approx_modes, 1013 SVE_NOT_IMPLEMENTED, /* sve_width */ 1014 4, /* memmov_cost */ 1015 4, /* issue_rate */ 1016 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 1017 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ 1018 "16", /* function_align. */ 1019 "8", /* jump_align. */ 1020 "16", /* loop_align. */ 1021 2, /* int_reassoc_width. */ 1022 4, /* fp_reassoc_width. */ 1023 1, /* vec_reassoc_width. */ 1024 2, /* min_div_recip_mul_sf. */ 1025 2, /* min_div_recip_mul_df. */ 1026 0, /* max_case_values. */ 1027 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1028 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */ 1029 &qdf24xx_prefetch_tune 1030 }; 1031 1032 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values 1033 for now. */ 1034 static const struct tune_params saphira_tunings = 1035 { 1036 &generic_extra_costs, 1037 &generic_addrcost_table, 1038 &generic_regmove_cost, 1039 &generic_vector_cost, 1040 &generic_branch_cost, 1041 &generic_approx_modes, 1042 SVE_NOT_IMPLEMENTED, /* sve_width */ 1043 4, /* memmov_cost */ 1044 4, /* issue_rate */ 1045 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD 1046 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */ 1047 "16", /* function_align. */ 1048 "8", /* jump_align. */ 1049 "16", /* loop_align. */ 1050 2, /* int_reassoc_width. */ 1051 4, /* fp_reassoc_width. */ 1052 1, /* vec_reassoc_width. */ 1053 2, /* min_div_recip_mul_sf. */ 1054 2, /* min_div_recip_mul_df. */ 1055 0, /* max_case_values. */ 1056 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1057 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1058 &generic_prefetch_tune 1059 }; 1060 1061 static const struct tune_params thunderx2t99_tunings = 1062 { 1063 &thunderx2t99_extra_costs, 1064 &thunderx2t99_addrcost_table, 1065 &thunderx2t99_regmove_cost, 1066 &thunderx2t99_vector_cost, 1067 &generic_branch_cost, 1068 &generic_approx_modes, 1069 SVE_NOT_IMPLEMENTED, /* sve_width */ 1070 4, /* memmov_cost. */ 1071 4, /* issue_rate. */ 1072 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC 1073 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */ 1074 "16", /* function_align. */ 1075 "8", /* jump_align. */ 1076 "16", /* loop_align. */ 1077 3, /* int_reassoc_width. */ 1078 2, /* fp_reassoc_width. */ 1079 2, /* vec_reassoc_width. */ 1080 2, /* min_div_recip_mul_sf. */ 1081 2, /* min_div_recip_mul_df. */ 1082 0, /* max_case_values. */ 1083 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1084 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1085 &thunderx2t99_prefetch_tune 1086 }; 1087 1088 static const struct tune_params neoversen1_tunings = 1089 { 1090 &cortexa57_extra_costs, 1091 &generic_addrcost_table, 1092 &generic_regmove_cost, 1093 &cortexa57_vector_cost, 1094 &generic_branch_cost, 1095 &generic_approx_modes, 1096 SVE_NOT_IMPLEMENTED, /* sve_width */ 1097 4, /* memmov_cost */ 1098 3, /* issue_rate */ 1099 AARCH64_FUSE_AES_AESMC, /* fusible_ops */ 1100 "32:16", /* function_align. */ 1101 "32:16", /* jump_align. */ 1102 "32:16", /* loop_align. */ 1103 2, /* int_reassoc_width. */ 1104 4, /* fp_reassoc_width. */ 1105 2, /* vec_reassoc_width. */ 1106 2, /* min_div_recip_mul_sf. */ 1107 2, /* min_div_recip_mul_df. */ 1108 0, /* max_case_values. */ 1109 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */ 1110 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */ 1111 &generic_prefetch_tune 1112 }; 1113 1114 /* Support for fine-grained override of the tuning structures. */ 1115 struct aarch64_tuning_override_function 1116 { 1117 const char* name; 1118 void (*parse_override)(const char*, struct tune_params*); 1119 }; 1120 1121 static void aarch64_parse_fuse_string (const char*, struct tune_params*); 1122 static void aarch64_parse_tune_string (const char*, struct tune_params*); 1123 static void aarch64_parse_sve_width_string (const char*, struct tune_params*); 1124 1125 static const struct aarch64_tuning_override_function 1126 aarch64_tuning_override_functions[] = 1127 { 1128 { "fuse", aarch64_parse_fuse_string }, 1129 { "tune", aarch64_parse_tune_string }, 1130 { "sve_width", aarch64_parse_sve_width_string }, 1131 { NULL, NULL } 1132 }; 1133 1134 /* A processor implementing AArch64. */ 1135 struct processor 1136 { 1137 const char *const name; 1138 enum aarch64_processor ident; 1139 enum aarch64_processor sched_core; 1140 enum aarch64_arch arch; 1141 unsigned architecture_version; 1142 const unsigned long flags; 1143 const struct tune_params *const tune; 1144 }; 1145 1146 /* Architectures implementing AArch64. */ 1147 static const struct processor all_architectures[] = 1148 { 1149 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \ 1150 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL}, 1151 #include "aarch64-arches.def" 1152 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL} 1153 }; 1154 1155 /* Processor cores implementing AArch64. */ 1156 static const struct processor all_cores[] = 1157 { 1158 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \ 1159 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \ 1160 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \ 1161 FLAGS, &COSTS##_tunings}, 1162 #include "aarch64-cores.def" 1163 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8, 1164 AARCH64_FL_FOR_ARCH8, &generic_tunings}, 1165 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL} 1166 }; 1167 1168 1169 /* Target specification. These are populated by the -march, -mtune, -mcpu 1170 handling code or by target attributes. */ 1171 static const struct processor *selected_arch; 1172 static const struct processor *selected_cpu; 1173 static const struct processor *selected_tune; 1174 1175 /* The current tuning set. */ 1176 struct tune_params aarch64_tune_params = generic_tunings; 1177 1178 /* Table of machine attributes. */ 1179 static const struct attribute_spec aarch64_attribute_table[] = 1180 { 1181 /* { name, min_len, max_len, decl_req, type_req, fn_type_req, 1182 affects_type_identity, handler, exclude } */ 1183 { "aarch64_vector_pcs", 0, 0, false, true, true, true, NULL, NULL }, 1184 { NULL, 0, 0, false, false, false, false, NULL, NULL } 1185 }; 1186 1187 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0) 1188 1189 /* An ISA extension in the co-processor and main instruction set space. */ 1190 struct aarch64_option_extension 1191 { 1192 const char *const name; 1193 const unsigned long flags_on; 1194 const unsigned long flags_off; 1195 }; 1196 1197 typedef enum aarch64_cond_code 1198 { 1199 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL, 1200 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT, 1201 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV 1202 } 1203 aarch64_cc; 1204 1205 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1)) 1206 1207 struct aarch64_branch_protect_type 1208 { 1209 /* The type's name that the user passes to the branch-protection option 1210 string. */ 1211 const char* name; 1212 /* Function to handle the protection type and set global variables. 1213 First argument is the string token corresponding with this type and the 1214 second argument is the next token in the option string. 1215 Return values: 1216 * AARCH64_PARSE_OK: Handling was sucessful. 1217 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller 1218 should print an error. 1219 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its 1220 own error. */ 1221 enum aarch64_parse_opt_result (*handler)(char*, char*); 1222 /* A list of types that can follow this type in the option string. */ 1223 const aarch64_branch_protect_type* subtypes; 1224 unsigned int num_subtypes; 1225 }; 1226 1227 static enum aarch64_parse_opt_result 1228 aarch64_handle_no_branch_protection (char* str, char* rest) 1229 { 1230 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE; 1231 aarch64_enable_bti = 0; 1232 if (rest) 1233 { 1234 error ("unexpected %<%s%> after %<%s%>", rest, str); 1235 return AARCH64_PARSE_INVALID_FEATURE; 1236 } 1237 return AARCH64_PARSE_OK; 1238 } 1239 1240 static enum aarch64_parse_opt_result 1241 aarch64_handle_standard_branch_protection (char* str, char* rest) 1242 { 1243 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF; 1244 aarch64_enable_bti = 1; 1245 if (rest) 1246 { 1247 error ("unexpected %<%s%> after %<%s%>", rest, str); 1248 return AARCH64_PARSE_INVALID_FEATURE; 1249 } 1250 return AARCH64_PARSE_OK; 1251 } 1252 1253 static enum aarch64_parse_opt_result 1254 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED, 1255 char* rest ATTRIBUTE_UNUSED) 1256 { 1257 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF; 1258 return AARCH64_PARSE_OK; 1259 } 1260 1261 static enum aarch64_parse_opt_result 1262 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED, 1263 char* rest ATTRIBUTE_UNUSED) 1264 { 1265 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL; 1266 return AARCH64_PARSE_OK; 1267 } 1268 1269 static enum aarch64_parse_opt_result 1270 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED, 1271 char* rest ATTRIBUTE_UNUSED) 1272 { 1273 aarch64_enable_bti = 1; 1274 return AARCH64_PARSE_OK; 1275 } 1276 1277 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = { 1278 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 }, 1279 { NULL, NULL, NULL, 0 } 1280 }; 1281 1282 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = { 1283 { "none", aarch64_handle_no_branch_protection, NULL, 0 }, 1284 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 }, 1285 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes, 1286 ARRAY_SIZE (aarch64_pac_ret_subtypes) }, 1287 { "bti", aarch64_handle_bti_protection, NULL, 0 }, 1288 { NULL, NULL, NULL, 0 } 1289 }; 1290 1291 /* The condition codes of the processor, and the inverse function. */ 1292 static const char * const aarch64_condition_codes[] = 1293 { 1294 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc", 1295 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv" 1296 }; 1297 1298 /* Generate code to enable conditional branches in functions over 1 MiB. */ 1299 const char * 1300 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest, 1301 const char * branch_format) 1302 { 1303 rtx_code_label * tmp_label = gen_label_rtx (); 1304 char label_buf[256]; 1305 char buffer[128]; 1306 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest, 1307 CODE_LABEL_NUMBER (tmp_label)); 1308 const char *label_ptr = targetm.strip_name_encoding (label_buf); 1309 rtx dest_label = operands[pos_label]; 1310 operands[pos_label] = tmp_label; 1311 1312 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr); 1313 output_asm_insn (buffer, operands); 1314 1315 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr); 1316 operands[pos_label] = dest_label; 1317 output_asm_insn (buffer, operands); 1318 return ""; 1319 } 1320 1321 void 1322 aarch64_err_no_fpadvsimd (machine_mode mode) 1323 { 1324 if (TARGET_GENERAL_REGS_ONLY) 1325 if (FLOAT_MODE_P (mode)) 1326 error ("%qs is incompatible with the use of floating-point types", 1327 "-mgeneral-regs-only"); 1328 else 1329 error ("%qs is incompatible with the use of vector types", 1330 "-mgeneral-regs-only"); 1331 else 1332 if (FLOAT_MODE_P (mode)) 1333 error ("%qs feature modifier is incompatible with the use of" 1334 " floating-point types", "+nofp"); 1335 else 1336 error ("%qs feature modifier is incompatible with the use of" 1337 " vector types", "+nofp"); 1338 } 1339 1340 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS. 1341 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and 1342 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much 1343 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS 1344 and GENERAL_REGS is lower than the memory cost (in this case the best class 1345 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its 1346 cost results in bad allocations with many redundant int<->FP moves which 1347 are expensive on various cores. 1348 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but 1349 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class 1350 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't 1351 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode. 1352 The result of this is that it is no longer inefficient to have a higher 1353 memory move cost than the register move cost. 1354 */ 1355 1356 static reg_class_t 1357 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class, 1358 reg_class_t best_class) 1359 { 1360 machine_mode mode; 1361 1362 if (!reg_class_subset_p (GENERAL_REGS, allocno_class) 1363 || !reg_class_subset_p (FP_REGS, allocno_class)) 1364 return allocno_class; 1365 1366 if (!reg_class_subset_p (GENERAL_REGS, best_class) 1367 || !reg_class_subset_p (FP_REGS, best_class)) 1368 return best_class; 1369 1370 mode = PSEUDO_REGNO_MODE (regno); 1371 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS; 1372 } 1373 1374 static unsigned int 1375 aarch64_min_divisions_for_recip_mul (machine_mode mode) 1376 { 1377 if (GET_MODE_UNIT_SIZE (mode) == 4) 1378 return aarch64_tune_params.min_div_recip_mul_sf; 1379 return aarch64_tune_params.min_div_recip_mul_df; 1380 } 1381 1382 /* Return the reassociation width of treeop OPC with mode MODE. */ 1383 static int 1384 aarch64_reassociation_width (unsigned opc, machine_mode mode) 1385 { 1386 if (VECTOR_MODE_P (mode)) 1387 return aarch64_tune_params.vec_reassoc_width; 1388 if (INTEGRAL_MODE_P (mode)) 1389 return aarch64_tune_params.int_reassoc_width; 1390 /* Avoid reassociating floating point addition so we emit more FMAs. */ 1391 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR) 1392 return aarch64_tune_params.fp_reassoc_width; 1393 return 1; 1394 } 1395 1396 /* Provide a mapping from gcc register numbers to dwarf register numbers. */ 1397 unsigned 1398 aarch64_dbx_register_number (unsigned regno) 1399 { 1400 if (GP_REGNUM_P (regno)) 1401 return AARCH64_DWARF_R0 + regno - R0_REGNUM; 1402 else if (regno == SP_REGNUM) 1403 return AARCH64_DWARF_SP; 1404 else if (FP_REGNUM_P (regno)) 1405 return AARCH64_DWARF_V0 + regno - V0_REGNUM; 1406 else if (PR_REGNUM_P (regno)) 1407 return AARCH64_DWARF_P0 + regno - P0_REGNUM; 1408 else if (regno == VG_REGNUM) 1409 return AARCH64_DWARF_VG; 1410 1411 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no 1412 equivalent DWARF register. */ 1413 return DWARF_FRAME_REGISTERS; 1414 } 1415 1416 /* Return true if MODE is any of the Advanced SIMD structure modes. */ 1417 static bool 1418 aarch64_advsimd_struct_mode_p (machine_mode mode) 1419 { 1420 return (TARGET_SIMD 1421 && (mode == OImode || mode == CImode || mode == XImode)); 1422 } 1423 1424 /* Return true if MODE is an SVE predicate mode. */ 1425 static bool 1426 aarch64_sve_pred_mode_p (machine_mode mode) 1427 { 1428 return (TARGET_SVE 1429 && (mode == VNx16BImode 1430 || mode == VNx8BImode 1431 || mode == VNx4BImode 1432 || mode == VNx2BImode)); 1433 } 1434 1435 /* Three mutually-exclusive flags describing a vector or predicate type. */ 1436 const unsigned int VEC_ADVSIMD = 1; 1437 const unsigned int VEC_SVE_DATA = 2; 1438 const unsigned int VEC_SVE_PRED = 4; 1439 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate 1440 a structure of 2, 3 or 4 vectors. */ 1441 const unsigned int VEC_STRUCT = 8; 1442 /* Useful combinations of the above. */ 1443 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED; 1444 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA; 1445 1446 /* Return a set of flags describing the vector properties of mode MODE. 1447 Ignore modes that are not supported by the current target. */ 1448 static unsigned int 1449 aarch64_classify_vector_mode (machine_mode mode) 1450 { 1451 if (aarch64_advsimd_struct_mode_p (mode)) 1452 return VEC_ADVSIMD | VEC_STRUCT; 1453 1454 if (aarch64_sve_pred_mode_p (mode)) 1455 return VEC_SVE_PRED; 1456 1457 scalar_mode inner = GET_MODE_INNER (mode); 1458 if (VECTOR_MODE_P (mode) 1459 && (inner == QImode 1460 || inner == HImode 1461 || inner == HFmode 1462 || inner == SImode 1463 || inner == SFmode 1464 || inner == DImode 1465 || inner == DFmode)) 1466 { 1467 if (TARGET_SVE) 1468 { 1469 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR)) 1470 return VEC_SVE_DATA; 1471 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2) 1472 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3) 1473 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4)) 1474 return VEC_SVE_DATA | VEC_STRUCT; 1475 } 1476 1477 /* This includes V1DF but not V1DI (which doesn't exist). */ 1478 if (TARGET_SIMD 1479 && (known_eq (GET_MODE_BITSIZE (mode), 64) 1480 || known_eq (GET_MODE_BITSIZE (mode), 128))) 1481 return VEC_ADVSIMD; 1482 } 1483 1484 return 0; 1485 } 1486 1487 /* Return true if MODE is any of the data vector modes, including 1488 structure modes. */ 1489 static bool 1490 aarch64_vector_data_mode_p (machine_mode mode) 1491 { 1492 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA; 1493 } 1494 1495 /* Return true if MODE is an SVE data vector mode; either a single vector 1496 or a structure of vectors. */ 1497 static bool 1498 aarch64_sve_data_mode_p (machine_mode mode) 1499 { 1500 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA; 1501 } 1502 1503 /* Implement target hook TARGET_ARRAY_MODE. */ 1504 static opt_machine_mode 1505 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems) 1506 { 1507 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA 1508 && IN_RANGE (nelems, 2, 4)) 1509 return mode_for_vector (GET_MODE_INNER (mode), 1510 GET_MODE_NUNITS (mode) * nelems); 1511 1512 return opt_machine_mode (); 1513 } 1514 1515 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */ 1516 static bool 1517 aarch64_array_mode_supported_p (machine_mode mode, 1518 unsigned HOST_WIDE_INT nelems) 1519 { 1520 if (TARGET_SIMD 1521 && (AARCH64_VALID_SIMD_QREG_MODE (mode) 1522 || AARCH64_VALID_SIMD_DREG_MODE (mode)) 1523 && (nelems >= 2 && nelems <= 4)) 1524 return true; 1525 1526 return false; 1527 } 1528 1529 /* Return the SVE predicate mode to use for elements that have 1530 ELEM_NBYTES bytes, if such a mode exists. */ 1531 1532 opt_machine_mode 1533 aarch64_sve_pred_mode (unsigned int elem_nbytes) 1534 { 1535 if (TARGET_SVE) 1536 { 1537 if (elem_nbytes == 1) 1538 return VNx16BImode; 1539 if (elem_nbytes == 2) 1540 return VNx8BImode; 1541 if (elem_nbytes == 4) 1542 return VNx4BImode; 1543 if (elem_nbytes == 8) 1544 return VNx2BImode; 1545 } 1546 return opt_machine_mode (); 1547 } 1548 1549 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */ 1550 1551 static opt_machine_mode 1552 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes) 1553 { 1554 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR)) 1555 { 1556 unsigned int elem_nbytes = vector_element_size (nbytes, nunits); 1557 machine_mode pred_mode; 1558 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode)) 1559 return pred_mode; 1560 } 1561 1562 return default_get_mask_mode (nunits, nbytes); 1563 } 1564 1565 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations, 1566 prefer to use the first arithmetic operand as the else value if 1567 the else value doesn't matter, since that exactly matches the SVE 1568 destructive merging form. For ternary operations we could either 1569 pick the first operand and use FMAD-like instructions or the last 1570 operand and use FMLA-like instructions; the latter seems more 1571 natural. */ 1572 1573 static tree 1574 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops) 1575 { 1576 return nops == 3 ? ops[2] : ops[0]; 1577 } 1578 1579 /* Implement TARGET_HARD_REGNO_NREGS. */ 1580 1581 static unsigned int 1582 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode) 1583 { 1584 /* ??? Logically we should only need to provide a value when 1585 HARD_REGNO_MODE_OK says that the combination is valid, 1586 but at the moment we need to handle all modes. Just ignore 1587 any runtime parts for registers that can't store them. */ 1588 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode)); 1589 switch (aarch64_regno_regclass (regno)) 1590 { 1591 case FP_REGS: 1592 case FP_LO_REGS: 1593 if (aarch64_sve_data_mode_p (mode)) 1594 return exact_div (GET_MODE_SIZE (mode), 1595 BYTES_PER_SVE_VECTOR).to_constant (); 1596 return CEIL (lowest_size, UNITS_PER_VREG); 1597 case PR_REGS: 1598 case PR_LO_REGS: 1599 case PR_HI_REGS: 1600 return 1; 1601 default: 1602 return CEIL (lowest_size, UNITS_PER_WORD); 1603 } 1604 gcc_unreachable (); 1605 } 1606 1607 /* Implement TARGET_HARD_REGNO_MODE_OK. */ 1608 1609 static bool 1610 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode) 1611 { 1612 if (GET_MODE_CLASS (mode) == MODE_CC) 1613 return regno == CC_REGNUM; 1614 1615 if (regno == VG_REGNUM) 1616 /* This must have the same size as _Unwind_Word. */ 1617 return mode == DImode; 1618 1619 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 1620 if (vec_flags & VEC_SVE_PRED) 1621 return PR_REGNUM_P (regno); 1622 1623 if (PR_REGNUM_P (regno)) 1624 return 0; 1625 1626 if (regno == SP_REGNUM) 1627 /* The purpose of comparing with ptr_mode is to support the 1628 global register variable associated with the stack pointer 1629 register via the syntax of asm ("wsp") in ILP32. */ 1630 return mode == Pmode || mode == ptr_mode; 1631 1632 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM) 1633 return mode == Pmode; 1634 1635 if (GP_REGNUM_P (regno)) 1636 { 1637 if (known_le (GET_MODE_SIZE (mode), 8)) 1638 return true; 1639 else if (known_le (GET_MODE_SIZE (mode), 16)) 1640 return (regno & 1) == 0; 1641 } 1642 else if (FP_REGNUM_P (regno)) 1643 { 1644 if (vec_flags & VEC_STRUCT) 1645 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM; 1646 else 1647 return !VECTOR_MODE_P (mode) || vec_flags != 0; 1648 } 1649 1650 return false; 1651 } 1652 1653 /* Return true if this is a definition of a vectorized simd function. */ 1654 1655 static bool 1656 aarch64_simd_decl_p (tree fndecl) 1657 { 1658 tree fntype; 1659 1660 if (fndecl == NULL) 1661 return false; 1662 fntype = TREE_TYPE (fndecl); 1663 if (fntype == NULL) 1664 return false; 1665 1666 /* Functions with the aarch64_vector_pcs attribute use the simd ABI. */ 1667 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)) != NULL) 1668 return true; 1669 1670 return false; 1671 } 1672 1673 /* Return the mode a register save/restore should use. DImode for integer 1674 registers, DFmode for FP registers in non-SIMD functions (they only save 1675 the bottom half of a 128 bit register), or TFmode for FP registers in 1676 SIMD functions. */ 1677 1678 static machine_mode 1679 aarch64_reg_save_mode (tree fndecl, unsigned regno) 1680 { 1681 return GP_REGNUM_P (regno) 1682 ? E_DImode 1683 : (aarch64_simd_decl_p (fndecl) ? E_TFmode : E_DFmode); 1684 } 1685 1686 /* Return true if the instruction is a call to a SIMD function, false 1687 if it is not a SIMD function or if we do not know anything about 1688 the function. */ 1689 1690 static bool 1691 aarch64_simd_call_p (rtx_insn *insn) 1692 { 1693 rtx symbol; 1694 rtx call; 1695 tree fndecl; 1696 1697 gcc_assert (CALL_P (insn)); 1698 call = get_call_rtx_from (insn); 1699 symbol = XEXP (XEXP (call, 0), 0); 1700 if (GET_CODE (symbol) != SYMBOL_REF) 1701 return false; 1702 fndecl = SYMBOL_REF_DECL (symbol); 1703 if (!fndecl) 1704 return false; 1705 1706 return aarch64_simd_decl_p (fndecl); 1707 } 1708 1709 /* Implement TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS. If INSN calls 1710 a function that uses the SIMD ABI, take advantage of the extra 1711 call-preserved registers that the ABI provides. */ 1712 1713 void 1714 aarch64_remove_extra_call_preserved_regs (rtx_insn *insn, 1715 HARD_REG_SET *return_set) 1716 { 1717 if (aarch64_simd_call_p (insn)) 1718 { 1719 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++) 1720 if (FP_SIMD_SAVED_REGNUM_P (regno)) 1721 CLEAR_HARD_REG_BIT (*return_set, regno); 1722 } 1723 } 1724 1725 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves 1726 the lower 64 bits of a 128-bit register. Tell the compiler the callee 1727 clobbers the top 64 bits when restoring the bottom 64 bits. */ 1728 1729 static bool 1730 aarch64_hard_regno_call_part_clobbered (rtx_insn *insn, unsigned int regno, 1731 machine_mode mode) 1732 { 1733 bool simd_p = insn && CALL_P (insn) && aarch64_simd_call_p (insn); 1734 return FP_REGNUM_P (regno) 1735 && maybe_gt (GET_MODE_SIZE (mode), simd_p ? 16 : 8); 1736 } 1737 1738 /* Implement TARGET_RETURN_CALL_WITH_MAX_CLOBBERS. */ 1739 1740 rtx_insn * 1741 aarch64_return_call_with_max_clobbers (rtx_insn *call_1, rtx_insn *call_2) 1742 { 1743 gcc_assert (CALL_P (call_1) && CALL_P (call_2)); 1744 1745 if (!aarch64_simd_call_p (call_1) || aarch64_simd_call_p (call_2)) 1746 return call_1; 1747 else 1748 return call_2; 1749 } 1750 1751 /* Implement REGMODE_NATURAL_SIZE. */ 1752 poly_uint64 1753 aarch64_regmode_natural_size (machine_mode mode) 1754 { 1755 /* The natural size for SVE data modes is one SVE data vector, 1756 and similarly for predicates. We can't independently modify 1757 anything smaller than that. */ 1758 /* ??? For now, only do this for variable-width SVE registers. 1759 Doing it for constant-sized registers breaks lower-subreg.c. */ 1760 /* ??? And once that's fixed, we should probably have similar 1761 code for Advanced SIMD. */ 1762 if (!aarch64_sve_vg.is_constant ()) 1763 { 1764 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 1765 if (vec_flags & VEC_SVE_PRED) 1766 return BYTES_PER_SVE_PRED; 1767 if (vec_flags & VEC_SVE_DATA) 1768 return BYTES_PER_SVE_VECTOR; 1769 } 1770 return UNITS_PER_WORD; 1771 } 1772 1773 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */ 1774 machine_mode 1775 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned, 1776 machine_mode mode) 1777 { 1778 /* The predicate mode determines which bits are significant and 1779 which are "don't care". Decreasing the number of lanes would 1780 lose data while increasing the number of lanes would make bits 1781 unnecessarily significant. */ 1782 if (PR_REGNUM_P (regno)) 1783 return mode; 1784 if (known_ge (GET_MODE_SIZE (mode), 4)) 1785 return mode; 1786 else 1787 return SImode; 1788 } 1789 1790 /* Return true if I's bits are consecutive ones from the MSB. */ 1791 bool 1792 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i) 1793 { 1794 return exact_log2 (-i) != HOST_WIDE_INT_M1; 1795 } 1796 1797 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so 1798 that strcpy from constants will be faster. */ 1799 1800 static HOST_WIDE_INT 1801 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align) 1802 { 1803 if (TREE_CODE (exp) == STRING_CST && !optimize_size) 1804 return MAX (align, BITS_PER_WORD); 1805 return align; 1806 } 1807 1808 /* Return true if calls to DECL should be treated as 1809 long-calls (ie called via a register). */ 1810 static bool 1811 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED) 1812 { 1813 return false; 1814 } 1815 1816 /* Return true if calls to symbol-ref SYM should be treated as 1817 long-calls (ie called via a register). */ 1818 bool 1819 aarch64_is_long_call_p (rtx sym) 1820 { 1821 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym)); 1822 } 1823 1824 /* Return true if calls to symbol-ref SYM should not go through 1825 plt stubs. */ 1826 1827 bool 1828 aarch64_is_noplt_call_p (rtx sym) 1829 { 1830 const_tree decl = SYMBOL_REF_DECL (sym); 1831 1832 if (flag_pic 1833 && decl 1834 && (!flag_plt 1835 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl))) 1836 && !targetm.binds_local_p (decl)) 1837 return true; 1838 1839 return false; 1840 } 1841 1842 /* Return true if the offsets to a zero/sign-extract operation 1843 represent an expression that matches an extend operation. The 1844 operands represent the paramters from 1845 1846 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */ 1847 bool 1848 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm, 1849 rtx extract_imm) 1850 { 1851 HOST_WIDE_INT mult_val, extract_val; 1852 1853 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm)) 1854 return false; 1855 1856 mult_val = INTVAL (mult_imm); 1857 extract_val = INTVAL (extract_imm); 1858 1859 if (extract_val > 8 1860 && extract_val < GET_MODE_BITSIZE (mode) 1861 && exact_log2 (extract_val & ~7) > 0 1862 && (extract_val & 7) <= 4 1863 && mult_val == (1 << (extract_val & 7))) 1864 return true; 1865 1866 return false; 1867 } 1868 1869 /* Emit an insn that's a simple single-set. Both the operands must be 1870 known to be valid. */ 1871 inline static rtx_insn * 1872 emit_set_insn (rtx x, rtx y) 1873 { 1874 return emit_insn (gen_rtx_SET (x, y)); 1875 } 1876 1877 /* X and Y are two things to compare using CODE. Emit the compare insn and 1878 return the rtx for register 0 in the proper mode. */ 1879 rtx 1880 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y) 1881 { 1882 machine_mode mode = SELECT_CC_MODE (code, x, y); 1883 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM); 1884 1885 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y)); 1886 return cc_reg; 1887 } 1888 1889 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */ 1890 1891 static rtx 1892 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y, 1893 machine_mode y_mode) 1894 { 1895 if (y_mode == E_QImode || y_mode == E_HImode) 1896 { 1897 if (CONST_INT_P (y)) 1898 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode)); 1899 else 1900 { 1901 rtx t, cc_reg; 1902 machine_mode cc_mode; 1903 1904 t = gen_rtx_ZERO_EXTEND (SImode, y); 1905 t = gen_rtx_COMPARE (CC_SWPmode, t, x); 1906 cc_mode = CC_SWPmode; 1907 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM); 1908 emit_set_insn (cc_reg, t); 1909 return cc_reg; 1910 } 1911 } 1912 1913 if (!aarch64_plus_operand (y, y_mode)) 1914 y = force_reg (y_mode, y); 1915 1916 return aarch64_gen_compare_reg (code, x, y); 1917 } 1918 1919 /* Build the SYMBOL_REF for __tls_get_addr. */ 1920 1921 static GTY(()) rtx tls_get_addr_libfunc; 1922 1923 rtx 1924 aarch64_tls_get_addr (void) 1925 { 1926 if (!tls_get_addr_libfunc) 1927 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr"); 1928 return tls_get_addr_libfunc; 1929 } 1930 1931 /* Return the TLS model to use for ADDR. */ 1932 1933 static enum tls_model 1934 tls_symbolic_operand_type (rtx addr) 1935 { 1936 enum tls_model tls_kind = TLS_MODEL_NONE; 1937 if (GET_CODE (addr) == CONST) 1938 { 1939 poly_int64 addend; 1940 rtx sym = strip_offset (addr, &addend); 1941 if (GET_CODE (sym) == SYMBOL_REF) 1942 tls_kind = SYMBOL_REF_TLS_MODEL (sym); 1943 } 1944 else if (GET_CODE (addr) == SYMBOL_REF) 1945 tls_kind = SYMBOL_REF_TLS_MODEL (addr); 1946 1947 return tls_kind; 1948 } 1949 1950 /* We'll allow lo_sum's in addresses in our legitimate addresses 1951 so that combine would take care of combining addresses where 1952 necessary, but for generation purposes, we'll generate the address 1953 as : 1954 RTL Absolute 1955 tmp = hi (symbol_ref); adrp x1, foo 1956 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo 1957 nop 1958 1959 PIC TLS 1960 adrp x1, :got:foo adrp tmp, :tlsgd:foo 1961 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo 1962 bl __tls_get_addr 1963 nop 1964 1965 Load TLS symbol, depending on TLS mechanism and TLS access model. 1966 1967 Global Dynamic - Traditional TLS: 1968 adrp tmp, :tlsgd:imm 1969 add dest, tmp, #:tlsgd_lo12:imm 1970 bl __tls_get_addr 1971 1972 Global Dynamic - TLS Descriptors: 1973 adrp dest, :tlsdesc:imm 1974 ldr tmp, [dest, #:tlsdesc_lo12:imm] 1975 add dest, dest, #:tlsdesc_lo12:imm 1976 blr tmp 1977 mrs tp, tpidr_el0 1978 add dest, dest, tp 1979 1980 Initial Exec: 1981 mrs tp, tpidr_el0 1982 adrp tmp, :gottprel:imm 1983 ldr dest, [tmp, #:gottprel_lo12:imm] 1984 add dest, dest, tp 1985 1986 Local Exec: 1987 mrs tp, tpidr_el0 1988 add t0, tp, #:tprel_hi12:imm, lsl #12 1989 add t0, t0, #:tprel_lo12_nc:imm 1990 */ 1991 1992 static void 1993 aarch64_load_symref_appropriately (rtx dest, rtx imm, 1994 enum aarch64_symbol_type type) 1995 { 1996 switch (type) 1997 { 1998 case SYMBOL_SMALL_ABSOLUTE: 1999 { 2000 /* In ILP32, the mode of dest can be either SImode or DImode. */ 2001 rtx tmp_reg = dest; 2002 machine_mode mode = GET_MODE (dest); 2003 2004 gcc_assert (mode == Pmode || mode == ptr_mode); 2005 2006 if (can_create_pseudo_p ()) 2007 tmp_reg = gen_reg_rtx (mode); 2008 2009 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm)); 2010 emit_insn (gen_add_losym (dest, tmp_reg, imm)); 2011 return; 2012 } 2013 2014 case SYMBOL_TINY_ABSOLUTE: 2015 emit_insn (gen_rtx_SET (dest, imm)); 2016 return; 2017 2018 case SYMBOL_SMALL_GOT_28K: 2019 { 2020 machine_mode mode = GET_MODE (dest); 2021 rtx gp_rtx = pic_offset_table_rtx; 2022 rtx insn; 2023 rtx mem; 2024 2025 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach 2026 here before rtl expand. Tree IVOPT will generate rtl pattern to 2027 decide rtx costs, in which case pic_offset_table_rtx is not 2028 initialized. For that case no need to generate the first adrp 2029 instruction as the final cost for global variable access is 2030 one instruction. */ 2031 if (gp_rtx != NULL) 2032 { 2033 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are 2034 using the page base as GOT base, the first page may be wasted, 2035 in the worst scenario, there is only 28K space for GOT). 2036 2037 The generate instruction sequence for accessing global variable 2038 is: 2039 2040 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym] 2041 2042 Only one instruction needed. But we must initialize 2043 pic_offset_table_rtx properly. We generate initialize insn for 2044 every global access, and allow CSE to remove all redundant. 2045 2046 The final instruction sequences will look like the following 2047 for multiply global variables access. 2048 2049 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_ 2050 2051 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1] 2052 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2] 2053 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3] 2054 ... */ 2055 2056 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_"); 2057 crtl->uses_pic_offset_table = 1; 2058 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s)); 2059 2060 if (mode != GET_MODE (gp_rtx)) 2061 gp_rtx = gen_lowpart (mode, gp_rtx); 2062 2063 } 2064 2065 if (mode == ptr_mode) 2066 { 2067 if (mode == DImode) 2068 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm); 2069 else 2070 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm); 2071 2072 mem = XVECEXP (SET_SRC (insn), 0, 0); 2073 } 2074 else 2075 { 2076 gcc_assert (mode == Pmode); 2077 2078 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm); 2079 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0); 2080 } 2081 2082 /* The operand is expected to be MEM. Whenever the related insn 2083 pattern changed, above code which calculate mem should be 2084 updated. */ 2085 gcc_assert (GET_CODE (mem) == MEM); 2086 MEM_READONLY_P (mem) = 1; 2087 MEM_NOTRAP_P (mem) = 1; 2088 emit_insn (insn); 2089 return; 2090 } 2091 2092 case SYMBOL_SMALL_GOT_4G: 2093 { 2094 /* In ILP32, the mode of dest can be either SImode or DImode, 2095 while the got entry is always of SImode size. The mode of 2096 dest depends on how dest is used: if dest is assigned to a 2097 pointer (e.g. in the memory), it has SImode; it may have 2098 DImode if dest is dereferenced to access the memeory. 2099 This is why we have to handle three different ldr_got_small 2100 patterns here (two patterns for ILP32). */ 2101 2102 rtx insn; 2103 rtx mem; 2104 rtx tmp_reg = dest; 2105 machine_mode mode = GET_MODE (dest); 2106 2107 if (can_create_pseudo_p ()) 2108 tmp_reg = gen_reg_rtx (mode); 2109 2110 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm)); 2111 if (mode == ptr_mode) 2112 { 2113 if (mode == DImode) 2114 insn = gen_ldr_got_small_di (dest, tmp_reg, imm); 2115 else 2116 insn = gen_ldr_got_small_si (dest, tmp_reg, imm); 2117 2118 mem = XVECEXP (SET_SRC (insn), 0, 0); 2119 } 2120 else 2121 { 2122 gcc_assert (mode == Pmode); 2123 2124 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm); 2125 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0); 2126 } 2127 2128 gcc_assert (GET_CODE (mem) == MEM); 2129 MEM_READONLY_P (mem) = 1; 2130 MEM_NOTRAP_P (mem) = 1; 2131 emit_insn (insn); 2132 return; 2133 } 2134 2135 case SYMBOL_SMALL_TLSGD: 2136 { 2137 rtx_insn *insns; 2138 machine_mode mode = GET_MODE (dest); 2139 rtx result = gen_rtx_REG (mode, R0_REGNUM); 2140 2141 start_sequence (); 2142 if (TARGET_ILP32) 2143 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm)); 2144 else 2145 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm)); 2146 insns = get_insns (); 2147 end_sequence (); 2148 2149 RTL_CONST_CALL_P (insns) = 1; 2150 emit_libcall_block (insns, dest, result, imm); 2151 return; 2152 } 2153 2154 case SYMBOL_SMALL_TLSDESC: 2155 { 2156 machine_mode mode = GET_MODE (dest); 2157 rtx x0 = gen_rtx_REG (mode, R0_REGNUM); 2158 rtx tp; 2159 2160 gcc_assert (mode == Pmode || mode == ptr_mode); 2161 2162 /* In ILP32, the got entry is always of SImode size. Unlike 2163 small GOT, the dest is fixed at reg 0. */ 2164 if (TARGET_ILP32) 2165 emit_insn (gen_tlsdesc_small_si (imm)); 2166 else 2167 emit_insn (gen_tlsdesc_small_di (imm)); 2168 tp = aarch64_load_tp (NULL); 2169 2170 if (mode != Pmode) 2171 tp = gen_lowpart (mode, tp); 2172 2173 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0))); 2174 if (REG_P (dest)) 2175 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm); 2176 return; 2177 } 2178 2179 case SYMBOL_SMALL_TLSIE: 2180 { 2181 /* In ILP32, the mode of dest can be either SImode or DImode, 2182 while the got entry is always of SImode size. The mode of 2183 dest depends on how dest is used: if dest is assigned to a 2184 pointer (e.g. in the memory), it has SImode; it may have 2185 DImode if dest is dereferenced to access the memeory. 2186 This is why we have to handle three different tlsie_small 2187 patterns here (two patterns for ILP32). */ 2188 machine_mode mode = GET_MODE (dest); 2189 rtx tmp_reg = gen_reg_rtx (mode); 2190 rtx tp = aarch64_load_tp (NULL); 2191 2192 if (mode == ptr_mode) 2193 { 2194 if (mode == DImode) 2195 emit_insn (gen_tlsie_small_di (tmp_reg, imm)); 2196 else 2197 { 2198 emit_insn (gen_tlsie_small_si (tmp_reg, imm)); 2199 tp = gen_lowpart (mode, tp); 2200 } 2201 } 2202 else 2203 { 2204 gcc_assert (mode == Pmode); 2205 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm)); 2206 } 2207 2208 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg))); 2209 if (REG_P (dest)) 2210 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm); 2211 return; 2212 } 2213 2214 case SYMBOL_TLSLE12: 2215 case SYMBOL_TLSLE24: 2216 case SYMBOL_TLSLE32: 2217 case SYMBOL_TLSLE48: 2218 { 2219 machine_mode mode = GET_MODE (dest); 2220 rtx tp = aarch64_load_tp (NULL); 2221 2222 if (mode != Pmode) 2223 tp = gen_lowpart (mode, tp); 2224 2225 switch (type) 2226 { 2227 case SYMBOL_TLSLE12: 2228 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si) 2229 (dest, tp, imm)); 2230 break; 2231 case SYMBOL_TLSLE24: 2232 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si) 2233 (dest, tp, imm)); 2234 break; 2235 case SYMBOL_TLSLE32: 2236 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si) 2237 (dest, imm)); 2238 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3) 2239 (dest, dest, tp)); 2240 break; 2241 case SYMBOL_TLSLE48: 2242 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si) 2243 (dest, imm)); 2244 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3) 2245 (dest, dest, tp)); 2246 break; 2247 default: 2248 gcc_unreachable (); 2249 } 2250 2251 if (REG_P (dest)) 2252 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm); 2253 return; 2254 } 2255 2256 case SYMBOL_TINY_GOT: 2257 emit_insn (gen_ldr_got_tiny (dest, imm)); 2258 return; 2259 2260 case SYMBOL_TINY_TLSIE: 2261 { 2262 machine_mode mode = GET_MODE (dest); 2263 rtx tp = aarch64_load_tp (NULL); 2264 2265 if (mode == ptr_mode) 2266 { 2267 if (mode == DImode) 2268 emit_insn (gen_tlsie_tiny_di (dest, imm, tp)); 2269 else 2270 { 2271 tp = gen_lowpart (mode, tp); 2272 emit_insn (gen_tlsie_tiny_si (dest, imm, tp)); 2273 } 2274 } 2275 else 2276 { 2277 gcc_assert (mode == Pmode); 2278 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp)); 2279 } 2280 2281 if (REG_P (dest)) 2282 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm); 2283 return; 2284 } 2285 2286 default: 2287 gcc_unreachable (); 2288 } 2289 } 2290 2291 /* Emit a move from SRC to DEST. Assume that the move expanders can 2292 handle all moves if !can_create_pseudo_p (). The distinction is 2293 important because, unlike emit_move_insn, the move expanders know 2294 how to force Pmode objects into the constant pool even when the 2295 constant pool address is not itself legitimate. */ 2296 static rtx 2297 aarch64_emit_move (rtx dest, rtx src) 2298 { 2299 return (can_create_pseudo_p () 2300 ? emit_move_insn (dest, src) 2301 : emit_move_insn_1 (dest, src)); 2302 } 2303 2304 /* Apply UNOPTAB to OP and store the result in DEST. */ 2305 2306 static void 2307 aarch64_emit_unop (rtx dest, optab unoptab, rtx op) 2308 { 2309 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0); 2310 if (dest != tmp) 2311 emit_move_insn (dest, tmp); 2312 } 2313 2314 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */ 2315 2316 static void 2317 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1) 2318 { 2319 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0, 2320 OPTAB_DIRECT); 2321 if (dest != tmp) 2322 emit_move_insn (dest, tmp); 2323 } 2324 2325 /* Split a 128-bit move operation into two 64-bit move operations, 2326 taking care to handle partial overlap of register to register 2327 copies. Special cases are needed when moving between GP regs and 2328 FP regs. SRC can be a register, constant or memory; DST a register 2329 or memory. If either operand is memory it must not have any side 2330 effects. */ 2331 void 2332 aarch64_split_128bit_move (rtx dst, rtx src) 2333 { 2334 rtx dst_lo, dst_hi; 2335 rtx src_lo, src_hi; 2336 2337 machine_mode mode = GET_MODE (dst); 2338 2339 gcc_assert (mode == TImode || mode == TFmode); 2340 gcc_assert (!(side_effects_p (src) || side_effects_p (dst))); 2341 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode); 2342 2343 if (REG_P (dst) && REG_P (src)) 2344 { 2345 int src_regno = REGNO (src); 2346 int dst_regno = REGNO (dst); 2347 2348 /* Handle FP <-> GP regs. */ 2349 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno)) 2350 { 2351 src_lo = gen_lowpart (word_mode, src); 2352 src_hi = gen_highpart (word_mode, src); 2353 2354 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo)); 2355 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi)); 2356 return; 2357 } 2358 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno)) 2359 { 2360 dst_lo = gen_lowpart (word_mode, dst); 2361 dst_hi = gen_highpart (word_mode, dst); 2362 2363 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src)); 2364 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src)); 2365 return; 2366 } 2367 } 2368 2369 dst_lo = gen_lowpart (word_mode, dst); 2370 dst_hi = gen_highpart (word_mode, dst); 2371 src_lo = gen_lowpart (word_mode, src); 2372 src_hi = gen_highpart_mode (word_mode, mode, src); 2373 2374 /* At most one pairing may overlap. */ 2375 if (reg_overlap_mentioned_p (dst_lo, src_hi)) 2376 { 2377 aarch64_emit_move (dst_hi, src_hi); 2378 aarch64_emit_move (dst_lo, src_lo); 2379 } 2380 else 2381 { 2382 aarch64_emit_move (dst_lo, src_lo); 2383 aarch64_emit_move (dst_hi, src_hi); 2384 } 2385 } 2386 2387 bool 2388 aarch64_split_128bit_move_p (rtx dst, rtx src) 2389 { 2390 return (! REG_P (src) 2391 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src)))); 2392 } 2393 2394 /* Split a complex SIMD combine. */ 2395 2396 void 2397 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2) 2398 { 2399 machine_mode src_mode = GET_MODE (src1); 2400 machine_mode dst_mode = GET_MODE (dst); 2401 2402 gcc_assert (VECTOR_MODE_P (dst_mode)); 2403 gcc_assert (register_operand (dst, dst_mode) 2404 && register_operand (src1, src_mode) 2405 && register_operand (src2, src_mode)); 2406 2407 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2)); 2408 return; 2409 } 2410 2411 /* Split a complex SIMD move. */ 2412 2413 void 2414 aarch64_split_simd_move (rtx dst, rtx src) 2415 { 2416 machine_mode src_mode = GET_MODE (src); 2417 machine_mode dst_mode = GET_MODE (dst); 2418 2419 gcc_assert (VECTOR_MODE_P (dst_mode)); 2420 2421 if (REG_P (dst) && REG_P (src)) 2422 { 2423 gcc_assert (VECTOR_MODE_P (src_mode)); 2424 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src)); 2425 } 2426 } 2427 2428 bool 2429 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x, 2430 machine_mode ymode, rtx y) 2431 { 2432 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode); 2433 gcc_assert (r != NULL); 2434 return rtx_equal_p (x, r); 2435 } 2436 2437 2438 static rtx 2439 aarch64_force_temporary (machine_mode mode, rtx x, rtx value) 2440 { 2441 if (can_create_pseudo_p ()) 2442 return force_reg (mode, value); 2443 else 2444 { 2445 gcc_assert (x); 2446 aarch64_emit_move (x, value); 2447 return x; 2448 } 2449 } 2450 2451 /* Return true if we can move VALUE into a register using a single 2452 CNT[BHWD] instruction. */ 2453 2454 static bool 2455 aarch64_sve_cnt_immediate_p (poly_int64 value) 2456 { 2457 HOST_WIDE_INT factor = value.coeffs[0]; 2458 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */ 2459 return (value.coeffs[1] == factor 2460 && IN_RANGE (factor, 2, 16 * 16) 2461 && (factor & 1) == 0 2462 && factor <= 16 * (factor & -factor)); 2463 } 2464 2465 /* Likewise for rtx X. */ 2466 2467 bool 2468 aarch64_sve_cnt_immediate_p (rtx x) 2469 { 2470 poly_int64 value; 2471 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value); 2472 } 2473 2474 /* Return the asm string for an instruction with a CNT-like vector size 2475 operand (a vector pattern followed by a multiplier in the range [1, 16]). 2476 PREFIX is the mnemonic without the size suffix and OPERANDS is the 2477 first part of the operands template (the part that comes before the 2478 vector size itself). FACTOR is the number of quadwords. 2479 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword. 2480 If it is zero, we can use any element size. */ 2481 2482 static char * 2483 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands, 2484 unsigned int factor, 2485 unsigned int nelts_per_vq) 2486 { 2487 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")]; 2488 2489 if (nelts_per_vq == 0) 2490 /* There is some overlap in the ranges of the four CNT instructions. 2491 Here we always use the smallest possible element size, so that the 2492 multiplier is 1 whereever possible. */ 2493 nelts_per_vq = factor & -factor; 2494 int shift = std::min (exact_log2 (nelts_per_vq), 4); 2495 gcc_assert (IN_RANGE (shift, 1, 4)); 2496 char suffix = "dwhb"[shift - 1]; 2497 2498 factor >>= shift; 2499 unsigned int written; 2500 if (factor == 1) 2501 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s", 2502 prefix, suffix, operands); 2503 else 2504 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d", 2505 prefix, suffix, operands, factor); 2506 gcc_assert (written < sizeof (buffer)); 2507 return buffer; 2508 } 2509 2510 /* Return the asm string for an instruction with a CNT-like vector size 2511 operand (a vector pattern followed by a multiplier in the range [1, 16]). 2512 PREFIX is the mnemonic without the size suffix and OPERANDS is the 2513 first part of the operands template (the part that comes before the 2514 vector size itself). X is the value of the vector size operand, 2515 as a polynomial integer rtx. */ 2516 2517 char * 2518 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands, 2519 rtx x) 2520 { 2521 poly_int64 value = rtx_to_poly_int64 (x); 2522 gcc_assert (aarch64_sve_cnt_immediate_p (value)); 2523 return aarch64_output_sve_cnt_immediate (prefix, operands, 2524 value.coeffs[1], 0); 2525 } 2526 2527 /* Return true if we can add VALUE to a register using a single ADDVL 2528 or ADDPL instruction. */ 2529 2530 static bool 2531 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value) 2532 { 2533 HOST_WIDE_INT factor = value.coeffs[0]; 2534 if (factor == 0 || value.coeffs[1] != factor) 2535 return false; 2536 /* FACTOR counts VG / 2, so a value of 2 is one predicate width 2537 and a value of 16 is one vector width. */ 2538 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16)) 2539 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2))); 2540 } 2541 2542 /* Likewise for rtx X. */ 2543 2544 bool 2545 aarch64_sve_addvl_addpl_immediate_p (rtx x) 2546 { 2547 poly_int64 value; 2548 return (poly_int_rtx_p (x, &value) 2549 && aarch64_sve_addvl_addpl_immediate_p (value)); 2550 } 2551 2552 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1 2553 and storing the result in operand 0. */ 2554 2555 char * 2556 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset) 2557 { 2558 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)]; 2559 poly_int64 offset_value = rtx_to_poly_int64 (offset); 2560 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value)); 2561 2562 /* Use INC or DEC if possible. */ 2563 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest))) 2564 { 2565 if (aarch64_sve_cnt_immediate_p (offset_value)) 2566 return aarch64_output_sve_cnt_immediate ("inc", "%x0", 2567 offset_value.coeffs[1], 0); 2568 if (aarch64_sve_cnt_immediate_p (-offset_value)) 2569 return aarch64_output_sve_cnt_immediate ("dec", "%x0", 2570 -offset_value.coeffs[1], 0); 2571 } 2572 2573 int factor = offset_value.coeffs[1]; 2574 if ((factor & 15) == 0) 2575 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16); 2576 else 2577 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2); 2578 return buffer; 2579 } 2580 2581 /* Return true if X is a valid immediate for an SVE vector INC or DEC 2582 instruction. If it is, store the number of elements in each vector 2583 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication 2584 factor in *FACTOR_OUT (if nonnull). */ 2585 2586 bool 2587 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out, 2588 unsigned int *nelts_per_vq_out) 2589 { 2590 rtx elt; 2591 poly_int64 value; 2592 2593 if (!const_vec_duplicate_p (x, &elt) 2594 || !poly_int_rtx_p (elt, &value)) 2595 return false; 2596 2597 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x)); 2598 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2) 2599 /* There's no vector INCB. */ 2600 return false; 2601 2602 HOST_WIDE_INT factor = value.coeffs[0]; 2603 if (value.coeffs[1] != factor) 2604 return false; 2605 2606 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */ 2607 if ((factor % nelts_per_vq) != 0 2608 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq)) 2609 return false; 2610 2611 if (factor_out) 2612 *factor_out = factor; 2613 if (nelts_per_vq_out) 2614 *nelts_per_vq_out = nelts_per_vq; 2615 return true; 2616 } 2617 2618 /* Return true if X is a valid immediate for an SVE vector INC or DEC 2619 instruction. */ 2620 2621 bool 2622 aarch64_sve_inc_dec_immediate_p (rtx x) 2623 { 2624 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL); 2625 } 2626 2627 /* Return the asm template for an SVE vector INC or DEC instruction. 2628 OPERANDS gives the operands before the vector count and X is the 2629 value of the vector count operand itself. */ 2630 2631 char * 2632 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x) 2633 { 2634 int factor; 2635 unsigned int nelts_per_vq; 2636 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq)) 2637 gcc_unreachable (); 2638 if (factor < 0) 2639 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor, 2640 nelts_per_vq); 2641 else 2642 return aarch64_output_sve_cnt_immediate ("inc", operands, factor, 2643 nelts_per_vq); 2644 } 2645 2646 static int 2647 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate, 2648 scalar_int_mode mode) 2649 { 2650 int i; 2651 unsigned HOST_WIDE_INT val, val2, mask; 2652 int one_match, zero_match; 2653 int num_insns; 2654 2655 val = INTVAL (imm); 2656 2657 if (aarch64_move_imm (val, mode)) 2658 { 2659 if (generate) 2660 emit_insn (gen_rtx_SET (dest, imm)); 2661 return 1; 2662 } 2663 2664 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff 2665 (with XXXX non-zero). In that case check to see if the move can be done in 2666 a smaller mode. */ 2667 val2 = val & 0xffffffff; 2668 if (mode == DImode 2669 && aarch64_move_imm (val2, SImode) 2670 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0)) 2671 { 2672 if (generate) 2673 emit_insn (gen_rtx_SET (dest, GEN_INT (val2))); 2674 2675 /* Check if we have to emit a second instruction by checking to see 2676 if any of the upper 32 bits of the original DI mode value is set. */ 2677 if (val == val2) 2678 return 1; 2679 2680 i = (val >> 48) ? 48 : 32; 2681 2682 if (generate) 2683 emit_insn (gen_insv_immdi (dest, GEN_INT (i), 2684 GEN_INT ((val >> i) & 0xffff))); 2685 2686 return 2; 2687 } 2688 2689 if ((val >> 32) == 0 || mode == SImode) 2690 { 2691 if (generate) 2692 { 2693 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff))); 2694 if (mode == SImode) 2695 emit_insn (gen_insv_immsi (dest, GEN_INT (16), 2696 GEN_INT ((val >> 16) & 0xffff))); 2697 else 2698 emit_insn (gen_insv_immdi (dest, GEN_INT (16), 2699 GEN_INT ((val >> 16) & 0xffff))); 2700 } 2701 return 2; 2702 } 2703 2704 /* Remaining cases are all for DImode. */ 2705 2706 mask = 0xffff; 2707 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) + 2708 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0); 2709 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) + 2710 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0); 2711 2712 if (zero_match != 2 && one_match != 2) 2713 { 2714 /* Try emitting a bitmask immediate with a movk replacing 16 bits. 2715 For a 64-bit bitmask try whether changing 16 bits to all ones or 2716 zeroes creates a valid bitmask. To check any repeated bitmask, 2717 try using 16 bits from the other 32-bit half of val. */ 2718 2719 for (i = 0; i < 64; i += 16, mask <<= 16) 2720 { 2721 val2 = val & ~mask; 2722 if (val2 != val && aarch64_bitmask_imm (val2, mode)) 2723 break; 2724 val2 = val | mask; 2725 if (val2 != val && aarch64_bitmask_imm (val2, mode)) 2726 break; 2727 val2 = val2 & ~mask; 2728 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask); 2729 if (val2 != val && aarch64_bitmask_imm (val2, mode)) 2730 break; 2731 } 2732 if (i != 64) 2733 { 2734 if (generate) 2735 { 2736 emit_insn (gen_rtx_SET (dest, GEN_INT (val2))); 2737 emit_insn (gen_insv_immdi (dest, GEN_INT (i), 2738 GEN_INT ((val >> i) & 0xffff))); 2739 } 2740 return 2; 2741 } 2742 } 2743 2744 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which 2745 are emitted by the initial mov. If one_match > zero_match, skip set bits, 2746 otherwise skip zero bits. */ 2747 2748 num_insns = 1; 2749 mask = 0xffff; 2750 val2 = one_match > zero_match ? ~val : val; 2751 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32; 2752 2753 if (generate) 2754 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match 2755 ? (val | ~(mask << i)) 2756 : (val & (mask << i))))); 2757 for (i += 16; i < 64; i += 16) 2758 { 2759 if ((val2 & (mask << i)) == 0) 2760 continue; 2761 if (generate) 2762 emit_insn (gen_insv_immdi (dest, GEN_INT (i), 2763 GEN_INT ((val >> i) & 0xffff))); 2764 num_insns ++; 2765 } 2766 2767 return num_insns; 2768 } 2769 2770 /* Return whether imm is a 128-bit immediate which is simple enough to 2771 expand inline. */ 2772 bool 2773 aarch64_mov128_immediate (rtx imm) 2774 { 2775 if (GET_CODE (imm) == CONST_INT) 2776 return true; 2777 2778 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2); 2779 2780 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0)); 2781 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1)); 2782 2783 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode) 2784 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4; 2785 } 2786 2787 2788 /* Return the number of temporary registers that aarch64_add_offset_1 2789 would need to add OFFSET to a register. */ 2790 2791 static unsigned int 2792 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset) 2793 { 2794 return abs_hwi (offset) < 0x1000000 ? 0 : 1; 2795 } 2796 2797 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for 2798 a non-polynomial OFFSET. MODE is the mode of the addition. 2799 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should 2800 be set and CFA adjustments added to the generated instructions. 2801 2802 TEMP1, if nonnull, is a register of mode MODE that can be used as a 2803 temporary if register allocation is already complete. This temporary 2804 register may overlap DEST but must not overlap SRC. If TEMP1 is known 2805 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting 2806 the immediate again. 2807 2808 Since this function may be used to adjust the stack pointer, we must 2809 ensure that it cannot cause transient stack deallocation (for example 2810 by first incrementing SP and then decrementing when adjusting by a 2811 large immediate). */ 2812 2813 static void 2814 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest, 2815 rtx src, HOST_WIDE_INT offset, rtx temp1, 2816 bool frame_related_p, bool emit_move_imm) 2817 { 2818 gcc_assert (emit_move_imm || temp1 != NULL_RTX); 2819 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src)); 2820 2821 HOST_WIDE_INT moffset = abs_hwi (offset); 2822 rtx_insn *insn; 2823 2824 if (!moffset) 2825 { 2826 if (!rtx_equal_p (dest, src)) 2827 { 2828 insn = emit_insn (gen_rtx_SET (dest, src)); 2829 RTX_FRAME_RELATED_P (insn) = frame_related_p; 2830 } 2831 return; 2832 } 2833 2834 /* Single instruction adjustment. */ 2835 if (aarch64_uimm12_shift (moffset)) 2836 { 2837 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset))); 2838 RTX_FRAME_RELATED_P (insn) = frame_related_p; 2839 return; 2840 } 2841 2842 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits 2843 and either: 2844 2845 a) the offset cannot be loaded by a 16-bit move or 2846 b) there is no spare register into which we can move it. */ 2847 if (moffset < 0x1000000 2848 && ((!temp1 && !can_create_pseudo_p ()) 2849 || !aarch64_move_imm (moffset, mode))) 2850 { 2851 HOST_WIDE_INT low_off = moffset & 0xfff; 2852 2853 low_off = offset < 0 ? -low_off : low_off; 2854 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off))); 2855 RTX_FRAME_RELATED_P (insn) = frame_related_p; 2856 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off))); 2857 RTX_FRAME_RELATED_P (insn) = frame_related_p; 2858 return; 2859 } 2860 2861 /* Emit a move immediate if required and an addition/subtraction. */ 2862 if (emit_move_imm) 2863 { 2864 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ()); 2865 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset)); 2866 } 2867 insn = emit_insn (offset < 0 2868 ? gen_sub3_insn (dest, src, temp1) 2869 : gen_add3_insn (dest, src, temp1)); 2870 if (frame_related_p) 2871 { 2872 RTX_FRAME_RELATED_P (insn) = frame_related_p; 2873 rtx adj = plus_constant (mode, src, offset); 2874 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj)); 2875 } 2876 } 2877 2878 /* Return the number of temporary registers that aarch64_add_offset 2879 would need to move OFFSET into a register or add OFFSET to a register; 2880 ADD_P is true if we want the latter rather than the former. */ 2881 2882 static unsigned int 2883 aarch64_offset_temporaries (bool add_p, poly_int64 offset) 2884 { 2885 /* This follows the same structure as aarch64_add_offset. */ 2886 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset)) 2887 return 0; 2888 2889 unsigned int count = 0; 2890 HOST_WIDE_INT factor = offset.coeffs[1]; 2891 HOST_WIDE_INT constant = offset.coeffs[0] - factor; 2892 poly_int64 poly_offset (factor, factor); 2893 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset)) 2894 /* Need one register for the ADDVL/ADDPL result. */ 2895 count += 1; 2896 else if (factor != 0) 2897 { 2898 factor = abs (factor); 2899 if (factor > 16 * (factor & -factor)) 2900 /* Need one register for the CNT result and one for the multiplication 2901 factor. If necessary, the second temporary can be reused for the 2902 constant part of the offset. */ 2903 return 2; 2904 /* Need one register for the CNT result (which might then 2905 be shifted). */ 2906 count += 1; 2907 } 2908 return count + aarch64_add_offset_1_temporaries (constant); 2909 } 2910 2911 /* If X can be represented as a poly_int64, return the number 2912 of temporaries that are required to add it to a register. 2913 Return -1 otherwise. */ 2914 2915 int 2916 aarch64_add_offset_temporaries (rtx x) 2917 { 2918 poly_int64 offset; 2919 if (!poly_int_rtx_p (x, &offset)) 2920 return -1; 2921 return aarch64_offset_temporaries (true, offset); 2922 } 2923 2924 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition. 2925 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should 2926 be set and CFA adjustments added to the generated instructions. 2927 2928 TEMP1, if nonnull, is a register of mode MODE that can be used as a 2929 temporary if register allocation is already complete. This temporary 2930 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC. 2931 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to 2932 false to avoid emitting the immediate again. 2933 2934 TEMP2, if nonnull, is a second temporary register that doesn't 2935 overlap either DEST or REG. 2936 2937 Since this function may be used to adjust the stack pointer, we must 2938 ensure that it cannot cause transient stack deallocation (for example 2939 by first incrementing SP and then decrementing when adjusting by a 2940 large immediate). */ 2941 2942 static void 2943 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, 2944 poly_int64 offset, rtx temp1, rtx temp2, 2945 bool frame_related_p, bool emit_move_imm = true) 2946 { 2947 gcc_assert (emit_move_imm || temp1 != NULL_RTX); 2948 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src)); 2949 gcc_assert (temp1 == NULL_RTX 2950 || !frame_related_p 2951 || !reg_overlap_mentioned_p (temp1, dest)); 2952 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2)); 2953 2954 /* Try using ADDVL or ADDPL to add the whole value. */ 2955 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset)) 2956 { 2957 rtx offset_rtx = gen_int_mode (offset, mode); 2958 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx)); 2959 RTX_FRAME_RELATED_P (insn) = frame_related_p; 2960 return; 2961 } 2962 2963 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an 2964 SVE vector register, over and above the minimum size of 128 bits. 2965 This is equivalent to half the value returned by CNTD with a 2966 vector shape of ALL. */ 2967 HOST_WIDE_INT factor = offset.coeffs[1]; 2968 HOST_WIDE_INT constant = offset.coeffs[0] - factor; 2969 2970 /* Try using ADDVL or ADDPL to add the VG-based part. */ 2971 poly_int64 poly_offset (factor, factor); 2972 if (src != const0_rtx 2973 && aarch64_sve_addvl_addpl_immediate_p (poly_offset)) 2974 { 2975 rtx offset_rtx = gen_int_mode (poly_offset, mode); 2976 if (frame_related_p) 2977 { 2978 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx)); 2979 RTX_FRAME_RELATED_P (insn) = true; 2980 src = dest; 2981 } 2982 else 2983 { 2984 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx); 2985 src = aarch64_force_temporary (mode, temp1, addr); 2986 temp1 = temp2; 2987 temp2 = NULL_RTX; 2988 } 2989 } 2990 /* Otherwise use a CNT-based sequence. */ 2991 else if (factor != 0) 2992 { 2993 /* Use a subtraction if we have a negative factor. */ 2994 rtx_code code = PLUS; 2995 if (factor < 0) 2996 { 2997 factor = -factor; 2998 code = MINUS; 2999 } 3000 3001 /* Calculate CNTD * FACTOR / 2. First try to fold the division 3002 into the multiplication. */ 3003 rtx val; 3004 int shift = 0; 3005 if (factor & 1) 3006 /* Use a right shift by 1. */ 3007 shift = -1; 3008 else 3009 factor /= 2; 3010 HOST_WIDE_INT low_bit = factor & -factor; 3011 if (factor <= 16 * low_bit) 3012 { 3013 if (factor > 16 * 8) 3014 { 3015 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate 3016 the value with the minimum multiplier and shift it into 3017 position. */ 3018 int extra_shift = exact_log2 (low_bit); 3019 shift += extra_shift; 3020 factor >>= extra_shift; 3021 } 3022 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode); 3023 } 3024 else 3025 { 3026 /* Use CNTD, then multiply it by FACTOR. */ 3027 val = gen_int_mode (poly_int64 (2, 2), mode); 3028 val = aarch64_force_temporary (mode, temp1, val); 3029 3030 /* Go back to using a negative multiplication factor if we have 3031 no register from which to subtract. */ 3032 if (code == MINUS && src == const0_rtx) 3033 { 3034 factor = -factor; 3035 code = PLUS; 3036 } 3037 rtx coeff1 = gen_int_mode (factor, mode); 3038 coeff1 = aarch64_force_temporary (mode, temp2, coeff1); 3039 val = gen_rtx_MULT (mode, val, coeff1); 3040 } 3041 3042 if (shift > 0) 3043 { 3044 /* Multiply by 1 << SHIFT. */ 3045 val = aarch64_force_temporary (mode, temp1, val); 3046 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift)); 3047 } 3048 else if (shift == -1) 3049 { 3050 /* Divide by 2. */ 3051 val = aarch64_force_temporary (mode, temp1, val); 3052 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx); 3053 } 3054 3055 /* Calculate SRC +/- CNTD * FACTOR / 2. */ 3056 if (src != const0_rtx) 3057 { 3058 val = aarch64_force_temporary (mode, temp1, val); 3059 val = gen_rtx_fmt_ee (code, mode, src, val); 3060 } 3061 else if (code == MINUS) 3062 { 3063 val = aarch64_force_temporary (mode, temp1, val); 3064 val = gen_rtx_NEG (mode, val); 3065 } 3066 3067 if (constant == 0 || frame_related_p) 3068 { 3069 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val)); 3070 if (frame_related_p) 3071 { 3072 RTX_FRAME_RELATED_P (insn) = true; 3073 add_reg_note (insn, REG_CFA_ADJUST_CFA, 3074 gen_rtx_SET (dest, plus_constant (Pmode, src, 3075 poly_offset))); 3076 } 3077 src = dest; 3078 if (constant == 0) 3079 return; 3080 } 3081 else 3082 { 3083 src = aarch64_force_temporary (mode, temp1, val); 3084 temp1 = temp2; 3085 temp2 = NULL_RTX; 3086 } 3087 3088 emit_move_imm = true; 3089 } 3090 3091 aarch64_add_offset_1 (mode, dest, src, constant, temp1, 3092 frame_related_p, emit_move_imm); 3093 } 3094 3095 /* Like aarch64_add_offset, but the offset is given as an rtx rather 3096 than a poly_int64. */ 3097 3098 void 3099 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src, 3100 rtx offset_rtx, rtx temp1, rtx temp2) 3101 { 3102 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx), 3103 temp1, temp2, false); 3104 } 3105 3106 /* Add DELTA to the stack pointer, marking the instructions frame-related. 3107 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false 3108 if TEMP1 already contains abs (DELTA). */ 3109 3110 static inline void 3111 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm) 3112 { 3113 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta, 3114 temp1, temp2, true, emit_move_imm); 3115 } 3116 3117 /* Subtract DELTA from the stack pointer, marking the instructions 3118 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary 3119 if nonnull. */ 3120 3121 static inline void 3122 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p, 3123 bool emit_move_imm = true) 3124 { 3125 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta, 3126 temp1, temp2, frame_related_p, emit_move_imm); 3127 } 3128 3129 /* Set DEST to (vec_series BASE STEP). */ 3130 3131 static void 3132 aarch64_expand_vec_series (rtx dest, rtx base, rtx step) 3133 { 3134 machine_mode mode = GET_MODE (dest); 3135 scalar_mode inner = GET_MODE_INNER (mode); 3136 3137 /* Each operand can be a register or an immediate in the range [-16, 15]. */ 3138 if (!aarch64_sve_index_immediate_p (base)) 3139 base = force_reg (inner, base); 3140 if (!aarch64_sve_index_immediate_p (step)) 3141 step = force_reg (inner, step); 3142 3143 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step)); 3144 } 3145 3146 /* Try to duplicate SRC into SVE register DEST, given that SRC is an 3147 integer of mode INT_MODE. Return true on success. */ 3148 3149 static bool 3150 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode, 3151 rtx src) 3152 { 3153 /* If the constant is smaller than 128 bits, we can do the move 3154 using a vector of SRC_MODEs. */ 3155 if (src_mode != TImode) 3156 { 3157 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)), 3158 GET_MODE_SIZE (src_mode)); 3159 machine_mode dup_mode = mode_for_vector (src_mode, count).require (); 3160 emit_move_insn (gen_lowpart (dup_mode, dest), 3161 gen_const_vec_duplicate (dup_mode, src)); 3162 return true; 3163 } 3164 3165 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */ 3166 src = force_const_mem (src_mode, src); 3167 if (!src) 3168 return false; 3169 3170 /* Make sure that the address is legitimate. */ 3171 if (!aarch64_sve_ld1r_operand_p (src)) 3172 { 3173 rtx addr = force_reg (Pmode, XEXP (src, 0)); 3174 src = replace_equiv_address (src, addr); 3175 } 3176 3177 machine_mode mode = GET_MODE (dest); 3178 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode); 3179 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require (); 3180 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode)); 3181 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ); 3182 emit_insn (gen_rtx_SET (dest, src)); 3183 return true; 3184 } 3185 3186 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it 3187 isn't a simple duplicate or series. */ 3188 3189 static void 3190 aarch64_expand_sve_const_vector (rtx dest, rtx src) 3191 { 3192 machine_mode mode = GET_MODE (src); 3193 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src); 3194 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src); 3195 gcc_assert (npatterns > 1); 3196 3197 if (nelts_per_pattern == 1) 3198 { 3199 /* The constant is a repeating seqeuence of at least two elements, 3200 where the repeating elements occupy no more than 128 bits. 3201 Get an integer representation of the replicated value. */ 3202 scalar_int_mode int_mode; 3203 if (BYTES_BIG_ENDIAN) 3204 /* For now, always use LD1RQ to load the value on big-endian 3205 targets, since the handling of smaller integers includes a 3206 subreg that is semantically an element reverse. */ 3207 int_mode = TImode; 3208 else 3209 { 3210 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns; 3211 gcc_assert (int_bits <= 128); 3212 int_mode = int_mode_for_size (int_bits, 0).require (); 3213 } 3214 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0); 3215 if (int_value 3216 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value)) 3217 return; 3218 } 3219 3220 /* Expand each pattern individually. */ 3221 rtx_vector_builder builder; 3222 auto_vec<rtx, 16> vectors (npatterns); 3223 for (unsigned int i = 0; i < npatterns; ++i) 3224 { 3225 builder.new_vector (mode, 1, nelts_per_pattern); 3226 for (unsigned int j = 0; j < nelts_per_pattern; ++j) 3227 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns)); 3228 vectors.quick_push (force_reg (mode, builder.build ())); 3229 } 3230 3231 /* Use permutes to interleave the separate vectors. */ 3232 while (npatterns > 1) 3233 { 3234 npatterns /= 2; 3235 for (unsigned int i = 0; i < npatterns; ++i) 3236 { 3237 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode)); 3238 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]); 3239 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1)); 3240 vectors[i] = tmp; 3241 } 3242 } 3243 gcc_assert (vectors[0] == dest); 3244 } 3245 3246 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE 3247 is a pattern that can be used to set DEST to a replicated scalar 3248 element. */ 3249 3250 void 3251 aarch64_expand_mov_immediate (rtx dest, rtx imm, 3252 rtx (*gen_vec_duplicate) (rtx, rtx)) 3253 { 3254 machine_mode mode = GET_MODE (dest); 3255 3256 /* Check on what type of symbol it is. */ 3257 scalar_int_mode int_mode; 3258 if ((GET_CODE (imm) == SYMBOL_REF 3259 || GET_CODE (imm) == LABEL_REF 3260 || GET_CODE (imm) == CONST 3261 || GET_CODE (imm) == CONST_POLY_INT) 3262 && is_a <scalar_int_mode> (mode, &int_mode)) 3263 { 3264 rtx mem; 3265 poly_int64 offset; 3266 HOST_WIDE_INT const_offset; 3267 enum aarch64_symbol_type sty; 3268 3269 /* If we have (const (plus symbol offset)), separate out the offset 3270 before we start classifying the symbol. */ 3271 rtx base = strip_offset (imm, &offset); 3272 3273 /* We must always add an offset involving VL separately, rather than 3274 folding it into the relocation. */ 3275 if (!offset.is_constant (&const_offset)) 3276 { 3277 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset)) 3278 emit_insn (gen_rtx_SET (dest, imm)); 3279 else 3280 { 3281 /* Do arithmetic on 32-bit values if the result is smaller 3282 than that. */ 3283 if (partial_subreg_p (int_mode, SImode)) 3284 { 3285 /* It is invalid to do symbol calculations in modes 3286 narrower than SImode. */ 3287 gcc_assert (base == const0_rtx); 3288 dest = gen_lowpart (SImode, dest); 3289 int_mode = SImode; 3290 } 3291 if (base != const0_rtx) 3292 { 3293 base = aarch64_force_temporary (int_mode, dest, base); 3294 aarch64_add_offset (int_mode, dest, base, offset, 3295 NULL_RTX, NULL_RTX, false); 3296 } 3297 else 3298 aarch64_add_offset (int_mode, dest, base, offset, 3299 dest, NULL_RTX, false); 3300 } 3301 return; 3302 } 3303 3304 sty = aarch64_classify_symbol (base, const_offset); 3305 switch (sty) 3306 { 3307 case SYMBOL_FORCE_TO_MEM: 3308 if (const_offset != 0 3309 && targetm.cannot_force_const_mem (int_mode, imm)) 3310 { 3311 gcc_assert (can_create_pseudo_p ()); 3312 base = aarch64_force_temporary (int_mode, dest, base); 3313 aarch64_add_offset (int_mode, dest, base, const_offset, 3314 NULL_RTX, NULL_RTX, false); 3315 return; 3316 } 3317 3318 mem = force_const_mem (ptr_mode, imm); 3319 gcc_assert (mem); 3320 3321 /* If we aren't generating PC relative literals, then 3322 we need to expand the literal pool access carefully. 3323 This is something that needs to be done in a number 3324 of places, so could well live as a separate function. */ 3325 if (!aarch64_pcrelative_literal_loads) 3326 { 3327 gcc_assert (can_create_pseudo_p ()); 3328 base = gen_reg_rtx (ptr_mode); 3329 aarch64_expand_mov_immediate (base, XEXP (mem, 0)); 3330 if (ptr_mode != Pmode) 3331 base = convert_memory_address (Pmode, base); 3332 mem = gen_rtx_MEM (ptr_mode, base); 3333 } 3334 3335 if (int_mode != ptr_mode) 3336 mem = gen_rtx_ZERO_EXTEND (int_mode, mem); 3337 3338 emit_insn (gen_rtx_SET (dest, mem)); 3339 3340 return; 3341 3342 case SYMBOL_SMALL_TLSGD: 3343 case SYMBOL_SMALL_TLSDESC: 3344 case SYMBOL_SMALL_TLSIE: 3345 case SYMBOL_SMALL_GOT_28K: 3346 case SYMBOL_SMALL_GOT_4G: 3347 case SYMBOL_TINY_GOT: 3348 case SYMBOL_TINY_TLSIE: 3349 if (const_offset != 0) 3350 { 3351 gcc_assert(can_create_pseudo_p ()); 3352 base = aarch64_force_temporary (int_mode, dest, base); 3353 aarch64_add_offset (int_mode, dest, base, const_offset, 3354 NULL_RTX, NULL_RTX, false); 3355 return; 3356 } 3357 /* FALLTHRU */ 3358 3359 case SYMBOL_SMALL_ABSOLUTE: 3360 case SYMBOL_TINY_ABSOLUTE: 3361 case SYMBOL_TLSLE12: 3362 case SYMBOL_TLSLE24: 3363 case SYMBOL_TLSLE32: 3364 case SYMBOL_TLSLE48: 3365 aarch64_load_symref_appropriately (dest, imm, sty); 3366 return; 3367 3368 default: 3369 gcc_unreachable (); 3370 } 3371 } 3372 3373 if (!CONST_INT_P (imm)) 3374 { 3375 rtx base, step, value; 3376 if (GET_CODE (imm) == HIGH 3377 || aarch64_simd_valid_immediate (imm, NULL)) 3378 emit_insn (gen_rtx_SET (dest, imm)); 3379 else if (const_vec_series_p (imm, &base, &step)) 3380 aarch64_expand_vec_series (dest, base, step); 3381 else if (const_vec_duplicate_p (imm, &value)) 3382 { 3383 /* If the constant is out of range of an SVE vector move, 3384 load it from memory if we can, otherwise move it into 3385 a register and use a DUP. */ 3386 scalar_mode inner_mode = GET_MODE_INNER (mode); 3387 rtx op = force_const_mem (inner_mode, value); 3388 if (!op) 3389 op = force_reg (inner_mode, value); 3390 else if (!aarch64_sve_ld1r_operand_p (op)) 3391 { 3392 rtx addr = force_reg (Pmode, XEXP (op, 0)); 3393 op = replace_equiv_address (op, addr); 3394 } 3395 emit_insn (gen_vec_duplicate (dest, op)); 3396 } 3397 else if (GET_CODE (imm) == CONST_VECTOR 3398 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ()) 3399 aarch64_expand_sve_const_vector (dest, imm); 3400 else 3401 { 3402 rtx mem = force_const_mem (mode, imm); 3403 gcc_assert (mem); 3404 emit_move_insn (dest, mem); 3405 } 3406 3407 return; 3408 } 3409 3410 aarch64_internal_mov_immediate (dest, imm, true, 3411 as_a <scalar_int_mode> (mode)); 3412 } 3413 3414 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate 3415 that is known to contain PTRUE. */ 3416 3417 void 3418 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src) 3419 { 3420 expand_operand ops[3]; 3421 machine_mode mode = GET_MODE (dest); 3422 create_output_operand (&ops[0], dest, mode); 3423 create_input_operand (&ops[1], pred, GET_MODE(pred)); 3424 create_input_operand (&ops[2], src, mode); 3425 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops); 3426 } 3427 3428 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one 3429 operand is in memory. In this case we need to use the predicated LD1 3430 and ST1 instead of LDR and STR, both for correctness on big-endian 3431 targets and because LD1 and ST1 support a wider range of addressing modes. 3432 PRED_MODE is the mode of the predicate. 3433 3434 See the comment at the head of aarch64-sve.md for details about the 3435 big-endian handling. */ 3436 3437 void 3438 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode) 3439 { 3440 machine_mode mode = GET_MODE (dest); 3441 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode)); 3442 if (!register_operand (src, mode) 3443 && !register_operand (dest, mode)) 3444 { 3445 rtx tmp = gen_reg_rtx (mode); 3446 if (MEM_P (src)) 3447 aarch64_emit_sve_pred_move (tmp, ptrue, src); 3448 else 3449 emit_move_insn (tmp, src); 3450 src = tmp; 3451 } 3452 aarch64_emit_sve_pred_move (dest, ptrue, src); 3453 } 3454 3455 /* Called only on big-endian targets. See whether an SVE vector move 3456 from SRC to DEST is effectively a REV[BHW] instruction, because at 3457 least one operand is a subreg of an SVE vector that has wider or 3458 narrower elements. Return true and emit the instruction if so. 3459 3460 For example: 3461 3462 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0)) 3463 3464 represents a VIEW_CONVERT between the following vectors, viewed 3465 in memory order: 3466 3467 R2: { [0].high, [0].low, [1].high, [1].low, ... } 3468 R1: { [0], [1], [2], [3], ... } 3469 3470 The high part of lane X in R2 should therefore correspond to lane X*2 3471 of R1, but the register representations are: 3472 3473 msb lsb 3474 R2: ...... [1].high [1].low [0].high [0].low 3475 R1: ...... [3] [2] [1] [0] 3476 3477 where the low part of lane X in R2 corresponds to lane X*2 in R1. 3478 We therefore need a reverse operation to swap the high and low values 3479 around. 3480 3481 This is purely an optimization. Without it we would spill the 3482 subreg operand to the stack in one mode and reload it in the 3483 other mode, which has the same effect as the REV. */ 3484 3485 bool 3486 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src) 3487 { 3488 gcc_assert (BYTES_BIG_ENDIAN); 3489 if (GET_CODE (dest) == SUBREG) 3490 dest = SUBREG_REG (dest); 3491 if (GET_CODE (src) == SUBREG) 3492 src = SUBREG_REG (src); 3493 3494 /* The optimization handles two single SVE REGs with different element 3495 sizes. */ 3496 if (!REG_P (dest) 3497 || !REG_P (src) 3498 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA 3499 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA 3500 || (GET_MODE_UNIT_SIZE (GET_MODE (dest)) 3501 == GET_MODE_UNIT_SIZE (GET_MODE (src)))) 3502 return false; 3503 3504 /* Generate *aarch64_sve_mov<mode>_subreg_be. */ 3505 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode)); 3506 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src), 3507 UNSPEC_REV_SUBREG); 3508 emit_insn (gen_rtx_SET (dest, unspec)); 3509 return true; 3510 } 3511 3512 /* Return a copy of X with mode MODE, without changing its other 3513 attributes. Unlike gen_lowpart, this doesn't care whether the 3514 mode change is valid. */ 3515 3516 static rtx 3517 aarch64_replace_reg_mode (rtx x, machine_mode mode) 3518 { 3519 if (GET_MODE (x) == mode) 3520 return x; 3521 3522 x = shallow_copy_rtx (x); 3523 set_mode_and_regno (x, mode, REGNO (x)); 3524 return x; 3525 } 3526 3527 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given 3528 operands. */ 3529 3530 void 3531 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src) 3532 { 3533 /* Decide which REV operation we need. The mode with narrower elements 3534 determines the mode of the operands and the mode with the wider 3535 elements determines the reverse width. */ 3536 machine_mode mode_with_wider_elts = GET_MODE (dest); 3537 machine_mode mode_with_narrower_elts = GET_MODE (src); 3538 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts) 3539 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts)) 3540 std::swap (mode_with_wider_elts, mode_with_narrower_elts); 3541 3542 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts); 3543 unsigned int unspec; 3544 if (wider_bytes == 8) 3545 unspec = UNSPEC_REV64; 3546 else if (wider_bytes == 4) 3547 unspec = UNSPEC_REV32; 3548 else if (wider_bytes == 2) 3549 unspec = UNSPEC_REV16; 3550 else 3551 gcc_unreachable (); 3552 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require (); 3553 3554 /* Emit: 3555 3556 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)] 3557 UNSPEC_MERGE_PTRUE)) 3558 3559 with the appropriate modes. */ 3560 ptrue = gen_lowpart (pred_mode, ptrue); 3561 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts); 3562 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts); 3563 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec); 3564 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src), 3565 UNSPEC_MERGE_PTRUE); 3566 emit_insn (gen_rtx_SET (dest, src)); 3567 } 3568 3569 static bool 3570 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED, 3571 tree exp ATTRIBUTE_UNUSED) 3572 { 3573 if (aarch64_simd_decl_p (cfun->decl) != aarch64_simd_decl_p (decl)) 3574 return false; 3575 3576 return true; 3577 } 3578 3579 /* Implement TARGET_PASS_BY_REFERENCE. */ 3580 3581 static bool 3582 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED, 3583 machine_mode mode, 3584 const_tree type, 3585 bool named ATTRIBUTE_UNUSED) 3586 { 3587 HOST_WIDE_INT size; 3588 machine_mode dummymode; 3589 int nregs; 3590 3591 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */ 3592 if (mode == BLKmode && type) 3593 size = int_size_in_bytes (type); 3594 else 3595 /* No frontends can create types with variable-sized modes, so we 3596 shouldn't be asked to pass or return them. */ 3597 size = GET_MODE_SIZE (mode).to_constant (); 3598 3599 /* Aggregates are passed by reference based on their size. */ 3600 if (type && AGGREGATE_TYPE_P (type)) 3601 { 3602 size = int_size_in_bytes (type); 3603 } 3604 3605 /* Variable sized arguments are always returned by reference. */ 3606 if (size < 0) 3607 return true; 3608 3609 /* Can this be a candidate to be passed in fp/simd register(s)? */ 3610 if (aarch64_vfp_is_call_or_return_candidate (mode, type, 3611 &dummymode, &nregs, 3612 NULL)) 3613 return false; 3614 3615 /* Arguments which are variable sized or larger than 2 registers are 3616 passed by reference unless they are a homogenous floating point 3617 aggregate. */ 3618 return size > 2 * UNITS_PER_WORD; 3619 } 3620 3621 /* Return TRUE if VALTYPE is padded to its least significant bits. */ 3622 static bool 3623 aarch64_return_in_msb (const_tree valtype) 3624 { 3625 machine_mode dummy_mode; 3626 int dummy_int; 3627 3628 /* Never happens in little-endian mode. */ 3629 if (!BYTES_BIG_ENDIAN) 3630 return false; 3631 3632 /* Only composite types smaller than or equal to 16 bytes can 3633 be potentially returned in registers. */ 3634 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype)) 3635 || int_size_in_bytes (valtype) <= 0 3636 || int_size_in_bytes (valtype) > 16) 3637 return false; 3638 3639 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate) 3640 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite 3641 is always passed/returned in the least significant bits of fp/simd 3642 register(s). */ 3643 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype, 3644 &dummy_mode, &dummy_int, NULL)) 3645 return false; 3646 3647 return true; 3648 } 3649 3650 /* Implement TARGET_FUNCTION_VALUE. 3651 Define how to find the value returned by a function. */ 3652 3653 static rtx 3654 aarch64_function_value (const_tree type, const_tree func, 3655 bool outgoing ATTRIBUTE_UNUSED) 3656 { 3657 machine_mode mode; 3658 int unsignedp; 3659 int count; 3660 machine_mode ag_mode; 3661 3662 mode = TYPE_MODE (type); 3663 if (INTEGRAL_TYPE_P (type)) 3664 mode = promote_function_mode (type, mode, &unsignedp, func, 1); 3665 3666 if (aarch64_return_in_msb (type)) 3667 { 3668 HOST_WIDE_INT size = int_size_in_bytes (type); 3669 3670 if (size % UNITS_PER_WORD != 0) 3671 { 3672 size += UNITS_PER_WORD - size % UNITS_PER_WORD; 3673 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require (); 3674 } 3675 } 3676 3677 if (aarch64_vfp_is_call_or_return_candidate (mode, type, 3678 &ag_mode, &count, NULL)) 3679 { 3680 if (!aarch64_composite_type_p (type, mode)) 3681 { 3682 gcc_assert (count == 1 && mode == ag_mode); 3683 return gen_rtx_REG (mode, V0_REGNUM); 3684 } 3685 else 3686 { 3687 int i; 3688 rtx par; 3689 3690 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count)); 3691 for (i = 0; i < count; i++) 3692 { 3693 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i); 3694 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode); 3695 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset); 3696 XVECEXP (par, 0, i) = tmp; 3697 } 3698 return par; 3699 } 3700 } 3701 else 3702 return gen_rtx_REG (mode, R0_REGNUM); 3703 } 3704 3705 /* Implements TARGET_FUNCTION_VALUE_REGNO_P. 3706 Return true if REGNO is the number of a hard register in which the values 3707 of called function may come back. */ 3708 3709 static bool 3710 aarch64_function_value_regno_p (const unsigned int regno) 3711 { 3712 /* Maximum of 16 bytes can be returned in the general registers. Examples 3713 of 16-byte return values are: 128-bit integers and 16-byte small 3714 structures (excluding homogeneous floating-point aggregates). */ 3715 if (regno == R0_REGNUM || regno == R1_REGNUM) 3716 return true; 3717 3718 /* Up to four fp/simd registers can return a function value, e.g. a 3719 homogeneous floating-point aggregate having four members. */ 3720 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS) 3721 return TARGET_FLOAT; 3722 3723 return false; 3724 } 3725 3726 /* Implement TARGET_RETURN_IN_MEMORY. 3727 3728 If the type T of the result of a function is such that 3729 void func (T arg) 3730 would require that arg be passed as a value in a register (or set of 3731 registers) according to the parameter passing rules, then the result 3732 is returned in the same registers as would be used for such an 3733 argument. */ 3734 3735 static bool 3736 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED) 3737 { 3738 HOST_WIDE_INT size; 3739 machine_mode ag_mode; 3740 int count; 3741 3742 if (!AGGREGATE_TYPE_P (type) 3743 && TREE_CODE (type) != COMPLEX_TYPE 3744 && TREE_CODE (type) != VECTOR_TYPE) 3745 /* Simple scalar types always returned in registers. */ 3746 return false; 3747 3748 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), 3749 type, 3750 &ag_mode, 3751 &count, 3752 NULL)) 3753 return false; 3754 3755 /* Types larger than 2 registers returned in memory. */ 3756 size = int_size_in_bytes (type); 3757 return (size < 0 || size > 2 * UNITS_PER_WORD); 3758 } 3759 3760 static bool 3761 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode, 3762 const_tree type, int *nregs) 3763 { 3764 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v); 3765 return aarch64_vfp_is_call_or_return_candidate (mode, 3766 type, 3767 &pcum->aapcs_vfp_rmode, 3768 nregs, 3769 NULL); 3770 } 3771 3772 /* Given MODE and TYPE of a function argument, return the alignment in 3773 bits. The idea is to suppress any stronger alignment requested by 3774 the user and opt for the natural alignment (specified in AAPCS64 \S 3775 4.1). ABI_BREAK is set to true if the alignment was incorrectly 3776 calculated in versions of GCC prior to GCC-9. This is a helper 3777 function for local use only. */ 3778 3779 static unsigned int 3780 aarch64_function_arg_alignment (machine_mode mode, const_tree type, 3781 bool *abi_break) 3782 { 3783 *abi_break = false; 3784 if (!type) 3785 return GET_MODE_ALIGNMENT (mode); 3786 3787 if (integer_zerop (TYPE_SIZE (type))) 3788 return 0; 3789 3790 gcc_assert (TYPE_MODE (type) == mode); 3791 3792 if (!AGGREGATE_TYPE_P (type)) 3793 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type)); 3794 3795 if (TREE_CODE (type) == ARRAY_TYPE) 3796 return TYPE_ALIGN (TREE_TYPE (type)); 3797 3798 unsigned int alignment = 0; 3799 unsigned int bitfield_alignment = 0; 3800 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) 3801 if (TREE_CODE (field) == FIELD_DECL) 3802 { 3803 alignment = std::max (alignment, DECL_ALIGN (field)); 3804 if (DECL_BIT_FIELD_TYPE (field)) 3805 bitfield_alignment 3806 = std::max (bitfield_alignment, 3807 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field))); 3808 } 3809 3810 if (bitfield_alignment > alignment) 3811 { 3812 *abi_break = true; 3813 return bitfield_alignment; 3814 } 3815 3816 return alignment; 3817 } 3818 3819 /* Layout a function argument according to the AAPCS64 rules. The rule 3820 numbers refer to the rule numbers in the AAPCS64. */ 3821 3822 static void 3823 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode, 3824 const_tree type, 3825 bool named ATTRIBUTE_UNUSED) 3826 { 3827 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v); 3828 int ncrn, nvrn, nregs; 3829 bool allocate_ncrn, allocate_nvrn; 3830 HOST_WIDE_INT size; 3831 bool abi_break; 3832 3833 /* We need to do this once per argument. */ 3834 if (pcum->aapcs_arg_processed) 3835 return; 3836 3837 pcum->aapcs_arg_processed = true; 3838 3839 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */ 3840 if (type) 3841 size = int_size_in_bytes (type); 3842 else 3843 /* No frontends can create types with variable-sized modes, so we 3844 shouldn't be asked to pass or return them. */ 3845 size = GET_MODE_SIZE (mode).to_constant (); 3846 size = ROUND_UP (size, UNITS_PER_WORD); 3847 3848 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode); 3849 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v, 3850 mode, 3851 type, 3852 &nregs); 3853 3854 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable. 3855 The following code thus handles passing by SIMD/FP registers first. */ 3856 3857 nvrn = pcum->aapcs_nvrn; 3858 3859 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA) 3860 and homogenous short-vector aggregates (HVA). */ 3861 if (allocate_nvrn) 3862 { 3863 if (!TARGET_FLOAT) 3864 aarch64_err_no_fpadvsimd (mode); 3865 3866 if (nvrn + nregs <= NUM_FP_ARG_REGS) 3867 { 3868 pcum->aapcs_nextnvrn = nvrn + nregs; 3869 if (!aarch64_composite_type_p (type, mode)) 3870 { 3871 gcc_assert (nregs == 1); 3872 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn); 3873 } 3874 else 3875 { 3876 rtx par; 3877 int i; 3878 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs)); 3879 for (i = 0; i < nregs; i++) 3880 { 3881 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode, 3882 V0_REGNUM + nvrn + i); 3883 rtx offset = gen_int_mode 3884 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode); 3885 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset); 3886 XVECEXP (par, 0, i) = tmp; 3887 } 3888 pcum->aapcs_reg = par; 3889 } 3890 return; 3891 } 3892 else 3893 { 3894 /* C.3 NSRN is set to 8. */ 3895 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS; 3896 goto on_stack; 3897 } 3898 } 3899 3900 ncrn = pcum->aapcs_ncrn; 3901 nregs = size / UNITS_PER_WORD; 3902 3903 /* C6 - C9. though the sign and zero extension semantics are 3904 handled elsewhere. This is the case where the argument fits 3905 entirely general registers. */ 3906 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS)) 3907 { 3908 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2); 3909 3910 /* C.8 if the argument has an alignment of 16 then the NGRN is 3911 rounded up to the next even number. */ 3912 if (nregs == 2 3913 && ncrn % 2 3914 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT 3915 comparison is there because for > 16 * BITS_PER_UNIT 3916 alignment nregs should be > 2 and therefore it should be 3917 passed by reference rather than value. */ 3918 && (aarch64_function_arg_alignment (mode, type, &abi_break) 3919 == 16 * BITS_PER_UNIT)) 3920 { 3921 if (abi_break && warn_psabi && currently_expanding_gimple_stmt) 3922 inform (input_location, "parameter passing for argument of type " 3923 "%qT changed in GCC 9.1", type); 3924 ++ncrn; 3925 gcc_assert (ncrn + nregs <= NUM_ARG_REGS); 3926 } 3927 3928 /* NREGS can be 0 when e.g. an empty structure is to be passed. 3929 A reg is still generated for it, but the caller should be smart 3930 enough not to use it. */ 3931 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT) 3932 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn); 3933 else 3934 { 3935 rtx par; 3936 int i; 3937 3938 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs)); 3939 for (i = 0; i < nregs; i++) 3940 { 3941 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i); 3942 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, 3943 GEN_INT (i * UNITS_PER_WORD)); 3944 XVECEXP (par, 0, i) = tmp; 3945 } 3946 pcum->aapcs_reg = par; 3947 } 3948 3949 pcum->aapcs_nextncrn = ncrn + nregs; 3950 return; 3951 } 3952 3953 /* C.11 */ 3954 pcum->aapcs_nextncrn = NUM_ARG_REGS; 3955 3956 /* The argument is passed on stack; record the needed number of words for 3957 this argument and align the total size if necessary. */ 3958 on_stack: 3959 pcum->aapcs_stack_words = size / UNITS_PER_WORD; 3960 3961 if (aarch64_function_arg_alignment (mode, type, &abi_break) 3962 == 16 * BITS_PER_UNIT) 3963 { 3964 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD); 3965 if (pcum->aapcs_stack_size != new_size) 3966 { 3967 if (abi_break && warn_psabi && currently_expanding_gimple_stmt) 3968 inform (input_location, "parameter passing for argument of type " 3969 "%qT changed in GCC 9.1", type); 3970 pcum->aapcs_stack_size = new_size; 3971 } 3972 } 3973 return; 3974 } 3975 3976 /* Implement TARGET_FUNCTION_ARG. */ 3977 3978 static rtx 3979 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode, 3980 const_tree type, bool named) 3981 { 3982 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v); 3983 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64); 3984 3985 if (mode == VOIDmode) 3986 return NULL_RTX; 3987 3988 aarch64_layout_arg (pcum_v, mode, type, named); 3989 return pcum->aapcs_reg; 3990 } 3991 3992 void 3993 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum, 3994 const_tree fntype ATTRIBUTE_UNUSED, 3995 rtx libname ATTRIBUTE_UNUSED, 3996 const_tree fndecl ATTRIBUTE_UNUSED, 3997 unsigned n_named ATTRIBUTE_UNUSED) 3998 { 3999 pcum->aapcs_ncrn = 0; 4000 pcum->aapcs_nvrn = 0; 4001 pcum->aapcs_nextncrn = 0; 4002 pcum->aapcs_nextnvrn = 0; 4003 pcum->pcs_variant = ARM_PCS_AAPCS64; 4004 pcum->aapcs_reg = NULL_RTX; 4005 pcum->aapcs_arg_processed = false; 4006 pcum->aapcs_stack_words = 0; 4007 pcum->aapcs_stack_size = 0; 4008 4009 if (!TARGET_FLOAT 4010 && fndecl && TREE_PUBLIC (fndecl) 4011 && fntype && fntype != error_mark_node) 4012 { 4013 const_tree type = TREE_TYPE (fntype); 4014 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */ 4015 int nregs ATTRIBUTE_UNUSED; /* Likewise. */ 4016 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type, 4017 &mode, &nregs, NULL)) 4018 aarch64_err_no_fpadvsimd (TYPE_MODE (type)); 4019 } 4020 return; 4021 } 4022 4023 static void 4024 aarch64_function_arg_advance (cumulative_args_t pcum_v, 4025 machine_mode mode, 4026 const_tree type, 4027 bool named) 4028 { 4029 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v); 4030 if (pcum->pcs_variant == ARM_PCS_AAPCS64) 4031 { 4032 aarch64_layout_arg (pcum_v, mode, type, named); 4033 gcc_assert ((pcum->aapcs_reg != NULL_RTX) 4034 != (pcum->aapcs_stack_words != 0)); 4035 pcum->aapcs_arg_processed = false; 4036 pcum->aapcs_ncrn = pcum->aapcs_nextncrn; 4037 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn; 4038 pcum->aapcs_stack_size += pcum->aapcs_stack_words; 4039 pcum->aapcs_stack_words = 0; 4040 pcum->aapcs_reg = NULL_RTX; 4041 } 4042 } 4043 4044 bool 4045 aarch64_function_arg_regno_p (unsigned regno) 4046 { 4047 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS) 4048 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)); 4049 } 4050 4051 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least 4052 PARM_BOUNDARY bits of alignment, but will be given anything up 4053 to STACK_BOUNDARY bits if the type requires it. This makes sure 4054 that both before and after the layout of each argument, the Next 4055 Stacked Argument Address (NSAA) will have a minimum alignment of 4056 8 bytes. */ 4057 4058 static unsigned int 4059 aarch64_function_arg_boundary (machine_mode mode, const_tree type) 4060 { 4061 bool abi_break; 4062 unsigned int alignment = aarch64_function_arg_alignment (mode, type, 4063 &abi_break); 4064 if (abi_break & warn_psabi) 4065 inform (input_location, "parameter passing for argument of type " 4066 "%qT changed in GCC 9.1", type); 4067 4068 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY); 4069 } 4070 4071 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */ 4072 4073 static fixed_size_mode 4074 aarch64_get_reg_raw_mode (int regno) 4075 { 4076 if (TARGET_SVE && FP_REGNUM_P (regno)) 4077 /* Don't use the SVE part of the register for __builtin_apply and 4078 __builtin_return. The SVE registers aren't used by the normal PCS, 4079 so using them there would be a waste of time. The PCS extensions 4080 for SVE types are fundamentally incompatible with the 4081 __builtin_return/__builtin_apply interface. */ 4082 return as_a <fixed_size_mode> (V16QImode); 4083 return default_get_reg_raw_mode (regno); 4084 } 4085 4086 /* Implement TARGET_FUNCTION_ARG_PADDING. 4087 4088 Small aggregate types are placed in the lowest memory address. 4089 4090 The related parameter passing rules are B.4, C.3, C.5 and C.14. */ 4091 4092 static pad_direction 4093 aarch64_function_arg_padding (machine_mode mode, const_tree type) 4094 { 4095 /* On little-endian targets, the least significant byte of every stack 4096 argument is passed at the lowest byte address of the stack slot. */ 4097 if (!BYTES_BIG_ENDIAN) 4098 return PAD_UPWARD; 4099 4100 /* Otherwise, integral, floating-point and pointer types are padded downward: 4101 the least significant byte of a stack argument is passed at the highest 4102 byte address of the stack slot. */ 4103 if (type 4104 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type) 4105 || POINTER_TYPE_P (type)) 4106 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode))) 4107 return PAD_DOWNWARD; 4108 4109 /* Everything else padded upward, i.e. data in first byte of stack slot. */ 4110 return PAD_UPWARD; 4111 } 4112 4113 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST). 4114 4115 It specifies padding for the last (may also be the only) 4116 element of a block move between registers and memory. If 4117 assuming the block is in the memory, padding upward means that 4118 the last element is padded after its highest significant byte, 4119 while in downward padding, the last element is padded at the 4120 its least significant byte side. 4121 4122 Small aggregates and small complex types are always padded 4123 upwards. 4124 4125 We don't need to worry about homogeneous floating-point or 4126 short-vector aggregates; their move is not affected by the 4127 padding direction determined here. Regardless of endianness, 4128 each element of such an aggregate is put in the least 4129 significant bits of a fp/simd register. 4130 4131 Return !BYTES_BIG_ENDIAN if the least significant byte of the 4132 register has useful data, and return the opposite if the most 4133 significant byte does. */ 4134 4135 bool 4136 aarch64_pad_reg_upward (machine_mode mode, const_tree type, 4137 bool first ATTRIBUTE_UNUSED) 4138 { 4139 4140 /* Small composite types are always padded upward. */ 4141 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode)) 4142 { 4143 HOST_WIDE_INT size; 4144 if (type) 4145 size = int_size_in_bytes (type); 4146 else 4147 /* No frontends can create types with variable-sized modes, so we 4148 shouldn't be asked to pass or return them. */ 4149 size = GET_MODE_SIZE (mode).to_constant (); 4150 if (size < 2 * UNITS_PER_WORD) 4151 return true; 4152 } 4153 4154 /* Otherwise, use the default padding. */ 4155 return !BYTES_BIG_ENDIAN; 4156 } 4157 4158 static scalar_int_mode 4159 aarch64_libgcc_cmp_return_mode (void) 4160 { 4161 return SImode; 4162 } 4163 4164 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP) 4165 4166 /* We use the 12-bit shifted immediate arithmetic instructions so values 4167 must be multiple of (1 << 12), i.e. 4096. */ 4168 #define ARITH_FACTOR 4096 4169 4170 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0 4171 #error Cannot use simple address calculation for stack probing 4172 #endif 4173 4174 /* The pair of scratch registers used for stack probing. */ 4175 #define PROBE_STACK_FIRST_REG R9_REGNUM 4176 #define PROBE_STACK_SECOND_REG R10_REGNUM 4177 4178 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE, 4179 inclusive. These are offsets from the current stack pointer. */ 4180 4181 static void 4182 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size) 4183 { 4184 HOST_WIDE_INT size; 4185 if (!poly_size.is_constant (&size)) 4186 { 4187 sorry ("stack probes for SVE frames"); 4188 return; 4189 } 4190 4191 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG); 4192 4193 /* See the same assertion on PROBE_INTERVAL above. */ 4194 gcc_assert ((first % ARITH_FACTOR) == 0); 4195 4196 /* See if we have a constant small number of probes to generate. If so, 4197 that's the easy case. */ 4198 if (size <= PROBE_INTERVAL) 4199 { 4200 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR); 4201 4202 emit_set_insn (reg1, 4203 plus_constant (Pmode, 4204 stack_pointer_rtx, -(first + base))); 4205 emit_stack_probe (plus_constant (Pmode, reg1, base - size)); 4206 } 4207 4208 /* The run-time loop is made up of 8 insns in the generic case while the 4209 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */ 4210 else if (size <= 4 * PROBE_INTERVAL) 4211 { 4212 HOST_WIDE_INT i, rem; 4213 4214 emit_set_insn (reg1, 4215 plus_constant (Pmode, 4216 stack_pointer_rtx, 4217 -(first + PROBE_INTERVAL))); 4218 emit_stack_probe (reg1); 4219 4220 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until 4221 it exceeds SIZE. If only two probes are needed, this will not 4222 generate any code. Then probe at FIRST + SIZE. */ 4223 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL) 4224 { 4225 emit_set_insn (reg1, 4226 plus_constant (Pmode, reg1, -PROBE_INTERVAL)); 4227 emit_stack_probe (reg1); 4228 } 4229 4230 rem = size - (i - PROBE_INTERVAL); 4231 if (rem > 256) 4232 { 4233 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR); 4234 4235 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base)); 4236 emit_stack_probe (plus_constant (Pmode, reg1, base - rem)); 4237 } 4238 else 4239 emit_stack_probe (plus_constant (Pmode, reg1, -rem)); 4240 } 4241 4242 /* Otherwise, do the same as above, but in a loop. Note that we must be 4243 extra careful with variables wrapping around because we might be at 4244 the very top (or the very bottom) of the address space and we have 4245 to be able to handle this case properly; in particular, we use an 4246 equality test for the loop condition. */ 4247 else 4248 { 4249 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG); 4250 4251 /* Step 1: round SIZE to the previous multiple of the interval. */ 4252 4253 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL; 4254 4255 4256 /* Step 2: compute initial and final value of the loop counter. */ 4257 4258 /* TEST_ADDR = SP + FIRST. */ 4259 emit_set_insn (reg1, 4260 plus_constant (Pmode, stack_pointer_rtx, -first)); 4261 4262 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */ 4263 HOST_WIDE_INT adjustment = - (first + rounded_size); 4264 if (! aarch64_uimm12_shift (adjustment)) 4265 { 4266 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment), 4267 true, Pmode); 4268 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2)); 4269 } 4270 else 4271 emit_set_insn (reg2, 4272 plus_constant (Pmode, stack_pointer_rtx, adjustment)); 4273 4274 /* Step 3: the loop 4275 4276 do 4277 { 4278 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL 4279 probe at TEST_ADDR 4280 } 4281 while (TEST_ADDR != LAST_ADDR) 4282 4283 probes at FIRST + N * PROBE_INTERVAL for values of N from 1 4284 until it is equal to ROUNDED_SIZE. */ 4285 4286 emit_insn (gen_probe_stack_range (reg1, reg1, reg2)); 4287 4288 4289 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time 4290 that SIZE is equal to ROUNDED_SIZE. */ 4291 4292 if (size != rounded_size) 4293 { 4294 HOST_WIDE_INT rem = size - rounded_size; 4295 4296 if (rem > 256) 4297 { 4298 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR); 4299 4300 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base)); 4301 emit_stack_probe (plus_constant (Pmode, reg2, base - rem)); 4302 } 4303 else 4304 emit_stack_probe (plus_constant (Pmode, reg2, -rem)); 4305 } 4306 } 4307 4308 /* Make sure nothing is scheduled before we are done. */ 4309 emit_insn (gen_blockage ()); 4310 } 4311 4312 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are 4313 absolute addresses. */ 4314 4315 const char * 4316 aarch64_output_probe_stack_range (rtx reg1, rtx reg2) 4317 { 4318 static int labelno = 0; 4319 char loop_lab[32]; 4320 rtx xops[2]; 4321 4322 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++); 4323 4324 /* Loop. */ 4325 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab); 4326 4327 HOST_WIDE_INT stack_clash_probe_interval 4328 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE); 4329 4330 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */ 4331 xops[0] = reg1; 4332 HOST_WIDE_INT interval; 4333 if (flag_stack_clash_protection) 4334 interval = stack_clash_probe_interval; 4335 else 4336 interval = PROBE_INTERVAL; 4337 4338 gcc_assert (aarch64_uimm12_shift (interval)); 4339 xops[1] = GEN_INT (interval); 4340 4341 output_asm_insn ("sub\t%0, %0, %1", xops); 4342 4343 /* If doing stack clash protection then we probe up by the ABI specified 4344 amount. We do this because we're dropping full pages at a time in the 4345 loop. But if we're doing non-stack clash probing, probe at SP 0. */ 4346 if (flag_stack_clash_protection) 4347 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD); 4348 else 4349 xops[1] = CONST0_RTX (GET_MODE (xops[1])); 4350 4351 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe 4352 by this amount for each iteration. */ 4353 output_asm_insn ("str\txzr, [%0, %1]", xops); 4354 4355 /* Test if TEST_ADDR == LAST_ADDR. */ 4356 xops[1] = reg2; 4357 output_asm_insn ("cmp\t%0, %1", xops); 4358 4359 /* Branch. */ 4360 fputs ("\tb.ne\t", asm_out_file); 4361 assemble_name_raw (asm_out_file, loop_lab); 4362 fputc ('\n', asm_out_file); 4363 4364 return ""; 4365 } 4366 4367 /* Emit the probe loop for doing stack clash probes and stack adjustments for 4368 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size 4369 of GUARD_SIZE. When a probe is emitted it is done at most 4370 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of 4371 at most MIN_PROBE_THRESHOLD. By the end of this function 4372 BASE = BASE - ADJUSTMENT. */ 4373 4374 const char * 4375 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment, 4376 rtx min_probe_threshold, rtx guard_size) 4377 { 4378 /* This function is not allowed to use any instruction generation function 4379 like gen_ and friends. If you do you'll likely ICE during CFG validation, 4380 so instead emit the code you want using output_asm_insn. */ 4381 gcc_assert (flag_stack_clash_protection); 4382 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size)); 4383 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold)); 4384 4385 /* The minimum required allocation before the residual requires probing. */ 4386 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold); 4387 4388 /* Clamp the value down to the nearest value that can be used with a cmp. */ 4389 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard); 4390 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode); 4391 4392 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard); 4393 gcc_assert (aarch64_uimm12_shift (residual_probe_guard)); 4394 4395 static int labelno = 0; 4396 char loop_start_lab[32]; 4397 char loop_end_lab[32]; 4398 rtx xops[2]; 4399 4400 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno); 4401 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++); 4402 4403 /* Emit loop start label. */ 4404 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab); 4405 4406 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */ 4407 xops[0] = adjustment; 4408 xops[1] = probe_offset_value_rtx; 4409 output_asm_insn ("cmp\t%0, %1", xops); 4410 4411 /* Branch to end if not enough adjustment to probe. */ 4412 fputs ("\tb.lt\t", asm_out_file); 4413 assemble_name_raw (asm_out_file, loop_end_lab); 4414 fputc ('\n', asm_out_file); 4415 4416 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */ 4417 xops[0] = base; 4418 xops[1] = probe_offset_value_rtx; 4419 output_asm_insn ("sub\t%0, %0, %1", xops); 4420 4421 /* Probe at BASE. */ 4422 xops[1] = const0_rtx; 4423 output_asm_insn ("str\txzr, [%0, %1]", xops); 4424 4425 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */ 4426 xops[0] = adjustment; 4427 xops[1] = probe_offset_value_rtx; 4428 output_asm_insn ("sub\t%0, %0, %1", xops); 4429 4430 /* Branch to start if still more bytes to allocate. */ 4431 fputs ("\tb\t", asm_out_file); 4432 assemble_name_raw (asm_out_file, loop_start_lab); 4433 fputc ('\n', asm_out_file); 4434 4435 /* No probe leave. */ 4436 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab); 4437 4438 /* BASE = BASE - ADJUSTMENT. */ 4439 xops[0] = base; 4440 xops[1] = adjustment; 4441 output_asm_insn ("sub\t%0, %0, %1", xops); 4442 return ""; 4443 } 4444 4445 /* Determine whether a frame chain needs to be generated. */ 4446 static bool 4447 aarch64_needs_frame_chain (void) 4448 { 4449 /* Force a frame chain for EH returns so the return address is at FP+8. */ 4450 if (frame_pointer_needed || crtl->calls_eh_return) 4451 return true; 4452 4453 /* A leaf function cannot have calls or write LR. */ 4454 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM); 4455 4456 /* Don't use a frame chain in leaf functions if leaf frame pointers 4457 are disabled. */ 4458 if (flag_omit_leaf_frame_pointer && is_leaf) 4459 return false; 4460 4461 return aarch64_use_frame_pointer; 4462 } 4463 4464 /* Mark the registers that need to be saved by the callee and calculate 4465 the size of the callee-saved registers area and frame record (both FP 4466 and LR may be omitted). */ 4467 static void 4468 aarch64_layout_frame (void) 4469 { 4470 HOST_WIDE_INT offset = 0; 4471 int regno, last_fp_reg = INVALID_REGNUM; 4472 bool simd_function = aarch64_simd_decl_p (cfun->decl); 4473 4474 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain (); 4475 4476 /* Adjust the outgoing arguments size if required. Keep it in sync with what 4477 the mid-end is doing. */ 4478 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun); 4479 4480 #define SLOT_NOT_REQUIRED (-2) 4481 #define SLOT_REQUIRED (-1) 4482 4483 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM; 4484 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM; 4485 4486 /* If this is a non-leaf simd function with calls we assume that 4487 at least one of those calls is to a non-simd function and thus 4488 we must save V8 to V23 in the prologue. */ 4489 4490 if (simd_function && !crtl->is_leaf) 4491 { 4492 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) 4493 if (FP_SIMD_SAVED_REGNUM_P (regno)) 4494 df_set_regs_ever_live (regno, true); 4495 } 4496 4497 /* First mark all the registers that really need to be saved... */ 4498 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++) 4499 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED; 4500 4501 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) 4502 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED; 4503 4504 /* ... that includes the eh data registers (if needed)... */ 4505 if (crtl->calls_eh_return) 4506 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++) 4507 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] 4508 = SLOT_REQUIRED; 4509 4510 /* ... and any callee saved register that dataflow says is live. */ 4511 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++) 4512 if (df_regs_ever_live_p (regno) 4513 && (regno == R30_REGNUM 4514 || !call_used_regs[regno])) 4515 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED; 4516 4517 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) 4518 if (df_regs_ever_live_p (regno) 4519 && (!call_used_regs[regno] 4520 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno)))) 4521 { 4522 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED; 4523 last_fp_reg = regno; 4524 } 4525 4526 if (cfun->machine->frame.emit_frame_chain) 4527 { 4528 /* FP and LR are placed in the linkage record. */ 4529 cfun->machine->frame.reg_offset[R29_REGNUM] = 0; 4530 cfun->machine->frame.wb_candidate1 = R29_REGNUM; 4531 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD; 4532 cfun->machine->frame.wb_candidate2 = R30_REGNUM; 4533 offset = 2 * UNITS_PER_WORD; 4534 } 4535 4536 /* With stack-clash, LR must be saved in non-leaf functions. */ 4537 gcc_assert (crtl->is_leaf 4538 || (cfun->machine->frame.reg_offset[R30_REGNUM] 4539 != SLOT_NOT_REQUIRED)); 4540 4541 /* Now assign stack slots for them. */ 4542 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++) 4543 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED) 4544 { 4545 cfun->machine->frame.reg_offset[regno] = offset; 4546 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM) 4547 cfun->machine->frame.wb_candidate1 = regno; 4548 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM) 4549 cfun->machine->frame.wb_candidate2 = regno; 4550 offset += UNITS_PER_WORD; 4551 } 4552 4553 HOST_WIDE_INT max_int_offset = offset; 4554 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT); 4555 bool has_align_gap = offset != max_int_offset; 4556 4557 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++) 4558 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED) 4559 { 4560 /* If there is an alignment gap between integer and fp callee-saves, 4561 allocate the last fp register to it if possible. */ 4562 if (regno == last_fp_reg 4563 && has_align_gap 4564 && !simd_function 4565 && (offset & 8) == 0) 4566 { 4567 cfun->machine->frame.reg_offset[regno] = max_int_offset; 4568 break; 4569 } 4570 4571 cfun->machine->frame.reg_offset[regno] = offset; 4572 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM) 4573 cfun->machine->frame.wb_candidate1 = regno; 4574 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM 4575 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM) 4576 cfun->machine->frame.wb_candidate2 = regno; 4577 offset += simd_function ? UNITS_PER_VREG : UNITS_PER_WORD; 4578 } 4579 4580 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT); 4581 4582 cfun->machine->frame.saved_regs_size = offset; 4583 4584 HOST_WIDE_INT varargs_and_saved_regs_size 4585 = offset + cfun->machine->frame.saved_varargs_size; 4586 4587 cfun->machine->frame.hard_fp_offset 4588 = aligned_upper_bound (varargs_and_saved_regs_size 4589 + get_frame_size (), 4590 STACK_BOUNDARY / BITS_PER_UNIT); 4591 4592 /* Both these values are already aligned. */ 4593 gcc_assert (multiple_p (crtl->outgoing_args_size, 4594 STACK_BOUNDARY / BITS_PER_UNIT)); 4595 cfun->machine->frame.frame_size 4596 = (cfun->machine->frame.hard_fp_offset 4597 + crtl->outgoing_args_size); 4598 4599 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size; 4600 4601 cfun->machine->frame.initial_adjust = 0; 4602 cfun->machine->frame.final_adjust = 0; 4603 cfun->machine->frame.callee_adjust = 0; 4604 cfun->machine->frame.callee_offset = 0; 4605 4606 HOST_WIDE_INT max_push_offset = 0; 4607 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM) 4608 max_push_offset = 512; 4609 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM) 4610 max_push_offset = 256; 4611 4612 HOST_WIDE_INT const_size, const_fp_offset; 4613 if (cfun->machine->frame.frame_size.is_constant (&const_size) 4614 && const_size < max_push_offset 4615 && known_eq (crtl->outgoing_args_size, 0)) 4616 { 4617 /* Simple, small frame with no outgoing arguments: 4618 stp reg1, reg2, [sp, -frame_size]! 4619 stp reg3, reg4, [sp, 16] */ 4620 cfun->machine->frame.callee_adjust = const_size; 4621 } 4622 else if (known_lt (crtl->outgoing_args_size 4623 + cfun->machine->frame.saved_regs_size, 512) 4624 && !(cfun->calls_alloca 4625 && known_lt (cfun->machine->frame.hard_fp_offset, 4626 max_push_offset))) 4627 { 4628 /* Frame with small outgoing arguments: 4629 sub sp, sp, frame_size 4630 stp reg1, reg2, [sp, outgoing_args_size] 4631 stp reg3, reg4, [sp, outgoing_args_size + 16] */ 4632 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size; 4633 cfun->machine->frame.callee_offset 4634 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset; 4635 } 4636 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset) 4637 && const_fp_offset < max_push_offset) 4638 { 4639 /* Frame with large outgoing arguments but a small local area: 4640 stp reg1, reg2, [sp, -hard_fp_offset]! 4641 stp reg3, reg4, [sp, 16] 4642 sub sp, sp, outgoing_args_size */ 4643 cfun->machine->frame.callee_adjust = const_fp_offset; 4644 cfun->machine->frame.final_adjust 4645 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust; 4646 } 4647 else 4648 { 4649 /* Frame with large local area and outgoing arguments using frame pointer: 4650 sub sp, sp, hard_fp_offset 4651 stp x29, x30, [sp, 0] 4652 add x29, sp, 0 4653 stp reg3, reg4, [sp, 16] 4654 sub sp, sp, outgoing_args_size */ 4655 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset; 4656 cfun->machine->frame.final_adjust 4657 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust; 4658 } 4659 4660 cfun->machine->frame.laid_out = true; 4661 } 4662 4663 /* Return true if the register REGNO is saved on entry to 4664 the current function. */ 4665 4666 static bool 4667 aarch64_register_saved_on_entry (int regno) 4668 { 4669 return cfun->machine->frame.reg_offset[regno] >= 0; 4670 } 4671 4672 /* Return the next register up from REGNO up to LIMIT for the callee 4673 to save. */ 4674 4675 static unsigned 4676 aarch64_next_callee_save (unsigned regno, unsigned limit) 4677 { 4678 while (regno <= limit && !aarch64_register_saved_on_entry (regno)) 4679 regno ++; 4680 return regno; 4681 } 4682 4683 /* Push the register number REGNO of mode MODE to the stack with write-back 4684 adjusting the stack by ADJUSTMENT. */ 4685 4686 static void 4687 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno, 4688 HOST_WIDE_INT adjustment) 4689 { 4690 rtx base_rtx = stack_pointer_rtx; 4691 rtx insn, reg, mem; 4692 4693 reg = gen_rtx_REG (mode, regno); 4694 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx, 4695 plus_constant (Pmode, base_rtx, -adjustment)); 4696 mem = gen_frame_mem (mode, mem); 4697 4698 insn = emit_move_insn (mem, reg); 4699 RTX_FRAME_RELATED_P (insn) = 1; 4700 } 4701 4702 /* Generate and return an instruction to store the pair of registers 4703 REG and REG2 of mode MODE to location BASE with write-back adjusting 4704 the stack location BASE by ADJUSTMENT. */ 4705 4706 static rtx 4707 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2, 4708 HOST_WIDE_INT adjustment) 4709 { 4710 switch (mode) 4711 { 4712 case E_DImode: 4713 return gen_storewb_pairdi_di (base, base, reg, reg2, 4714 GEN_INT (-adjustment), 4715 GEN_INT (UNITS_PER_WORD - adjustment)); 4716 case E_DFmode: 4717 return gen_storewb_pairdf_di (base, base, reg, reg2, 4718 GEN_INT (-adjustment), 4719 GEN_INT (UNITS_PER_WORD - adjustment)); 4720 case E_TFmode: 4721 return gen_storewb_pairtf_di (base, base, reg, reg2, 4722 GEN_INT (-adjustment), 4723 GEN_INT (UNITS_PER_VREG - adjustment)); 4724 default: 4725 gcc_unreachable (); 4726 } 4727 } 4728 4729 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the 4730 stack pointer by ADJUSTMENT. */ 4731 4732 static void 4733 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment) 4734 { 4735 rtx_insn *insn; 4736 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1); 4737 4738 if (regno2 == INVALID_REGNUM) 4739 return aarch64_pushwb_single_reg (mode, regno1, adjustment); 4740 4741 rtx reg1 = gen_rtx_REG (mode, regno1); 4742 rtx reg2 = gen_rtx_REG (mode, regno2); 4743 4744 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1, 4745 reg2, adjustment)); 4746 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1; 4747 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1; 4748 RTX_FRAME_RELATED_P (insn) = 1; 4749 } 4750 4751 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE, 4752 adjusting it by ADJUSTMENT afterwards. */ 4753 4754 static rtx 4755 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2, 4756 HOST_WIDE_INT adjustment) 4757 { 4758 switch (mode) 4759 { 4760 case E_DImode: 4761 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment), 4762 GEN_INT (UNITS_PER_WORD)); 4763 case E_DFmode: 4764 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment), 4765 GEN_INT (UNITS_PER_WORD)); 4766 case E_TFmode: 4767 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment), 4768 GEN_INT (UNITS_PER_VREG)); 4769 default: 4770 gcc_unreachable (); 4771 } 4772 } 4773 4774 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it 4775 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes 4776 into CFI_OPS. */ 4777 4778 static void 4779 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment, 4780 rtx *cfi_ops) 4781 { 4782 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno1); 4783 rtx reg1 = gen_rtx_REG (mode, regno1); 4784 4785 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops); 4786 4787 if (regno2 == INVALID_REGNUM) 4788 { 4789 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment); 4790 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem); 4791 emit_move_insn (reg1, gen_frame_mem (mode, mem)); 4792 } 4793 else 4794 { 4795 rtx reg2 = gen_rtx_REG (mode, regno2); 4796 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops); 4797 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1, 4798 reg2, adjustment)); 4799 } 4800 } 4801 4802 /* Generate and return a store pair instruction of mode MODE to store 4803 register REG1 to MEM1 and register REG2 to MEM2. */ 4804 4805 static rtx 4806 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2, 4807 rtx reg2) 4808 { 4809 switch (mode) 4810 { 4811 case E_DImode: 4812 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2); 4813 4814 case E_DFmode: 4815 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2); 4816 4817 case E_TFmode: 4818 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2); 4819 4820 default: 4821 gcc_unreachable (); 4822 } 4823 } 4824 4825 /* Generate and regurn a load pair isntruction of mode MODE to load register 4826 REG1 from MEM1 and register REG2 from MEM2. */ 4827 4828 static rtx 4829 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2, 4830 rtx mem2) 4831 { 4832 switch (mode) 4833 { 4834 case E_DImode: 4835 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2); 4836 4837 case E_DFmode: 4838 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2); 4839 4840 case E_TFmode: 4841 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2); 4842 4843 default: 4844 gcc_unreachable (); 4845 } 4846 } 4847 4848 /* Return TRUE if return address signing should be enabled for the current 4849 function, otherwise return FALSE. */ 4850 4851 bool 4852 aarch64_return_address_signing_enabled (void) 4853 { 4854 /* This function should only be called after frame laid out. */ 4855 gcc_assert (cfun->machine->frame.laid_out); 4856 4857 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function 4858 if it's LR is pushed onto stack. */ 4859 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL 4860 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF 4861 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0)); 4862 } 4863 4864 /* Return TRUE if Branch Target Identification Mechanism is enabled. */ 4865 bool 4866 aarch64_bti_enabled (void) 4867 { 4868 return (aarch64_enable_bti == 1); 4869 } 4870 4871 /* Emit code to save the callee-saved registers from register number START 4872 to LIMIT to the stack at the location starting at offset START_OFFSET, 4873 skipping any write-back candidates if SKIP_WB is true. */ 4874 4875 static void 4876 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset, 4877 unsigned start, unsigned limit, bool skip_wb) 4878 { 4879 rtx_insn *insn; 4880 unsigned regno; 4881 unsigned regno2; 4882 4883 for (regno = aarch64_next_callee_save (start, limit); 4884 regno <= limit; 4885 regno = aarch64_next_callee_save (regno + 1, limit)) 4886 { 4887 rtx reg, mem; 4888 poly_int64 offset; 4889 int offset_diff; 4890 4891 if (skip_wb 4892 && (regno == cfun->machine->frame.wb_candidate1 4893 || regno == cfun->machine->frame.wb_candidate2)) 4894 continue; 4895 4896 if (cfun->machine->reg_is_wrapped_separately[regno]) 4897 continue; 4898 4899 reg = gen_rtx_REG (mode, regno); 4900 offset = start_offset + cfun->machine->frame.reg_offset[regno]; 4901 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx, 4902 offset)); 4903 4904 regno2 = aarch64_next_callee_save (regno + 1, limit); 4905 offset_diff = cfun->machine->frame.reg_offset[regno2] 4906 - cfun->machine->frame.reg_offset[regno]; 4907 4908 if (regno2 <= limit 4909 && !cfun->machine->reg_is_wrapped_separately[regno2] 4910 && known_eq (GET_MODE_SIZE (mode), offset_diff)) 4911 { 4912 rtx reg2 = gen_rtx_REG (mode, regno2); 4913 rtx mem2; 4914 4915 offset = start_offset + cfun->machine->frame.reg_offset[regno2]; 4916 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx, 4917 offset)); 4918 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, 4919 reg2)); 4920 4921 /* The first part of a frame-related parallel insn is 4922 always assumed to be relevant to the frame 4923 calculations; subsequent parts, are only 4924 frame-related if explicitly marked. */ 4925 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1; 4926 regno = regno2; 4927 } 4928 else 4929 insn = emit_move_insn (mem, reg); 4930 4931 RTX_FRAME_RELATED_P (insn) = 1; 4932 } 4933 } 4934 4935 /* Emit code to restore the callee registers of mode MODE from register 4936 number START up to and including LIMIT. Restore from the stack offset 4937 START_OFFSET, skipping any write-back candidates if SKIP_WB is true. 4938 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */ 4939 4940 static void 4941 aarch64_restore_callee_saves (machine_mode mode, 4942 poly_int64 start_offset, unsigned start, 4943 unsigned limit, bool skip_wb, rtx *cfi_ops) 4944 { 4945 rtx base_rtx = stack_pointer_rtx; 4946 unsigned regno; 4947 unsigned regno2; 4948 poly_int64 offset; 4949 4950 for (regno = aarch64_next_callee_save (start, limit); 4951 regno <= limit; 4952 regno = aarch64_next_callee_save (regno + 1, limit)) 4953 { 4954 if (cfun->machine->reg_is_wrapped_separately[regno]) 4955 continue; 4956 4957 rtx reg, mem; 4958 int offset_diff; 4959 4960 if (skip_wb 4961 && (regno == cfun->machine->frame.wb_candidate1 4962 || regno == cfun->machine->frame.wb_candidate2)) 4963 continue; 4964 4965 reg = gen_rtx_REG (mode, regno); 4966 offset = start_offset + cfun->machine->frame.reg_offset[regno]; 4967 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset)); 4968 4969 regno2 = aarch64_next_callee_save (regno + 1, limit); 4970 offset_diff = cfun->machine->frame.reg_offset[regno2] 4971 - cfun->machine->frame.reg_offset[regno]; 4972 4973 if (regno2 <= limit 4974 && !cfun->machine->reg_is_wrapped_separately[regno2] 4975 && known_eq (GET_MODE_SIZE (mode), offset_diff)) 4976 { 4977 rtx reg2 = gen_rtx_REG (mode, regno2); 4978 rtx mem2; 4979 4980 offset = start_offset + cfun->machine->frame.reg_offset[regno2]; 4981 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset)); 4982 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2)); 4983 4984 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops); 4985 regno = regno2; 4986 } 4987 else 4988 emit_move_insn (reg, mem); 4989 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops); 4990 } 4991 } 4992 4993 /* Return true if OFFSET is a signed 4-bit value multiplied by the size 4994 of MODE. */ 4995 4996 static inline bool 4997 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset) 4998 { 4999 HOST_WIDE_INT multiple; 5000 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple) 5001 && IN_RANGE (multiple, -8, 7)); 5002 } 5003 5004 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size 5005 of MODE. */ 5006 5007 static inline bool 5008 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset) 5009 { 5010 HOST_WIDE_INT multiple; 5011 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple) 5012 && IN_RANGE (multiple, 0, 63)); 5013 } 5014 5015 /* Return true if OFFSET is a signed 7-bit value multiplied by the size 5016 of MODE. */ 5017 5018 bool 5019 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset) 5020 { 5021 HOST_WIDE_INT multiple; 5022 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple) 5023 && IN_RANGE (multiple, -64, 63)); 5024 } 5025 5026 /* Return true if OFFSET is a signed 9-bit value. */ 5027 5028 bool 5029 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED, 5030 poly_int64 offset) 5031 { 5032 HOST_WIDE_INT const_offset; 5033 return (offset.is_constant (&const_offset) 5034 && IN_RANGE (const_offset, -256, 255)); 5035 } 5036 5037 /* Return true if OFFSET is a signed 9-bit value multiplied by the size 5038 of MODE. */ 5039 5040 static inline bool 5041 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset) 5042 { 5043 HOST_WIDE_INT multiple; 5044 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple) 5045 && IN_RANGE (multiple, -256, 255)); 5046 } 5047 5048 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size 5049 of MODE. */ 5050 5051 static inline bool 5052 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset) 5053 { 5054 HOST_WIDE_INT multiple; 5055 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple) 5056 && IN_RANGE (multiple, 0, 4095)); 5057 } 5058 5059 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */ 5060 5061 static sbitmap 5062 aarch64_get_separate_components (void) 5063 { 5064 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1); 5065 bitmap_clear (components); 5066 5067 /* The registers we need saved to the frame. */ 5068 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) 5069 if (aarch64_register_saved_on_entry (regno)) 5070 { 5071 poly_int64 offset = cfun->machine->frame.reg_offset[regno]; 5072 if (!frame_pointer_needed) 5073 offset += cfun->machine->frame.frame_size 5074 - cfun->machine->frame.hard_fp_offset; 5075 /* Check that we can access the stack slot of the register with one 5076 direct load with no adjustments needed. */ 5077 if (offset_12bit_unsigned_scaled_p (DImode, offset)) 5078 bitmap_set_bit (components, regno); 5079 } 5080 5081 /* Don't mess with the hard frame pointer. */ 5082 if (frame_pointer_needed) 5083 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM); 5084 5085 unsigned reg1 = cfun->machine->frame.wb_candidate1; 5086 unsigned reg2 = cfun->machine->frame.wb_candidate2; 5087 /* If registers have been chosen to be stored/restored with 5088 writeback don't interfere with them to avoid having to output explicit 5089 stack adjustment instructions. */ 5090 if (reg2 != INVALID_REGNUM) 5091 bitmap_clear_bit (components, reg2); 5092 if (reg1 != INVALID_REGNUM) 5093 bitmap_clear_bit (components, reg1); 5094 5095 bitmap_clear_bit (components, LR_REGNUM); 5096 bitmap_clear_bit (components, SP_REGNUM); 5097 5098 return components; 5099 } 5100 5101 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */ 5102 5103 static sbitmap 5104 aarch64_components_for_bb (basic_block bb) 5105 { 5106 bitmap in = DF_LIVE_IN (bb); 5107 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen; 5108 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill; 5109 bool simd_function = aarch64_simd_decl_p (cfun->decl); 5110 5111 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1); 5112 bitmap_clear (components); 5113 5114 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */ 5115 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) 5116 if ((!call_used_regs[regno] 5117 || (simd_function && FP_SIMD_SAVED_REGNUM_P (regno))) 5118 && (bitmap_bit_p (in, regno) 5119 || bitmap_bit_p (gen, regno) 5120 || bitmap_bit_p (kill, regno))) 5121 { 5122 unsigned regno2, offset, offset2; 5123 bitmap_set_bit (components, regno); 5124 5125 /* If there is a callee-save at an adjacent offset, add it too 5126 to increase the use of LDP/STP. */ 5127 offset = cfun->machine->frame.reg_offset[regno]; 5128 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1; 5129 5130 if (regno2 <= LAST_SAVED_REGNUM) 5131 { 5132 offset2 = cfun->machine->frame.reg_offset[regno2]; 5133 if ((offset & ~8) == (offset2 & ~8)) 5134 bitmap_set_bit (components, regno2); 5135 } 5136 } 5137 5138 return components; 5139 } 5140 5141 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS. 5142 Nothing to do for aarch64. */ 5143 5144 static void 5145 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool) 5146 { 5147 } 5148 5149 /* Return the next set bit in BMP from START onwards. Return the total number 5150 of bits in BMP if no set bit is found at or after START. */ 5151 5152 static unsigned int 5153 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start) 5154 { 5155 unsigned int nbits = SBITMAP_SIZE (bmp); 5156 if (start == nbits) 5157 return start; 5158 5159 gcc_assert (start < nbits); 5160 for (unsigned int i = start; i < nbits; i++) 5161 if (bitmap_bit_p (bmp, i)) 5162 return i; 5163 5164 return nbits; 5165 } 5166 5167 /* Do the work for aarch64_emit_prologue_components and 5168 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers 5169 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence 5170 for these components or the epilogue sequence. That is, it determines 5171 whether we should emit stores or loads and what kind of CFA notes to attach 5172 to the insns. Otherwise the logic for the two sequences is very 5173 similar. */ 5174 5175 static void 5176 aarch64_process_components (sbitmap components, bool prologue_p) 5177 { 5178 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed 5179 ? HARD_FRAME_POINTER_REGNUM 5180 : STACK_POINTER_REGNUM); 5181 5182 unsigned last_regno = SBITMAP_SIZE (components); 5183 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM); 5184 rtx_insn *insn = NULL; 5185 5186 while (regno != last_regno) 5187 { 5188 /* AAPCS64 section 5.1.2 requires only the low 64 bits to be saved 5189 so DFmode for the vector registers is enough. For simd functions 5190 we want to save the low 128 bits. */ 5191 machine_mode mode = aarch64_reg_save_mode (cfun->decl, regno); 5192 5193 rtx reg = gen_rtx_REG (mode, regno); 5194 poly_int64 offset = cfun->machine->frame.reg_offset[regno]; 5195 if (!frame_pointer_needed) 5196 offset += cfun->machine->frame.frame_size 5197 - cfun->machine->frame.hard_fp_offset; 5198 rtx addr = plus_constant (Pmode, ptr_reg, offset); 5199 rtx mem = gen_frame_mem (mode, addr); 5200 5201 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem); 5202 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1); 5203 /* No more registers to handle after REGNO. 5204 Emit a single save/restore and exit. */ 5205 if (regno2 == last_regno) 5206 { 5207 insn = emit_insn (set); 5208 RTX_FRAME_RELATED_P (insn) = 1; 5209 if (prologue_p) 5210 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set)); 5211 else 5212 add_reg_note (insn, REG_CFA_RESTORE, reg); 5213 break; 5214 } 5215 5216 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2]; 5217 /* The next register is not of the same class or its offset is not 5218 mergeable with the current one into a pair. */ 5219 if (!satisfies_constraint_Ump (mem) 5220 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2) 5221 || (aarch64_simd_decl_p (cfun->decl) && FP_REGNUM_P (regno)) 5222 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]), 5223 GET_MODE_SIZE (mode))) 5224 { 5225 insn = emit_insn (set); 5226 RTX_FRAME_RELATED_P (insn) = 1; 5227 if (prologue_p) 5228 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set)); 5229 else 5230 add_reg_note (insn, REG_CFA_RESTORE, reg); 5231 5232 regno = regno2; 5233 continue; 5234 } 5235 5236 /* REGNO2 can be saved/restored in a pair with REGNO. */ 5237 rtx reg2 = gen_rtx_REG (mode, regno2); 5238 if (!frame_pointer_needed) 5239 offset2 += cfun->machine->frame.frame_size 5240 - cfun->machine->frame.hard_fp_offset; 5241 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2); 5242 rtx mem2 = gen_frame_mem (mode, addr2); 5243 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2) 5244 : gen_rtx_SET (reg2, mem2); 5245 5246 if (prologue_p) 5247 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2)); 5248 else 5249 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2)); 5250 5251 RTX_FRAME_RELATED_P (insn) = 1; 5252 if (prologue_p) 5253 { 5254 add_reg_note (insn, REG_CFA_OFFSET, set); 5255 add_reg_note (insn, REG_CFA_OFFSET, set2); 5256 } 5257 else 5258 { 5259 add_reg_note (insn, REG_CFA_RESTORE, reg); 5260 add_reg_note (insn, REG_CFA_RESTORE, reg2); 5261 } 5262 5263 regno = aarch64_get_next_set_bit (components, regno2 + 1); 5264 } 5265 } 5266 5267 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */ 5268 5269 static void 5270 aarch64_emit_prologue_components (sbitmap components) 5271 { 5272 aarch64_process_components (components, true); 5273 } 5274 5275 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */ 5276 5277 static void 5278 aarch64_emit_epilogue_components (sbitmap components) 5279 { 5280 aarch64_process_components (components, false); 5281 } 5282 5283 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */ 5284 5285 static void 5286 aarch64_set_handled_components (sbitmap components) 5287 { 5288 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++) 5289 if (bitmap_bit_p (components, regno)) 5290 cfun->machine->reg_is_wrapped_separately[regno] = true; 5291 } 5292 5293 /* On AArch64 we have an ABI defined safe buffer. This constant is used to 5294 determining the probe offset for alloca. */ 5295 5296 static HOST_WIDE_INT 5297 aarch64_stack_clash_protection_alloca_probe_range (void) 5298 { 5299 return STACK_CLASH_CALLER_GUARD; 5300 } 5301 5302 5303 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch 5304 registers. If POLY_SIZE is not large enough to require a probe this function 5305 will only adjust the stack. When allocating the stack space 5306 FRAME_RELATED_P is then used to indicate if the allocation is frame related. 5307 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing 5308 arguments. If we are then we ensure that any allocation larger than the ABI 5309 defined buffer needs a probe so that the invariant of having a 1KB buffer is 5310 maintained. 5311 5312 We emit barriers after each stack adjustment to prevent optimizations from 5313 breaking the invariant that we never drop the stack more than a page. This 5314 invariant is needed to make it easier to correctly handle asynchronous 5315 events, e.g. if we were to allow the stack to be dropped by more than a page 5316 and then have multiple probes up and we take a signal somewhere in between 5317 then the signal handler doesn't know the state of the stack and can make no 5318 assumptions about which pages have been probed. */ 5319 5320 static void 5321 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2, 5322 poly_int64 poly_size, 5323 bool frame_related_p, 5324 bool final_adjustment_p) 5325 { 5326 HOST_WIDE_INT guard_size 5327 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE); 5328 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; 5329 /* When doing the final adjustment for the outgoing argument size we can't 5330 assume that LR was saved at position 0. So subtract it's offset from the 5331 ABI safe buffer so that we don't accidentally allow an adjustment that 5332 would result in an allocation larger than the ABI buffer without 5333 probing. */ 5334 HOST_WIDE_INT min_probe_threshold 5335 = final_adjustment_p 5336 ? guard_used_by_caller - cfun->machine->frame.reg_offset[LR_REGNUM] 5337 : guard_size - guard_used_by_caller; 5338 5339 poly_int64 frame_size = cfun->machine->frame.frame_size; 5340 5341 /* We should always have a positive probe threshold. */ 5342 gcc_assert (min_probe_threshold > 0); 5343 5344 if (flag_stack_clash_protection && !final_adjustment_p) 5345 { 5346 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; 5347 poly_int64 final_adjust = cfun->machine->frame.final_adjust; 5348 5349 if (known_eq (frame_size, 0)) 5350 { 5351 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false); 5352 } 5353 else if (known_lt (initial_adjust, guard_size - guard_used_by_caller) 5354 && known_lt (final_adjust, guard_used_by_caller)) 5355 { 5356 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true); 5357 } 5358 } 5359 5360 /* If SIZE is not large enough to require probing, just adjust the stack and 5361 exit. */ 5362 if (known_lt (poly_size, min_probe_threshold) 5363 || !flag_stack_clash_protection) 5364 { 5365 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p); 5366 return; 5367 } 5368 5369 HOST_WIDE_INT size; 5370 /* Handle the SVE non-constant case first. */ 5371 if (!poly_size.is_constant (&size)) 5372 { 5373 if (dump_file) 5374 { 5375 fprintf (dump_file, "Stack clash SVE prologue: "); 5376 print_dec (poly_size, dump_file); 5377 fprintf (dump_file, " bytes, dynamic probing will be required.\n"); 5378 } 5379 5380 /* First calculate the amount of bytes we're actually spilling. */ 5381 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode), 5382 poly_size, temp1, temp2, false, true); 5383 5384 rtx_insn *insn = get_last_insn (); 5385 5386 if (frame_related_p) 5387 { 5388 /* This is done to provide unwinding information for the stack 5389 adjustments we're about to do, however to prevent the optimizers 5390 from removing the R11 move and leaving the CFA note (which would be 5391 very wrong) we tie the old and new stack pointer together. 5392 The tie will expand to nothing but the optimizers will not touch 5393 the instruction. */ 5394 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM); 5395 emit_move_insn (stack_ptr_copy, stack_pointer_rtx); 5396 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx)); 5397 5398 /* We want the CFA independent of the stack pointer for the 5399 duration of the loop. */ 5400 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy); 5401 RTX_FRAME_RELATED_P (insn) = 1; 5402 } 5403 5404 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode); 5405 rtx guard_const = gen_int_mode (guard_size, Pmode); 5406 5407 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx, 5408 stack_pointer_rtx, temp1, 5409 probe_const, guard_const)); 5410 5411 /* Now reset the CFA register if needed. */ 5412 if (frame_related_p) 5413 { 5414 add_reg_note (insn, REG_CFA_DEF_CFA, 5415 gen_rtx_PLUS (Pmode, stack_pointer_rtx, 5416 gen_int_mode (poly_size, Pmode))); 5417 RTX_FRAME_RELATED_P (insn) = 1; 5418 } 5419 5420 return; 5421 } 5422 5423 if (dump_file) 5424 fprintf (dump_file, 5425 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC 5426 " bytes, probing will be required.\n", size); 5427 5428 /* Round size to the nearest multiple of guard_size, and calculate the 5429 residual as the difference between the original size and the rounded 5430 size. */ 5431 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size); 5432 HOST_WIDE_INT residual = size - rounded_size; 5433 5434 /* We can handle a small number of allocations/probes inline. Otherwise 5435 punt to a loop. */ 5436 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size) 5437 { 5438 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size) 5439 { 5440 aarch64_sub_sp (NULL, temp2, guard_size, true); 5441 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, 5442 guard_used_by_caller)); 5443 emit_insn (gen_blockage ()); 5444 } 5445 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size); 5446 } 5447 else 5448 { 5449 /* Compute the ending address. */ 5450 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size, 5451 temp1, NULL, false, true); 5452 rtx_insn *insn = get_last_insn (); 5453 5454 /* For the initial allocation, we don't have a frame pointer 5455 set up, so we always need CFI notes. If we're doing the 5456 final allocation, then we may have a frame pointer, in which 5457 case it is the CFA, otherwise we need CFI notes. 5458 5459 We can determine which allocation we are doing by looking at 5460 the value of FRAME_RELATED_P since the final allocations are not 5461 frame related. */ 5462 if (frame_related_p) 5463 { 5464 /* We want the CFA independent of the stack pointer for the 5465 duration of the loop. */ 5466 add_reg_note (insn, REG_CFA_DEF_CFA, 5467 plus_constant (Pmode, temp1, rounded_size)); 5468 RTX_FRAME_RELATED_P (insn) = 1; 5469 } 5470 5471 /* This allocates and probes the stack. Note that this re-uses some of 5472 the existing Ada stack protection code. However we are guaranteed not 5473 to enter the non loop or residual branches of that code. 5474 5475 The non-loop part won't be entered because if our allocation amount 5476 doesn't require a loop, the case above would handle it. 5477 5478 The residual amount won't be entered because TEMP1 is a mutliple of 5479 the allocation size. The residual will always be 0. As such, the only 5480 part we are actually using from that code is the loop setup. The 5481 actual probing is done in aarch64_output_probe_stack_range. */ 5482 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx, 5483 stack_pointer_rtx, temp1)); 5484 5485 /* Now reset the CFA register if needed. */ 5486 if (frame_related_p) 5487 { 5488 add_reg_note (insn, REG_CFA_DEF_CFA, 5489 plus_constant (Pmode, stack_pointer_rtx, rounded_size)); 5490 RTX_FRAME_RELATED_P (insn) = 1; 5491 } 5492 5493 emit_insn (gen_blockage ()); 5494 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size); 5495 } 5496 5497 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to 5498 be probed. This maintains the requirement that each page is probed at 5499 least once. For initial probing we probe only if the allocation is 5500 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe 5501 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer == 5502 GUARD_SIZE. This works that for any allocation that is large enough to 5503 trigger a probe here, we'll have at least one, and if they're not large 5504 enough for this code to emit anything for them, The page would have been 5505 probed by the saving of FP/LR either by this function or any callees. If 5506 we don't have any callees then we won't have more stack adjustments and so 5507 are still safe. */ 5508 if (residual) 5509 { 5510 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller; 5511 /* If we're doing final adjustments, and we've done any full page 5512 allocations then any residual needs to be probed. */ 5513 if (final_adjustment_p && rounded_size != 0) 5514 min_probe_threshold = 0; 5515 /* If doing a small final adjustment, we always probe at offset 0. 5516 This is done to avoid issues when LR is not at position 0 or when 5517 the final adjustment is smaller than the probing offset. */ 5518 else if (final_adjustment_p && rounded_size == 0) 5519 residual_probe_offset = 0; 5520 5521 aarch64_sub_sp (temp1, temp2, residual, frame_related_p); 5522 if (residual >= min_probe_threshold) 5523 { 5524 if (dump_file) 5525 fprintf (dump_file, 5526 "Stack clash AArch64 prologue residuals: " 5527 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required." 5528 "\n", residual); 5529 5530 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, 5531 residual_probe_offset)); 5532 emit_insn (gen_blockage ()); 5533 } 5534 } 5535 } 5536 5537 /* Return 1 if the register is used by the epilogue. We need to say the 5538 return register is used, but only after epilogue generation is complete. 5539 Note that in the case of sibcalls, the values "used by the epilogue" are 5540 considered live at the start of the called function. 5541 5542 For SIMD functions we need to return 1 for FP registers that are saved and 5543 restored by a function but are not zero in call_used_regs. If we do not do 5544 this optimizations may remove the restore of the register. */ 5545 5546 int 5547 aarch64_epilogue_uses (int regno) 5548 { 5549 if (epilogue_completed) 5550 { 5551 if (regno == LR_REGNUM) 5552 return 1; 5553 if (aarch64_simd_decl_p (cfun->decl) && FP_SIMD_SAVED_REGNUM_P (regno)) 5554 return 1; 5555 } 5556 return 0; 5557 } 5558 5559 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG 5560 is saved at BASE + OFFSET. */ 5561 5562 static void 5563 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg, 5564 rtx base, poly_int64 offset) 5565 { 5566 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset)); 5567 add_reg_note (insn, REG_CFA_EXPRESSION, 5568 gen_rtx_SET (mem, regno_reg_rtx[reg])); 5569 } 5570 5571 /* AArch64 stack frames generated by this compiler look like: 5572 5573 +-------------------------------+ 5574 | | 5575 | incoming stack arguments | 5576 | | 5577 +-------------------------------+ 5578 | | <-- incoming stack pointer (aligned) 5579 | callee-allocated save area | 5580 | for register varargs | 5581 | | 5582 +-------------------------------+ 5583 | local variables | <-- frame_pointer_rtx 5584 | | 5585 +-------------------------------+ 5586 | padding | \ 5587 +-------------------------------+ | 5588 | callee-saved registers | | frame.saved_regs_size 5589 +-------------------------------+ | 5590 | LR' | | 5591 +-------------------------------+ | 5592 | FP' | / <- hard_frame_pointer_rtx (aligned) 5593 +-------------------------------+ 5594 | dynamic allocation | 5595 +-------------------------------+ 5596 | padding | 5597 +-------------------------------+ 5598 | outgoing stack arguments | <-- arg_pointer 5599 | | 5600 +-------------------------------+ 5601 | | <-- stack_pointer_rtx (aligned) 5602 5603 Dynamic stack allocations via alloca() decrease stack_pointer_rtx 5604 but leave frame_pointer_rtx and hard_frame_pointer_rtx 5605 unchanged. 5606 5607 By default for stack-clash we assume the guard is at least 64KB, but this 5608 value is configurable to either 4KB or 64KB. We also force the guard size to 5609 be the same as the probing interval and both values are kept in sync. 5610 5611 With those assumptions the callee can allocate up to 63KB (or 3KB depending 5612 on the guard size) of stack space without probing. 5613 5614 When probing is needed, we emit a probe at the start of the prologue 5615 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter. 5616 5617 We have to track how much space has been allocated and the only stores 5618 to the stack we track as implicit probes are the FP/LR stores. 5619 5620 For outgoing arguments we probe if the size is larger than 1KB, such that 5621 the ABI specified buffer is maintained for the next callee. 5622 5623 The following registers are reserved during frame layout and should not be 5624 used for any other purpose: 5625 5626 - r11: Used by stack clash protection when SVE is enabled. 5627 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment. 5628 - r14 and r15: Used for speculation tracking. 5629 - r16(IP0), r17(IP1): Used by indirect tailcalls. 5630 - r30(LR), r29(FP): Used by standard frame layout. 5631 5632 These registers must be avoided in frame layout related code unless the 5633 explicit intention is to interact with one of the features listed above. */ 5634 5635 /* Generate the prologue instructions for entry into a function. 5636 Establish the stack frame by decreasing the stack pointer with a 5637 properly calculated size and, if necessary, create a frame record 5638 filled with the values of LR and previous frame pointer. The 5639 current FP is also set up if it is in use. */ 5640 5641 void 5642 aarch64_expand_prologue (void) 5643 { 5644 poly_int64 frame_size = cfun->machine->frame.frame_size; 5645 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; 5646 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; 5647 poly_int64 final_adjust = cfun->machine->frame.final_adjust; 5648 poly_int64 callee_offset = cfun->machine->frame.callee_offset; 5649 unsigned reg1 = cfun->machine->frame.wb_candidate1; 5650 unsigned reg2 = cfun->machine->frame.wb_candidate2; 5651 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain; 5652 rtx_insn *insn; 5653 5654 /* Sign return address for functions. */ 5655 if (aarch64_return_address_signing_enabled ()) 5656 { 5657 insn = emit_insn (gen_pacisp ()); 5658 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx); 5659 RTX_FRAME_RELATED_P (insn) = 1; 5660 } 5661 5662 if (flag_stack_usage_info) 5663 current_function_static_stack_size = constant_lower_bound (frame_size); 5664 5665 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK) 5666 { 5667 if (crtl->is_leaf && !cfun->calls_alloca) 5668 { 5669 if (maybe_gt (frame_size, PROBE_INTERVAL) 5670 && maybe_gt (frame_size, get_stack_check_protect ())) 5671 aarch64_emit_probe_stack_range (get_stack_check_protect (), 5672 (frame_size 5673 - get_stack_check_protect ())); 5674 } 5675 else if (maybe_gt (frame_size, 0)) 5676 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size); 5677 } 5678 5679 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM); 5680 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM); 5681 5682 /* In theory we should never have both an initial adjustment 5683 and a callee save adjustment. Verify that is the case since the 5684 code below does not handle it for -fstack-clash-protection. */ 5685 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0); 5686 5687 /* Will only probe if the initial adjustment is larger than the guard 5688 less the amount of the guard reserved for use by the caller's 5689 outgoing args. */ 5690 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust, 5691 true, false); 5692 5693 if (callee_adjust != 0) 5694 aarch64_push_regs (reg1, reg2, callee_adjust); 5695 5696 if (emit_frame_chain) 5697 { 5698 poly_int64 reg_offset = callee_adjust; 5699 if (callee_adjust == 0) 5700 { 5701 reg1 = R29_REGNUM; 5702 reg2 = R30_REGNUM; 5703 reg_offset = callee_offset; 5704 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false); 5705 } 5706 aarch64_add_offset (Pmode, hard_frame_pointer_rtx, 5707 stack_pointer_rtx, callee_offset, 5708 tmp1_rtx, tmp0_rtx, frame_pointer_needed); 5709 if (frame_pointer_needed && !frame_size.is_constant ()) 5710 { 5711 /* Variable-sized frames need to describe the save slot 5712 address using DW_CFA_expression rather than DW_CFA_offset. 5713 This means that, without taking further action, the 5714 locations of the registers that we've already saved would 5715 remain based on the stack pointer even after we redefine 5716 the CFA based on the frame pointer. We therefore need new 5717 DW_CFA_expressions to re-express the save slots with addresses 5718 based on the frame pointer. */ 5719 rtx_insn *insn = get_last_insn (); 5720 gcc_assert (RTX_FRAME_RELATED_P (insn)); 5721 5722 /* Add an explicit CFA definition if this was previously 5723 implicit. */ 5724 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX)) 5725 { 5726 rtx src = plus_constant (Pmode, stack_pointer_rtx, 5727 callee_offset); 5728 add_reg_note (insn, REG_CFA_ADJUST_CFA, 5729 gen_rtx_SET (hard_frame_pointer_rtx, src)); 5730 } 5731 5732 /* Change the save slot expressions for the registers that 5733 we've already saved. */ 5734 reg_offset -= callee_offset; 5735 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx, 5736 reg_offset + UNITS_PER_WORD); 5737 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx, 5738 reg_offset); 5739 } 5740 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); 5741 } 5742 5743 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM, 5744 callee_adjust != 0 || emit_frame_chain); 5745 if (aarch64_simd_decl_p (cfun->decl)) 5746 aarch64_save_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM, 5747 callee_adjust != 0 || emit_frame_chain); 5748 else 5749 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM, 5750 callee_adjust != 0 || emit_frame_chain); 5751 5752 /* We may need to probe the final adjustment if it is larger than the guard 5753 that is assumed by the called. */ 5754 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust, 5755 !frame_pointer_needed, true); 5756 } 5757 5758 /* Return TRUE if we can use a simple_return insn. 5759 5760 This function checks whether the callee saved stack is empty, which 5761 means no restore actions are need. The pro_and_epilogue will use 5762 this to check whether shrink-wrapping opt is feasible. */ 5763 5764 bool 5765 aarch64_use_return_insn_p (void) 5766 { 5767 if (!reload_completed) 5768 return false; 5769 5770 if (crtl->profile) 5771 return false; 5772 5773 return known_eq (cfun->machine->frame.frame_size, 0); 5774 } 5775 5776 /* Return false for non-leaf SIMD functions in order to avoid 5777 shrink-wrapping them. Doing this will lose the necessary 5778 save/restore of FP registers. */ 5779 5780 bool 5781 aarch64_use_simple_return_insn_p (void) 5782 { 5783 if (aarch64_simd_decl_p (cfun->decl) && !crtl->is_leaf) 5784 return false; 5785 5786 return true; 5787 } 5788 5789 /* Generate the epilogue instructions for returning from a function. 5790 This is almost exactly the reverse of the prolog sequence, except 5791 that we need to insert barriers to avoid scheduling loads that read 5792 from a deallocated stack, and we optimize the unwind records by 5793 emitting them all together if possible. */ 5794 void 5795 aarch64_expand_epilogue (bool for_sibcall) 5796 { 5797 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust; 5798 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust; 5799 poly_int64 final_adjust = cfun->machine->frame.final_adjust; 5800 poly_int64 callee_offset = cfun->machine->frame.callee_offset; 5801 unsigned reg1 = cfun->machine->frame.wb_candidate1; 5802 unsigned reg2 = cfun->machine->frame.wb_candidate2; 5803 rtx cfi_ops = NULL; 5804 rtx_insn *insn; 5805 /* A stack clash protection prologue may not have left EP0_REGNUM or 5806 EP1_REGNUM in a usable state. The same is true for allocations 5807 with an SVE component, since we then need both temporary registers 5808 for each allocation. For stack clash we are in a usable state if 5809 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */ 5810 HOST_WIDE_INT guard_size 5811 = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE); 5812 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD; 5813 5814 /* We can re-use the registers when the allocation amount is smaller than 5815 guard_size - guard_used_by_caller because we won't be doing any probes 5816 then. In such situations the register should remain live with the correct 5817 value. */ 5818 bool can_inherit_p = (initial_adjust.is_constant () 5819 && final_adjust.is_constant ()) 5820 && (!flag_stack_clash_protection 5821 || known_lt (initial_adjust, 5822 guard_size - guard_used_by_caller)); 5823 5824 /* We need to add memory barrier to prevent read from deallocated stack. */ 5825 bool need_barrier_p 5826 = maybe_ne (get_frame_size () 5827 + cfun->machine->frame.saved_varargs_size, 0); 5828 5829 /* Emit a barrier to prevent loads from a deallocated stack. */ 5830 if (maybe_gt (final_adjust, crtl->outgoing_args_size) 5831 || cfun->calls_alloca 5832 || crtl->calls_eh_return) 5833 { 5834 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx)); 5835 need_barrier_p = false; 5836 } 5837 5838 /* Restore the stack pointer from the frame pointer if it may not 5839 be the same as the stack pointer. */ 5840 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM); 5841 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM); 5842 if (frame_pointer_needed 5843 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca)) 5844 /* If writeback is used when restoring callee-saves, the CFA 5845 is restored on the instruction doing the writeback. */ 5846 aarch64_add_offset (Pmode, stack_pointer_rtx, 5847 hard_frame_pointer_rtx, -callee_offset, 5848 tmp1_rtx, tmp0_rtx, callee_adjust == 0); 5849 else 5850 /* The case where we need to re-use the register here is very rare, so 5851 avoid the complicated condition and just always emit a move if the 5852 immediate doesn't fit. */ 5853 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true); 5854 5855 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM, 5856 callee_adjust != 0, &cfi_ops); 5857 if (aarch64_simd_decl_p (cfun->decl)) 5858 aarch64_restore_callee_saves (TFmode, callee_offset, V0_REGNUM, V31_REGNUM, 5859 callee_adjust != 0, &cfi_ops); 5860 else 5861 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM, 5862 callee_adjust != 0, &cfi_ops); 5863 5864 if (need_barrier_p) 5865 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx)); 5866 5867 if (callee_adjust != 0) 5868 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops); 5869 5870 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)) 5871 { 5872 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */ 5873 insn = get_last_insn (); 5874 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust); 5875 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops); 5876 RTX_FRAME_RELATED_P (insn) = 1; 5877 cfi_ops = NULL; 5878 } 5879 5880 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so 5881 add restriction on emit_move optimization to leaf functions. */ 5882 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, 5883 (!can_inherit_p || !crtl->is_leaf 5884 || df_regs_ever_live_p (EP0_REGNUM))); 5885 5886 if (cfi_ops) 5887 { 5888 /* Emit delayed restores and reset the CFA to be SP. */ 5889 insn = get_last_insn (); 5890 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops); 5891 REG_NOTES (insn) = cfi_ops; 5892 RTX_FRAME_RELATED_P (insn) = 1; 5893 } 5894 5895 /* We prefer to emit the combined return/authenticate instruction RETAA, 5896 however there are three cases in which we must instead emit an explicit 5897 authentication instruction. 5898 5899 1) Sibcalls don't return in a normal way, so if we're about to call one 5900 we must authenticate. 5901 5902 2) The RETAA instruction is not available before ARMv8.3-A, so if we are 5903 generating code for !TARGET_ARMV8_3 we can't use it and must 5904 explicitly authenticate. 5905 5906 3) On an eh_return path we make extra stack adjustments to update the 5907 canonical frame address to be the exception handler's CFA. We want 5908 to authenticate using the CFA of the function which calls eh_return. 5909 */ 5910 if (aarch64_return_address_signing_enabled () 5911 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return)) 5912 { 5913 insn = emit_insn (gen_autisp ()); 5914 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx); 5915 RTX_FRAME_RELATED_P (insn) = 1; 5916 } 5917 5918 /* Stack adjustment for exception handler. */ 5919 if (crtl->calls_eh_return) 5920 { 5921 /* We need to unwind the stack by the offset computed by 5922 EH_RETURN_STACKADJ_RTX. We have already reset the CFA 5923 to be SP; letting the CFA move during this adjustment 5924 is just as correct as retaining the CFA from the body 5925 of the function. Therefore, do nothing special. */ 5926 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX)); 5927 } 5928 5929 emit_use (gen_rtx_REG (DImode, LR_REGNUM)); 5930 if (!for_sibcall) 5931 emit_jump_insn (ret_rtx); 5932 } 5933 5934 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return 5935 normally or return to a previous frame after unwinding. 5936 5937 An EH return uses a single shared return sequence. The epilogue is 5938 exactly like a normal epilogue except that it has an extra input 5939 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment 5940 that must be applied after the frame has been destroyed. An extra label 5941 is inserted before the epilogue which initializes this register to zero, 5942 and this is the entry point for a normal return. 5943 5944 An actual EH return updates the return address, initializes the stack 5945 adjustment and jumps directly into the epilogue (bypassing the zeroing 5946 of the adjustment). Since the return address is typically saved on the 5947 stack when a function makes a call, the saved LR must be updated outside 5948 the epilogue. 5949 5950 This poses problems as the store is generated well before the epilogue, 5951 so the offset of LR is not known yet. Also optimizations will remove the 5952 store as it appears dead, even after the epilogue is generated (as the 5953 base or offset for loading LR is different in many cases). 5954 5955 To avoid these problems this implementation forces the frame pointer 5956 in eh_return functions so that the location of LR is fixed and known early. 5957 It also marks the store volatile, so no optimization is permitted to 5958 remove the store. */ 5959 rtx 5960 aarch64_eh_return_handler_rtx (void) 5961 { 5962 rtx tmp = gen_frame_mem (Pmode, 5963 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD)); 5964 5965 /* Mark the store volatile, so no optimization is permitted to remove it. */ 5966 MEM_VOLATILE_P (tmp) = true; 5967 return tmp; 5968 } 5969 5970 /* Output code to add DELTA to the first argument, and then jump 5971 to FUNCTION. Used for C++ multiple inheritance. */ 5972 static void 5973 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED, 5974 HOST_WIDE_INT delta, 5975 HOST_WIDE_INT vcall_offset, 5976 tree function) 5977 { 5978 /* The this pointer is always in x0. Note that this differs from 5979 Arm where the this pointer maybe bumped to r1 if r0 is required 5980 to return a pointer to an aggregate. On AArch64 a result value 5981 pointer will be in x8. */ 5982 int this_regno = R0_REGNUM; 5983 rtx this_rtx, temp0, temp1, addr, funexp; 5984 rtx_insn *insn; 5985 5986 if (aarch64_bti_enabled ()) 5987 emit_insn (gen_bti_c()); 5988 5989 reload_completed = 1; 5990 emit_note (NOTE_INSN_PROLOGUE_END); 5991 5992 this_rtx = gen_rtx_REG (Pmode, this_regno); 5993 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM); 5994 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM); 5995 5996 if (vcall_offset == 0) 5997 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false); 5998 else 5999 { 6000 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0); 6001 6002 addr = this_rtx; 6003 if (delta != 0) 6004 { 6005 if (delta >= -256 && delta < 256) 6006 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx, 6007 plus_constant (Pmode, this_rtx, delta)); 6008 else 6009 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, 6010 temp1, temp0, false); 6011 } 6012 6013 if (Pmode == ptr_mode) 6014 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr)); 6015 else 6016 aarch64_emit_move (temp0, 6017 gen_rtx_ZERO_EXTEND (Pmode, 6018 gen_rtx_MEM (ptr_mode, addr))); 6019 6020 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES) 6021 addr = plus_constant (Pmode, temp0, vcall_offset); 6022 else 6023 { 6024 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true, 6025 Pmode); 6026 addr = gen_rtx_PLUS (Pmode, temp0, temp1); 6027 } 6028 6029 if (Pmode == ptr_mode) 6030 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr)); 6031 else 6032 aarch64_emit_move (temp1, 6033 gen_rtx_SIGN_EXTEND (Pmode, 6034 gen_rtx_MEM (ptr_mode, addr))); 6035 6036 emit_insn (gen_add2_insn (this_rtx, temp1)); 6037 } 6038 6039 /* Generate a tail call to the target function. */ 6040 if (!TREE_USED (function)) 6041 { 6042 assemble_external (function); 6043 TREE_USED (function) = 1; 6044 } 6045 funexp = XEXP (DECL_RTL (function), 0); 6046 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp); 6047 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX)); 6048 SIBLING_CALL_P (insn) = 1; 6049 6050 insn = get_insns (); 6051 shorten_branches (insn); 6052 final_start_function (insn, file, 1); 6053 final (insn, file, 1); 6054 final_end_function (); 6055 6056 /* Stop pretending to be a post-reload pass. */ 6057 reload_completed = 0; 6058 } 6059 6060 static bool 6061 aarch64_tls_referenced_p (rtx x) 6062 { 6063 if (!TARGET_HAVE_TLS) 6064 return false; 6065 subrtx_iterator::array_type array; 6066 FOR_EACH_SUBRTX (iter, array, x, ALL) 6067 { 6068 const_rtx x = *iter; 6069 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0) 6070 return true; 6071 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are 6072 TLS offsets, not real symbol references. */ 6073 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS) 6074 iter.skip_subrtxes (); 6075 } 6076 return false; 6077 } 6078 6079 6080 /* Return true if val can be encoded as a 12-bit unsigned immediate with 6081 a left shift of 0 or 12 bits. */ 6082 bool 6083 aarch64_uimm12_shift (HOST_WIDE_INT val) 6084 { 6085 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val 6086 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val 6087 ); 6088 } 6089 6090 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate 6091 that can be created with a left shift of 0 or 12. */ 6092 static HOST_WIDE_INT 6093 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val) 6094 { 6095 /* Check to see if the value fits in 24 bits, as that is the maximum we can 6096 handle correctly. */ 6097 gcc_assert ((val & 0xffffff) == val); 6098 6099 if (((val & 0xfff) << 0) == val) 6100 return val; 6101 6102 return val & (0xfff << 12); 6103 } 6104 6105 /* Return true if val is an immediate that can be loaded into a 6106 register by a MOVZ instruction. */ 6107 static bool 6108 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode) 6109 { 6110 if (GET_MODE_SIZE (mode) > 4) 6111 { 6112 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val 6113 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val) 6114 return 1; 6115 } 6116 else 6117 { 6118 /* Ignore sign extension. */ 6119 val &= (HOST_WIDE_INT) 0xffffffff; 6120 } 6121 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val 6122 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val); 6123 } 6124 6125 /* VAL is a value with the inner mode of MODE. Replicate it to fill a 6126 64-bit (DImode) integer. */ 6127 6128 static unsigned HOST_WIDE_INT 6129 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode) 6130 { 6131 unsigned int size = GET_MODE_UNIT_PRECISION (mode); 6132 while (size < 64) 6133 { 6134 val &= (HOST_WIDE_INT_1U << size) - 1; 6135 val |= val << size; 6136 size *= 2; 6137 } 6138 return val; 6139 } 6140 6141 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */ 6142 6143 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] = 6144 { 6145 0x0000000100000001ull, 6146 0x0001000100010001ull, 6147 0x0101010101010101ull, 6148 0x1111111111111111ull, 6149 0x5555555555555555ull, 6150 }; 6151 6152 6153 /* Return true if val is a valid bitmask immediate. */ 6154 6155 bool 6156 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode) 6157 { 6158 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one; 6159 int bits; 6160 6161 /* Check for a single sequence of one bits and return quickly if so. 6162 The special cases of all ones and all zeroes returns false. */ 6163 val = aarch64_replicate_bitmask_imm (val_in, mode); 6164 tmp = val + (val & -val); 6165 6166 if (tmp == (tmp & -tmp)) 6167 return (val + 1) > 1; 6168 6169 /* Replicate 32-bit immediates so we can treat them as 64-bit. */ 6170 if (mode == SImode) 6171 val = (val << 32) | (val & 0xffffffff); 6172 6173 /* Invert if the immediate doesn't start with a zero bit - this means we 6174 only need to search for sequences of one bits. */ 6175 if (val & 1) 6176 val = ~val; 6177 6178 /* Find the first set bit and set tmp to val with the first sequence of one 6179 bits removed. Return success if there is a single sequence of ones. */ 6180 first_one = val & -val; 6181 tmp = val & (val + first_one); 6182 6183 if (tmp == 0) 6184 return true; 6185 6186 /* Find the next set bit and compute the difference in bit position. */ 6187 next_one = tmp & -tmp; 6188 bits = clz_hwi (first_one) - clz_hwi (next_one); 6189 mask = val ^ tmp; 6190 6191 /* Check the bit position difference is a power of 2, and that the first 6192 sequence of one bits fits within 'bits' bits. */ 6193 if ((mask >> bits) != 0 || bits != (bits & -bits)) 6194 return false; 6195 6196 /* Check the sequence of one bits is repeated 64/bits times. */ 6197 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26]; 6198 } 6199 6200 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN. 6201 Assumed precondition: VAL_IN Is not zero. */ 6202 6203 unsigned HOST_WIDE_INT 6204 aarch64_and_split_imm1 (HOST_WIDE_INT val_in) 6205 { 6206 int lowest_bit_set = ctz_hwi (val_in); 6207 int highest_bit_set = floor_log2 (val_in); 6208 gcc_assert (val_in != 0); 6209 6210 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) - 6211 (HOST_WIDE_INT_1U << lowest_bit_set)); 6212 } 6213 6214 /* Create constant where bits outside of lowest bit set to highest bit set 6215 are set to 1. */ 6216 6217 unsigned HOST_WIDE_INT 6218 aarch64_and_split_imm2 (HOST_WIDE_INT val_in) 6219 { 6220 return val_in | ~aarch64_and_split_imm1 (val_in); 6221 } 6222 6223 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */ 6224 6225 bool 6226 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode) 6227 { 6228 scalar_int_mode int_mode; 6229 if (!is_a <scalar_int_mode> (mode, &int_mode)) 6230 return false; 6231 6232 if (aarch64_bitmask_imm (val_in, int_mode)) 6233 return false; 6234 6235 if (aarch64_move_imm (val_in, int_mode)) 6236 return false; 6237 6238 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in); 6239 6240 return aarch64_bitmask_imm (imm2, int_mode); 6241 } 6242 6243 /* Return true if val is an immediate that can be loaded into a 6244 register in a single instruction. */ 6245 bool 6246 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode) 6247 { 6248 scalar_int_mode int_mode; 6249 if (!is_a <scalar_int_mode> (mode, &int_mode)) 6250 return false; 6251 6252 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode)) 6253 return 1; 6254 return aarch64_bitmask_imm (val, int_mode); 6255 } 6256 6257 static bool 6258 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x) 6259 { 6260 rtx base, offset; 6261 6262 if (GET_CODE (x) == HIGH) 6263 return true; 6264 6265 /* There's no way to calculate VL-based values using relocations. */ 6266 subrtx_iterator::array_type array; 6267 FOR_EACH_SUBRTX (iter, array, x, ALL) 6268 if (GET_CODE (*iter) == CONST_POLY_INT) 6269 return true; 6270 6271 split_const (x, &base, &offset); 6272 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF) 6273 { 6274 if (aarch64_classify_symbol (base, INTVAL (offset)) 6275 != SYMBOL_FORCE_TO_MEM) 6276 return true; 6277 else 6278 /* Avoid generating a 64-bit relocation in ILP32; leave 6279 to aarch64_expand_mov_immediate to handle it properly. */ 6280 return mode != ptr_mode; 6281 } 6282 6283 return aarch64_tls_referenced_p (x); 6284 } 6285 6286 /* Implement TARGET_CASE_VALUES_THRESHOLD. 6287 The expansion for a table switch is quite expensive due to the number 6288 of instructions, the table lookup and hard to predict indirect jump. 6289 When optimizing for speed, and -O3 enabled, use the per-core tuning if 6290 set, otherwise use tables for > 16 cases as a tradeoff between size and 6291 performance. When optimizing for size, use the default setting. */ 6292 6293 static unsigned int 6294 aarch64_case_values_threshold (void) 6295 { 6296 /* Use the specified limit for the number of cases before using jump 6297 tables at higher optimization levels. */ 6298 if (optimize > 2 6299 && selected_cpu->tune->max_case_values != 0) 6300 return selected_cpu->tune->max_case_values; 6301 else 6302 return optimize_size ? default_case_values_threshold () : 17; 6303 } 6304 6305 /* Return true if register REGNO is a valid index register. 6306 STRICT_P is true if REG_OK_STRICT is in effect. */ 6307 6308 bool 6309 aarch64_regno_ok_for_index_p (int regno, bool strict_p) 6310 { 6311 if (!HARD_REGISTER_NUM_P (regno)) 6312 { 6313 if (!strict_p) 6314 return true; 6315 6316 if (!reg_renumber) 6317 return false; 6318 6319 regno = reg_renumber[regno]; 6320 } 6321 return GP_REGNUM_P (regno); 6322 } 6323 6324 /* Return true if register REGNO is a valid base register for mode MODE. 6325 STRICT_P is true if REG_OK_STRICT is in effect. */ 6326 6327 bool 6328 aarch64_regno_ok_for_base_p (int regno, bool strict_p) 6329 { 6330 if (!HARD_REGISTER_NUM_P (regno)) 6331 { 6332 if (!strict_p) 6333 return true; 6334 6335 if (!reg_renumber) 6336 return false; 6337 6338 regno = reg_renumber[regno]; 6339 } 6340 6341 /* The fake registers will be eliminated to either the stack or 6342 hard frame pointer, both of which are usually valid base registers. 6343 Reload deals with the cases where the eliminated form isn't valid. */ 6344 return (GP_REGNUM_P (regno) 6345 || regno == SP_REGNUM 6346 || regno == FRAME_POINTER_REGNUM 6347 || regno == ARG_POINTER_REGNUM); 6348 } 6349 6350 /* Return true if X is a valid base register for mode MODE. 6351 STRICT_P is true if REG_OK_STRICT is in effect. */ 6352 6353 static bool 6354 aarch64_base_register_rtx_p (rtx x, bool strict_p) 6355 { 6356 if (!strict_p 6357 && GET_CODE (x) == SUBREG 6358 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))]) 6359 x = SUBREG_REG (x); 6360 6361 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p)); 6362 } 6363 6364 /* Return true if address offset is a valid index. If it is, fill in INFO 6365 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */ 6366 6367 static bool 6368 aarch64_classify_index (struct aarch64_address_info *info, rtx x, 6369 machine_mode mode, bool strict_p) 6370 { 6371 enum aarch64_address_type type; 6372 rtx index; 6373 int shift; 6374 6375 /* (reg:P) */ 6376 if ((REG_P (x) || GET_CODE (x) == SUBREG) 6377 && GET_MODE (x) == Pmode) 6378 { 6379 type = ADDRESS_REG_REG; 6380 index = x; 6381 shift = 0; 6382 } 6383 /* (sign_extend:DI (reg:SI)) */ 6384 else if ((GET_CODE (x) == SIGN_EXTEND 6385 || GET_CODE (x) == ZERO_EXTEND) 6386 && GET_MODE (x) == DImode 6387 && GET_MODE (XEXP (x, 0)) == SImode) 6388 { 6389 type = (GET_CODE (x) == SIGN_EXTEND) 6390 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW; 6391 index = XEXP (x, 0); 6392 shift = 0; 6393 } 6394 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */ 6395 else if (GET_CODE (x) == MULT 6396 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND 6397 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND) 6398 && GET_MODE (XEXP (x, 0)) == DImode 6399 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode 6400 && CONST_INT_P (XEXP (x, 1))) 6401 { 6402 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND) 6403 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW; 6404 index = XEXP (XEXP (x, 0), 0); 6405 shift = exact_log2 (INTVAL (XEXP (x, 1))); 6406 } 6407 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */ 6408 else if (GET_CODE (x) == ASHIFT 6409 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND 6410 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND) 6411 && GET_MODE (XEXP (x, 0)) == DImode 6412 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode 6413 && CONST_INT_P (XEXP (x, 1))) 6414 { 6415 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND) 6416 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW; 6417 index = XEXP (XEXP (x, 0), 0); 6418 shift = INTVAL (XEXP (x, 1)); 6419 } 6420 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */ 6421 else if ((GET_CODE (x) == SIGN_EXTRACT 6422 || GET_CODE (x) == ZERO_EXTRACT) 6423 && GET_MODE (x) == DImode 6424 && GET_CODE (XEXP (x, 0)) == MULT 6425 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode 6426 && CONST_INT_P (XEXP (XEXP (x, 0), 1))) 6427 { 6428 type = (GET_CODE (x) == SIGN_EXTRACT) 6429 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW; 6430 index = XEXP (XEXP (x, 0), 0); 6431 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1))); 6432 if (INTVAL (XEXP (x, 1)) != 32 + shift 6433 || INTVAL (XEXP (x, 2)) != 0) 6434 shift = -1; 6435 } 6436 /* (and:DI (mult:DI (reg:DI) (const_int scale)) 6437 (const_int 0xffffffff<<shift)) */ 6438 else if (GET_CODE (x) == AND 6439 && GET_MODE (x) == DImode 6440 && GET_CODE (XEXP (x, 0)) == MULT 6441 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode 6442 && CONST_INT_P (XEXP (XEXP (x, 0), 1)) 6443 && CONST_INT_P (XEXP (x, 1))) 6444 { 6445 type = ADDRESS_REG_UXTW; 6446 index = XEXP (XEXP (x, 0), 0); 6447 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1))); 6448 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift) 6449 shift = -1; 6450 } 6451 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */ 6452 else if ((GET_CODE (x) == SIGN_EXTRACT 6453 || GET_CODE (x) == ZERO_EXTRACT) 6454 && GET_MODE (x) == DImode 6455 && GET_CODE (XEXP (x, 0)) == ASHIFT 6456 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode 6457 && CONST_INT_P (XEXP (XEXP (x, 0), 1))) 6458 { 6459 type = (GET_CODE (x) == SIGN_EXTRACT) 6460 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW; 6461 index = XEXP (XEXP (x, 0), 0); 6462 shift = INTVAL (XEXP (XEXP (x, 0), 1)); 6463 if (INTVAL (XEXP (x, 1)) != 32 + shift 6464 || INTVAL (XEXP (x, 2)) != 0) 6465 shift = -1; 6466 } 6467 /* (and:DI (ashift:DI (reg:DI) (const_int shift)) 6468 (const_int 0xffffffff<<shift)) */ 6469 else if (GET_CODE (x) == AND 6470 && GET_MODE (x) == DImode 6471 && GET_CODE (XEXP (x, 0)) == ASHIFT 6472 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode 6473 && CONST_INT_P (XEXP (XEXP (x, 0), 1)) 6474 && CONST_INT_P (XEXP (x, 1))) 6475 { 6476 type = ADDRESS_REG_UXTW; 6477 index = XEXP (XEXP (x, 0), 0); 6478 shift = INTVAL (XEXP (XEXP (x, 0), 1)); 6479 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift) 6480 shift = -1; 6481 } 6482 /* (mult:P (reg:P) (const_int scale)) */ 6483 else if (GET_CODE (x) == MULT 6484 && GET_MODE (x) == Pmode 6485 && GET_MODE (XEXP (x, 0)) == Pmode 6486 && CONST_INT_P (XEXP (x, 1))) 6487 { 6488 type = ADDRESS_REG_REG; 6489 index = XEXP (x, 0); 6490 shift = exact_log2 (INTVAL (XEXP (x, 1))); 6491 } 6492 /* (ashift:P (reg:P) (const_int shift)) */ 6493 else if (GET_CODE (x) == ASHIFT 6494 && GET_MODE (x) == Pmode 6495 && GET_MODE (XEXP (x, 0)) == Pmode 6496 && CONST_INT_P (XEXP (x, 1))) 6497 { 6498 type = ADDRESS_REG_REG; 6499 index = XEXP (x, 0); 6500 shift = INTVAL (XEXP (x, 1)); 6501 } 6502 else 6503 return false; 6504 6505 if (!strict_p 6506 && GET_CODE (index) == SUBREG 6507 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))]) 6508 index = SUBREG_REG (index); 6509 6510 if (aarch64_sve_data_mode_p (mode)) 6511 { 6512 if (type != ADDRESS_REG_REG 6513 || (1 << shift) != GET_MODE_UNIT_SIZE (mode)) 6514 return false; 6515 } 6516 else 6517 { 6518 if (shift != 0 6519 && !(IN_RANGE (shift, 1, 3) 6520 && known_eq (1 << shift, GET_MODE_SIZE (mode)))) 6521 return false; 6522 } 6523 6524 if (REG_P (index) 6525 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p)) 6526 { 6527 info->type = type; 6528 info->offset = index; 6529 info->shift = shift; 6530 return true; 6531 } 6532 6533 return false; 6534 } 6535 6536 /* Return true if MODE is one of the modes for which we 6537 support LDP/STP operations. */ 6538 6539 static bool 6540 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode) 6541 { 6542 return mode == SImode || mode == DImode 6543 || mode == SFmode || mode == DFmode 6544 || (aarch64_vector_mode_supported_p (mode) 6545 && (known_eq (GET_MODE_SIZE (mode), 8) 6546 || (known_eq (GET_MODE_SIZE (mode), 16) 6547 && (aarch64_tune_params.extra_tuning_flags 6548 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0))); 6549 } 6550 6551 /* Return true if REGNO is a virtual pointer register, or an eliminable 6552 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't 6553 include stack_pointer or hard_frame_pointer. */ 6554 static bool 6555 virt_or_elim_regno_p (unsigned regno) 6556 { 6557 return ((regno >= FIRST_VIRTUAL_REGISTER 6558 && regno <= LAST_VIRTUAL_POINTER_REGISTER) 6559 || regno == FRAME_POINTER_REGNUM 6560 || regno == ARG_POINTER_REGNUM); 6561 } 6562 6563 /* Return true if X is a valid address of type TYPE for machine mode MODE. 6564 If it is, fill in INFO appropriately. STRICT_P is true if 6565 REG_OK_STRICT is in effect. */ 6566 6567 bool 6568 aarch64_classify_address (struct aarch64_address_info *info, 6569 rtx x, machine_mode mode, bool strict_p, 6570 aarch64_addr_query_type type) 6571 { 6572 enum rtx_code code = GET_CODE (x); 6573 rtx op0, op1; 6574 poly_int64 offset; 6575 6576 HOST_WIDE_INT const_size; 6577 6578 /* On BE, we use load/store pair for all large int mode load/stores. 6579 TI/TFmode may also use a load/store pair. */ 6580 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 6581 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT)); 6582 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP 6583 || type == ADDR_QUERY_LDP_STP_N 6584 || mode == TImode 6585 || mode == TFmode 6586 || (BYTES_BIG_ENDIAN && advsimd_struct_p)); 6587 6588 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode 6589 corresponds to the actual size of the memory being loaded/stored and the 6590 mode of the corresponding addressing mode is half of that. */ 6591 if (type == ADDR_QUERY_LDP_STP_N 6592 && known_eq (GET_MODE_SIZE (mode), 16)) 6593 mode = DFmode; 6594 6595 bool allow_reg_index_p = (!load_store_pair_p 6596 && (known_lt (GET_MODE_SIZE (mode), 16) 6597 || vec_flags == VEC_ADVSIMD 6598 || vec_flags == VEC_SVE_DATA)); 6599 6600 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and 6601 [Rn, #offset, MUL VL]. */ 6602 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0 6603 && (code != REG && code != PLUS)) 6604 return false; 6605 6606 /* On LE, for AdvSIMD, don't support anything other than POST_INC or 6607 REG addressing. */ 6608 if (advsimd_struct_p 6609 && !BYTES_BIG_ENDIAN 6610 && (code != POST_INC && code != REG)) 6611 return false; 6612 6613 gcc_checking_assert (GET_MODE (x) == VOIDmode 6614 || SCALAR_INT_MODE_P (GET_MODE (x))); 6615 6616 switch (code) 6617 { 6618 case REG: 6619 case SUBREG: 6620 info->type = ADDRESS_REG_IMM; 6621 info->base = x; 6622 info->offset = const0_rtx; 6623 info->const_offset = 0; 6624 return aarch64_base_register_rtx_p (x, strict_p); 6625 6626 case PLUS: 6627 op0 = XEXP (x, 0); 6628 op1 = XEXP (x, 1); 6629 6630 if (! strict_p 6631 && REG_P (op0) 6632 && virt_or_elim_regno_p (REGNO (op0)) 6633 && poly_int_rtx_p (op1, &offset)) 6634 { 6635 info->type = ADDRESS_REG_IMM; 6636 info->base = op0; 6637 info->offset = op1; 6638 info->const_offset = offset; 6639 6640 return true; 6641 } 6642 6643 if (maybe_ne (GET_MODE_SIZE (mode), 0) 6644 && aarch64_base_register_rtx_p (op0, strict_p) 6645 && poly_int_rtx_p (op1, &offset)) 6646 { 6647 info->type = ADDRESS_REG_IMM; 6648 info->base = op0; 6649 info->offset = op1; 6650 info->const_offset = offset; 6651 6652 /* TImode and TFmode values are allowed in both pairs of X 6653 registers and individual Q registers. The available 6654 address modes are: 6655 X,X: 7-bit signed scaled offset 6656 Q: 9-bit signed offset 6657 We conservatively require an offset representable in either mode. 6658 When performing the check for pairs of X registers i.e. LDP/STP 6659 pass down DImode since that is the natural size of the LDP/STP 6660 instruction memory accesses. */ 6661 if (mode == TImode || mode == TFmode) 6662 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset) 6663 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset) 6664 || offset_12bit_unsigned_scaled_p (mode, offset))); 6665 6666 /* A 7bit offset check because OImode will emit a ldp/stp 6667 instruction (only big endian will get here). 6668 For ldp/stp instructions, the offset is scaled for the size of a 6669 single element of the pair. */ 6670 if (mode == OImode) 6671 return aarch64_offset_7bit_signed_scaled_p (TImode, offset); 6672 6673 /* Three 9/12 bit offsets checks because CImode will emit three 6674 ldr/str instructions (only big endian will get here). */ 6675 if (mode == CImode) 6676 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset) 6677 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode, 6678 offset + 32) 6679 || offset_12bit_unsigned_scaled_p (V16QImode, 6680 offset + 32))); 6681 6682 /* Two 7bit offsets checks because XImode will emit two ldp/stp 6683 instructions (only big endian will get here). */ 6684 if (mode == XImode) 6685 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset) 6686 && aarch64_offset_7bit_signed_scaled_p (TImode, 6687 offset + 32)); 6688 6689 /* Make "m" use the LD1 offset range for SVE data modes, so 6690 that pre-RTL optimizers like ivopts will work to that 6691 instead of the wider LDR/STR range. */ 6692 if (vec_flags == VEC_SVE_DATA) 6693 return (type == ADDR_QUERY_M 6694 ? offset_4bit_signed_scaled_p (mode, offset) 6695 : offset_9bit_signed_scaled_p (mode, offset)); 6696 6697 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT)) 6698 { 6699 poly_int64 end_offset = (offset 6700 + GET_MODE_SIZE (mode) 6701 - BYTES_PER_SVE_VECTOR); 6702 return (type == ADDR_QUERY_M 6703 ? offset_4bit_signed_scaled_p (mode, offset) 6704 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset) 6705 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE, 6706 end_offset))); 6707 } 6708 6709 if (vec_flags == VEC_SVE_PRED) 6710 return offset_9bit_signed_scaled_p (mode, offset); 6711 6712 if (load_store_pair_p) 6713 return ((known_eq (GET_MODE_SIZE (mode), 4) 6714 || known_eq (GET_MODE_SIZE (mode), 8) 6715 || known_eq (GET_MODE_SIZE (mode), 16)) 6716 && aarch64_offset_7bit_signed_scaled_p (mode, offset)); 6717 else 6718 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset) 6719 || offset_12bit_unsigned_scaled_p (mode, offset)); 6720 } 6721 6722 if (allow_reg_index_p) 6723 { 6724 /* Look for base + (scaled/extended) index register. */ 6725 if (aarch64_base_register_rtx_p (op0, strict_p) 6726 && aarch64_classify_index (info, op1, mode, strict_p)) 6727 { 6728 info->base = op0; 6729 return true; 6730 } 6731 if (aarch64_base_register_rtx_p (op1, strict_p) 6732 && aarch64_classify_index (info, op0, mode, strict_p)) 6733 { 6734 info->base = op1; 6735 return true; 6736 } 6737 } 6738 6739 return false; 6740 6741 case POST_INC: 6742 case POST_DEC: 6743 case PRE_INC: 6744 case PRE_DEC: 6745 info->type = ADDRESS_REG_WB; 6746 info->base = XEXP (x, 0); 6747 info->offset = NULL_RTX; 6748 return aarch64_base_register_rtx_p (info->base, strict_p); 6749 6750 case POST_MODIFY: 6751 case PRE_MODIFY: 6752 info->type = ADDRESS_REG_WB; 6753 info->base = XEXP (x, 0); 6754 if (GET_CODE (XEXP (x, 1)) == PLUS 6755 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset) 6756 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base) 6757 && aarch64_base_register_rtx_p (info->base, strict_p)) 6758 { 6759 info->offset = XEXP (XEXP (x, 1), 1); 6760 info->const_offset = offset; 6761 6762 /* TImode and TFmode values are allowed in both pairs of X 6763 registers and individual Q registers. The available 6764 address modes are: 6765 X,X: 7-bit signed scaled offset 6766 Q: 9-bit signed offset 6767 We conservatively require an offset representable in either mode. 6768 */ 6769 if (mode == TImode || mode == TFmode) 6770 return (aarch64_offset_7bit_signed_scaled_p (mode, offset) 6771 && aarch64_offset_9bit_signed_unscaled_p (mode, offset)); 6772 6773 if (load_store_pair_p) 6774 return ((known_eq (GET_MODE_SIZE (mode), 4) 6775 || known_eq (GET_MODE_SIZE (mode), 8) 6776 || known_eq (GET_MODE_SIZE (mode), 16)) 6777 && aarch64_offset_7bit_signed_scaled_p (mode, offset)); 6778 else 6779 return aarch64_offset_9bit_signed_unscaled_p (mode, offset); 6780 } 6781 return false; 6782 6783 case CONST: 6784 case SYMBOL_REF: 6785 case LABEL_REF: 6786 /* load literal: pc-relative constant pool entry. Only supported 6787 for SI mode or larger. */ 6788 info->type = ADDRESS_SYMBOLIC; 6789 6790 if (!load_store_pair_p 6791 && GET_MODE_SIZE (mode).is_constant (&const_size) 6792 && const_size >= 4) 6793 { 6794 rtx sym, addend; 6795 6796 split_const (x, &sym, &addend); 6797 return ((GET_CODE (sym) == LABEL_REF 6798 || (GET_CODE (sym) == SYMBOL_REF 6799 && CONSTANT_POOL_ADDRESS_P (sym) 6800 && aarch64_pcrelative_literal_loads))); 6801 } 6802 return false; 6803 6804 case LO_SUM: 6805 info->type = ADDRESS_LO_SUM; 6806 info->base = XEXP (x, 0); 6807 info->offset = XEXP (x, 1); 6808 if (allow_reg_index_p 6809 && aarch64_base_register_rtx_p (info->base, strict_p)) 6810 { 6811 rtx sym, offs; 6812 split_const (info->offset, &sym, &offs); 6813 if (GET_CODE (sym) == SYMBOL_REF 6814 && (aarch64_classify_symbol (sym, INTVAL (offs)) 6815 == SYMBOL_SMALL_ABSOLUTE)) 6816 { 6817 /* The symbol and offset must be aligned to the access size. */ 6818 unsigned int align; 6819 6820 if (CONSTANT_POOL_ADDRESS_P (sym)) 6821 align = GET_MODE_ALIGNMENT (get_pool_mode (sym)); 6822 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym)) 6823 { 6824 tree exp = SYMBOL_REF_DECL (sym); 6825 align = TYPE_ALIGN (TREE_TYPE (exp)); 6826 align = aarch64_constant_alignment (exp, align); 6827 } 6828 else if (SYMBOL_REF_DECL (sym)) 6829 align = DECL_ALIGN (SYMBOL_REF_DECL (sym)); 6830 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym) 6831 && SYMBOL_REF_BLOCK (sym) != NULL) 6832 align = SYMBOL_REF_BLOCK (sym)->alignment; 6833 else 6834 align = BITS_PER_UNIT; 6835 6836 poly_int64 ref_size = GET_MODE_SIZE (mode); 6837 if (known_eq (ref_size, 0)) 6838 ref_size = GET_MODE_SIZE (DImode); 6839 6840 return (multiple_p (INTVAL (offs), ref_size) 6841 && multiple_p (align / BITS_PER_UNIT, ref_size)); 6842 } 6843 } 6844 return false; 6845 6846 default: 6847 return false; 6848 } 6849 } 6850 6851 /* Return true if the address X is valid for a PRFM instruction. 6852 STRICT_P is true if we should do strict checking with 6853 aarch64_classify_address. */ 6854 6855 bool 6856 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p) 6857 { 6858 struct aarch64_address_info addr; 6859 6860 /* PRFM accepts the same addresses as DImode... */ 6861 bool res = aarch64_classify_address (&addr, x, DImode, strict_p); 6862 if (!res) 6863 return false; 6864 6865 /* ... except writeback forms. */ 6866 return addr.type != ADDRESS_REG_WB; 6867 } 6868 6869 bool 6870 aarch64_symbolic_address_p (rtx x) 6871 { 6872 rtx offset; 6873 6874 split_const (x, &x, &offset); 6875 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF; 6876 } 6877 6878 /* Classify the base of symbolic expression X. */ 6879 6880 enum aarch64_symbol_type 6881 aarch64_classify_symbolic_expression (rtx x) 6882 { 6883 rtx offset; 6884 6885 split_const (x, &x, &offset); 6886 return aarch64_classify_symbol (x, INTVAL (offset)); 6887 } 6888 6889 6890 /* Return TRUE if X is a legitimate address for accessing memory in 6891 mode MODE. */ 6892 static bool 6893 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p) 6894 { 6895 struct aarch64_address_info addr; 6896 6897 return aarch64_classify_address (&addr, x, mode, strict_p); 6898 } 6899 6900 /* Return TRUE if X is a legitimate address of type TYPE for accessing 6901 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */ 6902 bool 6903 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p, 6904 aarch64_addr_query_type type) 6905 { 6906 struct aarch64_address_info addr; 6907 6908 return aarch64_classify_address (&addr, x, mode, strict_p, type); 6909 } 6910 6911 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */ 6912 6913 static bool 6914 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2, 6915 poly_int64 orig_offset, 6916 machine_mode mode) 6917 { 6918 HOST_WIDE_INT size; 6919 if (GET_MODE_SIZE (mode).is_constant (&size)) 6920 { 6921 HOST_WIDE_INT const_offset, second_offset; 6922 6923 /* A general SVE offset is A * VQ + B. Remove the A component from 6924 coefficient 0 in order to get the constant B. */ 6925 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1]; 6926 6927 /* Split an out-of-range address displacement into a base and 6928 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB 6929 range otherwise to increase opportunities for sharing the base 6930 address of different sizes. Unaligned accesses use the signed 6931 9-bit range, TImode/TFmode use the intersection of signed 6932 scaled 7-bit and signed 9-bit offset. */ 6933 if (mode == TImode || mode == TFmode) 6934 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100; 6935 else if ((const_offset & (size - 1)) != 0) 6936 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100; 6937 else 6938 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc); 6939 6940 if (second_offset == 0 || known_eq (orig_offset, second_offset)) 6941 return false; 6942 6943 /* Split the offset into second_offset and the rest. */ 6944 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode); 6945 *offset2 = gen_int_mode (second_offset, Pmode); 6946 return true; 6947 } 6948 else 6949 { 6950 /* Get the mode we should use as the basis of the range. For structure 6951 modes this is the mode of one vector. */ 6952 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 6953 machine_mode step_mode 6954 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode; 6955 6956 /* Get the "mul vl" multiplier we'd like to use. */ 6957 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1]; 6958 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor; 6959 if (vec_flags & VEC_SVE_DATA) 6960 /* LDR supports a 9-bit range, but the move patterns for 6961 structure modes require all vectors to be in range of the 6962 same base. The simplest way of accomodating that while still 6963 promoting reuse of anchor points between different modes is 6964 to use an 8-bit range unconditionally. */ 6965 vnum = ((vnum + 128) & 255) - 128; 6966 else 6967 /* Predicates are only handled singly, so we might as well use 6968 the full range. */ 6969 vnum = ((vnum + 256) & 511) - 256; 6970 if (vnum == 0) 6971 return false; 6972 6973 /* Convert the "mul vl" multiplier into a byte offset. */ 6974 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum; 6975 if (known_eq (second_offset, orig_offset)) 6976 return false; 6977 6978 /* Split the offset into second_offset and the rest. */ 6979 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode); 6980 *offset2 = gen_int_mode (second_offset, Pmode); 6981 return true; 6982 } 6983 } 6984 6985 /* Return the binary representation of floating point constant VALUE in INTVAL. 6986 If the value cannot be converted, return false without setting INTVAL. 6987 The conversion is done in the given MODE. */ 6988 bool 6989 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval) 6990 { 6991 6992 /* We make a general exception for 0. */ 6993 if (aarch64_float_const_zero_rtx_p (value)) 6994 { 6995 *intval = 0; 6996 return true; 6997 } 6998 6999 scalar_float_mode mode; 7000 if (GET_CODE (value) != CONST_DOUBLE 7001 || !is_a <scalar_float_mode> (GET_MODE (value), &mode) 7002 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT 7003 /* Only support up to DF mode. */ 7004 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode)) 7005 return false; 7006 7007 unsigned HOST_WIDE_INT ival = 0; 7008 7009 long res[2]; 7010 real_to_target (res, 7011 CONST_DOUBLE_REAL_VALUE (value), 7012 REAL_MODE_FORMAT (mode)); 7013 7014 if (mode == DFmode) 7015 { 7016 int order = BYTES_BIG_ENDIAN ? 1 : 0; 7017 ival = zext_hwi (res[order], 32); 7018 ival |= (zext_hwi (res[1 - order], 32) << 32); 7019 } 7020 else 7021 ival = zext_hwi (res[0], 32); 7022 7023 *intval = ival; 7024 return true; 7025 } 7026 7027 /* Return TRUE if rtx X is an immediate constant that can be moved using a 7028 single MOV(+MOVK) followed by an FMOV. */ 7029 bool 7030 aarch64_float_const_rtx_p (rtx x) 7031 { 7032 machine_mode mode = GET_MODE (x); 7033 if (mode == VOIDmode) 7034 return false; 7035 7036 /* Determine whether it's cheaper to write float constants as 7037 mov/movk pairs over ldr/adrp pairs. */ 7038 unsigned HOST_WIDE_INT ival; 7039 7040 if (GET_CODE (x) == CONST_DOUBLE 7041 && SCALAR_FLOAT_MODE_P (mode) 7042 && aarch64_reinterpret_float_as_int (x, &ival)) 7043 { 7044 scalar_int_mode imode = (mode == HFmode 7045 ? SImode 7046 : int_mode_for_mode (mode).require ()); 7047 int num_instr = aarch64_internal_mov_immediate 7048 (NULL_RTX, gen_int_mode (ival, imode), false, imode); 7049 return num_instr < 3; 7050 } 7051 7052 return false; 7053 } 7054 7055 /* Return TRUE if rtx X is immediate constant 0.0 */ 7056 bool 7057 aarch64_float_const_zero_rtx_p (rtx x) 7058 { 7059 if (GET_MODE (x) == VOIDmode) 7060 return false; 7061 7062 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x))) 7063 return !HONOR_SIGNED_ZEROS (GET_MODE (x)); 7064 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0); 7065 } 7066 7067 /* Return TRUE if rtx X is immediate constant that fits in a single 7068 MOVI immediate operation. */ 7069 bool 7070 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode) 7071 { 7072 if (!TARGET_SIMD) 7073 return false; 7074 7075 machine_mode vmode; 7076 scalar_int_mode imode; 7077 unsigned HOST_WIDE_INT ival; 7078 7079 if (GET_CODE (x) == CONST_DOUBLE 7080 && SCALAR_FLOAT_MODE_P (mode)) 7081 { 7082 if (!aarch64_reinterpret_float_as_int (x, &ival)) 7083 return false; 7084 7085 /* We make a general exception for 0. */ 7086 if (aarch64_float_const_zero_rtx_p (x)) 7087 return true; 7088 7089 imode = int_mode_for_mode (mode).require (); 7090 } 7091 else if (GET_CODE (x) == CONST_INT 7092 && is_a <scalar_int_mode> (mode, &imode)) 7093 ival = INTVAL (x); 7094 else 7095 return false; 7096 7097 /* use a 64 bit mode for everything except for DI/DF mode, where we use 7098 a 128 bit vector mode. */ 7099 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64; 7100 7101 vmode = aarch64_simd_container_mode (imode, width); 7102 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival); 7103 7104 return aarch64_simd_valid_immediate (v_op, NULL); 7105 } 7106 7107 7108 /* Return the fixed registers used for condition codes. */ 7109 7110 static bool 7111 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2) 7112 { 7113 *p1 = CC_REGNUM; 7114 *p2 = INVALID_REGNUM; 7115 return true; 7116 } 7117 7118 /* This function is used by the call expanders of the machine description. 7119 RESULT is the register in which the result is returned. It's NULL for 7120 "call" and "sibcall". 7121 MEM is the location of the function call. 7122 SIBCALL indicates whether this function call is normal call or sibling call. 7123 It will generate different pattern accordingly. */ 7124 7125 void 7126 aarch64_expand_call (rtx result, rtx mem, bool sibcall) 7127 { 7128 rtx call, callee, tmp; 7129 rtvec vec; 7130 machine_mode mode; 7131 7132 gcc_assert (MEM_P (mem)); 7133 callee = XEXP (mem, 0); 7134 mode = GET_MODE (callee); 7135 gcc_assert (mode == Pmode); 7136 7137 /* Decide if we should generate indirect calls by loading the 7138 address of the callee into a register before performing 7139 the branch-and-link. */ 7140 if (SYMBOL_REF_P (callee) 7141 ? (aarch64_is_long_call_p (callee) 7142 || aarch64_is_noplt_call_p (callee)) 7143 : !REG_P (callee)) 7144 XEXP (mem, 0) = force_reg (mode, callee); 7145 7146 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx); 7147 7148 if (result != NULL_RTX) 7149 call = gen_rtx_SET (result, call); 7150 7151 if (sibcall) 7152 tmp = ret_rtx; 7153 else 7154 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM)); 7155 7156 vec = gen_rtvec (2, call, tmp); 7157 call = gen_rtx_PARALLEL (VOIDmode, vec); 7158 7159 aarch64_emit_call_insn (call); 7160 } 7161 7162 /* Emit call insn with PAT and do aarch64-specific handling. */ 7163 7164 void 7165 aarch64_emit_call_insn (rtx pat) 7166 { 7167 rtx insn = emit_call_insn (pat); 7168 7169 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn); 7170 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM)); 7171 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM)); 7172 } 7173 7174 machine_mode 7175 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y) 7176 { 7177 machine_mode mode_x = GET_MODE (x); 7178 rtx_code code_x = GET_CODE (x); 7179 7180 /* All floating point compares return CCFP if it is an equality 7181 comparison, and CCFPE otherwise. */ 7182 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT) 7183 { 7184 switch (code) 7185 { 7186 case EQ: 7187 case NE: 7188 case UNORDERED: 7189 case ORDERED: 7190 case UNLT: 7191 case UNLE: 7192 case UNGT: 7193 case UNGE: 7194 case UNEQ: 7195 return CCFPmode; 7196 7197 case LT: 7198 case LE: 7199 case GT: 7200 case GE: 7201 case LTGT: 7202 return CCFPEmode; 7203 7204 default: 7205 gcc_unreachable (); 7206 } 7207 } 7208 7209 /* Equality comparisons of short modes against zero can be performed 7210 using the TST instruction with the appropriate bitmask. */ 7211 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x)) 7212 && (code == EQ || code == NE) 7213 && (mode_x == HImode || mode_x == QImode)) 7214 return CC_NZmode; 7215 7216 /* Similarly, comparisons of zero_extends from shorter modes can 7217 be performed using an ANDS with an immediate mask. */ 7218 if (y == const0_rtx && code_x == ZERO_EXTEND 7219 && (mode_x == SImode || mode_x == DImode) 7220 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode) 7221 && (code == EQ || code == NE)) 7222 return CC_NZmode; 7223 7224 if ((mode_x == SImode || mode_x == DImode) 7225 && y == const0_rtx 7226 && (code == EQ || code == NE || code == LT || code == GE) 7227 && (code_x == PLUS || code_x == MINUS || code_x == AND 7228 || code_x == NEG 7229 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1)) 7230 && CONST_INT_P (XEXP (x, 2))))) 7231 return CC_NZmode; 7232 7233 /* A compare with a shifted operand. Because of canonicalization, 7234 the comparison will have to be swapped when we emit the assembly 7235 code. */ 7236 if ((mode_x == SImode || mode_x == DImode) 7237 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx) 7238 && (code_x == ASHIFT || code_x == ASHIFTRT 7239 || code_x == LSHIFTRT 7240 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND)) 7241 return CC_SWPmode; 7242 7243 /* Similarly for a negated operand, but we can only do this for 7244 equalities. */ 7245 if ((mode_x == SImode || mode_x == DImode) 7246 && (REG_P (y) || GET_CODE (y) == SUBREG) 7247 && (code == EQ || code == NE) 7248 && code_x == NEG) 7249 return CC_Zmode; 7250 7251 /* A test for unsigned overflow from an addition. */ 7252 if ((mode_x == DImode || mode_x == TImode) 7253 && (code == LTU || code == GEU) 7254 && code_x == PLUS 7255 && rtx_equal_p (XEXP (x, 0), y)) 7256 return CC_Cmode; 7257 7258 /* A test for unsigned overflow from an add with carry. */ 7259 if ((mode_x == DImode || mode_x == TImode) 7260 && (code == LTU || code == GEU) 7261 && code_x == PLUS 7262 && CONST_SCALAR_INT_P (y) 7263 && (rtx_mode_t (y, mode_x) 7264 == (wi::shwi (1, mode_x) 7265 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2)))) 7266 return CC_ADCmode; 7267 7268 /* A test for signed overflow. */ 7269 if ((mode_x == DImode || mode_x == TImode) 7270 && code == NE 7271 && code_x == PLUS 7272 && GET_CODE (y) == SIGN_EXTEND) 7273 return CC_Vmode; 7274 7275 /* For everything else, return CCmode. */ 7276 return CCmode; 7277 } 7278 7279 static int 7280 aarch64_get_condition_code_1 (machine_mode, enum rtx_code); 7281 7282 int 7283 aarch64_get_condition_code (rtx x) 7284 { 7285 machine_mode mode = GET_MODE (XEXP (x, 0)); 7286 enum rtx_code comp_code = GET_CODE (x); 7287 7288 if (GET_MODE_CLASS (mode) != MODE_CC) 7289 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1)); 7290 return aarch64_get_condition_code_1 (mode, comp_code); 7291 } 7292 7293 static int 7294 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code) 7295 { 7296 switch (mode) 7297 { 7298 case E_CCFPmode: 7299 case E_CCFPEmode: 7300 switch (comp_code) 7301 { 7302 case GE: return AARCH64_GE; 7303 case GT: return AARCH64_GT; 7304 case LE: return AARCH64_LS; 7305 case LT: return AARCH64_MI; 7306 case NE: return AARCH64_NE; 7307 case EQ: return AARCH64_EQ; 7308 case ORDERED: return AARCH64_VC; 7309 case UNORDERED: return AARCH64_VS; 7310 case UNLT: return AARCH64_LT; 7311 case UNLE: return AARCH64_LE; 7312 case UNGT: return AARCH64_HI; 7313 case UNGE: return AARCH64_PL; 7314 default: return -1; 7315 } 7316 break; 7317 7318 case E_CCmode: 7319 switch (comp_code) 7320 { 7321 case NE: return AARCH64_NE; 7322 case EQ: return AARCH64_EQ; 7323 case GE: return AARCH64_GE; 7324 case GT: return AARCH64_GT; 7325 case LE: return AARCH64_LE; 7326 case LT: return AARCH64_LT; 7327 case GEU: return AARCH64_CS; 7328 case GTU: return AARCH64_HI; 7329 case LEU: return AARCH64_LS; 7330 case LTU: return AARCH64_CC; 7331 default: return -1; 7332 } 7333 break; 7334 7335 case E_CC_SWPmode: 7336 switch (comp_code) 7337 { 7338 case NE: return AARCH64_NE; 7339 case EQ: return AARCH64_EQ; 7340 case GE: return AARCH64_LE; 7341 case GT: return AARCH64_LT; 7342 case LE: return AARCH64_GE; 7343 case LT: return AARCH64_GT; 7344 case GEU: return AARCH64_LS; 7345 case GTU: return AARCH64_CC; 7346 case LEU: return AARCH64_CS; 7347 case LTU: return AARCH64_HI; 7348 default: return -1; 7349 } 7350 break; 7351 7352 case E_CC_NZmode: 7353 switch (comp_code) 7354 { 7355 case NE: return AARCH64_NE; 7356 case EQ: return AARCH64_EQ; 7357 case GE: return AARCH64_PL; 7358 case LT: return AARCH64_MI; 7359 default: return -1; 7360 } 7361 break; 7362 7363 case E_CC_Zmode: 7364 switch (comp_code) 7365 { 7366 case NE: return AARCH64_NE; 7367 case EQ: return AARCH64_EQ; 7368 default: return -1; 7369 } 7370 break; 7371 7372 case E_CC_Cmode: 7373 switch (comp_code) 7374 { 7375 case LTU: return AARCH64_CS; 7376 case GEU: return AARCH64_CC; 7377 default: return -1; 7378 } 7379 break; 7380 7381 case E_CC_ADCmode: 7382 switch (comp_code) 7383 { 7384 case GEU: return AARCH64_CS; 7385 case LTU: return AARCH64_CC; 7386 default: return -1; 7387 } 7388 break; 7389 7390 case E_CC_Vmode: 7391 switch (comp_code) 7392 { 7393 case NE: return AARCH64_VS; 7394 case EQ: return AARCH64_VC; 7395 default: return -1; 7396 } 7397 break; 7398 7399 default: 7400 return -1; 7401 } 7402 7403 return -1; 7404 } 7405 7406 bool 7407 aarch64_const_vec_all_same_in_range_p (rtx x, 7408 HOST_WIDE_INT minval, 7409 HOST_WIDE_INT maxval) 7410 { 7411 rtx elt; 7412 return (const_vec_duplicate_p (x, &elt) 7413 && CONST_INT_P (elt) 7414 && IN_RANGE (INTVAL (elt), minval, maxval)); 7415 } 7416 7417 bool 7418 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val) 7419 { 7420 return aarch64_const_vec_all_same_in_range_p (x, val, val); 7421 } 7422 7423 /* Return true if VEC is a constant in which every element is in the range 7424 [MINVAL, MAXVAL]. The elements do not need to have the same value. */ 7425 7426 static bool 7427 aarch64_const_vec_all_in_range_p (rtx vec, 7428 HOST_WIDE_INT minval, 7429 HOST_WIDE_INT maxval) 7430 { 7431 if (GET_CODE (vec) != CONST_VECTOR 7432 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT) 7433 return false; 7434 7435 int nunits; 7436 if (!CONST_VECTOR_STEPPED_P (vec)) 7437 nunits = const_vector_encoded_nelts (vec); 7438 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits)) 7439 return false; 7440 7441 for (int i = 0; i < nunits; i++) 7442 { 7443 rtx vec_elem = CONST_VECTOR_ELT (vec, i); 7444 if (!CONST_INT_P (vec_elem) 7445 || !IN_RANGE (INTVAL (vec_elem), minval, maxval)) 7446 return false; 7447 } 7448 return true; 7449 } 7450 7451 /* N Z C V. */ 7452 #define AARCH64_CC_V 1 7453 #define AARCH64_CC_C (1 << 1) 7454 #define AARCH64_CC_Z (1 << 2) 7455 #define AARCH64_CC_N (1 << 3) 7456 7457 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */ 7458 static const int aarch64_nzcv_codes[] = 7459 { 7460 0, /* EQ, Z == 1. */ 7461 AARCH64_CC_Z, /* NE, Z == 0. */ 7462 0, /* CS, C == 1. */ 7463 AARCH64_CC_C, /* CC, C == 0. */ 7464 0, /* MI, N == 1. */ 7465 AARCH64_CC_N, /* PL, N == 0. */ 7466 0, /* VS, V == 1. */ 7467 AARCH64_CC_V, /* VC, V == 0. */ 7468 0, /* HI, C ==1 && Z == 0. */ 7469 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */ 7470 AARCH64_CC_V, /* GE, N == V. */ 7471 0, /* LT, N != V. */ 7472 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */ 7473 0, /* LE, !(Z == 0 && N == V). */ 7474 0, /* AL, Any. */ 7475 0 /* NV, Any. */ 7476 }; 7477 7478 /* Print floating-point vector immediate operand X to F, negating it 7479 first if NEGATE is true. Return true on success, false if it isn't 7480 a constant we can handle. */ 7481 7482 static bool 7483 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate) 7484 { 7485 rtx elt; 7486 7487 if (!const_vec_duplicate_p (x, &elt)) 7488 return false; 7489 7490 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt); 7491 if (negate) 7492 r = real_value_negate (&r); 7493 7494 /* We only handle the SVE single-bit immediates here. */ 7495 if (real_equal (&r, &dconst0)) 7496 asm_fprintf (f, "0.0"); 7497 else if (real_equal (&r, &dconst1)) 7498 asm_fprintf (f, "1.0"); 7499 else if (real_equal (&r, &dconsthalf)) 7500 asm_fprintf (f, "0.5"); 7501 else 7502 return false; 7503 7504 return true; 7505 } 7506 7507 /* Return the equivalent letter for size. */ 7508 static char 7509 sizetochar (int size) 7510 { 7511 switch (size) 7512 { 7513 case 64: return 'd'; 7514 case 32: return 's'; 7515 case 16: return 'h'; 7516 case 8 : return 'b'; 7517 default: gcc_unreachable (); 7518 } 7519 } 7520 7521 /* Print operand X to file F in a target specific manner according to CODE. 7522 The acceptable formatting commands given by CODE are: 7523 'c': An integer or symbol address without a preceding # 7524 sign. 7525 'C': Take the duplicated element in a vector constant 7526 and print it in hex. 7527 'D': Take the duplicated element in a vector constant 7528 and print it as an unsigned integer, in decimal. 7529 'e': Print the sign/zero-extend size as a character 8->b, 7530 16->h, 32->w. 7531 'p': Prints N such that 2^N == X (X must be power of 2 and 7532 const int). 7533 'P': Print the number of non-zero bits in X (a const_int). 7534 'H': Print the higher numbered register of a pair (TImode) 7535 of regs. 7536 'm': Print a condition (eq, ne, etc). 7537 'M': Same as 'm', but invert condition. 7538 'N': Take the duplicated element in a vector constant 7539 and print the negative of it in decimal. 7540 'b/h/s/d/q': Print a scalar FP/SIMD register name. 7541 'S/T/U/V': Print a FP/SIMD register name for a register list. 7542 The register printed is the FP/SIMD register name 7543 of X + 0/1/2/3 for S/T/U/V. 7544 'R': Print a scalar FP/SIMD register name + 1. 7545 'X': Print bottom 16 bits of integer constant in hex. 7546 'w/x': Print a general register name or the zero register 7547 (32-bit or 64-bit). 7548 '0': Print a normal operand, if it's a general register, 7549 then we assume DImode. 7550 'k': Print NZCV for conditional compare instructions. 7551 'A': Output address constant representing the first 7552 argument of X, specifying a relocation offset 7553 if appropriate. 7554 'L': Output constant address specified by X 7555 with a relocation offset if appropriate. 7556 'G': Prints address of X, specifying a PC relative 7557 relocation mode if appropriate. 7558 'y': Output address of LDP or STP - this is used for 7559 some LDP/STPs which don't use a PARALLEL in their 7560 pattern (so the mode needs to be adjusted). 7561 'z': Output address of a typical LDP or STP. */ 7562 7563 static void 7564 aarch64_print_operand (FILE *f, rtx x, int code) 7565 { 7566 rtx elt; 7567 switch (code) 7568 { 7569 case 'c': 7570 switch (GET_CODE (x)) 7571 { 7572 case CONST_INT: 7573 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x)); 7574 break; 7575 7576 case SYMBOL_REF: 7577 output_addr_const (f, x); 7578 break; 7579 7580 case CONST: 7581 if (GET_CODE (XEXP (x, 0)) == PLUS 7582 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF) 7583 { 7584 output_addr_const (f, x); 7585 break; 7586 } 7587 /* Fall through. */ 7588 7589 default: 7590 output_operand_lossage ("unsupported operand for code '%c'", code); 7591 } 7592 break; 7593 7594 case 'e': 7595 { 7596 int n; 7597 7598 if (!CONST_INT_P (x) 7599 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0) 7600 { 7601 output_operand_lossage ("invalid operand for '%%%c'", code); 7602 return; 7603 } 7604 7605 switch (n) 7606 { 7607 case 3: 7608 fputc ('b', f); 7609 break; 7610 case 4: 7611 fputc ('h', f); 7612 break; 7613 case 5: 7614 fputc ('w', f); 7615 break; 7616 default: 7617 output_operand_lossage ("invalid operand for '%%%c'", code); 7618 return; 7619 } 7620 } 7621 break; 7622 7623 case 'p': 7624 { 7625 int n; 7626 7627 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0) 7628 { 7629 output_operand_lossage ("invalid operand for '%%%c'", code); 7630 return; 7631 } 7632 7633 asm_fprintf (f, "%d", n); 7634 } 7635 break; 7636 7637 case 'P': 7638 if (!CONST_INT_P (x)) 7639 { 7640 output_operand_lossage ("invalid operand for '%%%c'", code); 7641 return; 7642 } 7643 7644 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x))); 7645 break; 7646 7647 case 'H': 7648 if (x == const0_rtx) 7649 { 7650 asm_fprintf (f, "xzr"); 7651 break; 7652 } 7653 7654 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1)) 7655 { 7656 output_operand_lossage ("invalid operand for '%%%c'", code); 7657 return; 7658 } 7659 7660 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]); 7661 break; 7662 7663 case 'M': 7664 case 'm': 7665 { 7666 int cond_code; 7667 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */ 7668 if (x == const_true_rtx) 7669 { 7670 if (code == 'M') 7671 fputs ("nv", f); 7672 return; 7673 } 7674 7675 if (!COMPARISON_P (x)) 7676 { 7677 output_operand_lossage ("invalid operand for '%%%c'", code); 7678 return; 7679 } 7680 7681 cond_code = aarch64_get_condition_code (x); 7682 gcc_assert (cond_code >= 0); 7683 if (code == 'M') 7684 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code); 7685 fputs (aarch64_condition_codes[cond_code], f); 7686 } 7687 break; 7688 7689 case 'N': 7690 if (!const_vec_duplicate_p (x, &elt)) 7691 { 7692 output_operand_lossage ("invalid vector constant"); 7693 return; 7694 } 7695 7696 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT) 7697 asm_fprintf (f, "%wd", -INTVAL (elt)); 7698 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT 7699 && aarch64_print_vector_float_operand (f, x, true)) 7700 ; 7701 else 7702 { 7703 output_operand_lossage ("invalid vector constant"); 7704 return; 7705 } 7706 break; 7707 7708 case 'b': 7709 case 'h': 7710 case 's': 7711 case 'd': 7712 case 'q': 7713 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x))) 7714 { 7715 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code); 7716 return; 7717 } 7718 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM); 7719 break; 7720 7721 case 'S': 7722 case 'T': 7723 case 'U': 7724 case 'V': 7725 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x))) 7726 { 7727 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code); 7728 return; 7729 } 7730 asm_fprintf (f, "%c%d", 7731 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v', 7732 REGNO (x) - V0_REGNUM + (code - 'S')); 7733 break; 7734 7735 case 'R': 7736 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x))) 7737 { 7738 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code); 7739 return; 7740 } 7741 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1); 7742 break; 7743 7744 case 'X': 7745 if (!CONST_INT_P (x)) 7746 { 7747 output_operand_lossage ("invalid operand for '%%%c'", code); 7748 return; 7749 } 7750 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff); 7751 break; 7752 7753 case 'C': 7754 { 7755 /* Print a replicated constant in hex. */ 7756 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt)) 7757 { 7758 output_operand_lossage ("invalid operand for '%%%c'", code); 7759 return; 7760 } 7761 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x)); 7762 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode)); 7763 } 7764 break; 7765 7766 case 'D': 7767 { 7768 /* Print a replicated constant in decimal, treating it as 7769 unsigned. */ 7770 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt)) 7771 { 7772 output_operand_lossage ("invalid operand for '%%%c'", code); 7773 return; 7774 } 7775 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x)); 7776 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode)); 7777 } 7778 break; 7779 7780 case 'w': 7781 case 'x': 7782 if (x == const0_rtx 7783 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x))) 7784 { 7785 asm_fprintf (f, "%czr", code); 7786 break; 7787 } 7788 7789 if (REG_P (x) && GP_REGNUM_P (REGNO (x))) 7790 { 7791 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM); 7792 break; 7793 } 7794 7795 if (REG_P (x) && REGNO (x) == SP_REGNUM) 7796 { 7797 asm_fprintf (f, "%ssp", code == 'w' ? "w" : ""); 7798 break; 7799 } 7800 7801 /* Fall through */ 7802 7803 case 0: 7804 if (x == NULL) 7805 { 7806 output_operand_lossage ("missing operand"); 7807 return; 7808 } 7809 7810 switch (GET_CODE (x)) 7811 { 7812 case REG: 7813 if (aarch64_sve_data_mode_p (GET_MODE (x))) 7814 { 7815 if (REG_NREGS (x) == 1) 7816 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM); 7817 else 7818 { 7819 char suffix 7820 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x))); 7821 asm_fprintf (f, "{z%d.%c - z%d.%c}", 7822 REGNO (x) - V0_REGNUM, suffix, 7823 END_REGNO (x) - V0_REGNUM - 1, suffix); 7824 } 7825 } 7826 else 7827 asm_fprintf (f, "%s", reg_names [REGNO (x)]); 7828 break; 7829 7830 case MEM: 7831 output_address (GET_MODE (x), XEXP (x, 0)); 7832 break; 7833 7834 case LABEL_REF: 7835 case SYMBOL_REF: 7836 output_addr_const (asm_out_file, x); 7837 break; 7838 7839 case CONST_INT: 7840 asm_fprintf (f, "%wd", INTVAL (x)); 7841 break; 7842 7843 case CONST: 7844 if (!VECTOR_MODE_P (GET_MODE (x))) 7845 { 7846 output_addr_const (asm_out_file, x); 7847 break; 7848 } 7849 /* fall through */ 7850 7851 case CONST_VECTOR: 7852 if (!const_vec_duplicate_p (x, &elt)) 7853 { 7854 output_operand_lossage ("invalid vector constant"); 7855 return; 7856 } 7857 7858 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT) 7859 asm_fprintf (f, "%wd", INTVAL (elt)); 7860 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT 7861 && aarch64_print_vector_float_operand (f, x, false)) 7862 ; 7863 else 7864 { 7865 output_operand_lossage ("invalid vector constant"); 7866 return; 7867 } 7868 break; 7869 7870 case CONST_DOUBLE: 7871 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever 7872 be getting CONST_DOUBLEs holding integers. */ 7873 gcc_assert (GET_MODE (x) != VOIDmode); 7874 if (aarch64_float_const_zero_rtx_p (x)) 7875 { 7876 fputc ('0', f); 7877 break; 7878 } 7879 else if (aarch64_float_const_representable_p (x)) 7880 { 7881 #define buf_size 20 7882 char float_buf[buf_size] = {'\0'}; 7883 real_to_decimal_for_mode (float_buf, 7884 CONST_DOUBLE_REAL_VALUE (x), 7885 buf_size, buf_size, 7886 1, GET_MODE (x)); 7887 asm_fprintf (asm_out_file, "%s", float_buf); 7888 break; 7889 #undef buf_size 7890 } 7891 output_operand_lossage ("invalid constant"); 7892 return; 7893 default: 7894 output_operand_lossage ("invalid operand"); 7895 return; 7896 } 7897 break; 7898 7899 case 'A': 7900 if (GET_CODE (x) == HIGH) 7901 x = XEXP (x, 0); 7902 7903 switch (aarch64_classify_symbolic_expression (x)) 7904 { 7905 case SYMBOL_SMALL_GOT_4G: 7906 asm_fprintf (asm_out_file, ":got:"); 7907 break; 7908 7909 case SYMBOL_SMALL_TLSGD: 7910 asm_fprintf (asm_out_file, ":tlsgd:"); 7911 break; 7912 7913 case SYMBOL_SMALL_TLSDESC: 7914 asm_fprintf (asm_out_file, ":tlsdesc:"); 7915 break; 7916 7917 case SYMBOL_SMALL_TLSIE: 7918 asm_fprintf (asm_out_file, ":gottprel:"); 7919 break; 7920 7921 case SYMBOL_TLSLE24: 7922 asm_fprintf (asm_out_file, ":tprel:"); 7923 break; 7924 7925 case SYMBOL_TINY_GOT: 7926 gcc_unreachable (); 7927 break; 7928 7929 default: 7930 break; 7931 } 7932 output_addr_const (asm_out_file, x); 7933 break; 7934 7935 case 'L': 7936 switch (aarch64_classify_symbolic_expression (x)) 7937 { 7938 case SYMBOL_SMALL_GOT_4G: 7939 asm_fprintf (asm_out_file, ":lo12:"); 7940 break; 7941 7942 case SYMBOL_SMALL_TLSGD: 7943 asm_fprintf (asm_out_file, ":tlsgd_lo12:"); 7944 break; 7945 7946 case SYMBOL_SMALL_TLSDESC: 7947 asm_fprintf (asm_out_file, ":tlsdesc_lo12:"); 7948 break; 7949 7950 case SYMBOL_SMALL_TLSIE: 7951 asm_fprintf (asm_out_file, ":gottprel_lo12:"); 7952 break; 7953 7954 case SYMBOL_TLSLE12: 7955 asm_fprintf (asm_out_file, ":tprel_lo12:"); 7956 break; 7957 7958 case SYMBOL_TLSLE24: 7959 asm_fprintf (asm_out_file, ":tprel_lo12_nc:"); 7960 break; 7961 7962 case SYMBOL_TINY_GOT: 7963 asm_fprintf (asm_out_file, ":got:"); 7964 break; 7965 7966 case SYMBOL_TINY_TLSIE: 7967 asm_fprintf (asm_out_file, ":gottprel:"); 7968 break; 7969 7970 default: 7971 break; 7972 } 7973 output_addr_const (asm_out_file, x); 7974 break; 7975 7976 case 'G': 7977 switch (aarch64_classify_symbolic_expression (x)) 7978 { 7979 case SYMBOL_TLSLE24: 7980 asm_fprintf (asm_out_file, ":tprel_hi12:"); 7981 break; 7982 default: 7983 break; 7984 } 7985 output_addr_const (asm_out_file, x); 7986 break; 7987 7988 case 'k': 7989 { 7990 HOST_WIDE_INT cond_code; 7991 7992 if (!CONST_INT_P (x)) 7993 { 7994 output_operand_lossage ("invalid operand for '%%%c'", code); 7995 return; 7996 } 7997 7998 cond_code = INTVAL (x); 7999 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV); 8000 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]); 8001 } 8002 break; 8003 8004 case 'y': 8005 case 'z': 8006 { 8007 machine_mode mode = GET_MODE (x); 8008 8009 if (GET_CODE (x) != MEM 8010 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16))) 8011 { 8012 output_operand_lossage ("invalid operand for '%%%c'", code); 8013 return; 8014 } 8015 8016 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0), 8017 code == 'y' 8018 ? ADDR_QUERY_LDP_STP_N 8019 : ADDR_QUERY_LDP_STP)) 8020 output_operand_lossage ("invalid operand prefix '%%%c'", code); 8021 } 8022 break; 8023 8024 default: 8025 output_operand_lossage ("invalid operand prefix '%%%c'", code); 8026 return; 8027 } 8028 } 8029 8030 /* Print address 'x' of a memory access with mode 'mode'. 8031 'op' is the context required by aarch64_classify_address. It can either be 8032 MEM for a normal memory access or PARALLEL for LDP/STP. */ 8033 static bool 8034 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x, 8035 aarch64_addr_query_type type) 8036 { 8037 struct aarch64_address_info addr; 8038 unsigned int size; 8039 8040 /* Check all addresses are Pmode - including ILP32. */ 8041 if (GET_MODE (x) != Pmode 8042 && (!CONST_INT_P (x) 8043 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x))) 8044 { 8045 output_operand_lossage ("invalid address mode"); 8046 return false; 8047 } 8048 8049 if (aarch64_classify_address (&addr, x, mode, true, type)) 8050 switch (addr.type) 8051 { 8052 case ADDRESS_REG_IMM: 8053 if (known_eq (addr.const_offset, 0)) 8054 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]); 8055 else if (aarch64_sve_data_mode_p (mode)) 8056 { 8057 HOST_WIDE_INT vnum 8058 = exact_div (addr.const_offset, 8059 BYTES_PER_SVE_VECTOR).to_constant (); 8060 asm_fprintf (f, "[%s, #%wd, mul vl]", 8061 reg_names[REGNO (addr.base)], vnum); 8062 } 8063 else if (aarch64_sve_pred_mode_p (mode)) 8064 { 8065 HOST_WIDE_INT vnum 8066 = exact_div (addr.const_offset, 8067 BYTES_PER_SVE_PRED).to_constant (); 8068 asm_fprintf (f, "[%s, #%wd, mul vl]", 8069 reg_names[REGNO (addr.base)], vnum); 8070 } 8071 else 8072 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)], 8073 INTVAL (addr.offset)); 8074 return true; 8075 8076 case ADDRESS_REG_REG: 8077 if (addr.shift == 0) 8078 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)], 8079 reg_names [REGNO (addr.offset)]); 8080 else 8081 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)], 8082 reg_names [REGNO (addr.offset)], addr.shift); 8083 return true; 8084 8085 case ADDRESS_REG_UXTW: 8086 if (addr.shift == 0) 8087 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)], 8088 REGNO (addr.offset) - R0_REGNUM); 8089 else 8090 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)], 8091 REGNO (addr.offset) - R0_REGNUM, addr.shift); 8092 return true; 8093 8094 case ADDRESS_REG_SXTW: 8095 if (addr.shift == 0) 8096 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)], 8097 REGNO (addr.offset) - R0_REGNUM); 8098 else 8099 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)], 8100 REGNO (addr.offset) - R0_REGNUM, addr.shift); 8101 return true; 8102 8103 case ADDRESS_REG_WB: 8104 /* Writeback is only supported for fixed-width modes. */ 8105 size = GET_MODE_SIZE (mode).to_constant (); 8106 switch (GET_CODE (x)) 8107 { 8108 case PRE_INC: 8109 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size); 8110 return true; 8111 case POST_INC: 8112 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size); 8113 return true; 8114 case PRE_DEC: 8115 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size); 8116 return true; 8117 case POST_DEC: 8118 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size); 8119 return true; 8120 case PRE_MODIFY: 8121 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)], 8122 INTVAL (addr.offset)); 8123 return true; 8124 case POST_MODIFY: 8125 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)], 8126 INTVAL (addr.offset)); 8127 return true; 8128 default: 8129 break; 8130 } 8131 break; 8132 8133 case ADDRESS_LO_SUM: 8134 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]); 8135 output_addr_const (f, addr.offset); 8136 asm_fprintf (f, "]"); 8137 return true; 8138 8139 case ADDRESS_SYMBOLIC: 8140 output_addr_const (f, x); 8141 return true; 8142 } 8143 8144 return false; 8145 } 8146 8147 /* Print address 'x' of a memory access with mode 'mode'. */ 8148 static void 8149 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x) 8150 { 8151 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY)) 8152 output_addr_const (f, x); 8153 } 8154 8155 bool 8156 aarch64_label_mentioned_p (rtx x) 8157 { 8158 const char *fmt; 8159 int i; 8160 8161 if (GET_CODE (x) == LABEL_REF) 8162 return true; 8163 8164 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the 8165 referencing instruction, but they are constant offsets, not 8166 symbols. */ 8167 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS) 8168 return false; 8169 8170 fmt = GET_RTX_FORMAT (GET_CODE (x)); 8171 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--) 8172 { 8173 if (fmt[i] == 'E') 8174 { 8175 int j; 8176 8177 for (j = XVECLEN (x, i) - 1; j >= 0; j--) 8178 if (aarch64_label_mentioned_p (XVECEXP (x, i, j))) 8179 return 1; 8180 } 8181 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i))) 8182 return 1; 8183 } 8184 8185 return 0; 8186 } 8187 8188 /* Implement REGNO_REG_CLASS. */ 8189 8190 enum reg_class 8191 aarch64_regno_regclass (unsigned regno) 8192 { 8193 if (GP_REGNUM_P (regno)) 8194 return GENERAL_REGS; 8195 8196 if (regno == SP_REGNUM) 8197 return STACK_REG; 8198 8199 if (regno == FRAME_POINTER_REGNUM 8200 || regno == ARG_POINTER_REGNUM) 8201 return POINTER_REGS; 8202 8203 if (FP_REGNUM_P (regno)) 8204 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS; 8205 8206 if (PR_REGNUM_P (regno)) 8207 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS; 8208 8209 return NO_REGS; 8210 } 8211 8212 /* OFFSET is an address offset for mode MODE, which has SIZE bytes. 8213 If OFFSET is out of range, return an offset of an anchor point 8214 that is in range. Return 0 otherwise. */ 8215 8216 static HOST_WIDE_INT 8217 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size, 8218 machine_mode mode) 8219 { 8220 /* Does it look like we'll need a 16-byte load/store-pair operation? */ 8221 if (size > 16) 8222 return (offset + 0x400) & ~0x7f0; 8223 8224 /* For offsets that aren't a multiple of the access size, the limit is 8225 -256...255. */ 8226 if (offset & (size - 1)) 8227 { 8228 /* BLKmode typically uses LDP of X-registers. */ 8229 if (mode == BLKmode) 8230 return (offset + 512) & ~0x3ff; 8231 return (offset + 0x100) & ~0x1ff; 8232 } 8233 8234 /* Small negative offsets are supported. */ 8235 if (IN_RANGE (offset, -256, 0)) 8236 return 0; 8237 8238 if (mode == TImode || mode == TFmode) 8239 return (offset + 0x100) & ~0x1ff; 8240 8241 /* Use 12-bit offset by access size. */ 8242 return offset & (~0xfff * size); 8243 } 8244 8245 static rtx 8246 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode) 8247 { 8248 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask), 8249 where mask is selected by alignment and size of the offset. 8250 We try to pick as large a range for the offset as possible to 8251 maximize the chance of a CSE. However, for aligned addresses 8252 we limit the range to 4k so that structures with different sized 8253 elements are likely to use the same base. We need to be careful 8254 not to split a CONST for some forms of address expression, otherwise 8255 it will generate sub-optimal code. */ 8256 8257 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1))) 8258 { 8259 rtx base = XEXP (x, 0); 8260 rtx offset_rtx = XEXP (x, 1); 8261 HOST_WIDE_INT offset = INTVAL (offset_rtx); 8262 8263 if (GET_CODE (base) == PLUS) 8264 { 8265 rtx op0 = XEXP (base, 0); 8266 rtx op1 = XEXP (base, 1); 8267 8268 /* Force any scaling into a temp for CSE. */ 8269 op0 = force_reg (Pmode, op0); 8270 op1 = force_reg (Pmode, op1); 8271 8272 /* Let the pointer register be in op0. */ 8273 if (REG_POINTER (op1)) 8274 std::swap (op0, op1); 8275 8276 /* If the pointer is virtual or frame related, then we know that 8277 virtual register instantiation or register elimination is going 8278 to apply a second constant. We want the two constants folded 8279 together easily. Therefore, emit as (OP0 + CONST) + OP1. */ 8280 if (virt_or_elim_regno_p (REGNO (op0))) 8281 { 8282 base = expand_binop (Pmode, add_optab, op0, offset_rtx, 8283 NULL_RTX, true, OPTAB_DIRECT); 8284 return gen_rtx_PLUS (Pmode, base, op1); 8285 } 8286 8287 /* Otherwise, in order to encourage CSE (and thence loop strength 8288 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */ 8289 base = expand_binop (Pmode, add_optab, op0, op1, 8290 NULL_RTX, true, OPTAB_DIRECT); 8291 x = gen_rtx_PLUS (Pmode, base, offset_rtx); 8292 } 8293 8294 HOST_WIDE_INT size; 8295 if (GET_MODE_SIZE (mode).is_constant (&size)) 8296 { 8297 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size, 8298 mode); 8299 if (base_offset != 0) 8300 { 8301 base = plus_constant (Pmode, base, base_offset); 8302 base = force_operand (base, NULL_RTX); 8303 return plus_constant (Pmode, base, offset - base_offset); 8304 } 8305 } 8306 } 8307 8308 return x; 8309 } 8310 8311 static reg_class_t 8312 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x, 8313 reg_class_t rclass, 8314 machine_mode mode, 8315 secondary_reload_info *sri) 8316 { 8317 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled 8318 directly by the *aarch64_sve_mov<mode>_be move pattern. See the 8319 comment at the head of aarch64-sve.md for more details about the 8320 big-endian handling. */ 8321 if (BYTES_BIG_ENDIAN 8322 && reg_class_subset_p (rclass, FP_REGS) 8323 && !((REG_P (x) && HARD_REGISTER_P (x)) 8324 || aarch64_simd_valid_immediate (x, NULL)) 8325 && aarch64_sve_data_mode_p (mode)) 8326 { 8327 sri->icode = CODE_FOR_aarch64_sve_reload_be; 8328 return NO_REGS; 8329 } 8330 8331 /* If we have to disable direct literal pool loads and stores because the 8332 function is too big, then we need a scratch register. */ 8333 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x) 8334 && (SCALAR_FLOAT_MODE_P (GET_MODE (x)) 8335 || targetm.vector_mode_supported_p (GET_MODE (x))) 8336 && !aarch64_pcrelative_literal_loads) 8337 { 8338 sri->icode = code_for_aarch64_reload_movcp (mode, DImode); 8339 return NO_REGS; 8340 } 8341 8342 /* Without the TARGET_SIMD instructions we cannot move a Q register 8343 to a Q register directly. We need a scratch. */ 8344 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x) 8345 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD 8346 && reg_class_subset_p (rclass, FP_REGS)) 8347 { 8348 sri->icode = code_for_aarch64_reload_mov (mode); 8349 return NO_REGS; 8350 } 8351 8352 /* A TFmode or TImode memory access should be handled via an FP_REGS 8353 because AArch64 has richer addressing modes for LDR/STR instructions 8354 than LDP/STP instructions. */ 8355 if (TARGET_FLOAT && rclass == GENERAL_REGS 8356 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x)) 8357 return FP_REGS; 8358 8359 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x)) 8360 return GENERAL_REGS; 8361 8362 return NO_REGS; 8363 } 8364 8365 static bool 8366 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to) 8367 { 8368 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM); 8369 8370 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM 8371 can only eliminate to HARD_FRAME_POINTER_REGNUM. */ 8372 if (frame_pointer_needed) 8373 return to == HARD_FRAME_POINTER_REGNUM; 8374 return true; 8375 } 8376 8377 poly_int64 8378 aarch64_initial_elimination_offset (unsigned from, unsigned to) 8379 { 8380 if (to == HARD_FRAME_POINTER_REGNUM) 8381 { 8382 if (from == ARG_POINTER_REGNUM) 8383 return cfun->machine->frame.hard_fp_offset; 8384 8385 if (from == FRAME_POINTER_REGNUM) 8386 return cfun->machine->frame.hard_fp_offset 8387 - cfun->machine->frame.locals_offset; 8388 } 8389 8390 if (to == STACK_POINTER_REGNUM) 8391 { 8392 if (from == FRAME_POINTER_REGNUM) 8393 return cfun->machine->frame.frame_size 8394 - cfun->machine->frame.locals_offset; 8395 } 8396 8397 return cfun->machine->frame.frame_size; 8398 } 8399 8400 /* Implement RETURN_ADDR_RTX. We do not support moving back to a 8401 previous frame. */ 8402 8403 rtx 8404 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED) 8405 { 8406 if (count != 0) 8407 return const0_rtx; 8408 return get_hard_reg_initial_val (Pmode, LR_REGNUM); 8409 } 8410 8411 8412 static void 8413 aarch64_asm_trampoline_template (FILE *f) 8414 { 8415 int offset1 = 16; 8416 int offset2 = 20; 8417 8418 if (aarch64_bti_enabled ()) 8419 { 8420 asm_fprintf (f, "\thint\t34 // bti c\n"); 8421 offset1 -= 4; 8422 offset2 -= 4; 8423 } 8424 8425 if (TARGET_ILP32) 8426 { 8427 asm_fprintf (f, "\tldr\tw%d, .+%d\n", IP1_REGNUM - R0_REGNUM, offset1); 8428 asm_fprintf (f, "\tldr\tw%d, .+%d\n", STATIC_CHAIN_REGNUM - R0_REGNUM, 8429 offset1); 8430 } 8431 else 8432 { 8433 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [IP1_REGNUM], offset1); 8434 asm_fprintf (f, "\tldr\t%s, .+%d\n", reg_names [STATIC_CHAIN_REGNUM], 8435 offset2); 8436 } 8437 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]); 8438 8439 /* The trampoline needs an extra padding instruction. In case if BTI is 8440 enabled the padding instruction is replaced by the BTI instruction at 8441 the beginning. */ 8442 if (!aarch64_bti_enabled ()) 8443 assemble_aligned_integer (4, const0_rtx); 8444 8445 assemble_aligned_integer (POINTER_BYTES, const0_rtx); 8446 assemble_aligned_integer (POINTER_BYTES, const0_rtx); 8447 } 8448 8449 static void 8450 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value) 8451 { 8452 rtx fnaddr, mem, a_tramp; 8453 const int tramp_code_sz = 16; 8454 8455 /* Don't need to copy the trailing D-words, we fill those in below. */ 8456 emit_block_move (m_tramp, assemble_trampoline_template (), 8457 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL); 8458 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz); 8459 fnaddr = XEXP (DECL_RTL (fndecl), 0); 8460 if (GET_MODE (fnaddr) != ptr_mode) 8461 fnaddr = convert_memory_address (ptr_mode, fnaddr); 8462 emit_move_insn (mem, fnaddr); 8463 8464 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES); 8465 emit_move_insn (mem, chain_value); 8466 8467 /* XXX We should really define a "clear_cache" pattern and use 8468 gen_clear_cache(). */ 8469 a_tramp = XEXP (m_tramp, 0); 8470 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"), 8471 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode, 8472 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE), 8473 ptr_mode); 8474 } 8475 8476 static unsigned char 8477 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode) 8478 { 8479 /* ??? Logically we should only need to provide a value when 8480 HARD_REGNO_MODE_OK says that at least one register in REGCLASS 8481 can hold MODE, but at the moment we need to handle all modes. 8482 Just ignore any runtime parts for registers that can't store them. */ 8483 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode)); 8484 unsigned int nregs; 8485 switch (regclass) 8486 { 8487 case TAILCALL_ADDR_REGS: 8488 case POINTER_REGS: 8489 case GENERAL_REGS: 8490 case ALL_REGS: 8491 case POINTER_AND_FP_REGS: 8492 case FP_REGS: 8493 case FP_LO_REGS: 8494 if (aarch64_sve_data_mode_p (mode) 8495 && constant_multiple_p (GET_MODE_SIZE (mode), 8496 BYTES_PER_SVE_VECTOR, &nregs)) 8497 return nregs; 8498 return (aarch64_vector_data_mode_p (mode) 8499 ? CEIL (lowest_size, UNITS_PER_VREG) 8500 : CEIL (lowest_size, UNITS_PER_WORD)); 8501 case STACK_REG: 8502 case PR_REGS: 8503 case PR_LO_REGS: 8504 case PR_HI_REGS: 8505 return 1; 8506 8507 case NO_REGS: 8508 return 0; 8509 8510 default: 8511 break; 8512 } 8513 gcc_unreachable (); 8514 } 8515 8516 static reg_class_t 8517 aarch64_preferred_reload_class (rtx x, reg_class_t regclass) 8518 { 8519 if (regclass == POINTER_REGS) 8520 return GENERAL_REGS; 8521 8522 if (regclass == STACK_REG) 8523 { 8524 if (REG_P(x) 8525 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS)) 8526 return regclass; 8527 8528 return NO_REGS; 8529 } 8530 8531 /* Register eliminiation can result in a request for 8532 SP+constant->FP_REGS. We cannot support such operations which 8533 use SP as source and an FP_REG as destination, so reject out 8534 right now. */ 8535 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS) 8536 { 8537 rtx lhs = XEXP (x, 0); 8538 8539 /* Look through a possible SUBREG introduced by ILP32. */ 8540 if (GET_CODE (lhs) == SUBREG) 8541 lhs = SUBREG_REG (lhs); 8542 8543 gcc_assert (REG_P (lhs)); 8544 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)), 8545 POINTER_REGS)); 8546 return NO_REGS; 8547 } 8548 8549 return regclass; 8550 } 8551 8552 void 8553 aarch64_asm_output_labelref (FILE* f, const char *name) 8554 { 8555 asm_fprintf (f, "%U%s", name); 8556 } 8557 8558 static void 8559 aarch64_elf_asm_constructor (rtx symbol, int priority) 8560 { 8561 if (priority == DEFAULT_INIT_PRIORITY) 8562 default_ctor_section_asm_out_constructor (symbol, priority); 8563 else 8564 { 8565 section *s; 8566 /* While priority is known to be in range [0, 65535], so 18 bytes 8567 would be enough, the compiler might not know that. To avoid 8568 -Wformat-truncation false positive, use a larger size. */ 8569 char buf[23]; 8570 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority); 8571 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL); 8572 switch_to_section (s); 8573 assemble_align (POINTER_SIZE); 8574 assemble_aligned_integer (POINTER_BYTES, symbol); 8575 } 8576 } 8577 8578 static void 8579 aarch64_elf_asm_destructor (rtx symbol, int priority) 8580 { 8581 if (priority == DEFAULT_INIT_PRIORITY) 8582 default_dtor_section_asm_out_destructor (symbol, priority); 8583 else 8584 { 8585 section *s; 8586 /* While priority is known to be in range [0, 65535], so 18 bytes 8587 would be enough, the compiler might not know that. To avoid 8588 -Wformat-truncation false positive, use a larger size. */ 8589 char buf[23]; 8590 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority); 8591 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL); 8592 switch_to_section (s); 8593 assemble_align (POINTER_SIZE); 8594 assemble_aligned_integer (POINTER_BYTES, symbol); 8595 } 8596 } 8597 8598 const char* 8599 aarch64_output_casesi (rtx *operands) 8600 { 8601 char buf[100]; 8602 char label[100]; 8603 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2]))); 8604 int index; 8605 static const char *const patterns[4][2] = 8606 { 8607 { 8608 "ldrb\t%w3, [%0,%w1,uxtw]", 8609 "add\t%3, %4, %w3, sxtb #2" 8610 }, 8611 { 8612 "ldrh\t%w3, [%0,%w1,uxtw #1]", 8613 "add\t%3, %4, %w3, sxth #2" 8614 }, 8615 { 8616 "ldr\t%w3, [%0,%w1,uxtw #2]", 8617 "add\t%3, %4, %w3, sxtw #2" 8618 }, 8619 /* We assume that DImode is only generated when not optimizing and 8620 that we don't really need 64-bit address offsets. That would 8621 imply an object file with 8GB of code in a single function! */ 8622 { 8623 "ldr\t%w3, [%0,%w1,uxtw #2]", 8624 "add\t%3, %4, %w3, sxtw #2" 8625 } 8626 }; 8627 8628 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC); 8629 8630 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec)); 8631 index = exact_log2 (GET_MODE_SIZE (mode)); 8632 8633 gcc_assert (index >= 0 && index <= 3); 8634 8635 /* Need to implement table size reduction, by chaning the code below. */ 8636 output_asm_insn (patterns[index][0], operands); 8637 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2])); 8638 snprintf (buf, sizeof (buf), 8639 "adr\t%%4, %s", targetm.strip_name_encoding (label)); 8640 output_asm_insn (buf, operands); 8641 output_asm_insn (patterns[index][1], operands); 8642 output_asm_insn ("br\t%3", operands); 8643 assemble_label (asm_out_file, label); 8644 return ""; 8645 } 8646 8647 8648 /* Return size in bits of an arithmetic operand which is shifted/scaled and 8649 masked such that it is suitable for a UXTB, UXTH, or UXTW extend 8650 operator. */ 8651 8652 int 8653 aarch64_uxt_size (int shift, HOST_WIDE_INT mask) 8654 { 8655 if (shift >= 0 && shift <= 3) 8656 { 8657 int size; 8658 for (size = 8; size <= 32; size *= 2) 8659 { 8660 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1; 8661 if (mask == bits << shift) 8662 return size; 8663 } 8664 } 8665 return 0; 8666 } 8667 8668 /* Constant pools are per function only when PC relative 8669 literal loads are true or we are in the large memory 8670 model. */ 8671 8672 static inline bool 8673 aarch64_can_use_per_function_literal_pools_p (void) 8674 { 8675 return (aarch64_pcrelative_literal_loads 8676 || aarch64_cmodel == AARCH64_CMODEL_LARGE); 8677 } 8678 8679 static bool 8680 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx) 8681 { 8682 /* We can't use blocks for constants when we're using a per-function 8683 constant pool. */ 8684 return !aarch64_can_use_per_function_literal_pools_p (); 8685 } 8686 8687 /* Select appropriate section for constants depending 8688 on where we place literal pools. */ 8689 8690 static section * 8691 aarch64_select_rtx_section (machine_mode mode, 8692 rtx x, 8693 unsigned HOST_WIDE_INT align) 8694 { 8695 if (aarch64_can_use_per_function_literal_pools_p ()) 8696 return function_section (current_function_decl); 8697 8698 return default_elf_select_rtx_section (mode, x, align); 8699 } 8700 8701 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */ 8702 void 8703 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree, 8704 HOST_WIDE_INT offset) 8705 { 8706 /* When using per-function literal pools, we must ensure that any code 8707 section is aligned to the minimal instruction length, lest we get 8708 errors from the assembler re "unaligned instructions". */ 8709 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ()) 8710 ASM_OUTPUT_ALIGN (f, 2); 8711 } 8712 8713 /* Costs. */ 8714 8715 /* Helper function for rtx cost calculation. Strip a shift expression 8716 from X. Returns the inner operand if successful, or the original 8717 expression on failure. */ 8718 static rtx 8719 aarch64_strip_shift (rtx x) 8720 { 8721 rtx op = x; 8722 8723 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant 8724 we can convert both to ROR during final output. */ 8725 if ((GET_CODE (op) == ASHIFT 8726 || GET_CODE (op) == ASHIFTRT 8727 || GET_CODE (op) == LSHIFTRT 8728 || GET_CODE (op) == ROTATERT 8729 || GET_CODE (op) == ROTATE) 8730 && CONST_INT_P (XEXP (op, 1))) 8731 return XEXP (op, 0); 8732 8733 if (GET_CODE (op) == MULT 8734 && CONST_INT_P (XEXP (op, 1)) 8735 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64) 8736 return XEXP (op, 0); 8737 8738 return x; 8739 } 8740 8741 /* Helper function for rtx cost calculation. Strip an extend 8742 expression from X. Returns the inner operand if successful, or the 8743 original expression on failure. We deal with a number of possible 8744 canonicalization variations here. If STRIP_SHIFT is true, then 8745 we can strip off a shift also. */ 8746 static rtx 8747 aarch64_strip_extend (rtx x, bool strip_shift) 8748 { 8749 scalar_int_mode mode; 8750 rtx op = x; 8751 8752 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode)) 8753 return op; 8754 8755 /* Zero and sign extraction of a widened value. */ 8756 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT) 8757 && XEXP (op, 2) == const0_rtx 8758 && GET_CODE (XEXP (op, 0)) == MULT 8759 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1), 8760 XEXP (op, 1))) 8761 return XEXP (XEXP (op, 0), 0); 8762 8763 /* It can also be represented (for zero-extend) as an AND with an 8764 immediate. */ 8765 if (GET_CODE (op) == AND 8766 && GET_CODE (XEXP (op, 0)) == MULT 8767 && CONST_INT_P (XEXP (XEXP (op, 0), 1)) 8768 && CONST_INT_P (XEXP (op, 1)) 8769 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))), 8770 INTVAL (XEXP (op, 1))) != 0) 8771 return XEXP (XEXP (op, 0), 0); 8772 8773 /* Now handle extended register, as this may also have an optional 8774 left shift by 1..4. */ 8775 if (strip_shift 8776 && GET_CODE (op) == ASHIFT 8777 && CONST_INT_P (XEXP (op, 1)) 8778 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4) 8779 op = XEXP (op, 0); 8780 8781 if (GET_CODE (op) == ZERO_EXTEND 8782 || GET_CODE (op) == SIGN_EXTEND) 8783 op = XEXP (op, 0); 8784 8785 if (op != x) 8786 return op; 8787 8788 return x; 8789 } 8790 8791 /* Return true iff CODE is a shift supported in combination 8792 with arithmetic instructions. */ 8793 8794 static bool 8795 aarch64_shift_p (enum rtx_code code) 8796 { 8797 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT; 8798 } 8799 8800 8801 /* Return true iff X is a cheap shift without a sign extend. */ 8802 8803 static bool 8804 aarch64_cheap_mult_shift_p (rtx x) 8805 { 8806 rtx op0, op1; 8807 8808 op0 = XEXP (x, 0); 8809 op1 = XEXP (x, 1); 8810 8811 if (!(aarch64_tune_params.extra_tuning_flags 8812 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND)) 8813 return false; 8814 8815 if (GET_CODE (op0) == SIGN_EXTEND) 8816 return false; 8817 8818 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1) 8819 && UINTVAL (op1) <= 4) 8820 return true; 8821 8822 if (GET_CODE (x) != MULT || !CONST_INT_P (op1)) 8823 return false; 8824 8825 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1)); 8826 8827 if (l2 > 0 && l2 <= 4) 8828 return true; 8829 8830 return false; 8831 } 8832 8833 /* Helper function for rtx cost calculation. Calculate the cost of 8834 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx. 8835 Return the calculated cost of the expression, recursing manually in to 8836 operands where needed. */ 8837 8838 static int 8839 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed) 8840 { 8841 rtx op0, op1; 8842 const struct cpu_cost_table *extra_cost 8843 = aarch64_tune_params.insn_extra_cost; 8844 int cost = 0; 8845 bool compound_p = (outer == PLUS || outer == MINUS); 8846 machine_mode mode = GET_MODE (x); 8847 8848 gcc_checking_assert (code == MULT); 8849 8850 op0 = XEXP (x, 0); 8851 op1 = XEXP (x, 1); 8852 8853 if (VECTOR_MODE_P (mode)) 8854 mode = GET_MODE_INNER (mode); 8855 8856 /* Integer multiply/fma. */ 8857 if (GET_MODE_CLASS (mode) == MODE_INT) 8858 { 8859 /* The multiply will be canonicalized as a shift, cost it as such. */ 8860 if (aarch64_shift_p (GET_CODE (x)) 8861 || (CONST_INT_P (op1) 8862 && exact_log2 (INTVAL (op1)) > 0)) 8863 { 8864 bool is_extend = GET_CODE (op0) == ZERO_EXTEND 8865 || GET_CODE (op0) == SIGN_EXTEND; 8866 if (speed) 8867 { 8868 if (compound_p) 8869 { 8870 /* If the shift is considered cheap, 8871 then don't add any cost. */ 8872 if (aarch64_cheap_mult_shift_p (x)) 8873 ; 8874 else if (REG_P (op1)) 8875 /* ARITH + shift-by-register. */ 8876 cost += extra_cost->alu.arith_shift_reg; 8877 else if (is_extend) 8878 /* ARITH + extended register. We don't have a cost field 8879 for ARITH+EXTEND+SHIFT, so use extend_arith here. */ 8880 cost += extra_cost->alu.extend_arith; 8881 else 8882 /* ARITH + shift-by-immediate. */ 8883 cost += extra_cost->alu.arith_shift; 8884 } 8885 else 8886 /* LSL (immediate). */ 8887 cost += extra_cost->alu.shift; 8888 8889 } 8890 /* Strip extends as we will have costed them in the case above. */ 8891 if (is_extend) 8892 op0 = aarch64_strip_extend (op0, true); 8893 8894 cost += rtx_cost (op0, VOIDmode, code, 0, speed); 8895 8896 return cost; 8897 } 8898 8899 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a 8900 compound and let the below cases handle it. After all, MNEG is a 8901 special-case alias of MSUB. */ 8902 if (GET_CODE (op0) == NEG) 8903 { 8904 op0 = XEXP (op0, 0); 8905 compound_p = true; 8906 } 8907 8908 /* Integer multiplies or FMAs have zero/sign extending variants. */ 8909 if ((GET_CODE (op0) == ZERO_EXTEND 8910 && GET_CODE (op1) == ZERO_EXTEND) 8911 || (GET_CODE (op0) == SIGN_EXTEND 8912 && GET_CODE (op1) == SIGN_EXTEND)) 8913 { 8914 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed); 8915 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed); 8916 8917 if (speed) 8918 { 8919 if (compound_p) 8920 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */ 8921 cost += extra_cost->mult[0].extend_add; 8922 else 8923 /* MUL/SMULL/UMULL. */ 8924 cost += extra_cost->mult[0].extend; 8925 } 8926 8927 return cost; 8928 } 8929 8930 /* This is either an integer multiply or a MADD. In both cases 8931 we want to recurse and cost the operands. */ 8932 cost += rtx_cost (op0, mode, MULT, 0, speed); 8933 cost += rtx_cost (op1, mode, MULT, 1, speed); 8934 8935 if (speed) 8936 { 8937 if (compound_p) 8938 /* MADD/MSUB. */ 8939 cost += extra_cost->mult[mode == DImode].add; 8940 else 8941 /* MUL. */ 8942 cost += extra_cost->mult[mode == DImode].simple; 8943 } 8944 8945 return cost; 8946 } 8947 else 8948 { 8949 if (speed) 8950 { 8951 /* Floating-point FMA/FMUL can also support negations of the 8952 operands, unless the rounding mode is upward or downward in 8953 which case FNMUL is different than FMUL with operand negation. */ 8954 bool neg0 = GET_CODE (op0) == NEG; 8955 bool neg1 = GET_CODE (op1) == NEG; 8956 if (compound_p || !flag_rounding_math || (neg0 && neg1)) 8957 { 8958 if (neg0) 8959 op0 = XEXP (op0, 0); 8960 if (neg1) 8961 op1 = XEXP (op1, 0); 8962 } 8963 8964 if (compound_p) 8965 /* FMADD/FNMADD/FNMSUB/FMSUB. */ 8966 cost += extra_cost->fp[mode == DFmode].fma; 8967 else 8968 /* FMUL/FNMUL. */ 8969 cost += extra_cost->fp[mode == DFmode].mult; 8970 } 8971 8972 cost += rtx_cost (op0, mode, MULT, 0, speed); 8973 cost += rtx_cost (op1, mode, MULT, 1, speed); 8974 return cost; 8975 } 8976 } 8977 8978 static int 8979 aarch64_address_cost (rtx x, 8980 machine_mode mode, 8981 addr_space_t as ATTRIBUTE_UNUSED, 8982 bool speed) 8983 { 8984 enum rtx_code c = GET_CODE (x); 8985 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost; 8986 struct aarch64_address_info info; 8987 int cost = 0; 8988 info.shift = 0; 8989 8990 if (!aarch64_classify_address (&info, x, mode, false)) 8991 { 8992 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF) 8993 { 8994 /* This is a CONST or SYMBOL ref which will be split 8995 in a different way depending on the code model in use. 8996 Cost it through the generic infrastructure. */ 8997 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed); 8998 /* Divide through by the cost of one instruction to 8999 bring it to the same units as the address costs. */ 9000 cost_symbol_ref /= COSTS_N_INSNS (1); 9001 /* The cost is then the cost of preparing the address, 9002 followed by an immediate (possibly 0) offset. */ 9003 return cost_symbol_ref + addr_cost->imm_offset; 9004 } 9005 else 9006 { 9007 /* This is most likely a jump table from a case 9008 statement. */ 9009 return addr_cost->register_offset; 9010 } 9011 } 9012 9013 switch (info.type) 9014 { 9015 case ADDRESS_LO_SUM: 9016 case ADDRESS_SYMBOLIC: 9017 case ADDRESS_REG_IMM: 9018 cost += addr_cost->imm_offset; 9019 break; 9020 9021 case ADDRESS_REG_WB: 9022 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY) 9023 cost += addr_cost->pre_modify; 9024 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY) 9025 cost += addr_cost->post_modify; 9026 else 9027 gcc_unreachable (); 9028 9029 break; 9030 9031 case ADDRESS_REG_REG: 9032 cost += addr_cost->register_offset; 9033 break; 9034 9035 case ADDRESS_REG_SXTW: 9036 cost += addr_cost->register_sextend; 9037 break; 9038 9039 case ADDRESS_REG_UXTW: 9040 cost += addr_cost->register_zextend; 9041 break; 9042 9043 default: 9044 gcc_unreachable (); 9045 } 9046 9047 9048 if (info.shift > 0) 9049 { 9050 /* For the sake of calculating the cost of the shifted register 9051 component, we can treat same sized modes in the same way. */ 9052 if (known_eq (GET_MODE_BITSIZE (mode), 16)) 9053 cost += addr_cost->addr_scale_costs.hi; 9054 else if (known_eq (GET_MODE_BITSIZE (mode), 32)) 9055 cost += addr_cost->addr_scale_costs.si; 9056 else if (known_eq (GET_MODE_BITSIZE (mode), 64)) 9057 cost += addr_cost->addr_scale_costs.di; 9058 else 9059 /* We can't tell, or this is a 128-bit vector. */ 9060 cost += addr_cost->addr_scale_costs.ti; 9061 } 9062 9063 return cost; 9064 } 9065 9066 /* Return the cost of a branch. If SPEED_P is true then the compiler is 9067 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted 9068 to be taken. */ 9069 9070 int 9071 aarch64_branch_cost (bool speed_p, bool predictable_p) 9072 { 9073 /* When optimizing for speed, use the cost of unpredictable branches. */ 9074 const struct cpu_branch_cost *branch_costs = 9075 aarch64_tune_params.branch_costs; 9076 9077 if (!speed_p || predictable_p) 9078 return branch_costs->predictable; 9079 else 9080 return branch_costs->unpredictable; 9081 } 9082 9083 /* Return true if the RTX X in mode MODE is a zero or sign extract 9084 usable in an ADD or SUB (extended register) instruction. */ 9085 static bool 9086 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode) 9087 { 9088 /* Catch add with a sign extract. 9089 This is add_<optab><mode>_multp2. */ 9090 if (GET_CODE (x) == SIGN_EXTRACT 9091 || GET_CODE (x) == ZERO_EXTRACT) 9092 { 9093 rtx op0 = XEXP (x, 0); 9094 rtx op1 = XEXP (x, 1); 9095 rtx op2 = XEXP (x, 2); 9096 9097 if (GET_CODE (op0) == MULT 9098 && CONST_INT_P (op1) 9099 && op2 == const0_rtx 9100 && CONST_INT_P (XEXP (op0, 1)) 9101 && aarch64_is_extend_from_extract (mode, 9102 XEXP (op0, 1), 9103 op1)) 9104 { 9105 return true; 9106 } 9107 } 9108 /* The simple case <ARITH>, XD, XN, XM, [us]xt. 9109 No shift. */ 9110 else if (GET_CODE (x) == SIGN_EXTEND 9111 || GET_CODE (x) == ZERO_EXTEND) 9112 return REG_P (XEXP (x, 0)); 9113 9114 return false; 9115 } 9116 9117 static bool 9118 aarch64_frint_unspec_p (unsigned int u) 9119 { 9120 switch (u) 9121 { 9122 case UNSPEC_FRINTZ: 9123 case UNSPEC_FRINTP: 9124 case UNSPEC_FRINTM: 9125 case UNSPEC_FRINTA: 9126 case UNSPEC_FRINTN: 9127 case UNSPEC_FRINTX: 9128 case UNSPEC_FRINTI: 9129 return true; 9130 9131 default: 9132 return false; 9133 } 9134 } 9135 9136 /* Return true iff X is an rtx that will match an extr instruction 9137 i.e. as described in the *extr<mode>5_insn family of patterns. 9138 OP0 and OP1 will be set to the operands of the shifts involved 9139 on success and will be NULL_RTX otherwise. */ 9140 9141 static bool 9142 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1) 9143 { 9144 rtx op0, op1; 9145 scalar_int_mode mode; 9146 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode)) 9147 return false; 9148 9149 *res_op0 = NULL_RTX; 9150 *res_op1 = NULL_RTX; 9151 9152 if (GET_CODE (x) != IOR) 9153 return false; 9154 9155 op0 = XEXP (x, 0); 9156 op1 = XEXP (x, 1); 9157 9158 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT) 9159 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT)) 9160 { 9161 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */ 9162 if (GET_CODE (op1) == ASHIFT) 9163 std::swap (op0, op1); 9164 9165 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1))) 9166 return false; 9167 9168 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1)); 9169 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1)); 9170 9171 if (shft_amnt_0 < GET_MODE_BITSIZE (mode) 9172 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode)) 9173 { 9174 *res_op0 = XEXP (op0, 0); 9175 *res_op1 = XEXP (op1, 0); 9176 return true; 9177 } 9178 } 9179 9180 return false; 9181 } 9182 9183 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)), 9184 storing it in *COST. Result is true if the total cost of the operation 9185 has now been calculated. */ 9186 static bool 9187 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed) 9188 { 9189 rtx inner; 9190 rtx comparator; 9191 enum rtx_code cmpcode; 9192 9193 if (COMPARISON_P (op0)) 9194 { 9195 inner = XEXP (op0, 0); 9196 comparator = XEXP (op0, 1); 9197 cmpcode = GET_CODE (op0); 9198 } 9199 else 9200 { 9201 inner = op0; 9202 comparator = const0_rtx; 9203 cmpcode = NE; 9204 } 9205 9206 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC) 9207 { 9208 /* Conditional branch. */ 9209 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC) 9210 return true; 9211 else 9212 { 9213 if (cmpcode == NE || cmpcode == EQ) 9214 { 9215 if (comparator == const0_rtx) 9216 { 9217 /* TBZ/TBNZ/CBZ/CBNZ. */ 9218 if (GET_CODE (inner) == ZERO_EXTRACT) 9219 /* TBZ/TBNZ. */ 9220 *cost += rtx_cost (XEXP (inner, 0), VOIDmode, 9221 ZERO_EXTRACT, 0, speed); 9222 else 9223 /* CBZ/CBNZ. */ 9224 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed); 9225 9226 return true; 9227 } 9228 } 9229 else if (cmpcode == LT || cmpcode == GE) 9230 { 9231 /* TBZ/TBNZ. */ 9232 if (comparator == const0_rtx) 9233 return true; 9234 } 9235 } 9236 } 9237 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC) 9238 { 9239 /* CCMP. */ 9240 if (GET_CODE (op1) == COMPARE) 9241 { 9242 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */ 9243 if (XEXP (op1, 1) == const0_rtx) 9244 *cost += 1; 9245 if (speed) 9246 { 9247 machine_mode mode = GET_MODE (XEXP (op1, 0)); 9248 const struct cpu_cost_table *extra_cost 9249 = aarch64_tune_params.insn_extra_cost; 9250 9251 if (GET_MODE_CLASS (mode) == MODE_INT) 9252 *cost += extra_cost->alu.arith; 9253 else 9254 *cost += extra_cost->fp[mode == DFmode].compare; 9255 } 9256 return true; 9257 } 9258 9259 /* It's a conditional operation based on the status flags, 9260 so it must be some flavor of CSEL. */ 9261 9262 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */ 9263 if (GET_CODE (op1) == NEG 9264 || GET_CODE (op1) == NOT 9265 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx)) 9266 op1 = XEXP (op1, 0); 9267 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND) 9268 { 9269 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */ 9270 op1 = XEXP (op1, 0); 9271 op2 = XEXP (op2, 0); 9272 } 9273 9274 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed); 9275 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed); 9276 return true; 9277 } 9278 9279 /* We don't know what this is, cost all operands. */ 9280 return false; 9281 } 9282 9283 /* Check whether X is a bitfield operation of the form shift + extend that 9284 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the 9285 operand to which the bitfield operation is applied. Otherwise return 9286 NULL_RTX. */ 9287 9288 static rtx 9289 aarch64_extend_bitfield_pattern_p (rtx x) 9290 { 9291 rtx_code outer_code = GET_CODE (x); 9292 machine_mode outer_mode = GET_MODE (x); 9293 9294 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND 9295 && outer_mode != SImode && outer_mode != DImode) 9296 return NULL_RTX; 9297 9298 rtx inner = XEXP (x, 0); 9299 rtx_code inner_code = GET_CODE (inner); 9300 machine_mode inner_mode = GET_MODE (inner); 9301 rtx op = NULL_RTX; 9302 9303 switch (inner_code) 9304 { 9305 case ASHIFT: 9306 if (CONST_INT_P (XEXP (inner, 1)) 9307 && (inner_mode == QImode || inner_mode == HImode)) 9308 op = XEXP (inner, 0); 9309 break; 9310 case LSHIFTRT: 9311 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1)) 9312 && (inner_mode == QImode || inner_mode == HImode)) 9313 op = XEXP (inner, 0); 9314 break; 9315 case ASHIFTRT: 9316 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1)) 9317 && (inner_mode == QImode || inner_mode == HImode)) 9318 op = XEXP (inner, 0); 9319 break; 9320 default: 9321 break; 9322 } 9323 9324 return op; 9325 } 9326 9327 /* Return true if the mask and a shift amount from an RTX of the form 9328 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of 9329 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */ 9330 9331 bool 9332 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask, 9333 rtx shft_amnt) 9334 { 9335 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt) 9336 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode) 9337 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0 9338 && (INTVAL (mask) 9339 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0; 9340 } 9341 9342 /* Return true if the masks and a shift amount from an RTX of the form 9343 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into 9344 a BFI instruction of mode MODE. See *arch64_bfi patterns. */ 9345 9346 bool 9347 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode, 9348 unsigned HOST_WIDE_INT mask1, 9349 unsigned HOST_WIDE_INT shft_amnt, 9350 unsigned HOST_WIDE_INT mask2) 9351 { 9352 unsigned HOST_WIDE_INT t; 9353 9354 /* Verify that there is no overlap in what bits are set in the two masks. */ 9355 if (mask1 != ~mask2) 9356 return false; 9357 9358 /* Verify that mask2 is not all zeros or ones. */ 9359 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U) 9360 return false; 9361 9362 /* The shift amount should always be less than the mode size. */ 9363 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode)); 9364 9365 /* Verify that the mask being shifted is contiguous and would be in the 9366 least significant bits after shifting by shft_amnt. */ 9367 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt); 9368 return (t == (t & -t)); 9369 } 9370 9371 /* Calculate the cost of calculating X, storing it in *COST. Result 9372 is true if the total cost of the operation has now been calculated. */ 9373 static bool 9374 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED, 9375 int param ATTRIBUTE_UNUSED, int *cost, bool speed) 9376 { 9377 rtx op0, op1, op2; 9378 const struct cpu_cost_table *extra_cost 9379 = aarch64_tune_params.insn_extra_cost; 9380 int code = GET_CODE (x); 9381 scalar_int_mode int_mode; 9382 9383 /* By default, assume that everything has equivalent cost to the 9384 cheapest instruction. Any additional costs are applied as a delta 9385 above this default. */ 9386 *cost = COSTS_N_INSNS (1); 9387 9388 switch (code) 9389 { 9390 case SET: 9391 /* The cost depends entirely on the operands to SET. */ 9392 *cost = 0; 9393 op0 = SET_DEST (x); 9394 op1 = SET_SRC (x); 9395 9396 switch (GET_CODE (op0)) 9397 { 9398 case MEM: 9399 if (speed) 9400 { 9401 rtx address = XEXP (op0, 0); 9402 if (VECTOR_MODE_P (mode)) 9403 *cost += extra_cost->ldst.storev; 9404 else if (GET_MODE_CLASS (mode) == MODE_INT) 9405 *cost += extra_cost->ldst.store; 9406 else if (mode == SFmode) 9407 *cost += extra_cost->ldst.storef; 9408 else if (mode == DFmode) 9409 *cost += extra_cost->ldst.stored; 9410 9411 *cost += 9412 COSTS_N_INSNS (aarch64_address_cost (address, mode, 9413 0, speed)); 9414 } 9415 9416 *cost += rtx_cost (op1, mode, SET, 1, speed); 9417 return true; 9418 9419 case SUBREG: 9420 if (! REG_P (SUBREG_REG (op0))) 9421 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed); 9422 9423 /* Fall through. */ 9424 case REG: 9425 /* The cost is one per vector-register copied. */ 9426 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1)) 9427 { 9428 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0)); 9429 *cost = COSTS_N_INSNS (nregs); 9430 } 9431 /* const0_rtx is in general free, but we will use an 9432 instruction to set a register to 0. */ 9433 else if (REG_P (op1) || op1 == const0_rtx) 9434 { 9435 /* The cost is 1 per register copied. */ 9436 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0)); 9437 *cost = COSTS_N_INSNS (nregs); 9438 } 9439 else 9440 /* Cost is just the cost of the RHS of the set. */ 9441 *cost += rtx_cost (op1, mode, SET, 1, speed); 9442 return true; 9443 9444 case ZERO_EXTRACT: 9445 case SIGN_EXTRACT: 9446 /* Bit-field insertion. Strip any redundant widening of 9447 the RHS to meet the width of the target. */ 9448 if (GET_CODE (op1) == SUBREG) 9449 op1 = SUBREG_REG (op1); 9450 if ((GET_CODE (op1) == ZERO_EXTEND 9451 || GET_CODE (op1) == SIGN_EXTEND) 9452 && CONST_INT_P (XEXP (op0, 1)) 9453 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode) 9454 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1))) 9455 op1 = XEXP (op1, 0); 9456 9457 if (CONST_INT_P (op1)) 9458 { 9459 /* MOV immediate is assumed to always be cheap. */ 9460 *cost = COSTS_N_INSNS (1); 9461 } 9462 else 9463 { 9464 /* BFM. */ 9465 if (speed) 9466 *cost += extra_cost->alu.bfi; 9467 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed); 9468 } 9469 9470 return true; 9471 9472 default: 9473 /* We can't make sense of this, assume default cost. */ 9474 *cost = COSTS_N_INSNS (1); 9475 return false; 9476 } 9477 return false; 9478 9479 case CONST_INT: 9480 /* If an instruction can incorporate a constant within the 9481 instruction, the instruction's expression avoids calling 9482 rtx_cost() on the constant. If rtx_cost() is called on a 9483 constant, then it is usually because the constant must be 9484 moved into a register by one or more instructions. 9485 9486 The exception is constant 0, which can be expressed 9487 as XZR/WZR and is therefore free. The exception to this is 9488 if we have (set (reg) (const0_rtx)) in which case we must cost 9489 the move. However, we can catch that when we cost the SET, so 9490 we don't need to consider that here. */ 9491 if (x == const0_rtx) 9492 *cost = 0; 9493 else 9494 { 9495 /* To an approximation, building any other constant is 9496 proportionally expensive to the number of instructions 9497 required to build that constant. This is true whether we 9498 are compiling for SPEED or otherwise. */ 9499 if (!is_a <scalar_int_mode> (mode, &int_mode)) 9500 int_mode = word_mode; 9501 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate 9502 (NULL_RTX, x, false, int_mode)); 9503 } 9504 return true; 9505 9506 case CONST_DOUBLE: 9507 9508 /* First determine number of instructions to do the move 9509 as an integer constant. */ 9510 if (!aarch64_float_const_representable_p (x) 9511 && !aarch64_can_const_movi_rtx_p (x, mode) 9512 && aarch64_float_const_rtx_p (x)) 9513 { 9514 unsigned HOST_WIDE_INT ival; 9515 bool succeed = aarch64_reinterpret_float_as_int (x, &ival); 9516 gcc_assert (succeed); 9517 9518 scalar_int_mode imode = (mode == HFmode 9519 ? SImode 9520 : int_mode_for_mode (mode).require ()); 9521 int ncost = aarch64_internal_mov_immediate 9522 (NULL_RTX, gen_int_mode (ival, imode), false, imode); 9523 *cost += COSTS_N_INSNS (ncost); 9524 return true; 9525 } 9526 9527 if (speed) 9528 { 9529 /* mov[df,sf]_aarch64. */ 9530 if (aarch64_float_const_representable_p (x)) 9531 /* FMOV (scalar immediate). */ 9532 *cost += extra_cost->fp[mode == DFmode].fpconst; 9533 else if (!aarch64_float_const_zero_rtx_p (x)) 9534 { 9535 /* This will be a load from memory. */ 9536 if (mode == DFmode) 9537 *cost += extra_cost->ldst.loadd; 9538 else 9539 *cost += extra_cost->ldst.loadf; 9540 } 9541 else 9542 /* Otherwise this is +0.0. We get this using MOVI d0, #0 9543 or MOV v0.s[0], wzr - neither of which are modeled by the 9544 cost tables. Just use the default cost. */ 9545 { 9546 } 9547 } 9548 9549 return true; 9550 9551 case MEM: 9552 if (speed) 9553 { 9554 /* For loads we want the base cost of a load, plus an 9555 approximation for the additional cost of the addressing 9556 mode. */ 9557 rtx address = XEXP (x, 0); 9558 if (VECTOR_MODE_P (mode)) 9559 *cost += extra_cost->ldst.loadv; 9560 else if (GET_MODE_CLASS (mode) == MODE_INT) 9561 *cost += extra_cost->ldst.load; 9562 else if (mode == SFmode) 9563 *cost += extra_cost->ldst.loadf; 9564 else if (mode == DFmode) 9565 *cost += extra_cost->ldst.loadd; 9566 9567 *cost += 9568 COSTS_N_INSNS (aarch64_address_cost (address, mode, 9569 0, speed)); 9570 } 9571 9572 return true; 9573 9574 case NEG: 9575 op0 = XEXP (x, 0); 9576 9577 if (VECTOR_MODE_P (mode)) 9578 { 9579 if (speed) 9580 { 9581 /* FNEG. */ 9582 *cost += extra_cost->vect.alu; 9583 } 9584 return false; 9585 } 9586 9587 if (GET_MODE_CLASS (mode) == MODE_INT) 9588 { 9589 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE 9590 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE) 9591 { 9592 /* CSETM. */ 9593 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed); 9594 return true; 9595 } 9596 9597 /* Cost this as SUB wzr, X. */ 9598 op0 = CONST0_RTX (mode); 9599 op1 = XEXP (x, 0); 9600 goto cost_minus; 9601 } 9602 9603 if (GET_MODE_CLASS (mode) == MODE_FLOAT) 9604 { 9605 /* Support (neg(fma...)) as a single instruction only if 9606 sign of zeros is unimportant. This matches the decision 9607 making in aarch64.md. */ 9608 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0))) 9609 { 9610 /* FNMADD. */ 9611 *cost = rtx_cost (op0, mode, NEG, 0, speed); 9612 return true; 9613 } 9614 if (GET_CODE (op0) == MULT) 9615 { 9616 /* FNMUL. */ 9617 *cost = rtx_cost (op0, mode, NEG, 0, speed); 9618 return true; 9619 } 9620 if (speed) 9621 /* FNEG. */ 9622 *cost += extra_cost->fp[mode == DFmode].neg; 9623 return false; 9624 } 9625 9626 return false; 9627 9628 case CLRSB: 9629 case CLZ: 9630 if (speed) 9631 { 9632 if (VECTOR_MODE_P (mode)) 9633 *cost += extra_cost->vect.alu; 9634 else 9635 *cost += extra_cost->alu.clz; 9636 } 9637 9638 return false; 9639 9640 case COMPARE: 9641 op0 = XEXP (x, 0); 9642 op1 = XEXP (x, 1); 9643 9644 if (op1 == const0_rtx 9645 && GET_CODE (op0) == AND) 9646 { 9647 x = op0; 9648 mode = GET_MODE (op0); 9649 goto cost_logic; 9650 } 9651 9652 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT) 9653 { 9654 /* TODO: A write to the CC flags possibly costs extra, this 9655 needs encoding in the cost tables. */ 9656 9657 mode = GET_MODE (op0); 9658 /* ANDS. */ 9659 if (GET_CODE (op0) == AND) 9660 { 9661 x = op0; 9662 goto cost_logic; 9663 } 9664 9665 if (GET_CODE (op0) == PLUS) 9666 { 9667 /* ADDS (and CMN alias). */ 9668 x = op0; 9669 goto cost_plus; 9670 } 9671 9672 if (GET_CODE (op0) == MINUS) 9673 { 9674 /* SUBS. */ 9675 x = op0; 9676 goto cost_minus; 9677 } 9678 9679 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx 9680 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1)) 9681 && CONST_INT_P (XEXP (op0, 2))) 9682 { 9683 /* COMPARE of ZERO_EXTRACT form of TST-immediate. 9684 Handle it here directly rather than going to cost_logic 9685 since we know the immediate generated for the TST is valid 9686 so we can avoid creating an intermediate rtx for it only 9687 for costing purposes. */ 9688 if (speed) 9689 *cost += extra_cost->alu.logical; 9690 9691 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0), 9692 ZERO_EXTRACT, 0, speed); 9693 return true; 9694 } 9695 9696 if (GET_CODE (op1) == NEG) 9697 { 9698 /* CMN. */ 9699 if (speed) 9700 *cost += extra_cost->alu.arith; 9701 9702 *cost += rtx_cost (op0, mode, COMPARE, 0, speed); 9703 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed); 9704 return true; 9705 } 9706 9707 /* CMP. 9708 9709 Compare can freely swap the order of operands, and 9710 canonicalization puts the more complex operation first. 9711 But the integer MINUS logic expects the shift/extend 9712 operation in op1. */ 9713 if (! (REG_P (op0) 9714 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0))))) 9715 { 9716 op0 = XEXP (x, 1); 9717 op1 = XEXP (x, 0); 9718 } 9719 goto cost_minus; 9720 } 9721 9722 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT) 9723 { 9724 /* FCMP. */ 9725 if (speed) 9726 *cost += extra_cost->fp[mode == DFmode].compare; 9727 9728 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1)) 9729 { 9730 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed); 9731 /* FCMP supports constant 0.0 for no extra cost. */ 9732 return true; 9733 } 9734 return false; 9735 } 9736 9737 if (VECTOR_MODE_P (mode)) 9738 { 9739 /* Vector compare. */ 9740 if (speed) 9741 *cost += extra_cost->vect.alu; 9742 9743 if (aarch64_float_const_zero_rtx_p (op1)) 9744 { 9745 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra 9746 cost. */ 9747 return true; 9748 } 9749 return false; 9750 } 9751 return false; 9752 9753 case MINUS: 9754 { 9755 op0 = XEXP (x, 0); 9756 op1 = XEXP (x, 1); 9757 9758 cost_minus: 9759 *cost += rtx_cost (op0, mode, MINUS, 0, speed); 9760 9761 /* Detect valid immediates. */ 9762 if ((GET_MODE_CLASS (mode) == MODE_INT 9763 || (GET_MODE_CLASS (mode) == MODE_CC 9764 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)) 9765 && CONST_INT_P (op1) 9766 && aarch64_uimm12_shift (INTVAL (op1))) 9767 { 9768 if (speed) 9769 /* SUB(S) (immediate). */ 9770 *cost += extra_cost->alu.arith; 9771 return true; 9772 } 9773 9774 /* Look for SUB (extended register). */ 9775 if (is_a <scalar_int_mode> (mode, &int_mode) 9776 && aarch64_rtx_arith_op_extract_p (op1, int_mode)) 9777 { 9778 if (speed) 9779 *cost += extra_cost->alu.extend_arith; 9780 9781 op1 = aarch64_strip_extend (op1, true); 9782 *cost += rtx_cost (op1, VOIDmode, 9783 (enum rtx_code) GET_CODE (op1), 0, speed); 9784 return true; 9785 } 9786 9787 rtx new_op1 = aarch64_strip_extend (op1, false); 9788 9789 /* Cost this as an FMA-alike operation. */ 9790 if ((GET_CODE (new_op1) == MULT 9791 || aarch64_shift_p (GET_CODE (new_op1))) 9792 && code != COMPARE) 9793 { 9794 *cost += aarch64_rtx_mult_cost (new_op1, MULT, 9795 (enum rtx_code) code, 9796 speed); 9797 return true; 9798 } 9799 9800 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed); 9801 9802 if (speed) 9803 { 9804 if (VECTOR_MODE_P (mode)) 9805 { 9806 /* Vector SUB. */ 9807 *cost += extra_cost->vect.alu; 9808 } 9809 else if (GET_MODE_CLASS (mode) == MODE_INT) 9810 { 9811 /* SUB(S). */ 9812 *cost += extra_cost->alu.arith; 9813 } 9814 else if (GET_MODE_CLASS (mode) == MODE_FLOAT) 9815 { 9816 /* FSUB. */ 9817 *cost += extra_cost->fp[mode == DFmode].addsub; 9818 } 9819 } 9820 return true; 9821 } 9822 9823 case PLUS: 9824 { 9825 rtx new_op0; 9826 9827 op0 = XEXP (x, 0); 9828 op1 = XEXP (x, 1); 9829 9830 cost_plus: 9831 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE 9832 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE) 9833 { 9834 /* CSINC. */ 9835 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed); 9836 *cost += rtx_cost (op1, mode, PLUS, 1, speed); 9837 return true; 9838 } 9839 9840 if (GET_MODE_CLASS (mode) == MODE_INT 9841 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1))) 9842 || aarch64_sve_addvl_addpl_immediate (op1, mode))) 9843 { 9844 *cost += rtx_cost (op0, mode, PLUS, 0, speed); 9845 9846 if (speed) 9847 /* ADD (immediate). */ 9848 *cost += extra_cost->alu.arith; 9849 return true; 9850 } 9851 9852 *cost += rtx_cost (op1, mode, PLUS, 1, speed); 9853 9854 /* Look for ADD (extended register). */ 9855 if (is_a <scalar_int_mode> (mode, &int_mode) 9856 && aarch64_rtx_arith_op_extract_p (op0, int_mode)) 9857 { 9858 if (speed) 9859 *cost += extra_cost->alu.extend_arith; 9860 9861 op0 = aarch64_strip_extend (op0, true); 9862 *cost += rtx_cost (op0, VOIDmode, 9863 (enum rtx_code) GET_CODE (op0), 0, speed); 9864 return true; 9865 } 9866 9867 /* Strip any extend, leave shifts behind as we will 9868 cost them through mult_cost. */ 9869 new_op0 = aarch64_strip_extend (op0, false); 9870 9871 if (GET_CODE (new_op0) == MULT 9872 || aarch64_shift_p (GET_CODE (new_op0))) 9873 { 9874 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS, 9875 speed); 9876 return true; 9877 } 9878 9879 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed); 9880 9881 if (speed) 9882 { 9883 if (VECTOR_MODE_P (mode)) 9884 { 9885 /* Vector ADD. */ 9886 *cost += extra_cost->vect.alu; 9887 } 9888 else if (GET_MODE_CLASS (mode) == MODE_INT) 9889 { 9890 /* ADD. */ 9891 *cost += extra_cost->alu.arith; 9892 } 9893 else if (GET_MODE_CLASS (mode) == MODE_FLOAT) 9894 { 9895 /* FADD. */ 9896 *cost += extra_cost->fp[mode == DFmode].addsub; 9897 } 9898 } 9899 return true; 9900 } 9901 9902 case BSWAP: 9903 *cost = COSTS_N_INSNS (1); 9904 9905 if (speed) 9906 { 9907 if (VECTOR_MODE_P (mode)) 9908 *cost += extra_cost->vect.alu; 9909 else 9910 *cost += extra_cost->alu.rev; 9911 } 9912 return false; 9913 9914 case IOR: 9915 if (aarch_rev16_p (x)) 9916 { 9917 *cost = COSTS_N_INSNS (1); 9918 9919 if (speed) 9920 { 9921 if (VECTOR_MODE_P (mode)) 9922 *cost += extra_cost->vect.alu; 9923 else 9924 *cost += extra_cost->alu.rev; 9925 } 9926 return true; 9927 } 9928 9929 if (aarch64_extr_rtx_p (x, &op0, &op1)) 9930 { 9931 *cost += rtx_cost (op0, mode, IOR, 0, speed); 9932 *cost += rtx_cost (op1, mode, IOR, 1, speed); 9933 if (speed) 9934 *cost += extra_cost->alu.shift; 9935 9936 return true; 9937 } 9938 /* Fall through. */ 9939 case XOR: 9940 case AND: 9941 cost_logic: 9942 op0 = XEXP (x, 0); 9943 op1 = XEXP (x, 1); 9944 9945 if (VECTOR_MODE_P (mode)) 9946 { 9947 if (speed) 9948 *cost += extra_cost->vect.alu; 9949 return true; 9950 } 9951 9952 if (code == AND 9953 && GET_CODE (op0) == MULT 9954 && CONST_INT_P (XEXP (op0, 1)) 9955 && CONST_INT_P (op1) 9956 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))), 9957 INTVAL (op1)) != 0) 9958 { 9959 /* This is a UBFM/SBFM. */ 9960 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed); 9961 if (speed) 9962 *cost += extra_cost->alu.bfx; 9963 return true; 9964 } 9965 9966 if (is_int_mode (mode, &int_mode)) 9967 { 9968 if (CONST_INT_P (op1)) 9969 { 9970 /* We have a mask + shift version of a UBFIZ 9971 i.e. the *andim_ashift<mode>_bfiz pattern. */ 9972 if (GET_CODE (op0) == ASHIFT 9973 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1, 9974 XEXP (op0, 1))) 9975 { 9976 *cost += rtx_cost (XEXP (op0, 0), int_mode, 9977 (enum rtx_code) code, 0, speed); 9978 if (speed) 9979 *cost += extra_cost->alu.bfx; 9980 9981 return true; 9982 } 9983 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode)) 9984 { 9985 /* We possibly get the immediate for free, this is not 9986 modelled. */ 9987 *cost += rtx_cost (op0, int_mode, 9988 (enum rtx_code) code, 0, speed); 9989 if (speed) 9990 *cost += extra_cost->alu.logical; 9991 9992 return true; 9993 } 9994 } 9995 else 9996 { 9997 rtx new_op0 = op0; 9998 9999 /* Handle ORN, EON, or BIC. */ 10000 if (GET_CODE (op0) == NOT) 10001 op0 = XEXP (op0, 0); 10002 10003 new_op0 = aarch64_strip_shift (op0); 10004 10005 /* If we had a shift on op0 then this is a logical-shift- 10006 by-register/immediate operation. Otherwise, this is just 10007 a logical operation. */ 10008 if (speed) 10009 { 10010 if (new_op0 != op0) 10011 { 10012 /* Shift by immediate. */ 10013 if (CONST_INT_P (XEXP (op0, 1))) 10014 *cost += extra_cost->alu.log_shift; 10015 else 10016 *cost += extra_cost->alu.log_shift_reg; 10017 } 10018 else 10019 *cost += extra_cost->alu.logical; 10020 } 10021 10022 /* In both cases we want to cost both operands. */ 10023 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code, 10024 0, speed); 10025 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code, 10026 1, speed); 10027 10028 return true; 10029 } 10030 } 10031 return false; 10032 10033 case NOT: 10034 x = XEXP (x, 0); 10035 op0 = aarch64_strip_shift (x); 10036 10037 if (VECTOR_MODE_P (mode)) 10038 { 10039 /* Vector NOT. */ 10040 *cost += extra_cost->vect.alu; 10041 return false; 10042 } 10043 10044 /* MVN-shifted-reg. */ 10045 if (op0 != x) 10046 { 10047 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed); 10048 10049 if (speed) 10050 *cost += extra_cost->alu.log_shift; 10051 10052 return true; 10053 } 10054 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)). 10055 Handle the second form here taking care that 'a' in the above can 10056 be a shift. */ 10057 else if (GET_CODE (op0) == XOR) 10058 { 10059 rtx newop0 = XEXP (op0, 0); 10060 rtx newop1 = XEXP (op0, 1); 10061 rtx op0_stripped = aarch64_strip_shift (newop0); 10062 10063 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed); 10064 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed); 10065 10066 if (speed) 10067 { 10068 if (op0_stripped != newop0) 10069 *cost += extra_cost->alu.log_shift; 10070 else 10071 *cost += extra_cost->alu.logical; 10072 } 10073 10074 return true; 10075 } 10076 /* MVN. */ 10077 if (speed) 10078 *cost += extra_cost->alu.logical; 10079 10080 return false; 10081 10082 case ZERO_EXTEND: 10083 10084 op0 = XEXP (x, 0); 10085 /* If a value is written in SI mode, then zero extended to DI 10086 mode, the operation will in general be free as a write to 10087 a 'w' register implicitly zeroes the upper bits of an 'x' 10088 register. However, if this is 10089 10090 (set (reg) (zero_extend (reg))) 10091 10092 we must cost the explicit register move. */ 10093 if (mode == DImode 10094 && GET_MODE (op0) == SImode 10095 && outer == SET) 10096 { 10097 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed); 10098 10099 /* If OP_COST is non-zero, then the cost of the zero extend 10100 is effectively the cost of the inner operation. Otherwise 10101 we have a MOV instruction and we take the cost from the MOV 10102 itself. This is true independently of whether we are 10103 optimizing for space or time. */ 10104 if (op_cost) 10105 *cost = op_cost; 10106 10107 return true; 10108 } 10109 else if (MEM_P (op0)) 10110 { 10111 /* All loads can zero extend to any size for free. */ 10112 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed); 10113 return true; 10114 } 10115 10116 op0 = aarch64_extend_bitfield_pattern_p (x); 10117 if (op0) 10118 { 10119 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed); 10120 if (speed) 10121 *cost += extra_cost->alu.bfx; 10122 return true; 10123 } 10124 10125 if (speed) 10126 { 10127 if (VECTOR_MODE_P (mode)) 10128 { 10129 /* UMOV. */ 10130 *cost += extra_cost->vect.alu; 10131 } 10132 else 10133 { 10134 /* We generate an AND instead of UXTB/UXTH. */ 10135 *cost += extra_cost->alu.logical; 10136 } 10137 } 10138 return false; 10139 10140 case SIGN_EXTEND: 10141 if (MEM_P (XEXP (x, 0))) 10142 { 10143 /* LDRSH. */ 10144 if (speed) 10145 { 10146 rtx address = XEXP (XEXP (x, 0), 0); 10147 *cost += extra_cost->ldst.load_sign_extend; 10148 10149 *cost += 10150 COSTS_N_INSNS (aarch64_address_cost (address, mode, 10151 0, speed)); 10152 } 10153 return true; 10154 } 10155 10156 op0 = aarch64_extend_bitfield_pattern_p (x); 10157 if (op0) 10158 { 10159 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed); 10160 if (speed) 10161 *cost += extra_cost->alu.bfx; 10162 return true; 10163 } 10164 10165 if (speed) 10166 { 10167 if (VECTOR_MODE_P (mode)) 10168 *cost += extra_cost->vect.alu; 10169 else 10170 *cost += extra_cost->alu.extend; 10171 } 10172 return false; 10173 10174 case ASHIFT: 10175 op0 = XEXP (x, 0); 10176 op1 = XEXP (x, 1); 10177 10178 if (CONST_INT_P (op1)) 10179 { 10180 if (speed) 10181 { 10182 if (VECTOR_MODE_P (mode)) 10183 { 10184 /* Vector shift (immediate). */ 10185 *cost += extra_cost->vect.alu; 10186 } 10187 else 10188 { 10189 /* LSL (immediate), UBMF, UBFIZ and friends. These are all 10190 aliases. */ 10191 *cost += extra_cost->alu.shift; 10192 } 10193 } 10194 10195 /* We can incorporate zero/sign extend for free. */ 10196 if (GET_CODE (op0) == ZERO_EXTEND 10197 || GET_CODE (op0) == SIGN_EXTEND) 10198 op0 = XEXP (op0, 0); 10199 10200 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed); 10201 return true; 10202 } 10203 else 10204 { 10205 if (VECTOR_MODE_P (mode)) 10206 { 10207 if (speed) 10208 /* Vector shift (register). */ 10209 *cost += extra_cost->vect.alu; 10210 } 10211 else 10212 { 10213 if (speed) 10214 /* LSLV. */ 10215 *cost += extra_cost->alu.shift_reg; 10216 10217 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0)) 10218 && CONST_INT_P (XEXP (op1, 1)) 10219 && known_eq (INTVAL (XEXP (op1, 1)), 10220 GET_MODE_BITSIZE (mode) - 1)) 10221 { 10222 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed); 10223 /* We already demanded XEXP (op1, 0) to be REG_P, so 10224 don't recurse into it. */ 10225 return true; 10226 } 10227 } 10228 return false; /* All arguments need to be in registers. */ 10229 } 10230 10231 case ROTATE: 10232 case ROTATERT: 10233 case LSHIFTRT: 10234 case ASHIFTRT: 10235 op0 = XEXP (x, 0); 10236 op1 = XEXP (x, 1); 10237 10238 if (CONST_INT_P (op1)) 10239 { 10240 /* ASR (immediate) and friends. */ 10241 if (speed) 10242 { 10243 if (VECTOR_MODE_P (mode)) 10244 *cost += extra_cost->vect.alu; 10245 else 10246 *cost += extra_cost->alu.shift; 10247 } 10248 10249 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed); 10250 return true; 10251 } 10252 else 10253 { 10254 if (VECTOR_MODE_P (mode)) 10255 { 10256 if (speed) 10257 /* Vector shift (register). */ 10258 *cost += extra_cost->vect.alu; 10259 } 10260 else 10261 { 10262 if (speed) 10263 /* ASR (register) and friends. */ 10264 *cost += extra_cost->alu.shift_reg; 10265 10266 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0)) 10267 && CONST_INT_P (XEXP (op1, 1)) 10268 && known_eq (INTVAL (XEXP (op1, 1)), 10269 GET_MODE_BITSIZE (mode) - 1)) 10270 { 10271 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed); 10272 /* We already demanded XEXP (op1, 0) to be REG_P, so 10273 don't recurse into it. */ 10274 return true; 10275 } 10276 } 10277 return false; /* All arguments need to be in registers. */ 10278 } 10279 10280 case SYMBOL_REF: 10281 10282 if (aarch64_cmodel == AARCH64_CMODEL_LARGE 10283 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC) 10284 { 10285 /* LDR. */ 10286 if (speed) 10287 *cost += extra_cost->ldst.load; 10288 } 10289 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL 10290 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC) 10291 { 10292 /* ADRP, followed by ADD. */ 10293 *cost += COSTS_N_INSNS (1); 10294 if (speed) 10295 *cost += 2 * extra_cost->alu.arith; 10296 } 10297 else if (aarch64_cmodel == AARCH64_CMODEL_TINY 10298 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC) 10299 { 10300 /* ADR. */ 10301 if (speed) 10302 *cost += extra_cost->alu.arith; 10303 } 10304 10305 if (flag_pic) 10306 { 10307 /* One extra load instruction, after accessing the GOT. */ 10308 *cost += COSTS_N_INSNS (1); 10309 if (speed) 10310 *cost += extra_cost->ldst.load; 10311 } 10312 return true; 10313 10314 case HIGH: 10315 case LO_SUM: 10316 /* ADRP/ADD (immediate). */ 10317 if (speed) 10318 *cost += extra_cost->alu.arith; 10319 return true; 10320 10321 case ZERO_EXTRACT: 10322 case SIGN_EXTRACT: 10323 /* UBFX/SBFX. */ 10324 if (speed) 10325 { 10326 if (VECTOR_MODE_P (mode)) 10327 *cost += extra_cost->vect.alu; 10328 else 10329 *cost += extra_cost->alu.bfx; 10330 } 10331 10332 /* We can trust that the immediates used will be correct (there 10333 are no by-register forms), so we need only cost op0. */ 10334 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed); 10335 return true; 10336 10337 case MULT: 10338 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed); 10339 /* aarch64_rtx_mult_cost always handles recursion to its 10340 operands. */ 10341 return true; 10342 10343 case MOD: 10344 /* We can expand signed mod by power of 2 using a NEGS, two parallel 10345 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of 10346 an unconditional negate. This case should only ever be reached through 10347 the set_smod_pow2_cheap check in expmed.c. */ 10348 if (CONST_INT_P (XEXP (x, 1)) 10349 && exact_log2 (INTVAL (XEXP (x, 1))) > 0 10350 && (mode == SImode || mode == DImode)) 10351 { 10352 /* We expand to 4 instructions. Reset the baseline. */ 10353 *cost = COSTS_N_INSNS (4); 10354 10355 if (speed) 10356 *cost += 2 * extra_cost->alu.logical 10357 + 2 * extra_cost->alu.arith; 10358 10359 return true; 10360 } 10361 10362 /* Fall-through. */ 10363 case UMOD: 10364 if (speed) 10365 { 10366 /* Slighly prefer UMOD over SMOD. */ 10367 if (VECTOR_MODE_P (mode)) 10368 *cost += extra_cost->vect.alu; 10369 else if (GET_MODE_CLASS (mode) == MODE_INT) 10370 *cost += (extra_cost->mult[mode == DImode].add 10371 + extra_cost->mult[mode == DImode].idiv 10372 + (code == MOD ? 1 : 0)); 10373 } 10374 return false; /* All arguments need to be in registers. */ 10375 10376 case DIV: 10377 case UDIV: 10378 case SQRT: 10379 if (speed) 10380 { 10381 if (VECTOR_MODE_P (mode)) 10382 *cost += extra_cost->vect.alu; 10383 else if (GET_MODE_CLASS (mode) == MODE_INT) 10384 /* There is no integer SQRT, so only DIV and UDIV can get 10385 here. */ 10386 *cost += (extra_cost->mult[mode == DImode].idiv 10387 /* Slighly prefer UDIV over SDIV. */ 10388 + (code == DIV ? 1 : 0)); 10389 else 10390 *cost += extra_cost->fp[mode == DFmode].div; 10391 } 10392 return false; /* All arguments need to be in registers. */ 10393 10394 case IF_THEN_ELSE: 10395 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1), 10396 XEXP (x, 2), cost, speed); 10397 10398 case EQ: 10399 case NE: 10400 case GT: 10401 case GTU: 10402 case LT: 10403 case LTU: 10404 case GE: 10405 case GEU: 10406 case LE: 10407 case LEU: 10408 10409 return false; /* All arguments must be in registers. */ 10410 10411 case FMA: 10412 op0 = XEXP (x, 0); 10413 op1 = XEXP (x, 1); 10414 op2 = XEXP (x, 2); 10415 10416 if (speed) 10417 { 10418 if (VECTOR_MODE_P (mode)) 10419 *cost += extra_cost->vect.alu; 10420 else 10421 *cost += extra_cost->fp[mode == DFmode].fma; 10422 } 10423 10424 /* FMSUB, FNMADD, and FNMSUB are free. */ 10425 if (GET_CODE (op0) == NEG) 10426 op0 = XEXP (op0, 0); 10427 10428 if (GET_CODE (op2) == NEG) 10429 op2 = XEXP (op2, 0); 10430 10431 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1, 10432 and the by-element operand as operand 0. */ 10433 if (GET_CODE (op1) == NEG) 10434 op1 = XEXP (op1, 0); 10435 10436 /* Catch vector-by-element operations. The by-element operand can 10437 either be (vec_duplicate (vec_select (x))) or just 10438 (vec_select (x)), depending on whether we are multiplying by 10439 a vector or a scalar. 10440 10441 Canonicalization is not very good in these cases, FMA4 will put the 10442 by-element operand as operand 0, FNMA4 will have it as operand 1. */ 10443 if (GET_CODE (op0) == VEC_DUPLICATE) 10444 op0 = XEXP (op0, 0); 10445 else if (GET_CODE (op1) == VEC_DUPLICATE) 10446 op1 = XEXP (op1, 0); 10447 10448 if (GET_CODE (op0) == VEC_SELECT) 10449 op0 = XEXP (op0, 0); 10450 else if (GET_CODE (op1) == VEC_SELECT) 10451 op1 = XEXP (op1, 0); 10452 10453 /* If the remaining parameters are not registers, 10454 get the cost to put them into registers. */ 10455 *cost += rtx_cost (op0, mode, FMA, 0, speed); 10456 *cost += rtx_cost (op1, mode, FMA, 1, speed); 10457 *cost += rtx_cost (op2, mode, FMA, 2, speed); 10458 return true; 10459 10460 case FLOAT: 10461 case UNSIGNED_FLOAT: 10462 if (speed) 10463 *cost += extra_cost->fp[mode == DFmode].fromint; 10464 return false; 10465 10466 case FLOAT_EXTEND: 10467 if (speed) 10468 { 10469 if (VECTOR_MODE_P (mode)) 10470 { 10471 /*Vector truncate. */ 10472 *cost += extra_cost->vect.alu; 10473 } 10474 else 10475 *cost += extra_cost->fp[mode == DFmode].widen; 10476 } 10477 return false; 10478 10479 case FLOAT_TRUNCATE: 10480 if (speed) 10481 { 10482 if (VECTOR_MODE_P (mode)) 10483 { 10484 /*Vector conversion. */ 10485 *cost += extra_cost->vect.alu; 10486 } 10487 else 10488 *cost += extra_cost->fp[mode == DFmode].narrow; 10489 } 10490 return false; 10491 10492 case FIX: 10493 case UNSIGNED_FIX: 10494 x = XEXP (x, 0); 10495 /* Strip the rounding part. They will all be implemented 10496 by the fcvt* family of instructions anyway. */ 10497 if (GET_CODE (x) == UNSPEC) 10498 { 10499 unsigned int uns_code = XINT (x, 1); 10500 10501 if (uns_code == UNSPEC_FRINTA 10502 || uns_code == UNSPEC_FRINTM 10503 || uns_code == UNSPEC_FRINTN 10504 || uns_code == UNSPEC_FRINTP 10505 || uns_code == UNSPEC_FRINTZ) 10506 x = XVECEXP (x, 0, 0); 10507 } 10508 10509 if (speed) 10510 { 10511 if (VECTOR_MODE_P (mode)) 10512 *cost += extra_cost->vect.alu; 10513 else 10514 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint; 10515 } 10516 10517 /* We can combine fmul by a power of 2 followed by a fcvt into a single 10518 fixed-point fcvt. */ 10519 if (GET_CODE (x) == MULT 10520 && ((VECTOR_MODE_P (mode) 10521 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0) 10522 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0)) 10523 { 10524 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code, 10525 0, speed); 10526 return true; 10527 } 10528 10529 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed); 10530 return true; 10531 10532 case ABS: 10533 if (VECTOR_MODE_P (mode)) 10534 { 10535 /* ABS (vector). */ 10536 if (speed) 10537 *cost += extra_cost->vect.alu; 10538 } 10539 else if (GET_MODE_CLASS (mode) == MODE_FLOAT) 10540 { 10541 op0 = XEXP (x, 0); 10542 10543 /* FABD, which is analogous to FADD. */ 10544 if (GET_CODE (op0) == MINUS) 10545 { 10546 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed); 10547 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed); 10548 if (speed) 10549 *cost += extra_cost->fp[mode == DFmode].addsub; 10550 10551 return true; 10552 } 10553 /* Simple FABS is analogous to FNEG. */ 10554 if (speed) 10555 *cost += extra_cost->fp[mode == DFmode].neg; 10556 } 10557 else 10558 { 10559 /* Integer ABS will either be split to 10560 two arithmetic instructions, or will be an ABS 10561 (scalar), which we don't model. */ 10562 *cost = COSTS_N_INSNS (2); 10563 if (speed) 10564 *cost += 2 * extra_cost->alu.arith; 10565 } 10566 return false; 10567 10568 case SMAX: 10569 case SMIN: 10570 if (speed) 10571 { 10572 if (VECTOR_MODE_P (mode)) 10573 *cost += extra_cost->vect.alu; 10574 else 10575 { 10576 /* FMAXNM/FMINNM/FMAX/FMIN. 10577 TODO: This may not be accurate for all implementations, but 10578 we do not model this in the cost tables. */ 10579 *cost += extra_cost->fp[mode == DFmode].addsub; 10580 } 10581 } 10582 return false; 10583 10584 case UNSPEC: 10585 /* The floating point round to integer frint* instructions. */ 10586 if (aarch64_frint_unspec_p (XINT (x, 1))) 10587 { 10588 if (speed) 10589 *cost += extra_cost->fp[mode == DFmode].roundint; 10590 10591 return false; 10592 } 10593 10594 if (XINT (x, 1) == UNSPEC_RBIT) 10595 { 10596 if (speed) 10597 *cost += extra_cost->alu.rev; 10598 10599 return false; 10600 } 10601 break; 10602 10603 case TRUNCATE: 10604 10605 /* Decompose <su>muldi3_highpart. */ 10606 if (/* (truncate:DI */ 10607 mode == DImode 10608 /* (lshiftrt:TI */ 10609 && GET_MODE (XEXP (x, 0)) == TImode 10610 && GET_CODE (XEXP (x, 0)) == LSHIFTRT 10611 /* (mult:TI */ 10612 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT 10613 /* (ANY_EXTEND:TI (reg:DI)) 10614 (ANY_EXTEND:TI (reg:DI))) */ 10615 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND 10616 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND) 10617 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND 10618 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND)) 10619 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode 10620 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode 10621 /* (const_int 64) */ 10622 && CONST_INT_P (XEXP (XEXP (x, 0), 1)) 10623 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64) 10624 { 10625 /* UMULH/SMULH. */ 10626 if (speed) 10627 *cost += extra_cost->mult[mode == DImode].extend; 10628 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0), 10629 mode, MULT, 0, speed); 10630 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0), 10631 mode, MULT, 1, speed); 10632 return true; 10633 } 10634 10635 /* Fall through. */ 10636 default: 10637 break; 10638 } 10639 10640 if (dump_file 10641 && flag_aarch64_verbose_cost) 10642 fprintf (dump_file, 10643 "\nFailed to cost RTX. Assuming default cost.\n"); 10644 10645 return true; 10646 } 10647 10648 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost 10649 calculated for X. This cost is stored in *COST. Returns true 10650 if the total cost of X was calculated. */ 10651 static bool 10652 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer, 10653 int param, int *cost, bool speed) 10654 { 10655 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed); 10656 10657 if (dump_file 10658 && flag_aarch64_verbose_cost) 10659 { 10660 print_rtl_single (dump_file, x); 10661 fprintf (dump_file, "\n%s cost: %d (%s)\n", 10662 speed ? "Hot" : "Cold", 10663 *cost, result ? "final" : "partial"); 10664 } 10665 10666 return result; 10667 } 10668 10669 static int 10670 aarch64_register_move_cost (machine_mode mode, 10671 reg_class_t from_i, reg_class_t to_i) 10672 { 10673 enum reg_class from = (enum reg_class) from_i; 10674 enum reg_class to = (enum reg_class) to_i; 10675 const struct cpu_regmove_cost *regmove_cost 10676 = aarch64_tune_params.regmove_cost; 10677 10678 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */ 10679 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS) 10680 to = GENERAL_REGS; 10681 10682 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS) 10683 from = GENERAL_REGS; 10684 10685 /* Moving between GPR and stack cost is the same as GP2GP. */ 10686 if ((from == GENERAL_REGS && to == STACK_REG) 10687 || (to == GENERAL_REGS && from == STACK_REG)) 10688 return regmove_cost->GP2GP; 10689 10690 /* To/From the stack register, we move via the gprs. */ 10691 if (to == STACK_REG || from == STACK_REG) 10692 return aarch64_register_move_cost (mode, from, GENERAL_REGS) 10693 + aarch64_register_move_cost (mode, GENERAL_REGS, to); 10694 10695 if (known_eq (GET_MODE_SIZE (mode), 16)) 10696 { 10697 /* 128-bit operations on general registers require 2 instructions. */ 10698 if (from == GENERAL_REGS && to == GENERAL_REGS) 10699 return regmove_cost->GP2GP * 2; 10700 else if (from == GENERAL_REGS) 10701 return regmove_cost->GP2FP * 2; 10702 else if (to == GENERAL_REGS) 10703 return regmove_cost->FP2GP * 2; 10704 10705 /* When AdvSIMD instructions are disabled it is not possible to move 10706 a 128-bit value directly between Q registers. This is handled in 10707 secondary reload. A general register is used as a scratch to move 10708 the upper DI value and the lower DI value is moved directly, 10709 hence the cost is the sum of three moves. */ 10710 if (! TARGET_SIMD) 10711 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP; 10712 10713 return regmove_cost->FP2FP; 10714 } 10715 10716 if (from == GENERAL_REGS && to == GENERAL_REGS) 10717 return regmove_cost->GP2GP; 10718 else if (from == GENERAL_REGS) 10719 return regmove_cost->GP2FP; 10720 else if (to == GENERAL_REGS) 10721 return regmove_cost->FP2GP; 10722 10723 return regmove_cost->FP2FP; 10724 } 10725 10726 static int 10727 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED, 10728 reg_class_t rclass ATTRIBUTE_UNUSED, 10729 bool in ATTRIBUTE_UNUSED) 10730 { 10731 return aarch64_tune_params.memmov_cost; 10732 } 10733 10734 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs 10735 to optimize 1.0/sqrt. */ 10736 10737 static bool 10738 use_rsqrt_p (machine_mode mode) 10739 { 10740 return (!flag_trapping_math 10741 && flag_unsafe_math_optimizations 10742 && ((aarch64_tune_params.approx_modes->recip_sqrt 10743 & AARCH64_APPROX_MODE (mode)) 10744 || flag_mrecip_low_precision_sqrt)); 10745 } 10746 10747 /* Function to decide when to use the approximate reciprocal square root 10748 builtin. */ 10749 10750 static tree 10751 aarch64_builtin_reciprocal (tree fndecl) 10752 { 10753 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl)); 10754 10755 if (!use_rsqrt_p (mode)) 10756 return NULL_TREE; 10757 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl)); 10758 } 10759 10760 /* Emit instruction sequence to compute either the approximate square root 10761 or its approximate reciprocal, depending on the flag RECP, and return 10762 whether the sequence was emitted or not. */ 10763 10764 bool 10765 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp) 10766 { 10767 machine_mode mode = GET_MODE (dst); 10768 10769 if (GET_MODE_INNER (mode) == HFmode) 10770 { 10771 gcc_assert (!recp); 10772 return false; 10773 } 10774 10775 if (!recp) 10776 { 10777 if (!(flag_mlow_precision_sqrt 10778 || (aarch64_tune_params.approx_modes->sqrt 10779 & AARCH64_APPROX_MODE (mode)))) 10780 return false; 10781 10782 if (flag_finite_math_only 10783 || flag_trapping_math 10784 || !flag_unsafe_math_optimizations 10785 || optimize_function_for_size_p (cfun)) 10786 return false; 10787 } 10788 else 10789 /* Caller assumes we cannot fail. */ 10790 gcc_assert (use_rsqrt_p (mode)); 10791 10792 machine_mode mmsk = mode_for_int_vector (mode).require (); 10793 rtx xmsk = gen_reg_rtx (mmsk); 10794 if (!recp) 10795 /* When calculating the approximate square root, compare the 10796 argument with 0.0 and create a mask. */ 10797 emit_insn (gen_rtx_SET (xmsk, 10798 gen_rtx_NEG (mmsk, 10799 gen_rtx_EQ (mmsk, src, 10800 CONST0_RTX (mode))))); 10801 10802 /* Estimate the approximate reciprocal square root. */ 10803 rtx xdst = gen_reg_rtx (mode); 10804 emit_insn (gen_aarch64_rsqrte (mode, xdst, src)); 10805 10806 /* Iterate over the series twice for SF and thrice for DF. */ 10807 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2; 10808 10809 /* Optionally iterate over the series once less for faster performance 10810 while sacrificing the accuracy. */ 10811 if ((recp && flag_mrecip_low_precision_sqrt) 10812 || (!recp && flag_mlow_precision_sqrt)) 10813 iterations--; 10814 10815 /* Iterate over the series to calculate the approximate reciprocal square 10816 root. */ 10817 rtx x1 = gen_reg_rtx (mode); 10818 while (iterations--) 10819 { 10820 rtx x2 = gen_reg_rtx (mode); 10821 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst)); 10822 10823 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2)); 10824 10825 if (iterations > 0) 10826 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1)); 10827 } 10828 10829 if (!recp) 10830 { 10831 /* Qualify the approximate reciprocal square root when the argument is 10832 0.0 by squashing the intermediary result to 0.0. */ 10833 rtx xtmp = gen_reg_rtx (mmsk); 10834 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk), 10835 gen_rtx_SUBREG (mmsk, xdst, 0))); 10836 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0)); 10837 10838 /* Calculate the approximate square root. */ 10839 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src)); 10840 } 10841 10842 /* Finalize the approximation. */ 10843 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1)); 10844 10845 return true; 10846 } 10847 10848 /* Emit the instruction sequence to compute the approximation for the division 10849 of NUM by DEN in QUO and return whether the sequence was emitted or not. */ 10850 10851 bool 10852 aarch64_emit_approx_div (rtx quo, rtx num, rtx den) 10853 { 10854 machine_mode mode = GET_MODE (quo); 10855 10856 if (GET_MODE_INNER (mode) == HFmode) 10857 return false; 10858 10859 bool use_approx_division_p = (flag_mlow_precision_div 10860 || (aarch64_tune_params.approx_modes->division 10861 & AARCH64_APPROX_MODE (mode))); 10862 10863 if (!flag_finite_math_only 10864 || flag_trapping_math 10865 || !flag_unsafe_math_optimizations 10866 || optimize_function_for_size_p (cfun) 10867 || !use_approx_division_p) 10868 return false; 10869 10870 if (!TARGET_SIMD && VECTOR_MODE_P (mode)) 10871 return false; 10872 10873 /* Estimate the approximate reciprocal. */ 10874 rtx xrcp = gen_reg_rtx (mode); 10875 emit_insn (gen_aarch64_frecpe (mode, xrcp, den)); 10876 10877 /* Iterate over the series twice for SF and thrice for DF. */ 10878 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2; 10879 10880 /* Optionally iterate over the series once less for faster performance, 10881 while sacrificing the accuracy. */ 10882 if (flag_mlow_precision_div) 10883 iterations--; 10884 10885 /* Iterate over the series to calculate the approximate reciprocal. */ 10886 rtx xtmp = gen_reg_rtx (mode); 10887 while (iterations--) 10888 { 10889 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den)); 10890 10891 if (iterations > 0) 10892 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp)); 10893 } 10894 10895 if (num != CONST1_RTX (mode)) 10896 { 10897 /* As the approximate reciprocal of DEN is already calculated, only 10898 calculate the approximate division when NUM is not 1.0. */ 10899 rtx xnum = force_reg (mode, num); 10900 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum)); 10901 } 10902 10903 /* Finalize the approximation. */ 10904 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp)); 10905 return true; 10906 } 10907 10908 /* Return the number of instructions that can be issued per cycle. */ 10909 static int 10910 aarch64_sched_issue_rate (void) 10911 { 10912 return aarch64_tune_params.issue_rate; 10913 } 10914 10915 static int 10916 aarch64_sched_first_cycle_multipass_dfa_lookahead (void) 10917 { 10918 int issue_rate = aarch64_sched_issue_rate (); 10919 10920 return issue_rate > 1 && !sched_fusion ? issue_rate : 0; 10921 } 10922 10923 10924 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as 10925 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only 10926 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */ 10927 10928 static int 10929 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn, 10930 int ready_index) 10931 { 10932 return autopref_multipass_dfa_lookahead_guard (insn, ready_index); 10933 } 10934 10935 10936 /* Vectorizer cost model target hooks. */ 10937 10938 /* Implement targetm.vectorize.builtin_vectorization_cost. */ 10939 static int 10940 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost, 10941 tree vectype, 10942 int misalign ATTRIBUTE_UNUSED) 10943 { 10944 unsigned elements; 10945 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs; 10946 bool fp = false; 10947 10948 if (vectype != NULL) 10949 fp = FLOAT_TYPE_P (vectype); 10950 10951 switch (type_of_cost) 10952 { 10953 case scalar_stmt: 10954 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost; 10955 10956 case scalar_load: 10957 return costs->scalar_load_cost; 10958 10959 case scalar_store: 10960 return costs->scalar_store_cost; 10961 10962 case vector_stmt: 10963 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost; 10964 10965 case vector_load: 10966 return costs->vec_align_load_cost; 10967 10968 case vector_store: 10969 return costs->vec_store_cost; 10970 10971 case vec_to_scalar: 10972 return costs->vec_to_scalar_cost; 10973 10974 case scalar_to_vec: 10975 return costs->scalar_to_vec_cost; 10976 10977 case unaligned_load: 10978 case vector_gather_load: 10979 return costs->vec_unalign_load_cost; 10980 10981 case unaligned_store: 10982 case vector_scatter_store: 10983 return costs->vec_unalign_store_cost; 10984 10985 case cond_branch_taken: 10986 return costs->cond_taken_branch_cost; 10987 10988 case cond_branch_not_taken: 10989 return costs->cond_not_taken_branch_cost; 10990 10991 case vec_perm: 10992 return costs->vec_permute_cost; 10993 10994 case vec_promote_demote: 10995 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost; 10996 10997 case vec_construct: 10998 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype)); 10999 return elements / 2 + 1; 11000 11001 default: 11002 gcc_unreachable (); 11003 } 11004 } 11005 11006 /* Implement targetm.vectorize.add_stmt_cost. */ 11007 static unsigned 11008 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind, 11009 struct _stmt_vec_info *stmt_info, int misalign, 11010 enum vect_cost_model_location where) 11011 { 11012 unsigned *cost = (unsigned *) data; 11013 unsigned retval = 0; 11014 11015 if (flag_vect_cost_model) 11016 { 11017 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE; 11018 int stmt_cost = 11019 aarch64_builtin_vectorization_cost (kind, vectype, misalign); 11020 11021 /* Statements in an inner loop relative to the loop being 11022 vectorized are weighted more heavily. The value here is 11023 arbitrary and could potentially be improved with analysis. */ 11024 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info)) 11025 count *= 50; /* FIXME */ 11026 11027 retval = (unsigned) (count * stmt_cost); 11028 cost[where] += retval; 11029 } 11030 11031 return retval; 11032 } 11033 11034 static void initialize_aarch64_code_model (struct gcc_options *); 11035 11036 /* Parse the TO_PARSE string and put the architecture struct that it 11037 selects into RES and the architectural features into ISA_FLAGS. 11038 Return an aarch64_parse_opt_result describing the parse result. 11039 If there is an error parsing, RES and ISA_FLAGS are left unchanged. 11040 When the TO_PARSE string contains an invalid extension, 11041 a copy of the string is created and stored to INVALID_EXTENSION. */ 11042 11043 static enum aarch64_parse_opt_result 11044 aarch64_parse_arch (const char *to_parse, const struct processor **res, 11045 unsigned long *isa_flags, std::string *invalid_extension) 11046 { 11047 const char *ext; 11048 const struct processor *arch; 11049 size_t len; 11050 11051 ext = strchr (to_parse, '+'); 11052 11053 if (ext != NULL) 11054 len = ext - to_parse; 11055 else 11056 len = strlen (to_parse); 11057 11058 if (len == 0) 11059 return AARCH64_PARSE_MISSING_ARG; 11060 11061 11062 /* Loop through the list of supported ARCHes to find a match. */ 11063 for (arch = all_architectures; arch->name != NULL; arch++) 11064 { 11065 if (strlen (arch->name) == len 11066 && strncmp (arch->name, to_parse, len) == 0) 11067 { 11068 unsigned long isa_temp = arch->flags; 11069 11070 if (ext != NULL) 11071 { 11072 /* TO_PARSE string contains at least one extension. */ 11073 enum aarch64_parse_opt_result ext_res 11074 = aarch64_parse_extension (ext, &isa_temp, invalid_extension); 11075 11076 if (ext_res != AARCH64_PARSE_OK) 11077 return ext_res; 11078 } 11079 /* Extension parsing was successful. Confirm the result 11080 arch and ISA flags. */ 11081 *res = arch; 11082 *isa_flags = isa_temp; 11083 return AARCH64_PARSE_OK; 11084 } 11085 } 11086 11087 /* ARCH name not found in list. */ 11088 return AARCH64_PARSE_INVALID_ARG; 11089 } 11090 11091 /* Parse the TO_PARSE string and put the result tuning in RES and the 11092 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result 11093 describing the parse result. If there is an error parsing, RES and 11094 ISA_FLAGS are left unchanged. 11095 When the TO_PARSE string contains an invalid extension, 11096 a copy of the string is created and stored to INVALID_EXTENSION. */ 11097 11098 static enum aarch64_parse_opt_result 11099 aarch64_parse_cpu (const char *to_parse, const struct processor **res, 11100 unsigned long *isa_flags, std::string *invalid_extension) 11101 { 11102 const char *ext; 11103 const struct processor *cpu; 11104 size_t len; 11105 11106 ext = strchr (to_parse, '+'); 11107 11108 if (ext != NULL) 11109 len = ext - to_parse; 11110 else 11111 len = strlen (to_parse); 11112 11113 if (len == 0) 11114 return AARCH64_PARSE_MISSING_ARG; 11115 11116 11117 /* Loop through the list of supported CPUs to find a match. */ 11118 for (cpu = all_cores; cpu->name != NULL; cpu++) 11119 { 11120 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0) 11121 { 11122 unsigned long isa_temp = cpu->flags; 11123 11124 11125 if (ext != NULL) 11126 { 11127 /* TO_PARSE string contains at least one extension. */ 11128 enum aarch64_parse_opt_result ext_res 11129 = aarch64_parse_extension (ext, &isa_temp, invalid_extension); 11130 11131 if (ext_res != AARCH64_PARSE_OK) 11132 return ext_res; 11133 } 11134 /* Extension parsing was successfull. Confirm the result 11135 cpu and ISA flags. */ 11136 *res = cpu; 11137 *isa_flags = isa_temp; 11138 return AARCH64_PARSE_OK; 11139 } 11140 } 11141 11142 /* CPU name not found in list. */ 11143 return AARCH64_PARSE_INVALID_ARG; 11144 } 11145 11146 /* Parse the TO_PARSE string and put the cpu it selects into RES. 11147 Return an aarch64_parse_opt_result describing the parse result. 11148 If the parsing fails the RES does not change. */ 11149 11150 static enum aarch64_parse_opt_result 11151 aarch64_parse_tune (const char *to_parse, const struct processor **res) 11152 { 11153 const struct processor *cpu; 11154 11155 /* Loop through the list of supported CPUs to find a match. */ 11156 for (cpu = all_cores; cpu->name != NULL; cpu++) 11157 { 11158 if (strcmp (cpu->name, to_parse) == 0) 11159 { 11160 *res = cpu; 11161 return AARCH64_PARSE_OK; 11162 } 11163 } 11164 11165 /* CPU name not found in list. */ 11166 return AARCH64_PARSE_INVALID_ARG; 11167 } 11168 11169 /* Parse TOKEN, which has length LENGTH to see if it is an option 11170 described in FLAG. If it is, return the index bit for that fusion type. 11171 If not, error (printing OPTION_NAME) and return zero. */ 11172 11173 static unsigned int 11174 aarch64_parse_one_option_token (const char *token, 11175 size_t length, 11176 const struct aarch64_flag_desc *flag, 11177 const char *option_name) 11178 { 11179 for (; flag->name != NULL; flag++) 11180 { 11181 if (length == strlen (flag->name) 11182 && !strncmp (flag->name, token, length)) 11183 return flag->flag; 11184 } 11185 11186 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token); 11187 return 0; 11188 } 11189 11190 /* Parse OPTION which is a comma-separated list of flags to enable. 11191 FLAGS gives the list of flags we understand, INITIAL_STATE gives any 11192 default state we inherit from the CPU tuning structures. OPTION_NAME 11193 gives the top-level option we are parsing in the -moverride string, 11194 for use in error messages. */ 11195 11196 static unsigned int 11197 aarch64_parse_boolean_options (const char *option, 11198 const struct aarch64_flag_desc *flags, 11199 unsigned int initial_state, 11200 const char *option_name) 11201 { 11202 const char separator = '.'; 11203 const char* specs = option; 11204 const char* ntoken = option; 11205 unsigned int found_flags = initial_state; 11206 11207 while ((ntoken = strchr (specs, separator))) 11208 { 11209 size_t token_length = ntoken - specs; 11210 unsigned token_ops = aarch64_parse_one_option_token (specs, 11211 token_length, 11212 flags, 11213 option_name); 11214 /* If we find "none" (or, for simplicity's sake, an error) anywhere 11215 in the token stream, reset the supported operations. So: 11216 11217 adrp+add.cmp+branch.none.adrp+add 11218 11219 would have the result of turning on only adrp+add fusion. */ 11220 if (!token_ops) 11221 found_flags = 0; 11222 11223 found_flags |= token_ops; 11224 specs = ++ntoken; 11225 } 11226 11227 /* We ended with a comma, print something. */ 11228 if (!(*specs)) 11229 { 11230 error ("%s string ill-formed\n", option_name); 11231 return 0; 11232 } 11233 11234 /* We still have one more token to parse. */ 11235 size_t token_length = strlen (specs); 11236 unsigned token_ops = aarch64_parse_one_option_token (specs, 11237 token_length, 11238 flags, 11239 option_name); 11240 if (!token_ops) 11241 found_flags = 0; 11242 11243 found_flags |= token_ops; 11244 return found_flags; 11245 } 11246 11247 /* Support for overriding instruction fusion. */ 11248 11249 static void 11250 aarch64_parse_fuse_string (const char *fuse_string, 11251 struct tune_params *tune) 11252 { 11253 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string, 11254 aarch64_fusible_pairs, 11255 tune->fusible_ops, 11256 "fuse="); 11257 } 11258 11259 /* Support for overriding other tuning flags. */ 11260 11261 static void 11262 aarch64_parse_tune_string (const char *tune_string, 11263 struct tune_params *tune) 11264 { 11265 tune->extra_tuning_flags 11266 = aarch64_parse_boolean_options (tune_string, 11267 aarch64_tuning_flags, 11268 tune->extra_tuning_flags, 11269 "tune="); 11270 } 11271 11272 /* Parse the sve_width tuning moverride string in TUNE_STRING. 11273 Accept the valid SVE vector widths allowed by 11274 aarch64_sve_vector_bits_enum and use it to override sve_width 11275 in TUNE. */ 11276 11277 static void 11278 aarch64_parse_sve_width_string (const char *tune_string, 11279 struct tune_params *tune) 11280 { 11281 int width = -1; 11282 11283 int n = sscanf (tune_string, "%d", &width); 11284 if (n == EOF) 11285 { 11286 error ("invalid format for sve_width"); 11287 return; 11288 } 11289 switch (width) 11290 { 11291 case SVE_128: 11292 case SVE_256: 11293 case SVE_512: 11294 case SVE_1024: 11295 case SVE_2048: 11296 break; 11297 default: 11298 error ("invalid sve_width value: %d", width); 11299 } 11300 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width; 11301 } 11302 11303 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option 11304 we understand. If it is, extract the option string and handoff to 11305 the appropriate function. */ 11306 11307 void 11308 aarch64_parse_one_override_token (const char* token, 11309 size_t length, 11310 struct tune_params *tune) 11311 { 11312 const struct aarch64_tuning_override_function *fn 11313 = aarch64_tuning_override_functions; 11314 11315 const char *option_part = strchr (token, '='); 11316 if (!option_part) 11317 { 11318 error ("tuning string missing in option (%s)", token); 11319 return; 11320 } 11321 11322 /* Get the length of the option name. */ 11323 length = option_part - token; 11324 /* Skip the '=' to get to the option string. */ 11325 option_part++; 11326 11327 for (; fn->name != NULL; fn++) 11328 { 11329 if (!strncmp (fn->name, token, length)) 11330 { 11331 fn->parse_override (option_part, tune); 11332 return; 11333 } 11334 } 11335 11336 error ("unknown tuning option (%s)",token); 11337 return; 11338 } 11339 11340 /* A checking mechanism for the implementation of the tls size. */ 11341 11342 static void 11343 initialize_aarch64_tls_size (struct gcc_options *opts) 11344 { 11345 if (aarch64_tls_size == 0) 11346 aarch64_tls_size = 24; 11347 11348 switch (opts->x_aarch64_cmodel_var) 11349 { 11350 case AARCH64_CMODEL_TINY: 11351 /* Both the default and maximum TLS size allowed under tiny is 1M which 11352 needs two instructions to address, so we clamp the size to 24. */ 11353 if (aarch64_tls_size > 24) 11354 aarch64_tls_size = 24; 11355 break; 11356 case AARCH64_CMODEL_SMALL: 11357 /* The maximum TLS size allowed under small is 4G. */ 11358 if (aarch64_tls_size > 32) 11359 aarch64_tls_size = 32; 11360 break; 11361 case AARCH64_CMODEL_LARGE: 11362 /* The maximum TLS size allowed under large is 16E. 11363 FIXME: 16E should be 64bit, we only support 48bit offset now. */ 11364 if (aarch64_tls_size > 48) 11365 aarch64_tls_size = 48; 11366 break; 11367 default: 11368 gcc_unreachable (); 11369 } 11370 11371 return; 11372 } 11373 11374 /* Parse STRING looking for options in the format: 11375 string :: option:string 11376 option :: name=substring 11377 name :: {a-z} 11378 substring :: defined by option. */ 11379 11380 static void 11381 aarch64_parse_override_string (const char* input_string, 11382 struct tune_params* tune) 11383 { 11384 const char separator = ':'; 11385 size_t string_length = strlen (input_string) + 1; 11386 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length); 11387 char *string = string_root; 11388 strncpy (string, input_string, string_length); 11389 string[string_length - 1] = '\0'; 11390 11391 char* ntoken = string; 11392 11393 while ((ntoken = strchr (string, separator))) 11394 { 11395 size_t token_length = ntoken - string; 11396 /* Make this substring look like a string. */ 11397 *ntoken = '\0'; 11398 aarch64_parse_one_override_token (string, token_length, tune); 11399 string = ++ntoken; 11400 } 11401 11402 /* One last option to parse. */ 11403 aarch64_parse_one_override_token (string, strlen (string), tune); 11404 free (string_root); 11405 } 11406 11407 11408 static void 11409 aarch64_override_options_after_change_1 (struct gcc_options *opts) 11410 { 11411 if (accepted_branch_protection_string) 11412 { 11413 opts->x_aarch64_branch_protection_string 11414 = xstrdup (accepted_branch_protection_string); 11415 } 11416 11417 /* PR 70044: We have to be careful about being called multiple times for the 11418 same function. This means all changes should be repeatable. */ 11419 11420 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer. 11421 Disable the frame pointer flag so the mid-end will not use a frame 11422 pointer in leaf functions in order to support -fomit-leaf-frame-pointer. 11423 Set x_flag_omit_frame_pointer to the special value 2 to differentiate 11424 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */ 11425 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1; 11426 if (opts->x_flag_omit_frame_pointer == 0) 11427 opts->x_flag_omit_frame_pointer = 2; 11428 11429 /* If not optimizing for size, set the default 11430 alignment to what the target wants. */ 11431 if (!opts->x_optimize_size) 11432 { 11433 if (opts->x_flag_align_loops && !opts->x_str_align_loops) 11434 opts->x_str_align_loops = aarch64_tune_params.loop_align; 11435 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps) 11436 opts->x_str_align_jumps = aarch64_tune_params.jump_align; 11437 if (opts->x_flag_align_functions && !opts->x_str_align_functions) 11438 opts->x_str_align_functions = aarch64_tune_params.function_align; 11439 } 11440 11441 /* We default to no pc-relative literal loads. */ 11442 11443 aarch64_pcrelative_literal_loads = false; 11444 11445 /* If -mpc-relative-literal-loads is set on the command line, this 11446 implies that the user asked for PC relative literal loads. */ 11447 if (opts->x_pcrelative_literal_loads == 1) 11448 aarch64_pcrelative_literal_loads = true; 11449 11450 /* In the tiny memory model it makes no sense to disallow PC relative 11451 literal pool loads. */ 11452 if (aarch64_cmodel == AARCH64_CMODEL_TINY 11453 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC) 11454 aarch64_pcrelative_literal_loads = true; 11455 11456 /* When enabling the lower precision Newton series for the square root, also 11457 enable it for the reciprocal square root, since the latter is an 11458 intermediary step for the former. */ 11459 if (flag_mlow_precision_sqrt) 11460 flag_mrecip_low_precision_sqrt = true; 11461 } 11462 11463 /* 'Unpack' up the internal tuning structs and update the options 11464 in OPTS. The caller must have set up selected_tune and selected_arch 11465 as all the other target-specific codegen decisions are 11466 derived from them. */ 11467 11468 void 11469 aarch64_override_options_internal (struct gcc_options *opts) 11470 { 11471 aarch64_tune_flags = selected_tune->flags; 11472 aarch64_tune = selected_tune->sched_core; 11473 /* Make a copy of the tuning parameters attached to the core, which 11474 we may later overwrite. */ 11475 aarch64_tune_params = *(selected_tune->tune); 11476 aarch64_architecture_version = selected_arch->architecture_version; 11477 11478 if (opts->x_aarch64_override_tune_string) 11479 aarch64_parse_override_string (opts->x_aarch64_override_tune_string, 11480 &aarch64_tune_params); 11481 11482 /* This target defaults to strict volatile bitfields. */ 11483 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2)) 11484 opts->x_flag_strict_volatile_bitfields = 1; 11485 11486 if (aarch64_stack_protector_guard == SSP_GLOBAL 11487 && opts->x_aarch64_stack_protector_guard_offset_str) 11488 { 11489 error ("incompatible options %<-mstack-protector-guard=global%> and " 11490 "%<-mstack-protector-guard-offset=%s%>", 11491 aarch64_stack_protector_guard_offset_str); 11492 } 11493 11494 if (aarch64_stack_protector_guard == SSP_SYSREG 11495 && !(opts->x_aarch64_stack_protector_guard_offset_str 11496 && opts->x_aarch64_stack_protector_guard_reg_str)) 11497 { 11498 error ("both %<-mstack-protector-guard-offset%> and " 11499 "%<-mstack-protector-guard-reg%> must be used " 11500 "with %<-mstack-protector-guard=sysreg%>"); 11501 } 11502 11503 if (opts->x_aarch64_stack_protector_guard_reg_str) 11504 { 11505 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100) 11506 error ("specify a system register with a small string length."); 11507 } 11508 11509 if (opts->x_aarch64_stack_protector_guard_offset_str) 11510 { 11511 char *end; 11512 const char *str = aarch64_stack_protector_guard_offset_str; 11513 errno = 0; 11514 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0); 11515 if (!*str || *end || errno) 11516 error ("%qs is not a valid offset in %qs", str, 11517 "-mstack-protector-guard-offset="); 11518 aarch64_stack_protector_guard_offset = offs; 11519 } 11520 11521 initialize_aarch64_code_model (opts); 11522 initialize_aarch64_tls_size (opts); 11523 11524 int queue_depth = 0; 11525 switch (aarch64_tune_params.autoprefetcher_model) 11526 { 11527 case tune_params::AUTOPREFETCHER_OFF: 11528 queue_depth = -1; 11529 break; 11530 case tune_params::AUTOPREFETCHER_WEAK: 11531 queue_depth = 0; 11532 break; 11533 case tune_params::AUTOPREFETCHER_STRONG: 11534 queue_depth = max_insn_queue_index + 1; 11535 break; 11536 default: 11537 gcc_unreachable (); 11538 } 11539 11540 /* We don't mind passing in global_options_set here as we don't use 11541 the *options_set structs anyway. */ 11542 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH, 11543 queue_depth, 11544 opts->x_param_values, 11545 global_options_set.x_param_values); 11546 11547 /* Set up parameters to be used in prefetching algorithm. Do not 11548 override the defaults unless we are tuning for a core we have 11549 researched values for. */ 11550 if (aarch64_tune_params.prefetch->num_slots > 0) 11551 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES, 11552 aarch64_tune_params.prefetch->num_slots, 11553 opts->x_param_values, 11554 global_options_set.x_param_values); 11555 if (aarch64_tune_params.prefetch->l1_cache_size >= 0) 11556 maybe_set_param_value (PARAM_L1_CACHE_SIZE, 11557 aarch64_tune_params.prefetch->l1_cache_size, 11558 opts->x_param_values, 11559 global_options_set.x_param_values); 11560 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0) 11561 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE, 11562 aarch64_tune_params.prefetch->l1_cache_line_size, 11563 opts->x_param_values, 11564 global_options_set.x_param_values); 11565 if (aarch64_tune_params.prefetch->l2_cache_size >= 0) 11566 maybe_set_param_value (PARAM_L2_CACHE_SIZE, 11567 aarch64_tune_params.prefetch->l2_cache_size, 11568 opts->x_param_values, 11569 global_options_set.x_param_values); 11570 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides) 11571 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES, 11572 0, 11573 opts->x_param_values, 11574 global_options_set.x_param_values); 11575 if (aarch64_tune_params.prefetch->minimum_stride >= 0) 11576 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE, 11577 aarch64_tune_params.prefetch->minimum_stride, 11578 opts->x_param_values, 11579 global_options_set.x_param_values); 11580 11581 /* Use the alternative scheduling-pressure algorithm by default. */ 11582 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL, 11583 opts->x_param_values, 11584 global_options_set.x_param_values); 11585 11586 /* If the user hasn't changed it via configure then set the default to 64 KB 11587 for the backend. */ 11588 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE, 11589 DEFAULT_STK_CLASH_GUARD_SIZE == 0 11590 ? 16 : DEFAULT_STK_CLASH_GUARD_SIZE, 11591 opts->x_param_values, 11592 global_options_set.x_param_values); 11593 11594 /* Validate the guard size. */ 11595 int guard_size = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE); 11596 11597 /* Enforce that interval is the same size as size so the mid-end does the 11598 right thing. */ 11599 maybe_set_param_value (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL, 11600 guard_size, 11601 opts->x_param_values, 11602 global_options_set.x_param_values); 11603 11604 /* The maybe_set calls won't update the value if the user has explicitly set 11605 one. Which means we need to validate that probing interval and guard size 11606 are equal. */ 11607 int probe_interval 11608 = PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL); 11609 if (guard_size != probe_interval) 11610 error ("stack clash guard size %<%d%> must be equal to probing interval " 11611 "%<%d%>", guard_size, probe_interval); 11612 11613 /* Enable sw prefetching at specified optimization level for 11614 CPUS that have prefetch. Lower optimization level threshold by 1 11615 when profiling is enabled. */ 11616 if (opts->x_flag_prefetch_loop_arrays < 0 11617 && !opts->x_optimize_size 11618 && aarch64_tune_params.prefetch->default_opt_level >= 0 11619 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level) 11620 opts->x_flag_prefetch_loop_arrays = 1; 11621 11622 if (opts->x_aarch64_arch_string == NULL) 11623 opts->x_aarch64_arch_string = selected_arch->name; 11624 if (opts->x_aarch64_cpu_string == NULL) 11625 opts->x_aarch64_cpu_string = selected_cpu->name; 11626 if (opts->x_aarch64_tune_string == NULL) 11627 opts->x_aarch64_tune_string = selected_tune->name; 11628 11629 aarch64_override_options_after_change_1 (opts); 11630 } 11631 11632 /* Print a hint with a suggestion for a core or architecture name that 11633 most closely resembles what the user passed in STR. ARCH is true if 11634 the user is asking for an architecture name. ARCH is false if the user 11635 is asking for a core name. */ 11636 11637 static void 11638 aarch64_print_hint_for_core_or_arch (const char *str, bool arch) 11639 { 11640 auto_vec<const char *> candidates; 11641 const struct processor *entry = arch ? all_architectures : all_cores; 11642 for (; entry->name != NULL; entry++) 11643 candidates.safe_push (entry->name); 11644 11645 #ifdef HAVE_LOCAL_CPU_DETECT 11646 /* Add also "native" as possible value. */ 11647 if (arch) 11648 candidates.safe_push ("native"); 11649 #endif 11650 11651 char *s; 11652 const char *hint = candidates_list_and_hint (str, s, candidates); 11653 if (hint) 11654 inform (input_location, "valid arguments are: %s;" 11655 " did you mean %qs?", s, hint); 11656 else 11657 inform (input_location, "valid arguments are: %s", s); 11658 11659 XDELETEVEC (s); 11660 } 11661 11662 /* Print a hint with a suggestion for a core name that most closely resembles 11663 what the user passed in STR. */ 11664 11665 inline static void 11666 aarch64_print_hint_for_core (const char *str) 11667 { 11668 aarch64_print_hint_for_core_or_arch (str, false); 11669 } 11670 11671 /* Print a hint with a suggestion for an architecture name that most closely 11672 resembles what the user passed in STR. */ 11673 11674 inline static void 11675 aarch64_print_hint_for_arch (const char *str) 11676 { 11677 aarch64_print_hint_for_core_or_arch (str, true); 11678 } 11679 11680 11681 /* Print a hint with a suggestion for an extension name 11682 that most closely resembles what the user passed in STR. */ 11683 11684 void 11685 aarch64_print_hint_for_extensions (const std::string &str) 11686 { 11687 auto_vec<const char *> candidates; 11688 aarch64_get_all_extension_candidates (&candidates); 11689 char *s; 11690 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates); 11691 if (hint) 11692 inform (input_location, "valid arguments are: %s;" 11693 " did you mean %qs?", s, hint); 11694 else 11695 inform (input_location, "valid arguments are: %s;", s); 11696 11697 XDELETEVEC (s); 11698 } 11699 11700 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any) 11701 specified in STR and throw errors if appropriate. Put the results if 11702 they are valid in RES and ISA_FLAGS. Return whether the option is 11703 valid. */ 11704 11705 static bool 11706 aarch64_validate_mcpu (const char *str, const struct processor **res, 11707 unsigned long *isa_flags) 11708 { 11709 std::string invalid_extension; 11710 enum aarch64_parse_opt_result parse_res 11711 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension); 11712 11713 if (parse_res == AARCH64_PARSE_OK) 11714 return true; 11715 11716 switch (parse_res) 11717 { 11718 case AARCH64_PARSE_MISSING_ARG: 11719 error ("missing cpu name in %<-mcpu=%s%>", str); 11720 break; 11721 case AARCH64_PARSE_INVALID_ARG: 11722 error ("unknown value %qs for %<-mcpu%>", str); 11723 aarch64_print_hint_for_core (str); 11724 break; 11725 case AARCH64_PARSE_INVALID_FEATURE: 11726 error ("invalid feature modifier %qs in %<-mcpu=%s%>", 11727 invalid_extension.c_str (), str); 11728 aarch64_print_hint_for_extensions (invalid_extension); 11729 break; 11730 default: 11731 gcc_unreachable (); 11732 } 11733 11734 return false; 11735 } 11736 11737 /* Parses CONST_STR for branch protection features specified in 11738 aarch64_branch_protect_types, and set any global variables required. Returns 11739 the parsing result and assigns LAST_STR to the last processed token from 11740 CONST_STR so that it can be used for error reporting. */ 11741 11742 static enum 11743 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str, 11744 char** last_str) 11745 { 11746 char *str_root = xstrdup (const_str); 11747 char* token_save = NULL; 11748 char *str = strtok_r (str_root, "+", &token_save); 11749 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK; 11750 if (!str) 11751 res = AARCH64_PARSE_MISSING_ARG; 11752 else 11753 { 11754 char *next_str = strtok_r (NULL, "+", &token_save); 11755 /* Reset the branch protection features to their defaults. */ 11756 aarch64_handle_no_branch_protection (NULL, NULL); 11757 11758 while (str && res == AARCH64_PARSE_OK) 11759 { 11760 const aarch64_branch_protect_type* type = aarch64_branch_protect_types; 11761 bool found = false; 11762 /* Search for this type. */ 11763 while (type && type->name && !found && res == AARCH64_PARSE_OK) 11764 { 11765 if (strcmp (str, type->name) == 0) 11766 { 11767 found = true; 11768 res = type->handler (str, next_str); 11769 str = next_str; 11770 next_str = strtok_r (NULL, "+", &token_save); 11771 } 11772 else 11773 type++; 11774 } 11775 if (found && res == AARCH64_PARSE_OK) 11776 { 11777 bool found_subtype = true; 11778 /* Loop through each token until we find one that isn't a 11779 subtype. */ 11780 while (found_subtype) 11781 { 11782 found_subtype = false; 11783 const aarch64_branch_protect_type *subtype = type->subtypes; 11784 /* Search for the subtype. */ 11785 while (str && subtype && subtype->name && !found_subtype 11786 && res == AARCH64_PARSE_OK) 11787 { 11788 if (strcmp (str, subtype->name) == 0) 11789 { 11790 found_subtype = true; 11791 res = subtype->handler (str, next_str); 11792 str = next_str; 11793 next_str = strtok_r (NULL, "+", &token_save); 11794 } 11795 else 11796 subtype++; 11797 } 11798 } 11799 } 11800 else if (!found) 11801 res = AARCH64_PARSE_INVALID_ARG; 11802 } 11803 } 11804 /* Copy the last processed token into the argument to pass it back. 11805 Used by option and attribute validation to print the offending token. */ 11806 if (last_str) 11807 { 11808 if (str) strcpy (*last_str, str); 11809 else *last_str = NULL; 11810 } 11811 if (res == AARCH64_PARSE_OK) 11812 { 11813 /* If needed, alloc the accepted string then copy in const_str. 11814 Used by override_option_after_change_1. */ 11815 if (!accepted_branch_protection_string) 11816 accepted_branch_protection_string = (char *) xmalloc ( 11817 BRANCH_PROTECT_STR_MAX 11818 + 1); 11819 strncpy (accepted_branch_protection_string, const_str, 11820 BRANCH_PROTECT_STR_MAX + 1); 11821 /* Forcibly null-terminate. */ 11822 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0'; 11823 } 11824 return res; 11825 } 11826 11827 static bool 11828 aarch64_validate_mbranch_protection (const char *const_str) 11829 { 11830 char *str = (char *) xmalloc (strlen (const_str)); 11831 enum aarch64_parse_opt_result res = 11832 aarch64_parse_branch_protection (const_str, &str); 11833 if (res == AARCH64_PARSE_INVALID_ARG) 11834 error ("invalid arg %<%s%> for %<-mbranch-protection=%>", str); 11835 else if (res == AARCH64_PARSE_MISSING_ARG) 11836 error ("missing arg for %<-mbranch-protection=%>"); 11837 free (str); 11838 return res == AARCH64_PARSE_OK; 11839 } 11840 11841 /* Validate a command-line -march option. Parse the arch and extensions 11842 (if any) specified in STR and throw errors if appropriate. Put the 11843 results, if they are valid, in RES and ISA_FLAGS. Return whether the 11844 option is valid. */ 11845 11846 static bool 11847 aarch64_validate_march (const char *str, const struct processor **res, 11848 unsigned long *isa_flags) 11849 { 11850 std::string invalid_extension; 11851 enum aarch64_parse_opt_result parse_res 11852 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension); 11853 11854 if (parse_res == AARCH64_PARSE_OK) 11855 return true; 11856 11857 switch (parse_res) 11858 { 11859 case AARCH64_PARSE_MISSING_ARG: 11860 error ("missing arch name in %<-march=%s%>", str); 11861 break; 11862 case AARCH64_PARSE_INVALID_ARG: 11863 error ("unknown value %qs for %<-march%>", str); 11864 aarch64_print_hint_for_arch (str); 11865 break; 11866 case AARCH64_PARSE_INVALID_FEATURE: 11867 error ("invalid feature modifier %qs in %<-march=%s%>", 11868 invalid_extension.c_str (), str); 11869 aarch64_print_hint_for_extensions (invalid_extension); 11870 break; 11871 default: 11872 gcc_unreachable (); 11873 } 11874 11875 return false; 11876 } 11877 11878 /* Validate a command-line -mtune option. Parse the cpu 11879 specified in STR and throw errors if appropriate. Put the 11880 result, if it is valid, in RES. Return whether the option is 11881 valid. */ 11882 11883 static bool 11884 aarch64_validate_mtune (const char *str, const struct processor **res) 11885 { 11886 enum aarch64_parse_opt_result parse_res 11887 = aarch64_parse_tune (str, res); 11888 11889 if (parse_res == AARCH64_PARSE_OK) 11890 return true; 11891 11892 switch (parse_res) 11893 { 11894 case AARCH64_PARSE_MISSING_ARG: 11895 error ("missing cpu name in %<-mtune=%s%>", str); 11896 break; 11897 case AARCH64_PARSE_INVALID_ARG: 11898 error ("unknown value %qs for %<-mtune%>", str); 11899 aarch64_print_hint_for_core (str); 11900 break; 11901 default: 11902 gcc_unreachable (); 11903 } 11904 return false; 11905 } 11906 11907 /* Return the CPU corresponding to the enum CPU. 11908 If it doesn't specify a cpu, return the default. */ 11909 11910 static const struct processor * 11911 aarch64_get_tune_cpu (enum aarch64_processor cpu) 11912 { 11913 if (cpu != aarch64_none) 11914 return &all_cores[cpu]; 11915 11916 /* The & 0x3f is to extract the bottom 6 bits that encode the 11917 default cpu as selected by the --with-cpu GCC configure option 11918 in config.gcc. 11919 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS 11920 flags mechanism should be reworked to make it more sane. */ 11921 return &all_cores[TARGET_CPU_DEFAULT & 0x3f]; 11922 } 11923 11924 /* Return the architecture corresponding to the enum ARCH. 11925 If it doesn't specify a valid architecture, return the default. */ 11926 11927 static const struct processor * 11928 aarch64_get_arch (enum aarch64_arch arch) 11929 { 11930 if (arch != aarch64_no_arch) 11931 return &all_architectures[arch]; 11932 11933 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f]; 11934 11935 return &all_architectures[cpu->arch]; 11936 } 11937 11938 /* Return the VG value associated with -msve-vector-bits= value VALUE. */ 11939 11940 static poly_uint16 11941 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value) 11942 { 11943 /* For now generate vector-length agnostic code for -msve-vector-bits=128. 11944 This ensures we can clearly distinguish SVE and Advanced SIMD modes when 11945 deciding which .md file patterns to use and when deciding whether 11946 something is a legitimate address or constant. */ 11947 if (value == SVE_SCALABLE || value == SVE_128) 11948 return poly_uint16 (2, 2); 11949 else 11950 return (int) value / 64; 11951 } 11952 11953 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning 11954 and is used to parse the -m{cpu,tune,arch} strings and setup the initial 11955 tuning structs. In particular it must set selected_tune and 11956 aarch64_isa_flags that define the available ISA features and tuning 11957 decisions. It must also set selected_arch as this will be used to 11958 output the .arch asm tags for each function. */ 11959 11960 static void 11961 aarch64_override_options (void) 11962 { 11963 unsigned long cpu_isa = 0; 11964 unsigned long arch_isa = 0; 11965 aarch64_isa_flags = 0; 11966 11967 bool valid_cpu = true; 11968 bool valid_tune = true; 11969 bool valid_arch = true; 11970 11971 selected_cpu = NULL; 11972 selected_arch = NULL; 11973 selected_tune = NULL; 11974 11975 if (aarch64_branch_protection_string) 11976 aarch64_validate_mbranch_protection (aarch64_branch_protection_string); 11977 11978 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU. 11979 If either of -march or -mtune is given, they override their 11980 respective component of -mcpu. */ 11981 if (aarch64_cpu_string) 11982 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu, 11983 &cpu_isa); 11984 11985 if (aarch64_arch_string) 11986 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch, 11987 &arch_isa); 11988 11989 if (aarch64_tune_string) 11990 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune); 11991 11992 #ifdef SUBTARGET_OVERRIDE_OPTIONS 11993 SUBTARGET_OVERRIDE_OPTIONS; 11994 #endif 11995 11996 /* If the user did not specify a processor, choose the default 11997 one for them. This will be the CPU set during configuration using 11998 --with-cpu, otherwise it is "generic". */ 11999 if (!selected_cpu) 12000 { 12001 if (selected_arch) 12002 { 12003 selected_cpu = &all_cores[selected_arch->ident]; 12004 aarch64_isa_flags = arch_isa; 12005 explicit_arch = selected_arch->arch; 12006 } 12007 else 12008 { 12009 /* Get default configure-time CPU. */ 12010 selected_cpu = aarch64_get_tune_cpu (aarch64_none); 12011 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6; 12012 } 12013 12014 if (selected_tune) 12015 explicit_tune_core = selected_tune->ident; 12016 } 12017 /* If both -mcpu and -march are specified check that they are architecturally 12018 compatible, warn if they're not and prefer the -march ISA flags. */ 12019 else if (selected_arch) 12020 { 12021 if (selected_arch->arch != selected_cpu->arch) 12022 { 12023 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch", 12024 all_architectures[selected_cpu->arch].name, 12025 selected_arch->name); 12026 } 12027 aarch64_isa_flags = arch_isa; 12028 explicit_arch = selected_arch->arch; 12029 explicit_tune_core = selected_tune ? selected_tune->ident 12030 : selected_cpu->ident; 12031 } 12032 else 12033 { 12034 /* -mcpu but no -march. */ 12035 aarch64_isa_flags = cpu_isa; 12036 explicit_tune_core = selected_tune ? selected_tune->ident 12037 : selected_cpu->ident; 12038 gcc_assert (selected_cpu); 12039 selected_arch = &all_architectures[selected_cpu->arch]; 12040 explicit_arch = selected_arch->arch; 12041 } 12042 12043 /* Set the arch as well as we will need it when outputing 12044 the .arch directive in assembly. */ 12045 if (!selected_arch) 12046 { 12047 gcc_assert (selected_cpu); 12048 selected_arch = &all_architectures[selected_cpu->arch]; 12049 } 12050 12051 if (!selected_tune) 12052 selected_tune = selected_cpu; 12053 12054 if (aarch64_enable_bti == 2) 12055 { 12056 #ifdef TARGET_ENABLE_BTI 12057 aarch64_enable_bti = 1; 12058 #else 12059 aarch64_enable_bti = 0; 12060 #endif 12061 } 12062 12063 /* Return address signing is currently not supported for ILP32 targets. For 12064 LP64 targets use the configured option in the absence of a command-line 12065 option for -mbranch-protection. */ 12066 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL) 12067 { 12068 #ifdef TARGET_ENABLE_PAC_RET 12069 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF; 12070 #else 12071 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE; 12072 #endif 12073 } 12074 12075 #ifndef HAVE_AS_MABI_OPTION 12076 /* The compiler may have been configured with 2.23.* binutils, which does 12077 not have support for ILP32. */ 12078 if (TARGET_ILP32) 12079 error ("assembler does not support %<-mabi=ilp32%>"); 12080 #endif 12081 12082 /* Convert -msve-vector-bits to a VG count. */ 12083 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits); 12084 12085 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32) 12086 sorry ("return address signing is only supported for %<-mabi=lp64%>"); 12087 12088 /* Make sure we properly set up the explicit options. */ 12089 if ((aarch64_cpu_string && valid_cpu) 12090 || (aarch64_tune_string && valid_tune)) 12091 gcc_assert (explicit_tune_core != aarch64_none); 12092 12093 if ((aarch64_cpu_string && valid_cpu) 12094 || (aarch64_arch_string && valid_arch)) 12095 gcc_assert (explicit_arch != aarch64_no_arch); 12096 12097 /* The pass to insert speculation tracking runs before 12098 shrink-wrapping and the latter does not know how to update the 12099 tracking status. So disable it in this case. */ 12100 if (aarch64_track_speculation) 12101 flag_shrink_wrap = 0; 12102 12103 aarch64_override_options_internal (&global_options); 12104 12105 /* Save these options as the default ones in case we push and pop them later 12106 while processing functions with potential target attributes. */ 12107 target_option_default_node = target_option_current_node 12108 = build_target_option_node (&global_options); 12109 } 12110 12111 /* Implement targetm.override_options_after_change. */ 12112 12113 static void 12114 aarch64_override_options_after_change (void) 12115 { 12116 aarch64_override_options_after_change_1 (&global_options); 12117 } 12118 12119 static struct machine_function * 12120 aarch64_init_machine_status (void) 12121 { 12122 struct machine_function *machine; 12123 machine = ggc_cleared_alloc<machine_function> (); 12124 return machine; 12125 } 12126 12127 void 12128 aarch64_init_expanders (void) 12129 { 12130 init_machine_status = aarch64_init_machine_status; 12131 } 12132 12133 /* A checking mechanism for the implementation of the various code models. */ 12134 static void 12135 initialize_aarch64_code_model (struct gcc_options *opts) 12136 { 12137 if (opts->x_flag_pic) 12138 { 12139 switch (opts->x_aarch64_cmodel_var) 12140 { 12141 case AARCH64_CMODEL_TINY: 12142 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC; 12143 break; 12144 case AARCH64_CMODEL_SMALL: 12145 #ifdef HAVE_AS_SMALL_PIC_RELOCS 12146 aarch64_cmodel = (flag_pic == 2 12147 ? AARCH64_CMODEL_SMALL_PIC 12148 : AARCH64_CMODEL_SMALL_SPIC); 12149 #else 12150 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC; 12151 #endif 12152 break; 12153 case AARCH64_CMODEL_LARGE: 12154 sorry ("code model %qs with %<-f%s%>", "large", 12155 opts->x_flag_pic > 1 ? "PIC" : "pic"); 12156 break; 12157 default: 12158 gcc_unreachable (); 12159 } 12160 } 12161 else 12162 aarch64_cmodel = opts->x_aarch64_cmodel_var; 12163 } 12164 12165 /* Implement TARGET_OPTION_SAVE. */ 12166 12167 static void 12168 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts) 12169 { 12170 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string; 12171 ptr->x_aarch64_branch_protection_string 12172 = opts->x_aarch64_branch_protection_string; 12173 } 12174 12175 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions 12176 using the information saved in PTR. */ 12177 12178 static void 12179 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr) 12180 { 12181 opts->x_explicit_tune_core = ptr->x_explicit_tune_core; 12182 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core); 12183 opts->x_explicit_arch = ptr->x_explicit_arch; 12184 selected_arch = aarch64_get_arch (ptr->x_explicit_arch); 12185 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string; 12186 opts->x_aarch64_branch_protection_string 12187 = ptr->x_aarch64_branch_protection_string; 12188 if (opts->x_aarch64_branch_protection_string) 12189 { 12190 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string, 12191 NULL); 12192 } 12193 12194 aarch64_override_options_internal (opts); 12195 } 12196 12197 /* Implement TARGET_OPTION_PRINT. */ 12198 12199 static void 12200 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr) 12201 { 12202 const struct processor *cpu 12203 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core); 12204 unsigned long isa_flags = ptr->x_aarch64_isa_flags; 12205 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch); 12206 std::string extension 12207 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags); 12208 12209 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name); 12210 fprintf (file, "%*sselected arch = %s%s\n", indent, "", 12211 arch->name, extension.c_str ()); 12212 } 12213 12214 static GTY(()) tree aarch64_previous_fndecl; 12215 12216 void 12217 aarch64_reset_previous_fndecl (void) 12218 { 12219 aarch64_previous_fndecl = NULL; 12220 } 12221 12222 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE. 12223 Used by aarch64_set_current_function and aarch64_pragma_target_parse to 12224 make sure optab availability predicates are recomputed when necessary. */ 12225 12226 void 12227 aarch64_save_restore_target_globals (tree new_tree) 12228 { 12229 if (TREE_TARGET_GLOBALS (new_tree)) 12230 restore_target_globals (TREE_TARGET_GLOBALS (new_tree)); 12231 else if (new_tree == target_option_default_node) 12232 restore_target_globals (&default_target_globals); 12233 else 12234 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts (); 12235 } 12236 12237 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions 12238 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET 12239 of the function, if such exists. This function may be called multiple 12240 times on a single function so use aarch64_previous_fndecl to avoid 12241 setting up identical state. */ 12242 12243 static void 12244 aarch64_set_current_function (tree fndecl) 12245 { 12246 if (!fndecl || fndecl == aarch64_previous_fndecl) 12247 return; 12248 12249 tree old_tree = (aarch64_previous_fndecl 12250 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl) 12251 : NULL_TREE); 12252 12253 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl); 12254 12255 /* If current function has no attributes but the previous one did, 12256 use the default node. */ 12257 if (!new_tree && old_tree) 12258 new_tree = target_option_default_node; 12259 12260 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to 12261 the default have been handled by aarch64_save_restore_target_globals from 12262 aarch64_pragma_target_parse. */ 12263 if (old_tree == new_tree) 12264 return; 12265 12266 aarch64_previous_fndecl = fndecl; 12267 12268 /* First set the target options. */ 12269 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree)); 12270 12271 aarch64_save_restore_target_globals (new_tree); 12272 } 12273 12274 /* Enum describing the various ways we can handle attributes. 12275 In many cases we can reuse the generic option handling machinery. */ 12276 12277 enum aarch64_attr_opt_type 12278 { 12279 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */ 12280 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */ 12281 aarch64_attr_enum, /* Attribute sets an enum variable. */ 12282 aarch64_attr_custom /* Attribute requires a custom handling function. */ 12283 }; 12284 12285 /* All the information needed to handle a target attribute. 12286 NAME is the name of the attribute. 12287 ATTR_TYPE specifies the type of behavior of the attribute as described 12288 in the definition of enum aarch64_attr_opt_type. 12289 ALLOW_NEG is true if the attribute supports a "no-" form. 12290 HANDLER is the function that takes the attribute string as an argument 12291 It is needed only when the ATTR_TYPE is aarch64_attr_custom. 12292 OPT_NUM is the enum specifying the option that the attribute modifies. 12293 This is needed for attributes that mirror the behavior of a command-line 12294 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or 12295 aarch64_attr_enum. */ 12296 12297 struct aarch64_attribute_info 12298 { 12299 const char *name; 12300 enum aarch64_attr_opt_type attr_type; 12301 bool allow_neg; 12302 bool (*handler) (const char *); 12303 enum opt_code opt_num; 12304 }; 12305 12306 /* Handle the ARCH_STR argument to the arch= target attribute. */ 12307 12308 static bool 12309 aarch64_handle_attr_arch (const char *str) 12310 { 12311 const struct processor *tmp_arch = NULL; 12312 std::string invalid_extension; 12313 enum aarch64_parse_opt_result parse_res 12314 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension); 12315 12316 if (parse_res == AARCH64_PARSE_OK) 12317 { 12318 gcc_assert (tmp_arch); 12319 selected_arch = tmp_arch; 12320 explicit_arch = selected_arch->arch; 12321 return true; 12322 } 12323 12324 switch (parse_res) 12325 { 12326 case AARCH64_PARSE_MISSING_ARG: 12327 error ("missing name in %<target(\"arch=\")%> pragma or attribute"); 12328 break; 12329 case AARCH64_PARSE_INVALID_ARG: 12330 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str); 12331 aarch64_print_hint_for_arch (str); 12332 break; 12333 case AARCH64_PARSE_INVALID_FEATURE: 12334 error ("invalid feature modifier %s of value (\"%s\") in " 12335 "%<target()%> pragma or attribute", invalid_extension.c_str (), str); 12336 aarch64_print_hint_for_extensions (invalid_extension); 12337 break; 12338 default: 12339 gcc_unreachable (); 12340 } 12341 12342 return false; 12343 } 12344 12345 /* Handle the argument CPU_STR to the cpu= target attribute. */ 12346 12347 static bool 12348 aarch64_handle_attr_cpu (const char *str) 12349 { 12350 const struct processor *tmp_cpu = NULL; 12351 std::string invalid_extension; 12352 enum aarch64_parse_opt_result parse_res 12353 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension); 12354 12355 if (parse_res == AARCH64_PARSE_OK) 12356 { 12357 gcc_assert (tmp_cpu); 12358 selected_tune = tmp_cpu; 12359 explicit_tune_core = selected_tune->ident; 12360 12361 selected_arch = &all_architectures[tmp_cpu->arch]; 12362 explicit_arch = selected_arch->arch; 12363 return true; 12364 } 12365 12366 switch (parse_res) 12367 { 12368 case AARCH64_PARSE_MISSING_ARG: 12369 error ("missing name in %<target(\"cpu=\")%> pragma or attribute"); 12370 break; 12371 case AARCH64_PARSE_INVALID_ARG: 12372 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str); 12373 aarch64_print_hint_for_core (str); 12374 break; 12375 case AARCH64_PARSE_INVALID_FEATURE: 12376 error ("invalid feature modifier %s of value (\"%s\") in " 12377 "%<target()%> pragma or attribute", invalid_extension.c_str (), str); 12378 aarch64_print_hint_for_extensions (invalid_extension); 12379 break; 12380 default: 12381 gcc_unreachable (); 12382 } 12383 12384 return false; 12385 } 12386 12387 /* Handle the argument STR to the branch-protection= attribute. */ 12388 12389 static bool 12390 aarch64_handle_attr_branch_protection (const char* str) 12391 { 12392 char *err_str = (char *) xmalloc (strlen (str)); 12393 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str, 12394 &err_str); 12395 bool success = false; 12396 switch (res) 12397 { 12398 case AARCH64_PARSE_MISSING_ARG: 12399 error ("missing argument to %<target(\"branch-protection=\")%> pragma or" 12400 " attribute"); 12401 break; 12402 case AARCH64_PARSE_INVALID_ARG: 12403 error ("invalid protection type (\"%s\") in %<target(\"branch-protection" 12404 "=\")%> pragma or attribute", err_str); 12405 break; 12406 case AARCH64_PARSE_OK: 12407 success = true; 12408 /* Fall through. */ 12409 case AARCH64_PARSE_INVALID_FEATURE: 12410 break; 12411 default: 12412 gcc_unreachable (); 12413 } 12414 free (err_str); 12415 return success; 12416 } 12417 12418 /* Handle the argument STR to the tune= target attribute. */ 12419 12420 static bool 12421 aarch64_handle_attr_tune (const char *str) 12422 { 12423 const struct processor *tmp_tune = NULL; 12424 enum aarch64_parse_opt_result parse_res 12425 = aarch64_parse_tune (str, &tmp_tune); 12426 12427 if (parse_res == AARCH64_PARSE_OK) 12428 { 12429 gcc_assert (tmp_tune); 12430 selected_tune = tmp_tune; 12431 explicit_tune_core = selected_tune->ident; 12432 return true; 12433 } 12434 12435 switch (parse_res) 12436 { 12437 case AARCH64_PARSE_INVALID_ARG: 12438 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str); 12439 aarch64_print_hint_for_core (str); 12440 break; 12441 default: 12442 gcc_unreachable (); 12443 } 12444 12445 return false; 12446 } 12447 12448 /* Parse an architecture extensions target attribute string specified in STR. 12449 For example "+fp+nosimd". Show any errors if needed. Return TRUE 12450 if successful. Update aarch64_isa_flags to reflect the ISA features 12451 modified. */ 12452 12453 static bool 12454 aarch64_handle_attr_isa_flags (char *str) 12455 { 12456 enum aarch64_parse_opt_result parse_res; 12457 unsigned long isa_flags = aarch64_isa_flags; 12458 12459 /* We allow "+nothing" in the beginning to clear out all architectural 12460 features if the user wants to handpick specific features. */ 12461 if (strncmp ("+nothing", str, 8) == 0) 12462 { 12463 isa_flags = 0; 12464 str += 8; 12465 } 12466 12467 std::string invalid_extension; 12468 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension); 12469 12470 if (parse_res == AARCH64_PARSE_OK) 12471 { 12472 aarch64_isa_flags = isa_flags; 12473 return true; 12474 } 12475 12476 switch (parse_res) 12477 { 12478 case AARCH64_PARSE_MISSING_ARG: 12479 error ("missing value in %<target()%> pragma or attribute"); 12480 break; 12481 12482 case AARCH64_PARSE_INVALID_FEATURE: 12483 error ("invalid feature modifier %s of value (\"%s\") in " 12484 "%<target()%> pragma or attribute", invalid_extension.c_str (), str); 12485 break; 12486 12487 default: 12488 gcc_unreachable (); 12489 } 12490 12491 return false; 12492 } 12493 12494 /* The target attributes that we support. On top of these we also support just 12495 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is 12496 handled explicitly in aarch64_process_one_target_attr. */ 12497 12498 static const struct aarch64_attribute_info aarch64_attributes[] = 12499 { 12500 { "general-regs-only", aarch64_attr_mask, false, NULL, 12501 OPT_mgeneral_regs_only }, 12502 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL, 12503 OPT_mfix_cortex_a53_835769 }, 12504 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL, 12505 OPT_mfix_cortex_a53_843419 }, 12506 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ }, 12507 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align }, 12508 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL, 12509 OPT_momit_leaf_frame_pointer }, 12510 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ }, 12511 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch, 12512 OPT_march_ }, 12513 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ }, 12514 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune, 12515 OPT_mtune_ }, 12516 { "branch-protection", aarch64_attr_custom, false, 12517 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ }, 12518 { "sign-return-address", aarch64_attr_enum, false, NULL, 12519 OPT_msign_return_address_ }, 12520 { NULL, aarch64_attr_custom, false, NULL, OPT____ } 12521 }; 12522 12523 /* Parse ARG_STR which contains the definition of one target attribute. 12524 Show appropriate errors if any or return true if the attribute is valid. */ 12525 12526 static bool 12527 aarch64_process_one_target_attr (char *arg_str) 12528 { 12529 bool invert = false; 12530 12531 size_t len = strlen (arg_str); 12532 12533 if (len == 0) 12534 { 12535 error ("malformed %<target()%> pragma or attribute"); 12536 return false; 12537 } 12538 12539 char *str_to_check = (char *) alloca (len + 1); 12540 strcpy (str_to_check, arg_str); 12541 12542 /* Skip leading whitespace. */ 12543 while (*str_to_check == ' ' || *str_to_check == '\t') 12544 str_to_check++; 12545 12546 /* We have something like __attribute__ ((target ("+fp+nosimd"))). 12547 It is easier to detect and handle it explicitly here rather than going 12548 through the machinery for the rest of the target attributes in this 12549 function. */ 12550 if (*str_to_check == '+') 12551 return aarch64_handle_attr_isa_flags (str_to_check); 12552 12553 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0) 12554 { 12555 invert = true; 12556 str_to_check += 3; 12557 } 12558 char *arg = strchr (str_to_check, '='); 12559 12560 /* If we found opt=foo then terminate STR_TO_CHECK at the '=' 12561 and point ARG to "foo". */ 12562 if (arg) 12563 { 12564 *arg = '\0'; 12565 arg++; 12566 } 12567 const struct aarch64_attribute_info *p_attr; 12568 bool found = false; 12569 for (p_attr = aarch64_attributes; p_attr->name; p_attr++) 12570 { 12571 /* If the names don't match up, or the user has given an argument 12572 to an attribute that doesn't accept one, or didn't give an argument 12573 to an attribute that expects one, fail to match. */ 12574 if (strcmp (str_to_check, p_attr->name) != 0) 12575 continue; 12576 12577 found = true; 12578 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom 12579 || p_attr->attr_type == aarch64_attr_enum; 12580 12581 if (attr_need_arg_p ^ (arg != NULL)) 12582 { 12583 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check); 12584 return false; 12585 } 12586 12587 /* If the name matches but the attribute does not allow "no-" versions 12588 then we can't match. */ 12589 if (invert && !p_attr->allow_neg) 12590 { 12591 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check); 12592 return false; 12593 } 12594 12595 switch (p_attr->attr_type) 12596 { 12597 /* Has a custom handler registered. 12598 For example, cpu=, arch=, tune=. */ 12599 case aarch64_attr_custom: 12600 gcc_assert (p_attr->handler); 12601 if (!p_attr->handler (arg)) 12602 return false; 12603 break; 12604 12605 /* Either set or unset a boolean option. */ 12606 case aarch64_attr_bool: 12607 { 12608 struct cl_decoded_option decoded; 12609 12610 generate_option (p_attr->opt_num, NULL, !invert, 12611 CL_TARGET, &decoded); 12612 aarch64_handle_option (&global_options, &global_options_set, 12613 &decoded, input_location); 12614 break; 12615 } 12616 /* Set or unset a bit in the target_flags. aarch64_handle_option 12617 should know what mask to apply given the option number. */ 12618 case aarch64_attr_mask: 12619 { 12620 struct cl_decoded_option decoded; 12621 /* We only need to specify the option number. 12622 aarch64_handle_option will know which mask to apply. */ 12623 decoded.opt_index = p_attr->opt_num; 12624 decoded.value = !invert; 12625 aarch64_handle_option (&global_options, &global_options_set, 12626 &decoded, input_location); 12627 break; 12628 } 12629 /* Use the option setting machinery to set an option to an enum. */ 12630 case aarch64_attr_enum: 12631 { 12632 gcc_assert (arg); 12633 bool valid; 12634 int value; 12635 valid = opt_enum_arg_to_value (p_attr->opt_num, arg, 12636 &value, CL_TARGET); 12637 if (valid) 12638 { 12639 set_option (&global_options, NULL, p_attr->opt_num, value, 12640 NULL, DK_UNSPECIFIED, input_location, 12641 global_dc); 12642 } 12643 else 12644 { 12645 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg); 12646 } 12647 break; 12648 } 12649 default: 12650 gcc_unreachable (); 12651 } 12652 } 12653 12654 /* If we reached here we either have found an attribute and validated 12655 it or didn't match any. If we matched an attribute but its arguments 12656 were malformed we will have returned false already. */ 12657 return found; 12658 } 12659 12660 /* Count how many times the character C appears in 12661 NULL-terminated string STR. */ 12662 12663 static unsigned int 12664 num_occurences_in_str (char c, char *str) 12665 { 12666 unsigned int res = 0; 12667 while (*str != '\0') 12668 { 12669 if (*str == c) 12670 res++; 12671 12672 str++; 12673 } 12674 12675 return res; 12676 } 12677 12678 /* Parse the tree in ARGS that contains the target attribute information 12679 and update the global target options space. */ 12680 12681 bool 12682 aarch64_process_target_attr (tree args) 12683 { 12684 if (TREE_CODE (args) == TREE_LIST) 12685 { 12686 do 12687 { 12688 tree head = TREE_VALUE (args); 12689 if (head) 12690 { 12691 if (!aarch64_process_target_attr (head)) 12692 return false; 12693 } 12694 args = TREE_CHAIN (args); 12695 } while (args); 12696 12697 return true; 12698 } 12699 12700 if (TREE_CODE (args) != STRING_CST) 12701 { 12702 error ("attribute %<target%> argument not a string"); 12703 return false; 12704 } 12705 12706 size_t len = strlen (TREE_STRING_POINTER (args)); 12707 char *str_to_check = (char *) alloca (len + 1); 12708 strcpy (str_to_check, TREE_STRING_POINTER (args)); 12709 12710 if (len == 0) 12711 { 12712 error ("malformed %<target()%> pragma or attribute"); 12713 return false; 12714 } 12715 12716 /* Used to catch empty spaces between commas i.e. 12717 attribute ((target ("attr1,,attr2"))). */ 12718 unsigned int num_commas = num_occurences_in_str (',', str_to_check); 12719 12720 /* Handle multiple target attributes separated by ','. */ 12721 char *token = strtok_r (str_to_check, ",", &str_to_check); 12722 12723 unsigned int num_attrs = 0; 12724 while (token) 12725 { 12726 num_attrs++; 12727 if (!aarch64_process_one_target_attr (token)) 12728 { 12729 error ("pragma or attribute %<target(\"%s\")%> is not valid", token); 12730 return false; 12731 } 12732 12733 token = strtok_r (NULL, ",", &str_to_check); 12734 } 12735 12736 if (num_attrs != num_commas + 1) 12737 { 12738 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args)); 12739 return false; 12740 } 12741 12742 return true; 12743 } 12744 12745 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to 12746 process attribute ((target ("..."))). */ 12747 12748 static bool 12749 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int) 12750 { 12751 struct cl_target_option cur_target; 12752 bool ret; 12753 tree old_optimize; 12754 tree new_target, new_optimize; 12755 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl); 12756 12757 /* If what we're processing is the current pragma string then the 12758 target option node is already stored in target_option_current_node 12759 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid 12760 having to re-parse the string. This is especially useful to keep 12761 arm_neon.h compile times down since that header contains a lot 12762 of intrinsics enclosed in pragmas. */ 12763 if (!existing_target && args == current_target_pragma) 12764 { 12765 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node; 12766 return true; 12767 } 12768 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl); 12769 12770 old_optimize = build_optimization_node (&global_options); 12771 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl); 12772 12773 /* If the function changed the optimization levels as well as setting 12774 target options, start with the optimizations specified. */ 12775 if (func_optimize && func_optimize != old_optimize) 12776 cl_optimization_restore (&global_options, 12777 TREE_OPTIMIZATION (func_optimize)); 12778 12779 /* Save the current target options to restore at the end. */ 12780 cl_target_option_save (&cur_target, &global_options); 12781 12782 /* If fndecl already has some target attributes applied to it, unpack 12783 them so that we add this attribute on top of them, rather than 12784 overwriting them. */ 12785 if (existing_target) 12786 { 12787 struct cl_target_option *existing_options 12788 = TREE_TARGET_OPTION (existing_target); 12789 12790 if (existing_options) 12791 cl_target_option_restore (&global_options, existing_options); 12792 } 12793 else 12794 cl_target_option_restore (&global_options, 12795 TREE_TARGET_OPTION (target_option_current_node)); 12796 12797 ret = aarch64_process_target_attr (args); 12798 12799 /* Set up any additional state. */ 12800 if (ret) 12801 { 12802 aarch64_override_options_internal (&global_options); 12803 /* Initialize SIMD builtins if we haven't already. 12804 Set current_target_pragma to NULL for the duration so that 12805 the builtin initialization code doesn't try to tag the functions 12806 being built with the attributes specified by any current pragma, thus 12807 going into an infinite recursion. */ 12808 if (TARGET_SIMD) 12809 { 12810 tree saved_current_target_pragma = current_target_pragma; 12811 current_target_pragma = NULL; 12812 aarch64_init_simd_builtins (); 12813 current_target_pragma = saved_current_target_pragma; 12814 } 12815 new_target = build_target_option_node (&global_options); 12816 } 12817 else 12818 new_target = NULL; 12819 12820 new_optimize = build_optimization_node (&global_options); 12821 12822 if (fndecl && ret) 12823 { 12824 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target; 12825 12826 if (old_optimize != new_optimize) 12827 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize; 12828 } 12829 12830 cl_target_option_restore (&global_options, &cur_target); 12831 12832 if (old_optimize != new_optimize) 12833 cl_optimization_restore (&global_options, 12834 TREE_OPTIMIZATION (old_optimize)); 12835 return ret; 12836 } 12837 12838 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are 12839 tri-bool options (yes, no, don't care) and the default value is 12840 DEF, determine whether to reject inlining. */ 12841 12842 static bool 12843 aarch64_tribools_ok_for_inlining_p (int caller, int callee, 12844 int dont_care, int def) 12845 { 12846 /* If the callee doesn't care, always allow inlining. */ 12847 if (callee == dont_care) 12848 return true; 12849 12850 /* If the caller doesn't care, always allow inlining. */ 12851 if (caller == dont_care) 12852 return true; 12853 12854 /* Otherwise, allow inlining if either the callee and caller values 12855 agree, or if the callee is using the default value. */ 12856 return (callee == caller || callee == def); 12857 } 12858 12859 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid 12860 to inline CALLEE into CALLER based on target-specific info. 12861 Make sure that the caller and callee have compatible architectural 12862 features. Then go through the other possible target attributes 12863 and see if they can block inlining. Try not to reject always_inline 12864 callees unless they are incompatible architecturally. */ 12865 12866 static bool 12867 aarch64_can_inline_p (tree caller, tree callee) 12868 { 12869 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller); 12870 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee); 12871 12872 struct cl_target_option *caller_opts 12873 = TREE_TARGET_OPTION (caller_tree ? caller_tree 12874 : target_option_default_node); 12875 12876 struct cl_target_option *callee_opts 12877 = TREE_TARGET_OPTION (callee_tree ? callee_tree 12878 : target_option_default_node); 12879 12880 /* Callee's ISA flags should be a subset of the caller's. */ 12881 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags) 12882 != callee_opts->x_aarch64_isa_flags) 12883 return false; 12884 12885 /* Allow non-strict aligned functions inlining into strict 12886 aligned ones. */ 12887 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags) 12888 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)) 12889 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags) 12890 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags))) 12891 return false; 12892 12893 bool always_inline = lookup_attribute ("always_inline", 12894 DECL_ATTRIBUTES (callee)); 12895 12896 /* If the architectural features match up and the callee is always_inline 12897 then the other attributes don't matter. */ 12898 if (always_inline) 12899 return true; 12900 12901 if (caller_opts->x_aarch64_cmodel_var 12902 != callee_opts->x_aarch64_cmodel_var) 12903 return false; 12904 12905 if (caller_opts->x_aarch64_tls_dialect 12906 != callee_opts->x_aarch64_tls_dialect) 12907 return false; 12908 12909 /* Honour explicit requests to workaround errata. */ 12910 if (!aarch64_tribools_ok_for_inlining_p ( 12911 caller_opts->x_aarch64_fix_a53_err835769, 12912 callee_opts->x_aarch64_fix_a53_err835769, 12913 2, TARGET_FIX_ERR_A53_835769_DEFAULT)) 12914 return false; 12915 12916 if (!aarch64_tribools_ok_for_inlining_p ( 12917 caller_opts->x_aarch64_fix_a53_err843419, 12918 callee_opts->x_aarch64_fix_a53_err843419, 12919 2, TARGET_FIX_ERR_A53_843419)) 12920 return false; 12921 12922 /* If the user explicitly specified -momit-leaf-frame-pointer for the 12923 caller and calle and they don't match up, reject inlining. */ 12924 if (!aarch64_tribools_ok_for_inlining_p ( 12925 caller_opts->x_flag_omit_leaf_frame_pointer, 12926 callee_opts->x_flag_omit_leaf_frame_pointer, 12927 2, 1)) 12928 return false; 12929 12930 /* If the callee has specific tuning overrides, respect them. */ 12931 if (callee_opts->x_aarch64_override_tune_string != NULL 12932 && caller_opts->x_aarch64_override_tune_string == NULL) 12933 return false; 12934 12935 /* If the user specified tuning override strings for the 12936 caller and callee and they don't match up, reject inlining. 12937 We just do a string compare here, we don't analyze the meaning 12938 of the string, as it would be too costly for little gain. */ 12939 if (callee_opts->x_aarch64_override_tune_string 12940 && caller_opts->x_aarch64_override_tune_string 12941 && (strcmp (callee_opts->x_aarch64_override_tune_string, 12942 caller_opts->x_aarch64_override_tune_string) != 0)) 12943 return false; 12944 12945 return true; 12946 } 12947 12948 /* Return true if SYMBOL_REF X binds locally. */ 12949 12950 static bool 12951 aarch64_symbol_binds_local_p (const_rtx x) 12952 { 12953 return (SYMBOL_REF_DECL (x) 12954 ? targetm.binds_local_p (SYMBOL_REF_DECL (x)) 12955 : SYMBOL_REF_LOCAL_P (x)); 12956 } 12957 12958 /* Return true if SYMBOL_REF X is thread local */ 12959 static bool 12960 aarch64_tls_symbol_p (rtx x) 12961 { 12962 if (! TARGET_HAVE_TLS) 12963 return false; 12964 12965 if (GET_CODE (x) != SYMBOL_REF) 12966 return false; 12967 12968 return SYMBOL_REF_TLS_MODEL (x) != 0; 12969 } 12970 12971 /* Classify a TLS symbol into one of the TLS kinds. */ 12972 enum aarch64_symbol_type 12973 aarch64_classify_tls_symbol (rtx x) 12974 { 12975 enum tls_model tls_kind = tls_symbolic_operand_type (x); 12976 12977 switch (tls_kind) 12978 { 12979 case TLS_MODEL_GLOBAL_DYNAMIC: 12980 case TLS_MODEL_LOCAL_DYNAMIC: 12981 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD; 12982 12983 case TLS_MODEL_INITIAL_EXEC: 12984 switch (aarch64_cmodel) 12985 { 12986 case AARCH64_CMODEL_TINY: 12987 case AARCH64_CMODEL_TINY_PIC: 12988 return SYMBOL_TINY_TLSIE; 12989 default: 12990 return SYMBOL_SMALL_TLSIE; 12991 } 12992 12993 case TLS_MODEL_LOCAL_EXEC: 12994 if (aarch64_tls_size == 12) 12995 return SYMBOL_TLSLE12; 12996 else if (aarch64_tls_size == 24) 12997 return SYMBOL_TLSLE24; 12998 else if (aarch64_tls_size == 32) 12999 return SYMBOL_TLSLE32; 13000 else if (aarch64_tls_size == 48) 13001 return SYMBOL_TLSLE48; 13002 else 13003 gcc_unreachable (); 13004 13005 case TLS_MODEL_EMULATED: 13006 case TLS_MODEL_NONE: 13007 return SYMBOL_FORCE_TO_MEM; 13008 13009 default: 13010 gcc_unreachable (); 13011 } 13012 } 13013 13014 /* Return the correct method for accessing X + OFFSET, where X is either 13015 a SYMBOL_REF or LABEL_REF. */ 13016 13017 enum aarch64_symbol_type 13018 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset) 13019 { 13020 if (GET_CODE (x) == LABEL_REF) 13021 { 13022 switch (aarch64_cmodel) 13023 { 13024 case AARCH64_CMODEL_LARGE: 13025 return SYMBOL_FORCE_TO_MEM; 13026 13027 case AARCH64_CMODEL_TINY_PIC: 13028 case AARCH64_CMODEL_TINY: 13029 return SYMBOL_TINY_ABSOLUTE; 13030 13031 case AARCH64_CMODEL_SMALL_SPIC: 13032 case AARCH64_CMODEL_SMALL_PIC: 13033 case AARCH64_CMODEL_SMALL: 13034 return SYMBOL_SMALL_ABSOLUTE; 13035 13036 default: 13037 gcc_unreachable (); 13038 } 13039 } 13040 13041 if (GET_CODE (x) == SYMBOL_REF) 13042 { 13043 if (aarch64_tls_symbol_p (x)) 13044 return aarch64_classify_tls_symbol (x); 13045 13046 switch (aarch64_cmodel) 13047 { 13048 case AARCH64_CMODEL_TINY: 13049 /* When we retrieve symbol + offset address, we have to make sure 13050 the offset does not cause overflow of the final address. But 13051 we have no way of knowing the address of symbol at compile time 13052 so we can't accurately say if the distance between the PC and 13053 symbol + offset is outside the addressible range of +/-1M in the 13054 TINY code model. So we rely on images not being greater than 13055 1M and cap the offset at 1M and anything beyond 1M will have to 13056 be loaded using an alternative mechanism. Furthermore if the 13057 symbol is a weak reference to something that isn't known to 13058 resolve to a symbol in this module, then force to memory. */ 13059 if ((SYMBOL_REF_WEAK (x) 13060 && !aarch64_symbol_binds_local_p (x)) 13061 || !IN_RANGE (offset, -1048575, 1048575)) 13062 return SYMBOL_FORCE_TO_MEM; 13063 return SYMBOL_TINY_ABSOLUTE; 13064 13065 case AARCH64_CMODEL_SMALL: 13066 /* Same reasoning as the tiny code model, but the offset cap here is 13067 4G. */ 13068 if ((SYMBOL_REF_WEAK (x) 13069 && !aarch64_symbol_binds_local_p (x)) 13070 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263), 13071 HOST_WIDE_INT_C (4294967264))) 13072 return SYMBOL_FORCE_TO_MEM; 13073 return SYMBOL_SMALL_ABSOLUTE; 13074 13075 case AARCH64_CMODEL_TINY_PIC: 13076 if (!aarch64_symbol_binds_local_p (x)) 13077 return SYMBOL_TINY_GOT; 13078 return SYMBOL_TINY_ABSOLUTE; 13079 13080 case AARCH64_CMODEL_SMALL_SPIC: 13081 case AARCH64_CMODEL_SMALL_PIC: 13082 if (!aarch64_symbol_binds_local_p (x)) 13083 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC 13084 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G); 13085 return SYMBOL_SMALL_ABSOLUTE; 13086 13087 case AARCH64_CMODEL_LARGE: 13088 /* This is alright even in PIC code as the constant 13089 pool reference is always PC relative and within 13090 the same translation unit. */ 13091 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x)) 13092 return SYMBOL_SMALL_ABSOLUTE; 13093 else 13094 return SYMBOL_FORCE_TO_MEM; 13095 13096 default: 13097 gcc_unreachable (); 13098 } 13099 } 13100 13101 /* By default push everything into the constant pool. */ 13102 return SYMBOL_FORCE_TO_MEM; 13103 } 13104 13105 bool 13106 aarch64_constant_address_p (rtx x) 13107 { 13108 return (CONSTANT_P (x) && memory_address_p (DImode, x)); 13109 } 13110 13111 bool 13112 aarch64_legitimate_pic_operand_p (rtx x) 13113 { 13114 if (GET_CODE (x) == SYMBOL_REF 13115 || (GET_CODE (x) == CONST 13116 && GET_CODE (XEXP (x, 0)) == PLUS 13117 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)) 13118 return false; 13119 13120 return true; 13121 } 13122 13123 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants 13124 that should be rematerialized rather than spilled. */ 13125 13126 static bool 13127 aarch64_legitimate_constant_p (machine_mode mode, rtx x) 13128 { 13129 /* Support CSE and rematerialization of common constants. */ 13130 if (CONST_INT_P (x) 13131 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT) 13132 || GET_CODE (x) == CONST_VECTOR) 13133 return true; 13134 13135 /* Do not allow vector struct mode constants for Advanced SIMD. 13136 We could support 0 and -1 easily, but they need support in 13137 aarch64-simd.md. */ 13138 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 13139 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT)) 13140 return false; 13141 13142 /* Only accept variable-length vector constants if they can be 13143 handled directly. 13144 13145 ??? It would be possible to handle rematerialization of other 13146 constants via secondary reloads. */ 13147 if (vec_flags & VEC_ANY_SVE) 13148 return aarch64_simd_valid_immediate (x, NULL); 13149 13150 if (GET_CODE (x) == HIGH) 13151 x = XEXP (x, 0); 13152 13153 /* Accept polynomial constants that can be calculated by using the 13154 destination of a move as the sole temporary. Constants that 13155 require a second temporary cannot be rematerialized (they can't be 13156 forced to memory and also aren't legitimate constants). */ 13157 poly_int64 offset; 13158 if (poly_int_rtx_p (x, &offset)) 13159 return aarch64_offset_temporaries (false, offset) <= 1; 13160 13161 /* If an offset is being added to something else, we need to allow the 13162 base to be moved into the destination register, meaning that there 13163 are no free temporaries for the offset. */ 13164 x = strip_offset (x, &offset); 13165 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0) 13166 return false; 13167 13168 /* Do not allow const (plus (anchor_symbol, const_int)). */ 13169 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x)) 13170 return false; 13171 13172 /* Treat symbols as constants. Avoid TLS symbols as they are complex, 13173 so spilling them is better than rematerialization. */ 13174 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x)) 13175 return true; 13176 13177 /* Label references are always constant. */ 13178 if (GET_CODE (x) == LABEL_REF) 13179 return true; 13180 13181 return false; 13182 } 13183 13184 rtx 13185 aarch64_load_tp (rtx target) 13186 { 13187 if (!target 13188 || GET_MODE (target) != Pmode 13189 || !register_operand (target, Pmode)) 13190 target = gen_reg_rtx (Pmode); 13191 13192 /* Can return in any reg. */ 13193 emit_insn (gen_aarch64_load_tp_hard (target)); 13194 return target; 13195 } 13196 13197 /* On AAPCS systems, this is the "struct __va_list". */ 13198 static GTY(()) tree va_list_type; 13199 13200 /* Implement TARGET_BUILD_BUILTIN_VA_LIST. 13201 Return the type to use as __builtin_va_list. 13202 13203 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as: 13204 13205 struct __va_list 13206 { 13207 void *__stack; 13208 void *__gr_top; 13209 void *__vr_top; 13210 int __gr_offs; 13211 int __vr_offs; 13212 }; */ 13213 13214 static tree 13215 aarch64_build_builtin_va_list (void) 13216 { 13217 tree va_list_name; 13218 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff; 13219 13220 /* Create the type. */ 13221 va_list_type = lang_hooks.types.make_type (RECORD_TYPE); 13222 /* Give it the required name. */ 13223 va_list_name = build_decl (BUILTINS_LOCATION, 13224 TYPE_DECL, 13225 get_identifier ("__va_list"), 13226 va_list_type); 13227 DECL_ARTIFICIAL (va_list_name) = 1; 13228 TYPE_NAME (va_list_type) = va_list_name; 13229 TYPE_STUB_DECL (va_list_type) = va_list_name; 13230 13231 /* Create the fields. */ 13232 f_stack = build_decl (BUILTINS_LOCATION, 13233 FIELD_DECL, get_identifier ("__stack"), 13234 ptr_type_node); 13235 f_grtop = build_decl (BUILTINS_LOCATION, 13236 FIELD_DECL, get_identifier ("__gr_top"), 13237 ptr_type_node); 13238 f_vrtop = build_decl (BUILTINS_LOCATION, 13239 FIELD_DECL, get_identifier ("__vr_top"), 13240 ptr_type_node); 13241 f_groff = build_decl (BUILTINS_LOCATION, 13242 FIELD_DECL, get_identifier ("__gr_offs"), 13243 integer_type_node); 13244 f_vroff = build_decl (BUILTINS_LOCATION, 13245 FIELD_DECL, get_identifier ("__vr_offs"), 13246 integer_type_node); 13247 13248 /* Tell tree-stdarg pass about our internal offset fields. 13249 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision 13250 purpose to identify whether the code is updating va_list internal 13251 offset fields through irregular way. */ 13252 va_list_gpr_counter_field = f_groff; 13253 va_list_fpr_counter_field = f_vroff; 13254 13255 DECL_ARTIFICIAL (f_stack) = 1; 13256 DECL_ARTIFICIAL (f_grtop) = 1; 13257 DECL_ARTIFICIAL (f_vrtop) = 1; 13258 DECL_ARTIFICIAL (f_groff) = 1; 13259 DECL_ARTIFICIAL (f_vroff) = 1; 13260 13261 DECL_FIELD_CONTEXT (f_stack) = va_list_type; 13262 DECL_FIELD_CONTEXT (f_grtop) = va_list_type; 13263 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type; 13264 DECL_FIELD_CONTEXT (f_groff) = va_list_type; 13265 DECL_FIELD_CONTEXT (f_vroff) = va_list_type; 13266 13267 TYPE_FIELDS (va_list_type) = f_stack; 13268 DECL_CHAIN (f_stack) = f_grtop; 13269 DECL_CHAIN (f_grtop) = f_vrtop; 13270 DECL_CHAIN (f_vrtop) = f_groff; 13271 DECL_CHAIN (f_groff) = f_vroff; 13272 13273 /* Compute its layout. */ 13274 layout_type (va_list_type); 13275 13276 return va_list_type; 13277 } 13278 13279 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */ 13280 static void 13281 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED) 13282 { 13283 const CUMULATIVE_ARGS *cum; 13284 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff; 13285 tree stack, grtop, vrtop, groff, vroff; 13286 tree t; 13287 int gr_save_area_size = cfun->va_list_gpr_size; 13288 int vr_save_area_size = cfun->va_list_fpr_size; 13289 int vr_offset; 13290 13291 cum = &crtl->args.info; 13292 if (cfun->va_list_gpr_size) 13293 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD, 13294 cfun->va_list_gpr_size); 13295 if (cfun->va_list_fpr_size) 13296 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn) 13297 * UNITS_PER_VREG, cfun->va_list_fpr_size); 13298 13299 if (!TARGET_FLOAT) 13300 { 13301 gcc_assert (cum->aapcs_nvrn == 0); 13302 vr_save_area_size = 0; 13303 } 13304 13305 f_stack = TYPE_FIELDS (va_list_type_node); 13306 f_grtop = DECL_CHAIN (f_stack); 13307 f_vrtop = DECL_CHAIN (f_grtop); 13308 f_groff = DECL_CHAIN (f_vrtop); 13309 f_vroff = DECL_CHAIN (f_groff); 13310 13311 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack, 13312 NULL_TREE); 13313 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop, 13314 NULL_TREE); 13315 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop, 13316 NULL_TREE); 13317 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff, 13318 NULL_TREE); 13319 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff, 13320 NULL_TREE); 13321 13322 /* Emit code to initialize STACK, which points to the next varargs stack 13323 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used 13324 by named arguments. STACK is 8-byte aligned. */ 13325 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx); 13326 if (cum->aapcs_stack_size > 0) 13327 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD); 13328 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t); 13329 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); 13330 13331 /* Emit code to initialize GRTOP, the top of the GR save area. 13332 virtual_incoming_args_rtx should have been 16 byte aligned. */ 13333 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx); 13334 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t); 13335 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); 13336 13337 /* Emit code to initialize VRTOP, the top of the VR save area. 13338 This address is gr_save_area_bytes below GRTOP, rounded 13339 down to the next 16-byte boundary. */ 13340 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx); 13341 vr_offset = ROUND_UP (gr_save_area_size, 13342 STACK_BOUNDARY / BITS_PER_UNIT); 13343 13344 if (vr_offset) 13345 t = fold_build_pointer_plus_hwi (t, -vr_offset); 13346 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t); 13347 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); 13348 13349 /* Emit code to initialize GROFF, the offset from GRTOP of the 13350 next GPR argument. */ 13351 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff, 13352 build_int_cst (TREE_TYPE (groff), -gr_save_area_size)); 13353 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); 13354 13355 /* Likewise emit code to initialize VROFF, the offset from FTOP 13356 of the next VR argument. */ 13357 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff, 13358 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size)); 13359 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL); 13360 } 13361 13362 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */ 13363 13364 static tree 13365 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p, 13366 gimple_seq *post_p ATTRIBUTE_UNUSED) 13367 { 13368 tree addr; 13369 bool indirect_p; 13370 bool is_ha; /* is HFA or HVA. */ 13371 bool dw_align; /* double-word align. */ 13372 machine_mode ag_mode = VOIDmode; 13373 int nregs; 13374 machine_mode mode; 13375 13376 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff; 13377 tree stack, f_top, f_off, off, arg, roundup, on_stack; 13378 HOST_WIDE_INT size, rsize, adjust, align; 13379 tree t, u, cond1, cond2; 13380 13381 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false); 13382 if (indirect_p) 13383 type = build_pointer_type (type); 13384 13385 mode = TYPE_MODE (type); 13386 13387 f_stack = TYPE_FIELDS (va_list_type_node); 13388 f_grtop = DECL_CHAIN (f_stack); 13389 f_vrtop = DECL_CHAIN (f_grtop); 13390 f_groff = DECL_CHAIN (f_vrtop); 13391 f_vroff = DECL_CHAIN (f_groff); 13392 13393 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist), 13394 f_stack, NULL_TREE); 13395 size = int_size_in_bytes (type); 13396 13397 bool abi_break; 13398 align 13399 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT; 13400 13401 dw_align = false; 13402 adjust = 0; 13403 if (aarch64_vfp_is_call_or_return_candidate (mode, 13404 type, 13405 &ag_mode, 13406 &nregs, 13407 &is_ha)) 13408 { 13409 /* No frontends can create types with variable-sized modes, so we 13410 shouldn't be asked to pass or return them. */ 13411 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant (); 13412 13413 /* TYPE passed in fp/simd registers. */ 13414 if (!TARGET_FLOAT) 13415 aarch64_err_no_fpadvsimd (mode); 13416 13417 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), 13418 unshare_expr (valist), f_vrtop, NULL_TREE); 13419 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), 13420 unshare_expr (valist), f_vroff, NULL_TREE); 13421 13422 rsize = nregs * UNITS_PER_VREG; 13423 13424 if (is_ha) 13425 { 13426 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG) 13427 adjust = UNITS_PER_VREG - ag_size; 13428 } 13429 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD 13430 && size < UNITS_PER_VREG) 13431 { 13432 adjust = UNITS_PER_VREG - size; 13433 } 13434 } 13435 else 13436 { 13437 /* TYPE passed in general registers. */ 13438 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), 13439 unshare_expr (valist), f_grtop, NULL_TREE); 13440 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff), 13441 unshare_expr (valist), f_groff, NULL_TREE); 13442 rsize = ROUND_UP (size, UNITS_PER_WORD); 13443 nregs = rsize / UNITS_PER_WORD; 13444 13445 if (align > 8) 13446 { 13447 if (abi_break && warn_psabi) 13448 inform (input_location, "parameter passing for argument of type " 13449 "%qT changed in GCC 9.1", type); 13450 dw_align = true; 13451 } 13452 13453 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD 13454 && size < UNITS_PER_WORD) 13455 { 13456 adjust = UNITS_PER_WORD - size; 13457 } 13458 } 13459 13460 /* Get a local temporary for the field value. */ 13461 off = get_initialized_tmp_var (f_off, pre_p, NULL); 13462 13463 /* Emit code to branch if off >= 0. */ 13464 t = build2 (GE_EXPR, boolean_type_node, off, 13465 build_int_cst (TREE_TYPE (off), 0)); 13466 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE); 13467 13468 if (dw_align) 13469 { 13470 /* Emit: offs = (offs + 15) & -16. */ 13471 t = build2 (PLUS_EXPR, TREE_TYPE (off), off, 13472 build_int_cst (TREE_TYPE (off), 15)); 13473 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t, 13474 build_int_cst (TREE_TYPE (off), -16)); 13475 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t); 13476 } 13477 else 13478 roundup = NULL; 13479 13480 /* Update ap.__[g|v]r_offs */ 13481 t = build2 (PLUS_EXPR, TREE_TYPE (off), off, 13482 build_int_cst (TREE_TYPE (off), rsize)); 13483 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t); 13484 13485 /* String up. */ 13486 if (roundup) 13487 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t); 13488 13489 /* [cond2] if (ap.__[g|v]r_offs > 0) */ 13490 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off), 13491 build_int_cst (TREE_TYPE (f_off), 0)); 13492 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE); 13493 13494 /* String up: make sure the assignment happens before the use. */ 13495 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2); 13496 COND_EXPR_ELSE (cond1) = t; 13497 13498 /* Prepare the trees handling the argument that is passed on the stack; 13499 the top level node will store in ON_STACK. */ 13500 arg = get_initialized_tmp_var (stack, pre_p, NULL); 13501 if (align > 8) 13502 { 13503 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */ 13504 t = fold_build_pointer_plus_hwi (arg, 15); 13505 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, 13506 build_int_cst (TREE_TYPE (t), -16)); 13507 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t); 13508 } 13509 else 13510 roundup = NULL; 13511 /* Advance ap.__stack */ 13512 t = fold_build_pointer_plus_hwi (arg, size + 7); 13513 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t, 13514 build_int_cst (TREE_TYPE (t), -8)); 13515 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t); 13516 /* String up roundup and advance. */ 13517 if (roundup) 13518 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t); 13519 /* String up with arg */ 13520 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg); 13521 /* Big-endianness related address adjustment. */ 13522 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD 13523 && size < UNITS_PER_WORD) 13524 { 13525 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg, 13526 size_int (UNITS_PER_WORD - size)); 13527 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t); 13528 } 13529 13530 COND_EXPR_THEN (cond1) = unshare_expr (on_stack); 13531 COND_EXPR_THEN (cond2) = unshare_expr (on_stack); 13532 13533 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */ 13534 t = off; 13535 if (adjust) 13536 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off, 13537 build_int_cst (TREE_TYPE (off), adjust)); 13538 13539 t = fold_convert (sizetype, t); 13540 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t); 13541 13542 if (is_ha) 13543 { 13544 /* type ha; // treat as "struct {ftype field[n];}" 13545 ... [computing offs] 13546 for (i = 0; i <nregs; ++i, offs += 16) 13547 ha.field[i] = *((ftype *)(ap.__vr_top + offs)); 13548 return ha; */ 13549 int i; 13550 tree tmp_ha, field_t, field_ptr_t; 13551 13552 /* Declare a local variable. */ 13553 tmp_ha = create_tmp_var_raw (type, "ha"); 13554 gimple_add_tmp_var (tmp_ha); 13555 13556 /* Establish the base type. */ 13557 switch (ag_mode) 13558 { 13559 case E_SFmode: 13560 field_t = float_type_node; 13561 field_ptr_t = float_ptr_type_node; 13562 break; 13563 case E_DFmode: 13564 field_t = double_type_node; 13565 field_ptr_t = double_ptr_type_node; 13566 break; 13567 case E_TFmode: 13568 field_t = long_double_type_node; 13569 field_ptr_t = long_double_ptr_type_node; 13570 break; 13571 case E_HFmode: 13572 field_t = aarch64_fp16_type_node; 13573 field_ptr_t = aarch64_fp16_ptr_type_node; 13574 break; 13575 case E_V2SImode: 13576 case E_V4SImode: 13577 { 13578 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode)); 13579 field_t = build_vector_type_for_mode (innertype, ag_mode); 13580 field_ptr_t = build_pointer_type (field_t); 13581 } 13582 break; 13583 default: 13584 gcc_assert (0); 13585 } 13586 13587 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */ 13588 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha); 13589 addr = t; 13590 t = fold_convert (field_ptr_t, addr); 13591 t = build2 (MODIFY_EXPR, field_t, 13592 build1 (INDIRECT_REF, field_t, tmp_ha), 13593 build1 (INDIRECT_REF, field_t, t)); 13594 13595 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */ 13596 for (i = 1; i < nregs; ++i) 13597 { 13598 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG); 13599 u = fold_convert (field_ptr_t, addr); 13600 u = build2 (MODIFY_EXPR, field_t, 13601 build2 (MEM_REF, field_t, tmp_ha, 13602 build_int_cst (field_ptr_t, 13603 (i * 13604 int_size_in_bytes (field_t)))), 13605 build1 (INDIRECT_REF, field_t, u)); 13606 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u); 13607 } 13608 13609 u = fold_convert (TREE_TYPE (f_top), tmp_ha); 13610 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u); 13611 } 13612 13613 COND_EXPR_ELSE (cond2) = t; 13614 addr = fold_convert (build_pointer_type (type), cond1); 13615 addr = build_va_arg_indirect_ref (addr); 13616 13617 if (indirect_p) 13618 addr = build_va_arg_indirect_ref (addr); 13619 13620 return addr; 13621 } 13622 13623 /* Implement TARGET_SETUP_INCOMING_VARARGS. */ 13624 13625 static void 13626 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode, 13627 tree type, int *pretend_size ATTRIBUTE_UNUSED, 13628 int no_rtl) 13629 { 13630 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v); 13631 CUMULATIVE_ARGS local_cum; 13632 int gr_saved = cfun->va_list_gpr_size; 13633 int vr_saved = cfun->va_list_fpr_size; 13634 13635 /* The caller has advanced CUM up to, but not beyond, the last named 13636 argument. Advance a local copy of CUM past the last "real" named 13637 argument, to find out how many registers are left over. */ 13638 local_cum = *cum; 13639 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true); 13640 13641 /* Found out how many registers we need to save. 13642 Honor tree-stdvar analysis results. */ 13643 if (cfun->va_list_gpr_size) 13644 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn, 13645 cfun->va_list_gpr_size / UNITS_PER_WORD); 13646 if (cfun->va_list_fpr_size) 13647 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn, 13648 cfun->va_list_fpr_size / UNITS_PER_VREG); 13649 13650 if (!TARGET_FLOAT) 13651 { 13652 gcc_assert (local_cum.aapcs_nvrn == 0); 13653 vr_saved = 0; 13654 } 13655 13656 if (!no_rtl) 13657 { 13658 if (gr_saved > 0) 13659 { 13660 rtx ptr, mem; 13661 13662 /* virtual_incoming_args_rtx should have been 16-byte aligned. */ 13663 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, 13664 - gr_saved * UNITS_PER_WORD); 13665 mem = gen_frame_mem (BLKmode, ptr); 13666 set_mem_alias_set (mem, get_varargs_alias_set ()); 13667 13668 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM, 13669 mem, gr_saved); 13670 } 13671 if (vr_saved > 0) 13672 { 13673 /* We can't use move_block_from_reg, because it will use 13674 the wrong mode, storing D regs only. */ 13675 machine_mode mode = TImode; 13676 int off, i, vr_start; 13677 13678 /* Set OFF to the offset from virtual_incoming_args_rtx of 13679 the first vector register. The VR save area lies below 13680 the GR one, and is aligned to 16 bytes. */ 13681 off = -ROUND_UP (gr_saved * UNITS_PER_WORD, 13682 STACK_BOUNDARY / BITS_PER_UNIT); 13683 off -= vr_saved * UNITS_PER_VREG; 13684 13685 vr_start = V0_REGNUM + local_cum.aapcs_nvrn; 13686 for (i = 0; i < vr_saved; ++i) 13687 { 13688 rtx ptr, mem; 13689 13690 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off); 13691 mem = gen_frame_mem (mode, ptr); 13692 set_mem_alias_set (mem, get_varargs_alias_set ()); 13693 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i)); 13694 off += UNITS_PER_VREG; 13695 } 13696 } 13697 } 13698 13699 /* We don't save the size into *PRETEND_SIZE because we want to avoid 13700 any complication of having crtl->args.pretend_args_size changed. */ 13701 cfun->machine->frame.saved_varargs_size 13702 = (ROUND_UP (gr_saved * UNITS_PER_WORD, 13703 STACK_BOUNDARY / BITS_PER_UNIT) 13704 + vr_saved * UNITS_PER_VREG); 13705 } 13706 13707 static void 13708 aarch64_conditional_register_usage (void) 13709 { 13710 int i; 13711 if (!TARGET_FLOAT) 13712 { 13713 for (i = V0_REGNUM; i <= V31_REGNUM; i++) 13714 { 13715 fixed_regs[i] = 1; 13716 call_used_regs[i] = 1; 13717 } 13718 } 13719 if (!TARGET_SVE) 13720 for (i = P0_REGNUM; i <= P15_REGNUM; i++) 13721 { 13722 fixed_regs[i] = 1; 13723 call_used_regs[i] = 1; 13724 } 13725 13726 /* When tracking speculation, we need a couple of call-clobbered registers 13727 to track the speculation state. It would be nice to just use 13728 IP0 and IP1, but currently there are numerous places that just 13729 assume these registers are free for other uses (eg pointer 13730 authentication). */ 13731 if (aarch64_track_speculation) 13732 { 13733 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1; 13734 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1; 13735 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1; 13736 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1; 13737 } 13738 } 13739 13740 /* Walk down the type tree of TYPE counting consecutive base elements. 13741 If *MODEP is VOIDmode, then set it to the first valid floating point 13742 type. If a non-floating point type is found, or if a floating point 13743 type that doesn't match a non-VOIDmode *MODEP is found, then return -1, 13744 otherwise return the count in the sub-tree. */ 13745 static int 13746 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep) 13747 { 13748 machine_mode mode; 13749 HOST_WIDE_INT size; 13750 13751 switch (TREE_CODE (type)) 13752 { 13753 case REAL_TYPE: 13754 mode = TYPE_MODE (type); 13755 if (mode != DFmode && mode != SFmode 13756 && mode != TFmode && mode != HFmode) 13757 return -1; 13758 13759 if (*modep == VOIDmode) 13760 *modep = mode; 13761 13762 if (*modep == mode) 13763 return 1; 13764 13765 break; 13766 13767 case COMPLEX_TYPE: 13768 mode = TYPE_MODE (TREE_TYPE (type)); 13769 if (mode != DFmode && mode != SFmode 13770 && mode != TFmode && mode != HFmode) 13771 return -1; 13772 13773 if (*modep == VOIDmode) 13774 *modep = mode; 13775 13776 if (*modep == mode) 13777 return 2; 13778 13779 break; 13780 13781 case VECTOR_TYPE: 13782 /* Use V2SImode and V4SImode as representatives of all 64-bit 13783 and 128-bit vector types. */ 13784 size = int_size_in_bytes (type); 13785 switch (size) 13786 { 13787 case 8: 13788 mode = V2SImode; 13789 break; 13790 case 16: 13791 mode = V4SImode; 13792 break; 13793 default: 13794 return -1; 13795 } 13796 13797 if (*modep == VOIDmode) 13798 *modep = mode; 13799 13800 /* Vector modes are considered to be opaque: two vectors are 13801 equivalent for the purposes of being homogeneous aggregates 13802 if they are the same size. */ 13803 if (*modep == mode) 13804 return 1; 13805 13806 break; 13807 13808 case ARRAY_TYPE: 13809 { 13810 int count; 13811 tree index = TYPE_DOMAIN (type); 13812 13813 /* Can't handle incomplete types nor sizes that are not 13814 fixed. */ 13815 if (!COMPLETE_TYPE_P (type) 13816 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST) 13817 return -1; 13818 13819 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep); 13820 if (count == -1 13821 || !index 13822 || !TYPE_MAX_VALUE (index) 13823 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index)) 13824 || !TYPE_MIN_VALUE (index) 13825 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index)) 13826 || count < 0) 13827 return -1; 13828 13829 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index)) 13830 - tree_to_uhwi (TYPE_MIN_VALUE (index))); 13831 13832 /* There must be no padding. */ 13833 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)), 13834 count * GET_MODE_BITSIZE (*modep))) 13835 return -1; 13836 13837 return count; 13838 } 13839 13840 case RECORD_TYPE: 13841 { 13842 int count = 0; 13843 int sub_count; 13844 tree field; 13845 13846 /* Can't handle incomplete types nor sizes that are not 13847 fixed. */ 13848 if (!COMPLETE_TYPE_P (type) 13849 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST) 13850 return -1; 13851 13852 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) 13853 { 13854 if (TREE_CODE (field) != FIELD_DECL) 13855 continue; 13856 13857 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep); 13858 if (sub_count < 0) 13859 return -1; 13860 count += sub_count; 13861 } 13862 13863 /* There must be no padding. */ 13864 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)), 13865 count * GET_MODE_BITSIZE (*modep))) 13866 return -1; 13867 13868 return count; 13869 } 13870 13871 case UNION_TYPE: 13872 case QUAL_UNION_TYPE: 13873 { 13874 /* These aren't very interesting except in a degenerate case. */ 13875 int count = 0; 13876 int sub_count; 13877 tree field; 13878 13879 /* Can't handle incomplete types nor sizes that are not 13880 fixed. */ 13881 if (!COMPLETE_TYPE_P (type) 13882 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST) 13883 return -1; 13884 13885 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field)) 13886 { 13887 if (TREE_CODE (field) != FIELD_DECL) 13888 continue; 13889 13890 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep); 13891 if (sub_count < 0) 13892 return -1; 13893 count = count > sub_count ? count : sub_count; 13894 } 13895 13896 /* There must be no padding. */ 13897 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)), 13898 count * GET_MODE_BITSIZE (*modep))) 13899 return -1; 13900 13901 return count; 13902 } 13903 13904 default: 13905 break; 13906 } 13907 13908 return -1; 13909 } 13910 13911 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector 13912 type as described in AAPCS64 \S 4.1.2. 13913 13914 See the comment above aarch64_composite_type_p for the notes on MODE. */ 13915 13916 static bool 13917 aarch64_short_vector_p (const_tree type, 13918 machine_mode mode) 13919 { 13920 poly_int64 size = -1; 13921 13922 if (type && TREE_CODE (type) == VECTOR_TYPE) 13923 size = int_size_in_bytes (type); 13924 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT 13925 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT) 13926 size = GET_MODE_SIZE (mode); 13927 13928 return known_eq (size, 8) || known_eq (size, 16); 13929 } 13930 13931 /* Return TRUE if the type, as described by TYPE and MODE, is a composite 13932 type as described in AAPCS64 \S 4.3. This includes aggregate, union and 13933 array types. The C99 floating-point complex types are also considered 13934 as composite types, according to AAPCS64 \S 7.1.1. The complex integer 13935 types, which are GCC extensions and out of the scope of AAPCS64, are 13936 treated as composite types here as well. 13937 13938 Note that MODE itself is not sufficient in determining whether a type 13939 is such a composite type or not. This is because 13940 stor-layout.c:compute_record_mode may have already changed the MODE 13941 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a 13942 structure with only one field may have its MODE set to the mode of the 13943 field. Also an integer mode whose size matches the size of the 13944 RECORD_TYPE type may be used to substitute the original mode 13945 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be 13946 solely relied on. */ 13947 13948 static bool 13949 aarch64_composite_type_p (const_tree type, 13950 machine_mode mode) 13951 { 13952 if (aarch64_short_vector_p (type, mode)) 13953 return false; 13954 13955 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE)) 13956 return true; 13957 13958 if (mode == BLKmode 13959 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT 13960 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT) 13961 return true; 13962 13963 return false; 13964 } 13965 13966 /* Return TRUE if an argument, whose type is described by TYPE and MODE, 13967 shall be passed or returned in simd/fp register(s) (providing these 13968 parameter passing registers are available). 13969 13970 Upon successful return, *COUNT returns the number of needed registers, 13971 *BASE_MODE returns the mode of the individual register and when IS_HAF 13972 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous 13973 floating-point aggregate or a homogeneous short-vector aggregate. */ 13974 13975 static bool 13976 aarch64_vfp_is_call_or_return_candidate (machine_mode mode, 13977 const_tree type, 13978 machine_mode *base_mode, 13979 int *count, 13980 bool *is_ha) 13981 { 13982 machine_mode new_mode = VOIDmode; 13983 bool composite_p = aarch64_composite_type_p (type, mode); 13984 13985 if (is_ha != NULL) *is_ha = false; 13986 13987 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT) 13988 || aarch64_short_vector_p (type, mode)) 13989 { 13990 *count = 1; 13991 new_mode = mode; 13992 } 13993 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT) 13994 { 13995 if (is_ha != NULL) *is_ha = true; 13996 *count = 2; 13997 new_mode = GET_MODE_INNER (mode); 13998 } 13999 else if (type && composite_p) 14000 { 14001 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode); 14002 14003 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS) 14004 { 14005 if (is_ha != NULL) *is_ha = true; 14006 *count = ag_count; 14007 } 14008 else 14009 return false; 14010 } 14011 else 14012 return false; 14013 14014 *base_mode = new_mode; 14015 return true; 14016 } 14017 14018 /* Implement TARGET_STRUCT_VALUE_RTX. */ 14019 14020 static rtx 14021 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED, 14022 int incoming ATTRIBUTE_UNUSED) 14023 { 14024 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM); 14025 } 14026 14027 /* Implements target hook vector_mode_supported_p. */ 14028 static bool 14029 aarch64_vector_mode_supported_p (machine_mode mode) 14030 { 14031 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 14032 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0; 14033 } 14034 14035 /* Return appropriate SIMD container 14036 for MODE within a vector of WIDTH bits. */ 14037 static machine_mode 14038 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width) 14039 { 14040 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR)) 14041 switch (mode) 14042 { 14043 case E_DFmode: 14044 return VNx2DFmode; 14045 case E_SFmode: 14046 return VNx4SFmode; 14047 case E_HFmode: 14048 return VNx8HFmode; 14049 case E_DImode: 14050 return VNx2DImode; 14051 case E_SImode: 14052 return VNx4SImode; 14053 case E_HImode: 14054 return VNx8HImode; 14055 case E_QImode: 14056 return VNx16QImode; 14057 default: 14058 return word_mode; 14059 } 14060 14061 gcc_assert (known_eq (width, 64) || known_eq (width, 128)); 14062 if (TARGET_SIMD) 14063 { 14064 if (known_eq (width, 128)) 14065 switch (mode) 14066 { 14067 case E_DFmode: 14068 return V2DFmode; 14069 case E_SFmode: 14070 return V4SFmode; 14071 case E_HFmode: 14072 return V8HFmode; 14073 case E_SImode: 14074 return V4SImode; 14075 case E_HImode: 14076 return V8HImode; 14077 case E_QImode: 14078 return V16QImode; 14079 case E_DImode: 14080 return V2DImode; 14081 default: 14082 break; 14083 } 14084 else 14085 switch (mode) 14086 { 14087 case E_SFmode: 14088 return V2SFmode; 14089 case E_HFmode: 14090 return V4HFmode; 14091 case E_SImode: 14092 return V2SImode; 14093 case E_HImode: 14094 return V4HImode; 14095 case E_QImode: 14096 return V8QImode; 14097 default: 14098 break; 14099 } 14100 } 14101 return word_mode; 14102 } 14103 14104 /* Return 128-bit container as the preferred SIMD mode for MODE. */ 14105 static machine_mode 14106 aarch64_preferred_simd_mode (scalar_mode mode) 14107 { 14108 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128; 14109 return aarch64_simd_container_mode (mode, bits); 14110 } 14111 14112 /* Return a list of possible vector sizes for the vectorizer 14113 to iterate over. */ 14114 static void 14115 aarch64_autovectorize_vector_sizes (vector_sizes *sizes) 14116 { 14117 if (TARGET_SVE) 14118 sizes->safe_push (BYTES_PER_SVE_VECTOR); 14119 sizes->safe_push (16); 14120 sizes->safe_push (8); 14121 } 14122 14123 /* Implement TARGET_MANGLE_TYPE. */ 14124 14125 static const char * 14126 aarch64_mangle_type (const_tree type) 14127 { 14128 /* The AArch64 ABI documents say that "__va_list" has to be 14129 mangled as if it is in the "std" namespace. */ 14130 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type)) 14131 return "St9__va_list"; 14132 14133 /* Half-precision float. */ 14134 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16) 14135 return "Dh"; 14136 14137 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for 14138 builtin types. */ 14139 if (TYPE_NAME (type) != NULL) 14140 return aarch64_mangle_builtin_type (type); 14141 14142 /* Use the default mangling. */ 14143 return NULL; 14144 } 14145 14146 /* Find the first rtx_insn before insn that will generate an assembly 14147 instruction. */ 14148 14149 static rtx_insn * 14150 aarch64_prev_real_insn (rtx_insn *insn) 14151 { 14152 if (!insn) 14153 return NULL; 14154 14155 do 14156 { 14157 insn = prev_real_insn (insn); 14158 } 14159 while (insn && recog_memoized (insn) < 0); 14160 14161 return insn; 14162 } 14163 14164 static bool 14165 is_madd_op (enum attr_type t1) 14166 { 14167 unsigned int i; 14168 /* A number of these may be AArch32 only. */ 14169 enum attr_type mlatypes[] = { 14170 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD, 14171 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY, 14172 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD 14173 }; 14174 14175 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++) 14176 { 14177 if (t1 == mlatypes[i]) 14178 return true; 14179 } 14180 14181 return false; 14182 } 14183 14184 /* Check if there is a register dependency between a load and the insn 14185 for which we hold recog_data. */ 14186 14187 static bool 14188 dep_between_memop_and_curr (rtx memop) 14189 { 14190 rtx load_reg; 14191 int opno; 14192 14193 gcc_assert (GET_CODE (memop) == SET); 14194 14195 if (!REG_P (SET_DEST (memop))) 14196 return false; 14197 14198 load_reg = SET_DEST (memop); 14199 for (opno = 1; opno < recog_data.n_operands; opno++) 14200 { 14201 rtx operand = recog_data.operand[opno]; 14202 if (REG_P (operand) 14203 && reg_overlap_mentioned_p (load_reg, operand)) 14204 return true; 14205 14206 } 14207 return false; 14208 } 14209 14210 14211 /* When working around the Cortex-A53 erratum 835769, 14212 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate 14213 instruction and has a preceding memory instruction such that a NOP 14214 should be inserted between them. */ 14215 14216 bool 14217 aarch64_madd_needs_nop (rtx_insn* insn) 14218 { 14219 enum attr_type attr_type; 14220 rtx_insn *prev; 14221 rtx body; 14222 14223 if (!TARGET_FIX_ERR_A53_835769) 14224 return false; 14225 14226 if (!INSN_P (insn) || recog_memoized (insn) < 0) 14227 return false; 14228 14229 attr_type = get_attr_type (insn); 14230 if (!is_madd_op (attr_type)) 14231 return false; 14232 14233 prev = aarch64_prev_real_insn (insn); 14234 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN. 14235 Restore recog state to INSN to avoid state corruption. */ 14236 extract_constrain_insn_cached (insn); 14237 14238 if (!prev || !contains_mem_rtx_p (PATTERN (prev))) 14239 return false; 14240 14241 body = single_set (prev); 14242 14243 /* If the previous insn is a memory op and there is no dependency between 14244 it and the DImode madd, emit a NOP between them. If body is NULL then we 14245 have a complex memory operation, probably a load/store pair. 14246 Be conservative for now and emit a NOP. */ 14247 if (GET_MODE (recog_data.operand[0]) == DImode 14248 && (!body || !dep_between_memop_and_curr (body))) 14249 return true; 14250 14251 return false; 14252 14253 } 14254 14255 14256 /* Implement FINAL_PRESCAN_INSN. */ 14257 14258 void 14259 aarch64_final_prescan_insn (rtx_insn *insn) 14260 { 14261 if (aarch64_madd_needs_nop (insn)) 14262 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n"); 14263 } 14264 14265 14266 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX 14267 instruction. */ 14268 14269 bool 14270 aarch64_sve_index_immediate_p (rtx base_or_step) 14271 { 14272 return (CONST_INT_P (base_or_step) 14273 && IN_RANGE (INTVAL (base_or_step), -16, 15)); 14274 } 14275 14276 /* Return true if X is a valid immediate for the SVE ADD and SUB 14277 instructions. Negate X first if NEGATE_P is true. */ 14278 14279 bool 14280 aarch64_sve_arith_immediate_p (rtx x, bool negate_p) 14281 { 14282 rtx elt; 14283 14284 if (!const_vec_duplicate_p (x, &elt) 14285 || !CONST_INT_P (elt)) 14286 return false; 14287 14288 HOST_WIDE_INT val = INTVAL (elt); 14289 if (negate_p) 14290 val = -val; 14291 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x))); 14292 14293 if (val & 0xff) 14294 return IN_RANGE (val, 0, 0xff); 14295 return IN_RANGE (val, 0, 0xff00); 14296 } 14297 14298 /* Return true if X is a valid immediate operand for an SVE logical 14299 instruction such as AND. */ 14300 14301 bool 14302 aarch64_sve_bitmask_immediate_p (rtx x) 14303 { 14304 rtx elt; 14305 14306 return (const_vec_duplicate_p (x, &elt) 14307 && CONST_INT_P (elt) 14308 && aarch64_bitmask_imm (INTVAL (elt), 14309 GET_MODE_INNER (GET_MODE (x)))); 14310 } 14311 14312 /* Return true if X is a valid immediate for the SVE DUP and CPY 14313 instructions. */ 14314 14315 bool 14316 aarch64_sve_dup_immediate_p (rtx x) 14317 { 14318 rtx elt; 14319 14320 if (!const_vec_duplicate_p (x, &elt) 14321 || !CONST_INT_P (elt)) 14322 return false; 14323 14324 HOST_WIDE_INT val = INTVAL (elt); 14325 if (val & 0xff) 14326 return IN_RANGE (val, -0x80, 0x7f); 14327 return IN_RANGE (val, -0x8000, 0x7f00); 14328 } 14329 14330 /* Return true if X is a valid immediate operand for an SVE CMP instruction. 14331 SIGNED_P says whether the operand is signed rather than unsigned. */ 14332 14333 bool 14334 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p) 14335 { 14336 rtx elt; 14337 14338 return (const_vec_duplicate_p (x, &elt) 14339 && CONST_INT_P (elt) 14340 && (signed_p 14341 ? IN_RANGE (INTVAL (elt), -16, 15) 14342 : IN_RANGE (INTVAL (elt), 0, 127))); 14343 } 14344 14345 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB 14346 instruction. Negate X first if NEGATE_P is true. */ 14347 14348 bool 14349 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p) 14350 { 14351 rtx elt; 14352 REAL_VALUE_TYPE r; 14353 14354 if (!const_vec_duplicate_p (x, &elt) 14355 || GET_CODE (elt) != CONST_DOUBLE) 14356 return false; 14357 14358 r = *CONST_DOUBLE_REAL_VALUE (elt); 14359 14360 if (negate_p) 14361 r = real_value_negate (&r); 14362 14363 if (real_equal (&r, &dconst1)) 14364 return true; 14365 if (real_equal (&r, &dconsthalf)) 14366 return true; 14367 return false; 14368 } 14369 14370 /* Return true if X is a valid immediate operand for an SVE FMUL 14371 instruction. */ 14372 14373 bool 14374 aarch64_sve_float_mul_immediate_p (rtx x) 14375 { 14376 rtx elt; 14377 14378 /* GCC will never generate a multiply with an immediate of 2, so there is no 14379 point testing for it (even though it is a valid constant). */ 14380 return (const_vec_duplicate_p (x, &elt) 14381 && GET_CODE (elt) == CONST_DOUBLE 14382 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)); 14383 } 14384 14385 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate 14386 for the Advanced SIMD operation described by WHICH and INSN. If INFO 14387 is nonnull, use it to describe valid immediates. */ 14388 static bool 14389 aarch64_advsimd_valid_immediate_hs (unsigned int val32, 14390 simd_immediate_info *info, 14391 enum simd_immediate_check which, 14392 simd_immediate_info::insn_type insn) 14393 { 14394 /* Try a 4-byte immediate with LSL. */ 14395 for (unsigned int shift = 0; shift < 32; shift += 8) 14396 if ((val32 & (0xff << shift)) == val32) 14397 { 14398 if (info) 14399 *info = simd_immediate_info (SImode, val32 >> shift, insn, 14400 simd_immediate_info::LSL, shift); 14401 return true; 14402 } 14403 14404 /* Try a 2-byte immediate with LSL. */ 14405 unsigned int imm16 = val32 & 0xffff; 14406 if (imm16 == (val32 >> 16)) 14407 for (unsigned int shift = 0; shift < 16; shift += 8) 14408 if ((imm16 & (0xff << shift)) == imm16) 14409 { 14410 if (info) 14411 *info = simd_immediate_info (HImode, imm16 >> shift, insn, 14412 simd_immediate_info::LSL, shift); 14413 return true; 14414 } 14415 14416 /* Try a 4-byte immediate with MSL, except for cases that MVN 14417 can handle. */ 14418 if (which == AARCH64_CHECK_MOV) 14419 for (unsigned int shift = 8; shift < 24; shift += 8) 14420 { 14421 unsigned int low = (1 << shift) - 1; 14422 if (((val32 & (0xff << shift)) | low) == val32) 14423 { 14424 if (info) 14425 *info = simd_immediate_info (SImode, val32 >> shift, insn, 14426 simd_immediate_info::MSL, shift); 14427 return true; 14428 } 14429 } 14430 14431 return false; 14432 } 14433 14434 /* Return true if replicating VAL64 is a valid immediate for the 14435 Advanced SIMD operation described by WHICH. If INFO is nonnull, 14436 use it to describe valid immediates. */ 14437 static bool 14438 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64, 14439 simd_immediate_info *info, 14440 enum simd_immediate_check which) 14441 { 14442 unsigned int val32 = val64 & 0xffffffff; 14443 unsigned int val16 = val64 & 0xffff; 14444 unsigned int val8 = val64 & 0xff; 14445 14446 if (val32 == (val64 >> 32)) 14447 { 14448 if ((which & AARCH64_CHECK_ORR) != 0 14449 && aarch64_advsimd_valid_immediate_hs (val32, info, which, 14450 simd_immediate_info::MOV)) 14451 return true; 14452 14453 if ((which & AARCH64_CHECK_BIC) != 0 14454 && aarch64_advsimd_valid_immediate_hs (~val32, info, which, 14455 simd_immediate_info::MVN)) 14456 return true; 14457 14458 /* Try using a replicated byte. */ 14459 if (which == AARCH64_CHECK_MOV 14460 && val16 == (val32 >> 16) 14461 && val8 == (val16 >> 8)) 14462 { 14463 if (info) 14464 *info = simd_immediate_info (QImode, val8); 14465 return true; 14466 } 14467 } 14468 14469 /* Try using a bit-to-bytemask. */ 14470 if (which == AARCH64_CHECK_MOV) 14471 { 14472 unsigned int i; 14473 for (i = 0; i < 64; i += 8) 14474 { 14475 unsigned char byte = (val64 >> i) & 0xff; 14476 if (byte != 0 && byte != 0xff) 14477 break; 14478 } 14479 if (i == 64) 14480 { 14481 if (info) 14482 *info = simd_immediate_info (DImode, val64); 14483 return true; 14484 } 14485 } 14486 return false; 14487 } 14488 14489 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV 14490 instruction. If INFO is nonnull, use it to describe valid immediates. */ 14491 14492 static bool 14493 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64, 14494 simd_immediate_info *info) 14495 { 14496 scalar_int_mode mode = DImode; 14497 unsigned int val32 = val64 & 0xffffffff; 14498 if (val32 == (val64 >> 32)) 14499 { 14500 mode = SImode; 14501 unsigned int val16 = val32 & 0xffff; 14502 if (val16 == (val32 >> 16)) 14503 { 14504 mode = HImode; 14505 unsigned int val8 = val16 & 0xff; 14506 if (val8 == (val16 >> 8)) 14507 mode = QImode; 14508 } 14509 } 14510 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode); 14511 if (IN_RANGE (val, -0x80, 0x7f)) 14512 { 14513 /* DUP with no shift. */ 14514 if (info) 14515 *info = simd_immediate_info (mode, val); 14516 return true; 14517 } 14518 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00)) 14519 { 14520 /* DUP with LSL #8. */ 14521 if (info) 14522 *info = simd_immediate_info (mode, val); 14523 return true; 14524 } 14525 if (aarch64_bitmask_imm (val64, mode)) 14526 { 14527 /* DUPM. */ 14528 if (info) 14529 *info = simd_immediate_info (mode, val); 14530 return true; 14531 } 14532 return false; 14533 } 14534 14535 /* Return true if OP is a valid SIMD immediate for the operation 14536 described by WHICH. If INFO is nonnull, use it to describe valid 14537 immediates. */ 14538 bool 14539 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info, 14540 enum simd_immediate_check which) 14541 { 14542 machine_mode mode = GET_MODE (op); 14543 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 14544 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT)) 14545 return false; 14546 14547 scalar_mode elt_mode = GET_MODE_INNER (mode); 14548 rtx base, step; 14549 unsigned int n_elts; 14550 if (GET_CODE (op) == CONST_VECTOR 14551 && CONST_VECTOR_DUPLICATE_P (op)) 14552 n_elts = CONST_VECTOR_NPATTERNS (op); 14553 else if ((vec_flags & VEC_SVE_DATA) 14554 && const_vec_series_p (op, &base, &step)) 14555 { 14556 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT); 14557 if (!aarch64_sve_index_immediate_p (base) 14558 || !aarch64_sve_index_immediate_p (step)) 14559 return false; 14560 14561 if (info) 14562 *info = simd_immediate_info (elt_mode, base, step); 14563 return true; 14564 } 14565 else if (GET_CODE (op) == CONST_VECTOR 14566 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts)) 14567 /* N_ELTS set above. */; 14568 else 14569 return false; 14570 14571 /* Handle PFALSE and PTRUE. */ 14572 if (vec_flags & VEC_SVE_PRED) 14573 return (op == CONST0_RTX (mode) 14574 || op == CONSTM1_RTX (mode)); 14575 14576 scalar_float_mode elt_float_mode; 14577 if (n_elts == 1 14578 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)) 14579 { 14580 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0); 14581 if (aarch64_float_const_zero_rtx_p (elt) 14582 || aarch64_float_const_representable_p (elt)) 14583 { 14584 if (info) 14585 *info = simd_immediate_info (elt_float_mode, elt); 14586 return true; 14587 } 14588 } 14589 14590 unsigned int elt_size = GET_MODE_SIZE (elt_mode); 14591 if (elt_size > 8) 14592 return false; 14593 14594 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require (); 14595 14596 /* Expand the vector constant out into a byte vector, with the least 14597 significant byte of the register first. */ 14598 auto_vec<unsigned char, 16> bytes; 14599 bytes.reserve (n_elts * elt_size); 14600 for (unsigned int i = 0; i < n_elts; i++) 14601 { 14602 /* The vector is provided in gcc endian-neutral fashion. 14603 For aarch64_be Advanced SIMD, it must be laid out in the vector 14604 register in reverse order. */ 14605 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN); 14606 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i); 14607 14608 if (elt_mode != elt_int_mode) 14609 elt = gen_lowpart (elt_int_mode, elt); 14610 14611 if (!CONST_INT_P (elt)) 14612 return false; 14613 14614 unsigned HOST_WIDE_INT elt_val = INTVAL (elt); 14615 for (unsigned int byte = 0; byte < elt_size; byte++) 14616 { 14617 bytes.quick_push (elt_val & 0xff); 14618 elt_val >>= BITS_PER_UNIT; 14619 } 14620 } 14621 14622 /* The immediate must repeat every eight bytes. */ 14623 unsigned int nbytes = bytes.length (); 14624 for (unsigned i = 8; i < nbytes; ++i) 14625 if (bytes[i] != bytes[i - 8]) 14626 return false; 14627 14628 /* Get the repeating 8-byte value as an integer. No endian correction 14629 is needed here because bytes is already in lsb-first order. */ 14630 unsigned HOST_WIDE_INT val64 = 0; 14631 for (unsigned int i = 0; i < 8; i++) 14632 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes] 14633 << (i * BITS_PER_UNIT)); 14634 14635 if (vec_flags & VEC_SVE_DATA) 14636 return aarch64_sve_valid_immediate (val64, info); 14637 else 14638 return aarch64_advsimd_valid_immediate (val64, info, which); 14639 } 14640 14641 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and 14642 has a step in the range of INDEX. Return the index expression if so, 14643 otherwise return null. */ 14644 rtx 14645 aarch64_check_zero_based_sve_index_immediate (rtx x) 14646 { 14647 rtx base, step; 14648 if (const_vec_series_p (x, &base, &step) 14649 && base == const0_rtx 14650 && aarch64_sve_index_immediate_p (step)) 14651 return step; 14652 return NULL_RTX; 14653 } 14654 14655 /* Check of immediate shift constants are within range. */ 14656 bool 14657 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left) 14658 { 14659 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT; 14660 if (left) 14661 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1); 14662 else 14663 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width); 14664 } 14665 14666 /* Return the bitmask CONST_INT to select the bits required by a zero extract 14667 operation of width WIDTH at bit position POS. */ 14668 14669 rtx 14670 aarch64_mask_from_zextract_ops (rtx width, rtx pos) 14671 { 14672 gcc_assert (CONST_INT_P (width)); 14673 gcc_assert (CONST_INT_P (pos)); 14674 14675 unsigned HOST_WIDE_INT mask 14676 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1; 14677 return GEN_INT (mask << UINTVAL (pos)); 14678 } 14679 14680 bool 14681 aarch64_mov_operand_p (rtx x, machine_mode mode) 14682 { 14683 if (GET_CODE (x) == HIGH 14684 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0)))) 14685 return true; 14686 14687 if (CONST_INT_P (x)) 14688 return true; 14689 14690 if (VECTOR_MODE_P (GET_MODE (x))) 14691 return aarch64_simd_valid_immediate (x, NULL); 14692 14693 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x)) 14694 return true; 14695 14696 if (aarch64_sve_cnt_immediate_p (x)) 14697 return true; 14698 14699 return aarch64_classify_symbolic_expression (x) 14700 == SYMBOL_TINY_ABSOLUTE; 14701 } 14702 14703 /* Return a const_int vector of VAL. */ 14704 rtx 14705 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val) 14706 { 14707 rtx c = gen_int_mode (val, GET_MODE_INNER (mode)); 14708 return gen_const_vec_duplicate (mode, c); 14709 } 14710 14711 /* Check OP is a legal scalar immediate for the MOVI instruction. */ 14712 14713 bool 14714 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode) 14715 { 14716 machine_mode vmode; 14717 14718 vmode = aarch64_simd_container_mode (mode, 64); 14719 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op)); 14720 return aarch64_simd_valid_immediate (op_v, NULL); 14721 } 14722 14723 /* Construct and return a PARALLEL RTX vector with elements numbering the 14724 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of 14725 the vector - from the perspective of the architecture. This does not 14726 line up with GCC's perspective on lane numbers, so we end up with 14727 different masks depending on our target endian-ness. The diagram 14728 below may help. We must draw the distinction when building masks 14729 which select one half of the vector. An instruction selecting 14730 architectural low-lanes for a big-endian target, must be described using 14731 a mask selecting GCC high-lanes. 14732 14733 Big-Endian Little-Endian 14734 14735 GCC 0 1 2 3 3 2 1 0 14736 | x | x | x | x | | x | x | x | x | 14737 Architecture 3 2 1 0 3 2 1 0 14738 14739 Low Mask: { 2, 3 } { 0, 1 } 14740 High Mask: { 0, 1 } { 2, 3 } 14741 14742 MODE Is the mode of the vector and NUNITS is the number of units in it. */ 14743 14744 rtx 14745 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high) 14746 { 14747 rtvec v = rtvec_alloc (nunits / 2); 14748 int high_base = nunits / 2; 14749 int low_base = 0; 14750 int base; 14751 rtx t1; 14752 int i; 14753 14754 if (BYTES_BIG_ENDIAN) 14755 base = high ? low_base : high_base; 14756 else 14757 base = high ? high_base : low_base; 14758 14759 for (i = 0; i < nunits / 2; i++) 14760 RTVEC_ELT (v, i) = GEN_INT (base + i); 14761 14762 t1 = gen_rtx_PARALLEL (mode, v); 14763 return t1; 14764 } 14765 14766 /* Check OP for validity as a PARALLEL RTX vector with elements 14767 numbering the lanes of either the high (HIGH == TRUE) or low lanes, 14768 from the perspective of the architecture. See the diagram above 14769 aarch64_simd_vect_par_cnst_half for more details. */ 14770 14771 bool 14772 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode, 14773 bool high) 14774 { 14775 int nelts; 14776 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts)) 14777 return false; 14778 14779 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high); 14780 HOST_WIDE_INT count_op = XVECLEN (op, 0); 14781 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0); 14782 int i = 0; 14783 14784 if (count_op != count_ideal) 14785 return false; 14786 14787 for (i = 0; i < count_ideal; i++) 14788 { 14789 rtx elt_op = XVECEXP (op, 0, i); 14790 rtx elt_ideal = XVECEXP (ideal, 0, i); 14791 14792 if (!CONST_INT_P (elt_op) 14793 || INTVAL (elt_ideal) != INTVAL (elt_op)) 14794 return false; 14795 } 14796 return true; 14797 } 14798 14799 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and 14800 HIGH (exclusive). */ 14801 void 14802 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high, 14803 const_tree exp) 14804 { 14805 HOST_WIDE_INT lane; 14806 gcc_assert (CONST_INT_P (operand)); 14807 lane = INTVAL (operand); 14808 14809 if (lane < low || lane >= high) 14810 { 14811 if (exp) 14812 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1); 14813 else 14814 error ("lane %wd out of range %wd - %wd", lane, low, high - 1); 14815 } 14816 } 14817 14818 /* Peform endian correction on lane number N, which indexes a vector 14819 of mode MODE, and return the result as an SImode rtx. */ 14820 14821 rtx 14822 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n) 14823 { 14824 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode); 14825 } 14826 14827 /* Return TRUE if OP is a valid vector addressing mode. */ 14828 14829 bool 14830 aarch64_simd_mem_operand_p (rtx op) 14831 { 14832 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC 14833 || REG_P (XEXP (op, 0))); 14834 } 14835 14836 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */ 14837 14838 bool 14839 aarch64_sve_ld1r_operand_p (rtx op) 14840 { 14841 struct aarch64_address_info addr; 14842 scalar_mode mode; 14843 14844 return (MEM_P (op) 14845 && is_a <scalar_mode> (GET_MODE (op), &mode) 14846 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false) 14847 && addr.type == ADDRESS_REG_IMM 14848 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset)); 14849 } 14850 14851 /* Return true if OP is a valid MEM operand for an SVE LDR instruction. 14852 The conditions for STR are the same. */ 14853 bool 14854 aarch64_sve_ldr_operand_p (rtx op) 14855 { 14856 struct aarch64_address_info addr; 14857 14858 return (MEM_P (op) 14859 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), 14860 false, ADDR_QUERY_ANY) 14861 && addr.type == ADDRESS_REG_IMM); 14862 } 14863 14864 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode. 14865 We need to be able to access the individual pieces, so the range 14866 is different from LD[234] and ST[234]. */ 14867 bool 14868 aarch64_sve_struct_memory_operand_p (rtx op) 14869 { 14870 if (!MEM_P (op)) 14871 return false; 14872 14873 machine_mode mode = GET_MODE (op); 14874 struct aarch64_address_info addr; 14875 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false, 14876 ADDR_QUERY_ANY) 14877 || addr.type != ADDRESS_REG_IMM) 14878 return false; 14879 14880 poly_int64 first = addr.const_offset; 14881 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR; 14882 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first) 14883 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last)); 14884 } 14885 14886 /* Emit a register copy from operand to operand, taking care not to 14887 early-clobber source registers in the process. 14888 14889 COUNT is the number of components into which the copy needs to be 14890 decomposed. */ 14891 void 14892 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode, 14893 unsigned int count) 14894 { 14895 unsigned int i; 14896 int rdest = REGNO (operands[0]); 14897 int rsrc = REGNO (operands[1]); 14898 14899 if (!reg_overlap_mentioned_p (operands[0], operands[1]) 14900 || rdest < rsrc) 14901 for (i = 0; i < count; i++) 14902 emit_move_insn (gen_rtx_REG (mode, rdest + i), 14903 gen_rtx_REG (mode, rsrc + i)); 14904 else 14905 for (i = 0; i < count; i++) 14906 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1), 14907 gen_rtx_REG (mode, rsrc + count - i - 1)); 14908 } 14909 14910 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is 14911 one of VSTRUCT modes: OI, CI, or XI. */ 14912 int 14913 aarch64_simd_attr_length_rglist (machine_mode mode) 14914 { 14915 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */ 14916 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4; 14917 } 14918 14919 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum 14920 alignment of a vector to 128 bits. SVE predicates have an alignment of 14921 16 bits. */ 14922 static HOST_WIDE_INT 14923 aarch64_simd_vector_alignment (const_tree type) 14924 { 14925 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST) 14926 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can 14927 be set for non-predicate vectors of booleans. Modes are the most 14928 direct way we have of identifying real SVE predicate types. */ 14929 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128; 14930 return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi (); 14931 } 14932 14933 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */ 14934 static poly_uint64 14935 aarch64_vectorize_preferred_vector_alignment (const_tree type) 14936 { 14937 if (aarch64_sve_data_mode_p (TYPE_MODE (type))) 14938 { 14939 /* If the length of the vector is fixed, try to align to that length, 14940 otherwise don't try to align at all. */ 14941 HOST_WIDE_INT result; 14942 if (!BITS_PER_SVE_VECTOR.is_constant (&result)) 14943 result = TYPE_ALIGN (TREE_TYPE (type)); 14944 return result; 14945 } 14946 return TYPE_ALIGN (type); 14947 } 14948 14949 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */ 14950 static bool 14951 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed) 14952 { 14953 if (is_packed) 14954 return false; 14955 14956 /* For fixed-length vectors, check that the vectorizer will aim for 14957 full-vector alignment. This isn't true for generic GCC vectors 14958 that are wider than the ABI maximum of 128 bits. */ 14959 poly_uint64 preferred_alignment = 14960 aarch64_vectorize_preferred_vector_alignment (type); 14961 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST 14962 && maybe_ne (wi::to_widest (TYPE_SIZE (type)), 14963 preferred_alignment)) 14964 return false; 14965 14966 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */ 14967 return true; 14968 } 14969 14970 /* Return true if the vector misalignment factor is supported by the 14971 target. */ 14972 static bool 14973 aarch64_builtin_support_vector_misalignment (machine_mode mode, 14974 const_tree type, int misalignment, 14975 bool is_packed) 14976 { 14977 if (TARGET_SIMD && STRICT_ALIGNMENT) 14978 { 14979 /* Return if movmisalign pattern is not supported for this mode. */ 14980 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing) 14981 return false; 14982 14983 /* Misalignment factor is unknown at compile time. */ 14984 if (misalignment == -1) 14985 return false; 14986 } 14987 return default_builtin_support_vector_misalignment (mode, type, misalignment, 14988 is_packed); 14989 } 14990 14991 /* If VALS is a vector constant that can be loaded into a register 14992 using DUP, generate instructions to do so and return an RTX to 14993 assign to the register. Otherwise return NULL_RTX. */ 14994 static rtx 14995 aarch64_simd_dup_constant (rtx vals) 14996 { 14997 machine_mode mode = GET_MODE (vals); 14998 machine_mode inner_mode = GET_MODE_INNER (mode); 14999 rtx x; 15000 15001 if (!const_vec_duplicate_p (vals, &x)) 15002 return NULL_RTX; 15003 15004 /* We can load this constant by using DUP and a constant in a 15005 single ARM register. This will be cheaper than a vector 15006 load. */ 15007 x = copy_to_mode_reg (inner_mode, x); 15008 return gen_vec_duplicate (mode, x); 15009 } 15010 15011 15012 /* Generate code to load VALS, which is a PARALLEL containing only 15013 constants (for vec_init) or CONST_VECTOR, efficiently into a 15014 register. Returns an RTX to copy into the register, or NULL_RTX 15015 for a PARALLEL that cannot be converted into a CONST_VECTOR. */ 15016 static rtx 15017 aarch64_simd_make_constant (rtx vals) 15018 { 15019 machine_mode mode = GET_MODE (vals); 15020 rtx const_dup; 15021 rtx const_vec = NULL_RTX; 15022 int n_const = 0; 15023 int i; 15024 15025 if (GET_CODE (vals) == CONST_VECTOR) 15026 const_vec = vals; 15027 else if (GET_CODE (vals) == PARALLEL) 15028 { 15029 /* A CONST_VECTOR must contain only CONST_INTs and 15030 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF). 15031 Only store valid constants in a CONST_VECTOR. */ 15032 int n_elts = XVECLEN (vals, 0); 15033 for (i = 0; i < n_elts; ++i) 15034 { 15035 rtx x = XVECEXP (vals, 0, i); 15036 if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) 15037 n_const++; 15038 } 15039 if (n_const == n_elts) 15040 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)); 15041 } 15042 else 15043 gcc_unreachable (); 15044 15045 if (const_vec != NULL_RTX 15046 && aarch64_simd_valid_immediate (const_vec, NULL)) 15047 /* Load using MOVI/MVNI. */ 15048 return const_vec; 15049 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX) 15050 /* Loaded using DUP. */ 15051 return const_dup; 15052 else if (const_vec != NULL_RTX) 15053 /* Load from constant pool. We cannot take advantage of single-cycle 15054 LD1 because we need a PC-relative addressing mode. */ 15055 return const_vec; 15056 else 15057 /* A PARALLEL containing something not valid inside CONST_VECTOR. 15058 We cannot construct an initializer. */ 15059 return NULL_RTX; 15060 } 15061 15062 /* Expand a vector initialisation sequence, such that TARGET is 15063 initialised to contain VALS. */ 15064 15065 void 15066 aarch64_expand_vector_init (rtx target, rtx vals) 15067 { 15068 machine_mode mode = GET_MODE (target); 15069 scalar_mode inner_mode = GET_MODE_INNER (mode); 15070 /* The number of vector elements. */ 15071 int n_elts = XVECLEN (vals, 0); 15072 /* The number of vector elements which are not constant. */ 15073 int n_var = 0; 15074 rtx any_const = NULL_RTX; 15075 /* The first element of vals. */ 15076 rtx v0 = XVECEXP (vals, 0, 0); 15077 bool all_same = true; 15078 15079 /* Count the number of variable elements to initialise. */ 15080 for (int i = 0; i < n_elts; ++i) 15081 { 15082 rtx x = XVECEXP (vals, 0, i); 15083 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x))) 15084 ++n_var; 15085 else 15086 any_const = x; 15087 15088 all_same &= rtx_equal_p (x, v0); 15089 } 15090 15091 /* No variable elements, hand off to aarch64_simd_make_constant which knows 15092 how best to handle this. */ 15093 if (n_var == 0) 15094 { 15095 rtx constant = aarch64_simd_make_constant (vals); 15096 if (constant != NULL_RTX) 15097 { 15098 emit_move_insn (target, constant); 15099 return; 15100 } 15101 } 15102 15103 /* Splat a single non-constant element if we can. */ 15104 if (all_same) 15105 { 15106 rtx x = copy_to_mode_reg (inner_mode, v0); 15107 aarch64_emit_move (target, gen_vec_duplicate (mode, x)); 15108 return; 15109 } 15110 15111 enum insn_code icode = optab_handler (vec_set_optab, mode); 15112 gcc_assert (icode != CODE_FOR_nothing); 15113 15114 /* If there are only variable elements, try to optimize 15115 the insertion using dup for the most common element 15116 followed by insertions. */ 15117 15118 /* The algorithm will fill matches[*][0] with the earliest matching element, 15119 and matches[X][1] with the count of duplicate elements (if X is the 15120 earliest element which has duplicates). */ 15121 15122 if (n_var == n_elts && n_elts <= 16) 15123 { 15124 int matches[16][2] = {0}; 15125 for (int i = 0; i < n_elts; i++) 15126 { 15127 for (int j = 0; j <= i; j++) 15128 { 15129 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j))) 15130 { 15131 matches[i][0] = j; 15132 matches[j][1]++; 15133 break; 15134 } 15135 } 15136 } 15137 int maxelement = 0; 15138 int maxv = 0; 15139 for (int i = 0; i < n_elts; i++) 15140 if (matches[i][1] > maxv) 15141 { 15142 maxelement = i; 15143 maxv = matches[i][1]; 15144 } 15145 15146 /* Create a duplicate of the most common element, unless all elements 15147 are equally useless to us, in which case just immediately set the 15148 vector register using the first element. */ 15149 15150 if (maxv == 1) 15151 { 15152 /* For vectors of two 64-bit elements, we can do even better. */ 15153 if (n_elts == 2 15154 && (inner_mode == E_DImode 15155 || inner_mode == E_DFmode)) 15156 15157 { 15158 rtx x0 = XVECEXP (vals, 0, 0); 15159 rtx x1 = XVECEXP (vals, 0, 1); 15160 /* Combine can pick up this case, but handling it directly 15161 here leaves clearer RTL. 15162 15163 This is load_pair_lanes<mode>, and also gives us a clean-up 15164 for store_pair_lanes<mode>. */ 15165 if (memory_operand (x0, inner_mode) 15166 && memory_operand (x1, inner_mode) 15167 && !STRICT_ALIGNMENT 15168 && rtx_equal_p (XEXP (x1, 0), 15169 plus_constant (Pmode, 15170 XEXP (x0, 0), 15171 GET_MODE_SIZE (inner_mode)))) 15172 { 15173 rtx t; 15174 if (inner_mode == DFmode) 15175 t = gen_load_pair_lanesdf (target, x0, x1); 15176 else 15177 t = gen_load_pair_lanesdi (target, x0, x1); 15178 emit_insn (t); 15179 return; 15180 } 15181 } 15182 /* The subreg-move sequence below will move into lane zero of the 15183 vector register. For big-endian we want that position to hold 15184 the last element of VALS. */ 15185 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0; 15186 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement)); 15187 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode)); 15188 } 15189 else 15190 { 15191 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement)); 15192 aarch64_emit_move (target, gen_vec_duplicate (mode, x)); 15193 } 15194 15195 /* Insert the rest. */ 15196 for (int i = 0; i < n_elts; i++) 15197 { 15198 rtx x = XVECEXP (vals, 0, i); 15199 if (matches[i][0] == maxelement) 15200 continue; 15201 x = copy_to_mode_reg (inner_mode, x); 15202 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); 15203 } 15204 return; 15205 } 15206 15207 /* Initialise a vector which is part-variable. We want to first try 15208 to build those lanes which are constant in the most efficient way we 15209 can. */ 15210 if (n_var != n_elts) 15211 { 15212 rtx copy = copy_rtx (vals); 15213 15214 /* Load constant part of vector. We really don't care what goes into the 15215 parts we will overwrite, but we're more likely to be able to load the 15216 constant efficiently if it has fewer, larger, repeating parts 15217 (see aarch64_simd_valid_immediate). */ 15218 for (int i = 0; i < n_elts; i++) 15219 { 15220 rtx x = XVECEXP (vals, 0, i); 15221 if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) 15222 continue; 15223 rtx subst = any_const; 15224 for (int bit = n_elts / 2; bit > 0; bit /= 2) 15225 { 15226 /* Look in the copied vector, as more elements are const. */ 15227 rtx test = XVECEXP (copy, 0, i ^ bit); 15228 if (CONST_INT_P (test) || CONST_DOUBLE_P (test)) 15229 { 15230 subst = test; 15231 break; 15232 } 15233 } 15234 XVECEXP (copy, 0, i) = subst; 15235 } 15236 aarch64_expand_vector_init (target, copy); 15237 } 15238 15239 /* Insert the variable lanes directly. */ 15240 for (int i = 0; i < n_elts; i++) 15241 { 15242 rtx x = XVECEXP (vals, 0, i); 15243 if (CONST_INT_P (x) || CONST_DOUBLE_P (x)) 15244 continue; 15245 x = copy_to_mode_reg (inner_mode, x); 15246 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i))); 15247 } 15248 } 15249 15250 static unsigned HOST_WIDE_INT 15251 aarch64_shift_truncation_mask (machine_mode mode) 15252 { 15253 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode)) 15254 return 0; 15255 return GET_MODE_UNIT_BITSIZE (mode) - 1; 15256 } 15257 15258 /* Select a format to encode pointers in exception handling data. */ 15259 int 15260 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global) 15261 { 15262 int type; 15263 switch (aarch64_cmodel) 15264 { 15265 case AARCH64_CMODEL_TINY: 15266 case AARCH64_CMODEL_TINY_PIC: 15267 case AARCH64_CMODEL_SMALL: 15268 case AARCH64_CMODEL_SMALL_PIC: 15269 case AARCH64_CMODEL_SMALL_SPIC: 15270 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient 15271 for everything. */ 15272 type = DW_EH_PE_sdata4; 15273 break; 15274 default: 15275 /* No assumptions here. 8-byte relocs required. */ 15276 type = DW_EH_PE_sdata8; 15277 break; 15278 } 15279 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type; 15280 } 15281 15282 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */ 15283 15284 static void 15285 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name) 15286 { 15287 if (aarch64_simd_decl_p (decl)) 15288 { 15289 fprintf (stream, "\t.variant_pcs\t"); 15290 assemble_name (stream, name); 15291 fprintf (stream, "\n"); 15292 } 15293 } 15294 15295 /* The last .arch and .tune assembly strings that we printed. */ 15296 static std::string aarch64_last_printed_arch_string; 15297 static std::string aarch64_last_printed_tune_string; 15298 15299 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used 15300 by the function fndecl. */ 15301 15302 void 15303 aarch64_declare_function_name (FILE *stream, const char* name, 15304 tree fndecl) 15305 { 15306 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl); 15307 15308 struct cl_target_option *targ_options; 15309 if (target_parts) 15310 targ_options = TREE_TARGET_OPTION (target_parts); 15311 else 15312 targ_options = TREE_TARGET_OPTION (target_option_current_node); 15313 gcc_assert (targ_options); 15314 15315 const struct processor *this_arch 15316 = aarch64_get_arch (targ_options->x_explicit_arch); 15317 15318 unsigned long isa_flags = targ_options->x_aarch64_isa_flags; 15319 std::string extension 15320 = aarch64_get_extension_string_for_isa_flags (isa_flags, 15321 this_arch->flags); 15322 /* Only update the assembler .arch string if it is distinct from the last 15323 such string we printed. */ 15324 std::string to_print = this_arch->name + extension; 15325 if (to_print != aarch64_last_printed_arch_string) 15326 { 15327 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ()); 15328 aarch64_last_printed_arch_string = to_print; 15329 } 15330 15331 /* Print the cpu name we're tuning for in the comments, might be 15332 useful to readers of the generated asm. Do it only when it changes 15333 from function to function and verbose assembly is requested. */ 15334 const struct processor *this_tune 15335 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core); 15336 15337 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name) 15338 { 15339 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n", 15340 this_tune->name); 15341 aarch64_last_printed_tune_string = this_tune->name; 15342 } 15343 15344 aarch64_asm_output_variant_pcs (stream, fndecl, name); 15345 15346 /* Don't forget the type directive for ELF. */ 15347 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function"); 15348 ASM_OUTPUT_LABEL (stream, name); 15349 15350 cfun->machine->label_is_assembled = true; 15351 } 15352 15353 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after 15354 the function label and emit a BTI if necessary. */ 15355 15356 void 15357 aarch64_print_patchable_function_entry (FILE *file, 15358 unsigned HOST_WIDE_INT patch_area_size, 15359 bool record_p) 15360 { 15361 if (cfun->machine->label_is_assembled 15362 && aarch64_bti_enabled () 15363 && !cgraph_node::get (cfun->decl)->only_called_directly_p ()) 15364 { 15365 /* Remove the BTI that follows the patch area and insert a new BTI 15366 before the patch area right after the function label. */ 15367 rtx_insn *insn = next_real_nondebug_insn (get_insns ()); 15368 if (insn 15369 && INSN_P (insn) 15370 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE 15371 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C) 15372 delete_insn (insn); 15373 asm_fprintf (file, "\thint\t34 // bti c\n"); 15374 } 15375 15376 default_print_patchable_function_entry (file, patch_area_size, record_p); 15377 } 15378 15379 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */ 15380 15381 void 15382 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target) 15383 { 15384 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0); 15385 const char *value = IDENTIFIER_POINTER (target); 15386 aarch64_asm_output_variant_pcs (stream, decl, name); 15387 ASM_OUTPUT_DEF (stream, name, value); 15388 } 15389 15390 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined 15391 function symbol references. */ 15392 15393 void 15394 aarch64_asm_output_external (FILE *stream, tree decl, const char* name) 15395 { 15396 default_elf_asm_output_external (stream, decl, name); 15397 aarch64_asm_output_variant_pcs (stream, decl, name); 15398 } 15399 15400 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */ 15401 15402 static void 15403 aarch64_start_file (void) 15404 { 15405 struct cl_target_option *default_options 15406 = TREE_TARGET_OPTION (target_option_default_node); 15407 15408 const struct processor *default_arch 15409 = aarch64_get_arch (default_options->x_explicit_arch); 15410 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags; 15411 std::string extension 15412 = aarch64_get_extension_string_for_isa_flags (default_isa_flags, 15413 default_arch->flags); 15414 15415 aarch64_last_printed_arch_string = default_arch->name + extension; 15416 aarch64_last_printed_tune_string = ""; 15417 asm_fprintf (asm_out_file, "\t.arch %s\n", 15418 aarch64_last_printed_arch_string.c_str ()); 15419 15420 default_file_start (); 15421 } 15422 15423 /* Emit load exclusive. */ 15424 15425 static void 15426 aarch64_emit_load_exclusive (machine_mode mode, rtx rval, 15427 rtx mem, rtx model_rtx) 15428 { 15429 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx)); 15430 } 15431 15432 /* Emit store exclusive. */ 15433 15434 static void 15435 aarch64_emit_store_exclusive (machine_mode mode, rtx bval, 15436 rtx rval, rtx mem, rtx model_rtx) 15437 { 15438 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx)); 15439 } 15440 15441 /* Mark the previous jump instruction as unlikely. */ 15442 15443 static void 15444 aarch64_emit_unlikely_jump (rtx insn) 15445 { 15446 rtx_insn *jump = emit_jump_insn (insn); 15447 add_reg_br_prob_note (jump, profile_probability::very_unlikely ()); 15448 } 15449 15450 /* Expand a compare and swap pattern. */ 15451 15452 void 15453 aarch64_expand_compare_and_swap (rtx operands[]) 15454 { 15455 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg; 15456 machine_mode mode, r_mode; 15457 15458 bval = operands[0]; 15459 rval = operands[1]; 15460 mem = operands[2]; 15461 oldval = operands[3]; 15462 newval = operands[4]; 15463 is_weak = operands[5]; 15464 mod_s = operands[6]; 15465 mod_f = operands[7]; 15466 mode = GET_MODE (mem); 15467 15468 /* Normally the succ memory model must be stronger than fail, but in the 15469 unlikely event of fail being ACQUIRE and succ being RELEASE we need to 15470 promote succ to ACQ_REL so that we don't lose the acquire semantics. */ 15471 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f))) 15472 && is_mm_release (memmodel_from_int (INTVAL (mod_s)))) 15473 mod_s = GEN_INT (MEMMODEL_ACQ_REL); 15474 15475 r_mode = mode; 15476 if (mode == QImode || mode == HImode) 15477 { 15478 r_mode = SImode; 15479 rval = gen_reg_rtx (r_mode); 15480 } 15481 15482 if (TARGET_LSE) 15483 { 15484 /* The CAS insn requires oldval and rval overlap, but we need to 15485 have a copy of oldval saved across the operation to tell if 15486 the operation is successful. */ 15487 if (reg_overlap_mentioned_p (rval, oldval)) 15488 rval = copy_to_mode_reg (r_mode, oldval); 15489 else 15490 emit_move_insn (rval, gen_lowpart (r_mode, oldval)); 15491 15492 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, 15493 newval, mod_s)); 15494 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode); 15495 } 15496 else 15497 { 15498 /* The oldval predicate varies by mode. Test it and force to reg. */ 15499 insn_code code = code_for_aarch64_compare_and_swap (mode); 15500 if (!insn_data[code].operand[2].predicate (oldval, mode)) 15501 oldval = force_reg (mode, oldval); 15502 15503 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval, 15504 is_weak, mod_s, mod_f)); 15505 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM); 15506 } 15507 15508 if (r_mode != mode) 15509 rval = gen_lowpart (mode, rval); 15510 emit_move_insn (operands[1], rval); 15511 15512 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx); 15513 emit_insn (gen_rtx_SET (bval, x)); 15514 } 15515 15516 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a 15517 sequence implementing an atomic operation. */ 15518 15519 static void 15520 aarch64_emit_post_barrier (enum memmodel model) 15521 { 15522 const enum memmodel base_model = memmodel_base (model); 15523 15524 if (is_mm_sync (model) 15525 && (base_model == MEMMODEL_ACQUIRE 15526 || base_model == MEMMODEL_ACQ_REL 15527 || base_model == MEMMODEL_SEQ_CST)) 15528 { 15529 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST))); 15530 } 15531 } 15532 15533 /* Split a compare and swap pattern. */ 15534 15535 void 15536 aarch64_split_compare_and_swap (rtx operands[]) 15537 { 15538 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */ 15539 gcc_assert (epilogue_completed); 15540 15541 rtx rval, mem, oldval, newval, scratch; 15542 machine_mode mode; 15543 bool is_weak; 15544 rtx_code_label *label1, *label2; 15545 rtx x, cond; 15546 enum memmodel model; 15547 rtx model_rtx; 15548 15549 rval = operands[0]; 15550 mem = operands[1]; 15551 oldval = operands[2]; 15552 newval = operands[3]; 15553 is_weak = (operands[4] != const0_rtx); 15554 model_rtx = operands[5]; 15555 scratch = operands[7]; 15556 mode = GET_MODE (mem); 15557 model = memmodel_from_int (INTVAL (model_rtx)); 15558 15559 /* When OLDVAL is zero and we want the strong version we can emit a tighter 15560 loop: 15561 .label1: 15562 LD[A]XR rval, [mem] 15563 CBNZ rval, .label2 15564 ST[L]XR scratch, newval, [mem] 15565 CBNZ scratch, .label1 15566 .label2: 15567 CMP rval, 0. */ 15568 bool strong_zero_p = !is_weak && oldval == const0_rtx; 15569 15570 label1 = NULL; 15571 if (!is_weak) 15572 { 15573 label1 = gen_label_rtx (); 15574 emit_label (label1); 15575 } 15576 label2 = gen_label_rtx (); 15577 15578 /* The initial load can be relaxed for a __sync operation since a final 15579 barrier will be emitted to stop code hoisting. */ 15580 if (is_mm_sync (model)) 15581 aarch64_emit_load_exclusive (mode, rval, mem, 15582 GEN_INT (MEMMODEL_RELAXED)); 15583 else 15584 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx); 15585 15586 if (strong_zero_p) 15587 { 15588 if (aarch64_track_speculation) 15589 { 15590 /* Emit an explicit compare instruction, so that we can correctly 15591 track the condition codes. */ 15592 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx); 15593 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx); 15594 } 15595 else 15596 x = gen_rtx_NE (VOIDmode, rval, const0_rtx); 15597 15598 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, 15599 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx); 15600 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); 15601 } 15602 else 15603 { 15604 cond = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode); 15605 x = gen_rtx_NE (VOIDmode, cond, const0_rtx); 15606 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, 15607 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx); 15608 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); 15609 } 15610 15611 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx); 15612 15613 if (!is_weak) 15614 { 15615 if (aarch64_track_speculation) 15616 { 15617 /* Emit an explicit compare instruction, so that we can correctly 15618 track the condition codes. */ 15619 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx); 15620 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx); 15621 } 15622 else 15623 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx); 15624 15625 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, 15626 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx); 15627 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); 15628 } 15629 else 15630 { 15631 cond = gen_rtx_REG (CCmode, CC_REGNUM); 15632 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx); 15633 emit_insn (gen_rtx_SET (cond, x)); 15634 } 15635 15636 emit_label (label2); 15637 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL 15638 to set the condition flags. If this is not used it will be removed by 15639 later passes. */ 15640 if (strong_zero_p) 15641 { 15642 cond = gen_rtx_REG (CCmode, CC_REGNUM); 15643 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx); 15644 emit_insn (gen_rtx_SET (cond, x)); 15645 } 15646 /* Emit any final barrier needed for a __sync operation. */ 15647 if (is_mm_sync (model)) 15648 aarch64_emit_post_barrier (model); 15649 } 15650 15651 /* Split an atomic operation. */ 15652 15653 void 15654 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem, 15655 rtx value, rtx model_rtx, rtx cond) 15656 { 15657 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */ 15658 gcc_assert (epilogue_completed); 15659 15660 machine_mode mode = GET_MODE (mem); 15661 machine_mode wmode = (mode == DImode ? DImode : SImode); 15662 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx)); 15663 const bool is_sync = is_mm_sync (model); 15664 rtx_code_label *label; 15665 rtx x; 15666 15667 /* Split the atomic operation into a sequence. */ 15668 label = gen_label_rtx (); 15669 emit_label (label); 15670 15671 if (new_out) 15672 new_out = gen_lowpart (wmode, new_out); 15673 if (old_out) 15674 old_out = gen_lowpart (wmode, old_out); 15675 else 15676 old_out = new_out; 15677 value = simplify_gen_subreg (wmode, value, mode, 0); 15678 15679 /* The initial load can be relaxed for a __sync operation since a final 15680 barrier will be emitted to stop code hoisting. */ 15681 if (is_sync) 15682 aarch64_emit_load_exclusive (mode, old_out, mem, 15683 GEN_INT (MEMMODEL_RELAXED)); 15684 else 15685 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx); 15686 15687 switch (code) 15688 { 15689 case SET: 15690 new_out = value; 15691 break; 15692 15693 case NOT: 15694 x = gen_rtx_AND (wmode, old_out, value); 15695 emit_insn (gen_rtx_SET (new_out, x)); 15696 x = gen_rtx_NOT (wmode, new_out); 15697 emit_insn (gen_rtx_SET (new_out, x)); 15698 break; 15699 15700 case MINUS: 15701 if (CONST_INT_P (value)) 15702 { 15703 value = GEN_INT (-INTVAL (value)); 15704 code = PLUS; 15705 } 15706 /* Fall through. */ 15707 15708 default: 15709 x = gen_rtx_fmt_ee (code, wmode, old_out, value); 15710 emit_insn (gen_rtx_SET (new_out, x)); 15711 break; 15712 } 15713 15714 aarch64_emit_store_exclusive (mode, cond, mem, 15715 gen_lowpart (mode, new_out), model_rtx); 15716 15717 if (aarch64_track_speculation) 15718 { 15719 /* Emit an explicit compare instruction, so that we can correctly 15720 track the condition codes. */ 15721 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx); 15722 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx); 15723 } 15724 else 15725 x = gen_rtx_NE (VOIDmode, cond, const0_rtx); 15726 15727 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, 15728 gen_rtx_LABEL_REF (Pmode, label), pc_rtx); 15729 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); 15730 15731 /* Emit any final barrier needed for a __sync operation. */ 15732 if (is_sync) 15733 aarch64_emit_post_barrier (model); 15734 } 15735 15736 static void 15737 aarch64_init_libfuncs (void) 15738 { 15739 /* Half-precision float operations. The compiler handles all operations 15740 with NULL libfuncs by converting to SFmode. */ 15741 15742 /* Conversions. */ 15743 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee"); 15744 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee"); 15745 15746 /* Arithmetic. */ 15747 set_optab_libfunc (add_optab, HFmode, NULL); 15748 set_optab_libfunc (sdiv_optab, HFmode, NULL); 15749 set_optab_libfunc (smul_optab, HFmode, NULL); 15750 set_optab_libfunc (neg_optab, HFmode, NULL); 15751 set_optab_libfunc (sub_optab, HFmode, NULL); 15752 15753 /* Comparisons. */ 15754 set_optab_libfunc (eq_optab, HFmode, NULL); 15755 set_optab_libfunc (ne_optab, HFmode, NULL); 15756 set_optab_libfunc (lt_optab, HFmode, NULL); 15757 set_optab_libfunc (le_optab, HFmode, NULL); 15758 set_optab_libfunc (ge_optab, HFmode, NULL); 15759 set_optab_libfunc (gt_optab, HFmode, NULL); 15760 set_optab_libfunc (unord_optab, HFmode, NULL); 15761 } 15762 15763 /* Target hook for c_mode_for_suffix. */ 15764 static machine_mode 15765 aarch64_c_mode_for_suffix (char suffix) 15766 { 15767 if (suffix == 'q') 15768 return TFmode; 15769 15770 return VOIDmode; 15771 } 15772 15773 /* We can only represent floating point constants which will fit in 15774 "quarter-precision" values. These values are characterised by 15775 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given 15776 by: 15777 15778 (-1)^s * (n/16) * 2^r 15779 15780 Where: 15781 's' is the sign bit. 15782 'n' is an integer in the range 16 <= n <= 31. 15783 'r' is an integer in the range -3 <= r <= 4. */ 15784 15785 /* Return true iff X can be represented by a quarter-precision 15786 floating point immediate operand X. Note, we cannot represent 0.0. */ 15787 bool 15788 aarch64_float_const_representable_p (rtx x) 15789 { 15790 /* This represents our current view of how many bits 15791 make up the mantissa. */ 15792 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1; 15793 int exponent; 15794 unsigned HOST_WIDE_INT mantissa, mask; 15795 REAL_VALUE_TYPE r, m; 15796 bool fail; 15797 15798 if (!CONST_DOUBLE_P (x)) 15799 return false; 15800 15801 if (GET_MODE (x) == VOIDmode 15802 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST)) 15803 return false; 15804 15805 r = *CONST_DOUBLE_REAL_VALUE (x); 15806 15807 /* We cannot represent infinities, NaNs or +/-zero. We won't 15808 know if we have +zero until we analyse the mantissa, but we 15809 can reject the other invalid values. */ 15810 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r) 15811 || REAL_VALUE_MINUS_ZERO (r)) 15812 return false; 15813 15814 /* Extract exponent. */ 15815 r = real_value_abs (&r); 15816 exponent = REAL_EXP (&r); 15817 15818 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the 15819 highest (sign) bit, with a fixed binary point at bit point_pos. 15820 m1 holds the low part of the mantissa, m2 the high part. 15821 WARNING: If we ever have a representation using more than 2 * H_W_I - 1 15822 bits for the mantissa, this can fail (low bits will be lost). */ 15823 real_ldexp (&m, &r, point_pos - exponent); 15824 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2); 15825 15826 /* If the low part of the mantissa has bits set we cannot represent 15827 the value. */ 15828 if (w.ulow () != 0) 15829 return false; 15830 /* We have rejected the lower HOST_WIDE_INT, so update our 15831 understanding of how many bits lie in the mantissa and 15832 look only at the high HOST_WIDE_INT. */ 15833 mantissa = w.elt (1); 15834 point_pos -= HOST_BITS_PER_WIDE_INT; 15835 15836 /* We can only represent values with a mantissa of the form 1.xxxx. */ 15837 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1; 15838 if ((mantissa & mask) != 0) 15839 return false; 15840 15841 /* Having filtered unrepresentable values, we may now remove all 15842 but the highest 5 bits. */ 15843 mantissa >>= point_pos - 5; 15844 15845 /* We cannot represent the value 0.0, so reject it. This is handled 15846 elsewhere. */ 15847 if (mantissa == 0) 15848 return false; 15849 15850 /* Then, as bit 4 is always set, we can mask it off, leaving 15851 the mantissa in the range [0, 15]. */ 15852 mantissa &= ~(1 << 4); 15853 gcc_assert (mantissa <= 15); 15854 15855 /* GCC internally does not use IEEE754-like encoding (where normalized 15856 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c). 15857 Our mantissa values are shifted 4 places to the left relative to 15858 normalized IEEE754 so we must modify the exponent returned by REAL_EXP 15859 by 5 places to correct for GCC's representation. */ 15860 exponent = 5 - exponent; 15861 15862 return (exponent >= 0 && exponent <= 7); 15863 } 15864 15865 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC 15866 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to 15867 output MOVI/MVNI, ORR or BIC immediate. */ 15868 char* 15869 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width, 15870 enum simd_immediate_check which) 15871 { 15872 bool is_valid; 15873 static char templ[40]; 15874 const char *mnemonic; 15875 const char *shift_op; 15876 unsigned int lane_count = 0; 15877 char element_char; 15878 15879 struct simd_immediate_info info; 15880 15881 /* This will return true to show const_vector is legal for use as either 15882 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate. 15883 It will also update INFO to show how the immediate should be generated. 15884 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */ 15885 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which); 15886 gcc_assert (is_valid); 15887 15888 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode)); 15889 lane_count = width / GET_MODE_BITSIZE (info.elt_mode); 15890 15891 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT) 15892 { 15893 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV); 15894 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD 15895 move immediate path. */ 15896 if (aarch64_float_const_zero_rtx_p (info.value)) 15897 info.value = GEN_INT (0); 15898 else 15899 { 15900 const unsigned int buf_size = 20; 15901 char float_buf[buf_size] = {'\0'}; 15902 real_to_decimal_for_mode (float_buf, 15903 CONST_DOUBLE_REAL_VALUE (info.value), 15904 buf_size, buf_size, 1, info.elt_mode); 15905 15906 if (lane_count == 1) 15907 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf); 15908 else 15909 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s", 15910 lane_count, element_char, float_buf); 15911 return templ; 15912 } 15913 } 15914 15915 gcc_assert (CONST_INT_P (info.value)); 15916 15917 if (which == AARCH64_CHECK_MOV) 15918 { 15919 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi"; 15920 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl"; 15921 if (lane_count == 1) 15922 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX, 15923 mnemonic, UINTVAL (info.value)); 15924 else if (info.shift) 15925 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " 15926 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count, 15927 element_char, UINTVAL (info.value), shift_op, info.shift); 15928 else 15929 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " 15930 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count, 15931 element_char, UINTVAL (info.value)); 15932 } 15933 else 15934 { 15935 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */ 15936 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr"; 15937 if (info.shift) 15938 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #" 15939 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count, 15940 element_char, UINTVAL (info.value), "lsl", info.shift); 15941 else 15942 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #" 15943 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count, 15944 element_char, UINTVAL (info.value)); 15945 } 15946 return templ; 15947 } 15948 15949 char* 15950 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode) 15951 { 15952 15953 /* If a floating point number was passed and we desire to use it in an 15954 integer mode do the conversion to integer. */ 15955 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT) 15956 { 15957 unsigned HOST_WIDE_INT ival; 15958 if (!aarch64_reinterpret_float_as_int (immediate, &ival)) 15959 gcc_unreachable (); 15960 immediate = gen_int_mode (ival, mode); 15961 } 15962 15963 machine_mode vmode; 15964 /* use a 64 bit mode for everything except for DI/DF mode, where we use 15965 a 128 bit vector mode. */ 15966 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64; 15967 15968 vmode = aarch64_simd_container_mode (mode, width); 15969 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate)); 15970 return aarch64_output_simd_mov_immediate (v_op, width); 15971 } 15972 15973 /* Return the output string to use for moving immediate CONST_VECTOR 15974 into an SVE register. */ 15975 15976 char * 15977 aarch64_output_sve_mov_immediate (rtx const_vector) 15978 { 15979 static char templ[40]; 15980 struct simd_immediate_info info; 15981 char element_char; 15982 15983 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info); 15984 gcc_assert (is_valid); 15985 15986 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode)); 15987 15988 if (info.step) 15989 { 15990 snprintf (templ, sizeof (templ), "index\t%%0.%c, #" 15991 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC, 15992 element_char, INTVAL (info.value), INTVAL (info.step)); 15993 return templ; 15994 } 15995 15996 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT) 15997 { 15998 if (aarch64_float_const_zero_rtx_p (info.value)) 15999 info.value = GEN_INT (0); 16000 else 16001 { 16002 const int buf_size = 20; 16003 char float_buf[buf_size] = {}; 16004 real_to_decimal_for_mode (float_buf, 16005 CONST_DOUBLE_REAL_VALUE (info.value), 16006 buf_size, buf_size, 1, info.elt_mode); 16007 16008 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s", 16009 element_char, float_buf); 16010 return templ; 16011 } 16012 } 16013 16014 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC, 16015 element_char, INTVAL (info.value)); 16016 return templ; 16017 } 16018 16019 /* Return the asm format for a PTRUE instruction whose destination has 16020 mode MODE. SUFFIX is the element size suffix. */ 16021 16022 char * 16023 aarch64_output_ptrue (machine_mode mode, char suffix) 16024 { 16025 unsigned int nunits; 16026 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")]; 16027 if (GET_MODE_NUNITS (mode).is_constant (&nunits)) 16028 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits); 16029 else 16030 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix); 16031 return buf; 16032 } 16033 16034 /* Split operands into moves from op[1] + op[2] into op[0]. */ 16035 16036 void 16037 aarch64_split_combinev16qi (rtx operands[3]) 16038 { 16039 unsigned int dest = REGNO (operands[0]); 16040 unsigned int src1 = REGNO (operands[1]); 16041 unsigned int src2 = REGNO (operands[2]); 16042 machine_mode halfmode = GET_MODE (operands[1]); 16043 unsigned int halfregs = REG_NREGS (operands[1]); 16044 rtx destlo, desthi; 16045 16046 gcc_assert (halfmode == V16QImode); 16047 16048 if (src1 == dest && src2 == dest + halfregs) 16049 { 16050 /* No-op move. Can't split to nothing; emit something. */ 16051 emit_note (NOTE_INSN_DELETED); 16052 return; 16053 } 16054 16055 /* Preserve register attributes for variable tracking. */ 16056 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0); 16057 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs, 16058 GET_MODE_SIZE (halfmode)); 16059 16060 /* Special case of reversed high/low parts. */ 16061 if (reg_overlap_mentioned_p (operands[2], destlo) 16062 && reg_overlap_mentioned_p (operands[1], desthi)) 16063 { 16064 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2])); 16065 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2])); 16066 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2])); 16067 } 16068 else if (!reg_overlap_mentioned_p (operands[2], destlo)) 16069 { 16070 /* Try to avoid unnecessary moves if part of the result 16071 is in the right place already. */ 16072 if (src1 != dest) 16073 emit_move_insn (destlo, operands[1]); 16074 if (src2 != dest + halfregs) 16075 emit_move_insn (desthi, operands[2]); 16076 } 16077 else 16078 { 16079 if (src2 != dest + halfregs) 16080 emit_move_insn (desthi, operands[2]); 16081 if (src1 != dest) 16082 emit_move_insn (destlo, operands[1]); 16083 } 16084 } 16085 16086 /* vec_perm support. */ 16087 16088 struct expand_vec_perm_d 16089 { 16090 rtx target, op0, op1; 16091 vec_perm_indices perm; 16092 machine_mode vmode; 16093 unsigned int vec_flags; 16094 bool one_vector_p; 16095 bool testing_p; 16096 }; 16097 16098 /* Generate a variable permutation. */ 16099 16100 static void 16101 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel) 16102 { 16103 machine_mode vmode = GET_MODE (target); 16104 bool one_vector_p = rtx_equal_p (op0, op1); 16105 16106 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode); 16107 gcc_checking_assert (GET_MODE (op0) == vmode); 16108 gcc_checking_assert (GET_MODE (op1) == vmode); 16109 gcc_checking_assert (GET_MODE (sel) == vmode); 16110 gcc_checking_assert (TARGET_SIMD); 16111 16112 if (one_vector_p) 16113 { 16114 if (vmode == V8QImode) 16115 { 16116 /* Expand the argument to a V16QI mode by duplicating it. */ 16117 rtx pair = gen_reg_rtx (V16QImode); 16118 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0)); 16119 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel)); 16120 } 16121 else 16122 { 16123 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel)); 16124 } 16125 } 16126 else 16127 { 16128 rtx pair; 16129 16130 if (vmode == V8QImode) 16131 { 16132 pair = gen_reg_rtx (V16QImode); 16133 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1)); 16134 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel)); 16135 } 16136 else 16137 { 16138 pair = gen_reg_rtx (OImode); 16139 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1)); 16140 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel)); 16141 } 16142 } 16143 } 16144 16145 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL. 16146 NELT is the number of elements in the vector. */ 16147 16148 void 16149 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel, 16150 unsigned int nelt) 16151 { 16152 machine_mode vmode = GET_MODE (target); 16153 bool one_vector_p = rtx_equal_p (op0, op1); 16154 rtx mask; 16155 16156 /* The TBL instruction does not use a modulo index, so we must take care 16157 of that ourselves. */ 16158 mask = aarch64_simd_gen_const_vector_dup (vmode, 16159 one_vector_p ? nelt - 1 : 2 * nelt - 1); 16160 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN); 16161 16162 /* For big-endian, we also need to reverse the index within the vector 16163 (but not which vector). */ 16164 if (BYTES_BIG_ENDIAN) 16165 { 16166 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */ 16167 if (!one_vector_p) 16168 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1); 16169 sel = expand_simple_binop (vmode, XOR, sel, mask, 16170 NULL, 0, OPTAB_LIB_WIDEN); 16171 } 16172 aarch64_expand_vec_perm_1 (target, op0, op1, sel); 16173 } 16174 16175 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */ 16176 16177 static void 16178 emit_unspec2 (rtx target, int code, rtx op0, rtx op1) 16179 { 16180 emit_insn (gen_rtx_SET (target, 16181 gen_rtx_UNSPEC (GET_MODE (target), 16182 gen_rtvec (2, op0, op1), code))); 16183 } 16184 16185 /* Expand an SVE vec_perm with the given operands. */ 16186 16187 void 16188 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel) 16189 { 16190 machine_mode data_mode = GET_MODE (target); 16191 machine_mode sel_mode = GET_MODE (sel); 16192 /* Enforced by the pattern condition. */ 16193 int nunits = GET_MODE_NUNITS (sel_mode).to_constant (); 16194 16195 /* Note: vec_perm indices are supposed to wrap when they go beyond the 16196 size of the two value vectors, i.e. the upper bits of the indices 16197 are effectively ignored. SVE TBL instead produces 0 for any 16198 out-of-range indices, so we need to modulo all the vec_perm indices 16199 to ensure they are all in range. */ 16200 rtx sel_reg = force_reg (sel_mode, sel); 16201 16202 /* Check if the sel only references the first values vector. */ 16203 if (GET_CODE (sel) == CONST_VECTOR 16204 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1)) 16205 { 16206 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg); 16207 return; 16208 } 16209 16210 /* Check if the two values vectors are the same. */ 16211 if (rtx_equal_p (op0, op1)) 16212 { 16213 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1); 16214 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel, 16215 NULL, 0, OPTAB_DIRECT); 16216 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod); 16217 return; 16218 } 16219 16220 /* Run TBL on for each value vector and combine the results. */ 16221 16222 rtx res0 = gen_reg_rtx (data_mode); 16223 rtx res1 = gen_reg_rtx (data_mode); 16224 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits); 16225 if (GET_CODE (sel) != CONST_VECTOR 16226 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1)) 16227 { 16228 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, 16229 2 * nunits - 1); 16230 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel, 16231 NULL, 0, OPTAB_DIRECT); 16232 } 16233 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg); 16234 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems, 16235 NULL, 0, OPTAB_DIRECT); 16236 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub); 16237 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT) 16238 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1))); 16239 else 16240 emit_unspec2 (target, UNSPEC_IORF, res0, res1); 16241 } 16242 16243 /* Recognize patterns suitable for the TRN instructions. */ 16244 static bool 16245 aarch64_evpc_trn (struct expand_vec_perm_d *d) 16246 { 16247 HOST_WIDE_INT odd; 16248 poly_uint64 nelt = d->perm.length (); 16249 rtx out, in0, in1, x; 16250 machine_mode vmode = d->vmode; 16251 16252 if (GET_MODE_UNIT_SIZE (vmode) > 8) 16253 return false; 16254 16255 /* Note that these are little-endian tests. 16256 We correct for big-endian later. */ 16257 if (!d->perm[0].is_constant (&odd) 16258 || (odd != 0 && odd != 1) 16259 || !d->perm.series_p (0, 2, odd, 2) 16260 || !d->perm.series_p (1, 2, nelt + odd, 2)) 16261 return false; 16262 16263 /* Success! */ 16264 if (d->testing_p) 16265 return true; 16266 16267 in0 = d->op0; 16268 in1 = d->op1; 16269 /* We don't need a big-endian lane correction for SVE; see the comment 16270 at the head of aarch64-sve.md for details. */ 16271 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD) 16272 { 16273 x = in0, in0 = in1, in1 = x; 16274 odd = !odd; 16275 } 16276 out = d->target; 16277 16278 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1), 16279 odd ? UNSPEC_TRN2 : UNSPEC_TRN1)); 16280 return true; 16281 } 16282 16283 /* Recognize patterns suitable for the UZP instructions. */ 16284 static bool 16285 aarch64_evpc_uzp (struct expand_vec_perm_d *d) 16286 { 16287 HOST_WIDE_INT odd; 16288 rtx out, in0, in1, x; 16289 machine_mode vmode = d->vmode; 16290 16291 if (GET_MODE_UNIT_SIZE (vmode) > 8) 16292 return false; 16293 16294 /* Note that these are little-endian tests. 16295 We correct for big-endian later. */ 16296 if (!d->perm[0].is_constant (&odd) 16297 || (odd != 0 && odd != 1) 16298 || !d->perm.series_p (0, 1, odd, 2)) 16299 return false; 16300 16301 /* Success! */ 16302 if (d->testing_p) 16303 return true; 16304 16305 in0 = d->op0; 16306 in1 = d->op1; 16307 /* We don't need a big-endian lane correction for SVE; see the comment 16308 at the head of aarch64-sve.md for details. */ 16309 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD) 16310 { 16311 x = in0, in0 = in1, in1 = x; 16312 odd = !odd; 16313 } 16314 out = d->target; 16315 16316 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1), 16317 odd ? UNSPEC_UZP2 : UNSPEC_UZP1)); 16318 return true; 16319 } 16320 16321 /* Recognize patterns suitable for the ZIP instructions. */ 16322 static bool 16323 aarch64_evpc_zip (struct expand_vec_perm_d *d) 16324 { 16325 unsigned int high; 16326 poly_uint64 nelt = d->perm.length (); 16327 rtx out, in0, in1, x; 16328 machine_mode vmode = d->vmode; 16329 16330 if (GET_MODE_UNIT_SIZE (vmode) > 8) 16331 return false; 16332 16333 /* Note that these are little-endian tests. 16334 We correct for big-endian later. */ 16335 poly_uint64 first = d->perm[0]; 16336 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt)) 16337 || !d->perm.series_p (0, 2, first, 1) 16338 || !d->perm.series_p (1, 2, first + nelt, 1)) 16339 return false; 16340 high = maybe_ne (first, 0U); 16341 16342 /* Success! */ 16343 if (d->testing_p) 16344 return true; 16345 16346 in0 = d->op0; 16347 in1 = d->op1; 16348 /* We don't need a big-endian lane correction for SVE; see the comment 16349 at the head of aarch64-sve.md for details. */ 16350 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD) 16351 { 16352 x = in0, in0 = in1, in1 = x; 16353 high = !high; 16354 } 16355 out = d->target; 16356 16357 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1), 16358 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1)); 16359 return true; 16360 } 16361 16362 /* Recognize patterns for the EXT insn. */ 16363 16364 static bool 16365 aarch64_evpc_ext (struct expand_vec_perm_d *d) 16366 { 16367 HOST_WIDE_INT location; 16368 rtx offset; 16369 16370 /* The first element always refers to the first vector. 16371 Check if the extracted indices are increasing by one. */ 16372 if (d->vec_flags == VEC_SVE_PRED 16373 || !d->perm[0].is_constant (&location) 16374 || !d->perm.series_p (0, 1, location, 1)) 16375 return false; 16376 16377 /* Success! */ 16378 if (d->testing_p) 16379 return true; 16380 16381 /* The case where (location == 0) is a no-op for both big- and little-endian, 16382 and is removed by the mid-end at optimization levels -O1 and higher. 16383 16384 We don't need a big-endian lane correction for SVE; see the comment 16385 at the head of aarch64-sve.md for details. */ 16386 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD) 16387 { 16388 /* After setup, we want the high elements of the first vector (stored 16389 at the LSB end of the register), and the low elements of the second 16390 vector (stored at the MSB end of the register). So swap. */ 16391 std::swap (d->op0, d->op1); 16392 /* location != 0 (above), so safe to assume (nelt - location) < nelt. 16393 to_constant () is safe since this is restricted to Advanced SIMD 16394 vectors. */ 16395 location = d->perm.length ().to_constant () - location; 16396 } 16397 16398 offset = GEN_INT (location); 16399 emit_set_insn (d->target, 16400 gen_rtx_UNSPEC (d->vmode, 16401 gen_rtvec (3, d->op0, d->op1, offset), 16402 UNSPEC_EXT)); 16403 return true; 16404 } 16405 16406 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements 16407 within each 64-bit, 32-bit or 16-bit granule. */ 16408 16409 static bool 16410 aarch64_evpc_rev_local (struct expand_vec_perm_d *d) 16411 { 16412 HOST_WIDE_INT diff; 16413 unsigned int i, size, unspec; 16414 machine_mode pred_mode; 16415 16416 if (d->vec_flags == VEC_SVE_PRED 16417 || !d->one_vector_p 16418 || !d->perm[0].is_constant (&diff)) 16419 return false; 16420 16421 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode); 16422 if (size == 8) 16423 { 16424 unspec = UNSPEC_REV64; 16425 pred_mode = VNx2BImode; 16426 } 16427 else if (size == 4) 16428 { 16429 unspec = UNSPEC_REV32; 16430 pred_mode = VNx4BImode; 16431 } 16432 else if (size == 2) 16433 { 16434 unspec = UNSPEC_REV16; 16435 pred_mode = VNx8BImode; 16436 } 16437 else 16438 return false; 16439 16440 unsigned int step = diff + 1; 16441 for (i = 0; i < step; ++i) 16442 if (!d->perm.series_p (i, step, diff - i, step)) 16443 return false; 16444 16445 /* Success! */ 16446 if (d->testing_p) 16447 return true; 16448 16449 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec); 16450 if (d->vec_flags == VEC_SVE_DATA) 16451 { 16452 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode)); 16453 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src), 16454 UNSPEC_MERGE_PTRUE); 16455 } 16456 emit_set_insn (d->target, src); 16457 return true; 16458 } 16459 16460 /* Recognize patterns for the REV insn, which reverses elements within 16461 a full vector. */ 16462 16463 static bool 16464 aarch64_evpc_rev_global (struct expand_vec_perm_d *d) 16465 { 16466 poly_uint64 nelt = d->perm.length (); 16467 16468 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA) 16469 return false; 16470 16471 if (!d->perm.series_p (0, 1, nelt - 1, -1)) 16472 return false; 16473 16474 /* Success! */ 16475 if (d->testing_p) 16476 return true; 16477 16478 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV); 16479 emit_set_insn (d->target, src); 16480 return true; 16481 } 16482 16483 static bool 16484 aarch64_evpc_dup (struct expand_vec_perm_d *d) 16485 { 16486 rtx out = d->target; 16487 rtx in0; 16488 HOST_WIDE_INT elt; 16489 machine_mode vmode = d->vmode; 16490 rtx lane; 16491 16492 if (d->vec_flags == VEC_SVE_PRED 16493 || d->perm.encoding ().encoded_nelts () != 1 16494 || !d->perm[0].is_constant (&elt)) 16495 return false; 16496 16497 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode)) 16498 return false; 16499 16500 /* Success! */ 16501 if (d->testing_p) 16502 return true; 16503 16504 /* The generic preparation in aarch64_expand_vec_perm_const_1 16505 swaps the operand order and the permute indices if it finds 16506 d->perm[0] to be in the second operand. Thus, we can always 16507 use d->op0 and need not do any extra arithmetic to get the 16508 correct lane number. */ 16509 in0 = d->op0; 16510 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */ 16511 16512 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane)); 16513 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel); 16514 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select)); 16515 return true; 16516 } 16517 16518 static bool 16519 aarch64_evpc_tbl (struct expand_vec_perm_d *d) 16520 { 16521 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel; 16522 machine_mode vmode = d->vmode; 16523 16524 /* Make sure that the indices are constant. */ 16525 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts (); 16526 for (unsigned int i = 0; i < encoded_nelts; ++i) 16527 if (!d->perm[i].is_constant ()) 16528 return false; 16529 16530 if (d->testing_p) 16531 return true; 16532 16533 /* Generic code will try constant permutation twice. Once with the 16534 original mode and again with the elements lowered to QImode. 16535 So wait and don't do the selector expansion ourselves. */ 16536 if (vmode != V8QImode && vmode != V16QImode) 16537 return false; 16538 16539 /* to_constant is safe since this routine is specific to Advanced SIMD 16540 vectors. */ 16541 unsigned int nelt = d->perm.length ().to_constant (); 16542 for (unsigned int i = 0; i < nelt; ++i) 16543 /* If big-endian and two vectors we end up with a weird mixed-endian 16544 mode on NEON. Reverse the index within each word but not the word 16545 itself. to_constant is safe because we checked is_constant above. */ 16546 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN 16547 ? d->perm[i].to_constant () ^ (nelt - 1) 16548 : d->perm[i].to_constant ()); 16549 16550 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); 16551 sel = force_reg (vmode, sel); 16552 16553 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel); 16554 return true; 16555 } 16556 16557 /* Try to implement D using an SVE TBL instruction. */ 16558 16559 static bool 16560 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d) 16561 { 16562 unsigned HOST_WIDE_INT nelt; 16563 16564 /* Permuting two variable-length vectors could overflow the 16565 index range. */ 16566 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt)) 16567 return false; 16568 16569 if (d->testing_p) 16570 return true; 16571 16572 machine_mode sel_mode = mode_for_int_vector (d->vmode).require (); 16573 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm); 16574 if (d->one_vector_p) 16575 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel)); 16576 else 16577 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel); 16578 return true; 16579 } 16580 16581 static bool 16582 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) 16583 { 16584 /* The pattern matching functions above are written to look for a small 16585 number to begin the sequence (0, 1, N/2). If we begin with an index 16586 from the second operand, we can swap the operands. */ 16587 poly_int64 nelt = d->perm.length (); 16588 if (known_ge (d->perm[0], nelt)) 16589 { 16590 d->perm.rotate_inputs (1); 16591 std::swap (d->op0, d->op1); 16592 } 16593 16594 if ((d->vec_flags == VEC_ADVSIMD 16595 || d->vec_flags == VEC_SVE_DATA 16596 || d->vec_flags == VEC_SVE_PRED) 16597 && known_gt (nelt, 1)) 16598 { 16599 if (aarch64_evpc_rev_local (d)) 16600 return true; 16601 else if (aarch64_evpc_rev_global (d)) 16602 return true; 16603 else if (aarch64_evpc_ext (d)) 16604 return true; 16605 else if (aarch64_evpc_dup (d)) 16606 return true; 16607 else if (aarch64_evpc_zip (d)) 16608 return true; 16609 else if (aarch64_evpc_uzp (d)) 16610 return true; 16611 else if (aarch64_evpc_trn (d)) 16612 return true; 16613 if (d->vec_flags == VEC_SVE_DATA) 16614 return aarch64_evpc_sve_tbl (d); 16615 else if (d->vec_flags == VEC_ADVSIMD) 16616 return aarch64_evpc_tbl (d); 16617 } 16618 return false; 16619 } 16620 16621 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */ 16622 16623 static bool 16624 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, 16625 rtx op1, const vec_perm_indices &sel) 16626 { 16627 struct expand_vec_perm_d d; 16628 16629 /* Check whether the mask can be applied to a single vector. */ 16630 if (sel.ninputs () == 1 16631 || (op0 && rtx_equal_p (op0, op1))) 16632 d.one_vector_p = true; 16633 else if (sel.all_from_input_p (0)) 16634 { 16635 d.one_vector_p = true; 16636 op1 = op0; 16637 } 16638 else if (sel.all_from_input_p (1)) 16639 { 16640 d.one_vector_p = true; 16641 op0 = op1; 16642 } 16643 else 16644 d.one_vector_p = false; 16645 16646 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2, 16647 sel.nelts_per_input ()); 16648 d.vmode = vmode; 16649 d.vec_flags = aarch64_classify_vector_mode (d.vmode); 16650 d.target = target; 16651 d.op0 = op0; 16652 d.op1 = op1; 16653 d.testing_p = !target; 16654 16655 if (!d.testing_p) 16656 return aarch64_expand_vec_perm_const_1 (&d); 16657 16658 rtx_insn *last = get_last_insn (); 16659 bool ret = aarch64_expand_vec_perm_const_1 (&d); 16660 gcc_assert (last == get_last_insn ()); 16661 16662 return ret; 16663 } 16664 16665 /* Generate a byte permute mask for a register of mode MODE, 16666 which has NUNITS units. */ 16667 16668 rtx 16669 aarch64_reverse_mask (machine_mode mode, unsigned int nunits) 16670 { 16671 /* We have to reverse each vector because we dont have 16672 a permuted load that can reverse-load according to ABI rules. */ 16673 rtx mask; 16674 rtvec v = rtvec_alloc (16); 16675 unsigned int i, j; 16676 unsigned int usize = GET_MODE_UNIT_SIZE (mode); 16677 16678 gcc_assert (BYTES_BIG_ENDIAN); 16679 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode)); 16680 16681 for (i = 0; i < nunits; i++) 16682 for (j = 0; j < usize; j++) 16683 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j); 16684 mask = gen_rtx_CONST_VECTOR (V16QImode, v); 16685 return force_reg (V16QImode, mask); 16686 } 16687 16688 /* Return true if X is a valid second operand for the SVE instruction 16689 that implements integer comparison OP_CODE. */ 16690 16691 static bool 16692 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x) 16693 { 16694 if (register_operand (x, VOIDmode)) 16695 return true; 16696 16697 switch (op_code) 16698 { 16699 case LTU: 16700 case LEU: 16701 case GEU: 16702 case GTU: 16703 return aarch64_sve_cmp_immediate_p (x, false); 16704 case LT: 16705 case LE: 16706 case GE: 16707 case GT: 16708 case NE: 16709 case EQ: 16710 return aarch64_sve_cmp_immediate_p (x, true); 16711 default: 16712 gcc_unreachable (); 16713 } 16714 } 16715 16716 /* Use predicated SVE instructions to implement the equivalent of: 16717 16718 (set TARGET OP) 16719 16720 given that PTRUE is an all-true predicate of the appropriate mode. */ 16721 16722 static void 16723 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op) 16724 { 16725 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target), 16726 gen_rtvec (2, ptrue, op), 16727 UNSPEC_MERGE_PTRUE); 16728 rtx_insn *insn = emit_set_insn (target, unspec); 16729 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op)); 16730 } 16731 16732 /* Likewise, but also clobber the condition codes. */ 16733 16734 static void 16735 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op) 16736 { 16737 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target), 16738 gen_rtvec (2, ptrue, op), 16739 UNSPEC_MERGE_PTRUE); 16740 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec)); 16741 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op)); 16742 } 16743 16744 /* Return the UNSPEC_COND_* code for comparison CODE. */ 16745 16746 static unsigned int 16747 aarch64_unspec_cond_code (rtx_code code) 16748 { 16749 switch (code) 16750 { 16751 case NE: 16752 return UNSPEC_COND_NE; 16753 case EQ: 16754 return UNSPEC_COND_EQ; 16755 case LT: 16756 return UNSPEC_COND_LT; 16757 case GT: 16758 return UNSPEC_COND_GT; 16759 case LE: 16760 return UNSPEC_COND_LE; 16761 case GE: 16762 return UNSPEC_COND_GE; 16763 default: 16764 gcc_unreachable (); 16765 } 16766 } 16767 16768 /* Emit: 16769 16770 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>)) 16771 16772 where <X> is the operation associated with comparison CODE. This form 16773 of instruction is used when (and (CODE OP0 OP1) PRED) would have different 16774 semantics, such as when PRED might not be all-true and when comparing 16775 inactive lanes could have side effects. */ 16776 16777 static void 16778 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code, 16779 rtx pred, rtx op0, rtx op1) 16780 { 16781 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred), 16782 gen_rtvec (3, pred, op0, op1), 16783 aarch64_unspec_cond_code (code)); 16784 emit_set_insn (target, unspec); 16785 } 16786 16787 /* Expand an SVE integer comparison using the SVE equivalent of: 16788 16789 (set TARGET (CODE OP0 OP1)). */ 16790 16791 void 16792 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1) 16793 { 16794 machine_mode pred_mode = GET_MODE (target); 16795 machine_mode data_mode = GET_MODE (op0); 16796 16797 if (!aarch64_sve_cmp_operand_p (code, op1)) 16798 op1 = force_reg (data_mode, op1); 16799 16800 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode)); 16801 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1); 16802 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond); 16803 } 16804 16805 /* Emit the SVE equivalent of: 16806 16807 (set TMP1 (CODE1 OP0 OP1)) 16808 (set TMP2 (CODE2 OP0 OP1)) 16809 (set TARGET (ior:PRED_MODE TMP1 TMP2)) 16810 16811 PTRUE is an all-true predicate with the same mode as TARGET. */ 16812 16813 static void 16814 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2, 16815 rtx ptrue, rtx op0, rtx op1) 16816 { 16817 machine_mode pred_mode = GET_MODE (ptrue); 16818 rtx tmp1 = gen_reg_rtx (pred_mode); 16819 aarch64_emit_sve_ptrue_op (tmp1, ptrue, 16820 gen_rtx_fmt_ee (code1, pred_mode, op0, op1)); 16821 rtx tmp2 = gen_reg_rtx (pred_mode); 16822 aarch64_emit_sve_ptrue_op (tmp2, ptrue, 16823 gen_rtx_fmt_ee (code2, pred_mode, op0, op1)); 16824 aarch64_emit_binop (target, ior_optab, tmp1, tmp2); 16825 } 16826 16827 /* Emit the SVE equivalent of: 16828 16829 (set TMP (CODE OP0 OP1)) 16830 (set TARGET (not TMP)) 16831 16832 PTRUE is an all-true predicate with the same mode as TARGET. */ 16833 16834 static void 16835 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code, 16836 rtx op0, rtx op1) 16837 { 16838 machine_mode pred_mode = GET_MODE (ptrue); 16839 rtx tmp = gen_reg_rtx (pred_mode); 16840 aarch64_emit_sve_ptrue_op (tmp, ptrue, 16841 gen_rtx_fmt_ee (code, pred_mode, op0, op1)); 16842 aarch64_emit_unop (target, one_cmpl_optab, tmp); 16843 } 16844 16845 /* Expand an SVE floating-point comparison using the SVE equivalent of: 16846 16847 (set TARGET (CODE OP0 OP1)) 16848 16849 If CAN_INVERT_P is true, the caller can also handle inverted results; 16850 return true if the result is in fact inverted. */ 16851 16852 bool 16853 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code, 16854 rtx op0, rtx op1, bool can_invert_p) 16855 { 16856 machine_mode pred_mode = GET_MODE (target); 16857 machine_mode data_mode = GET_MODE (op0); 16858 16859 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode)); 16860 switch (code) 16861 { 16862 case UNORDERED: 16863 /* UNORDERED has no immediate form. */ 16864 op1 = force_reg (data_mode, op1); 16865 /* fall through */ 16866 case LT: 16867 case LE: 16868 case GT: 16869 case GE: 16870 case EQ: 16871 case NE: 16872 { 16873 /* There is native support for the comparison. */ 16874 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1); 16875 aarch64_emit_sve_ptrue_op (target, ptrue, cond); 16876 return false; 16877 } 16878 16879 case LTGT: 16880 /* This is a trapping operation (LT or GT). */ 16881 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1); 16882 return false; 16883 16884 case UNEQ: 16885 if (!flag_trapping_math) 16886 { 16887 /* This would trap for signaling NaNs. */ 16888 op1 = force_reg (data_mode, op1); 16889 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1); 16890 return false; 16891 } 16892 /* fall through */ 16893 case UNLT: 16894 case UNLE: 16895 case UNGT: 16896 case UNGE: 16897 if (flag_trapping_math) 16898 { 16899 /* Work out which elements are ordered. */ 16900 rtx ordered = gen_reg_rtx (pred_mode); 16901 op1 = force_reg (data_mode, op1); 16902 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1); 16903 16904 /* Test the opposite condition for the ordered elements, 16905 then invert the result. */ 16906 if (code == UNEQ) 16907 code = NE; 16908 else 16909 code = reverse_condition_maybe_unordered (code); 16910 if (can_invert_p) 16911 { 16912 aarch64_emit_sve_predicated_cond (target, code, 16913 ordered, op0, op1); 16914 return true; 16915 } 16916 rtx tmp = gen_reg_rtx (pred_mode); 16917 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1); 16918 aarch64_emit_unop (target, one_cmpl_optab, tmp); 16919 return false; 16920 } 16921 break; 16922 16923 case ORDERED: 16924 /* ORDERED has no immediate form. */ 16925 op1 = force_reg (data_mode, op1); 16926 break; 16927 16928 default: 16929 gcc_unreachable (); 16930 } 16931 16932 /* There is native support for the inverse comparison. */ 16933 code = reverse_condition_maybe_unordered (code); 16934 if (can_invert_p) 16935 { 16936 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1); 16937 aarch64_emit_sve_ptrue_op (target, ptrue, cond); 16938 return true; 16939 } 16940 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1); 16941 return false; 16942 } 16943 16944 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode 16945 of the data being selected and CMP_MODE is the mode of the values being 16946 compared. */ 16947 16948 void 16949 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode, 16950 rtx *ops) 16951 { 16952 machine_mode pred_mode 16953 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode), 16954 GET_MODE_SIZE (cmp_mode)).require (); 16955 rtx pred = gen_reg_rtx (pred_mode); 16956 if (FLOAT_MODE_P (cmp_mode)) 16957 { 16958 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]), 16959 ops[4], ops[5], true)) 16960 std::swap (ops[1], ops[2]); 16961 } 16962 else 16963 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]); 16964 16965 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]); 16966 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL)); 16967 } 16968 16969 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return 16970 true. However due to issues with register allocation it is preferable 16971 to avoid tieing integer scalar and FP scalar modes. Executing integer 16972 operations in general registers is better than treating them as scalar 16973 vector operations. This reduces latency and avoids redundant int<->FP 16974 moves. So tie modes if they are either the same class, or vector modes 16975 with other vector modes, vector structs or any scalar mode. */ 16976 16977 static bool 16978 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2) 16979 { 16980 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2)) 16981 return true; 16982 16983 /* We specifically want to allow elements of "structure" modes to 16984 be tieable to the structure. This more general condition allows 16985 other rarer situations too. The reason we don't extend this to 16986 predicate modes is that there are no predicate structure modes 16987 nor any specific instructions for extracting part of a predicate 16988 register. */ 16989 if (aarch64_vector_data_mode_p (mode1) 16990 && aarch64_vector_data_mode_p (mode2)) 16991 return true; 16992 16993 /* Also allow any scalar modes with vectors. */ 16994 if (aarch64_vector_mode_supported_p (mode1) 16995 || aarch64_vector_mode_supported_p (mode2)) 16996 return true; 16997 16998 return false; 16999 } 17000 17001 /* Return a new RTX holding the result of moving POINTER forward by 17002 AMOUNT bytes. */ 17003 17004 static rtx 17005 aarch64_move_pointer (rtx pointer, poly_int64 amount) 17006 { 17007 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount); 17008 17009 return adjust_automodify_address (pointer, GET_MODE (pointer), 17010 next, amount); 17011 } 17012 17013 /* Return a new RTX holding the result of moving POINTER forward by the 17014 size of the mode it points to. */ 17015 17016 static rtx 17017 aarch64_progress_pointer (rtx pointer) 17018 { 17019 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer))); 17020 } 17021 17022 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by 17023 MODE bytes. */ 17024 17025 static void 17026 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst, 17027 machine_mode mode) 17028 { 17029 rtx reg = gen_reg_rtx (mode); 17030 17031 /* "Cast" the pointers to the correct mode. */ 17032 *src = adjust_address (*src, mode, 0); 17033 *dst = adjust_address (*dst, mode, 0); 17034 /* Emit the memcpy. */ 17035 emit_move_insn (reg, *src); 17036 emit_move_insn (*dst, reg); 17037 /* Move the pointers forward. */ 17038 *src = aarch64_progress_pointer (*src); 17039 *dst = aarch64_progress_pointer (*dst); 17040 } 17041 17042 /* Expand movmem, as if from a __builtin_memcpy. Return true if 17043 we succeed, otherwise return false. */ 17044 17045 bool 17046 aarch64_expand_movmem (rtx *operands) 17047 { 17048 int n, mode_bits; 17049 rtx dst = operands[0]; 17050 rtx src = operands[1]; 17051 rtx base; 17052 machine_mode cur_mode = BLKmode, next_mode; 17053 bool speed_p = !optimize_function_for_size_p (cfun); 17054 17055 /* When optimizing for size, give a better estimate of the length of a 17056 memcpy call, but use the default otherwise. Moves larger than 8 bytes 17057 will always require an even number of instructions to do now. And each 17058 operation requires both a load+store, so devide the max number by 2. */ 17059 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2; 17060 17061 /* We can't do anything smart if the amount to copy is not constant. */ 17062 if (!CONST_INT_P (operands[2])) 17063 return false; 17064 17065 n = INTVAL (operands[2]); 17066 17067 /* Try to keep the number of instructions low. For all cases we will do at 17068 most two moves for the residual amount, since we'll always overlap the 17069 remainder. */ 17070 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves) 17071 return false; 17072 17073 base = copy_to_mode_reg (Pmode, XEXP (dst, 0)); 17074 dst = adjust_automodify_address (dst, VOIDmode, base, 0); 17075 17076 base = copy_to_mode_reg (Pmode, XEXP (src, 0)); 17077 src = adjust_automodify_address (src, VOIDmode, base, 0); 17078 17079 /* Convert n to bits to make the rest of the code simpler. */ 17080 n = n * BITS_PER_UNIT; 17081 17082 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes 17083 larger than TImode, but we should not use them for loads/stores here. */ 17084 const int copy_limit = GET_MODE_BITSIZE (TImode); 17085 17086 while (n > 0) 17087 { 17088 /* Find the largest mode in which to do the copy in without over reading 17089 or writing. */ 17090 opt_scalar_int_mode mode_iter; 17091 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT) 17092 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit)) 17093 cur_mode = mode_iter.require (); 17094 17095 gcc_assert (cur_mode != BLKmode); 17096 17097 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant (); 17098 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode); 17099 17100 n -= mode_bits; 17101 17102 /* Do certain trailing copies as overlapping if it's going to be 17103 cheaper. i.e. less instructions to do so. For instance doing a 15 17104 byte copy it's more efficient to do two overlapping 8 byte copies than 17105 8 + 6 + 1. */ 17106 if (n > 0 && n <= 8 * BITS_PER_UNIT) 17107 { 17108 next_mode = smallest_mode_for_size (n, MODE_INT); 17109 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant (); 17110 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT); 17111 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT); 17112 n = n_bits; 17113 } 17114 } 17115 17116 return true; 17117 } 17118 17119 /* Split a DImode store of a CONST_INT SRC to MEM DST as two 17120 SImode stores. Handle the case when the constant has identical 17121 bottom and top halves. This is beneficial when the two stores can be 17122 merged into an STP and we avoid synthesising potentially expensive 17123 immediates twice. Return true if such a split is possible. */ 17124 17125 bool 17126 aarch64_split_dimode_const_store (rtx dst, rtx src) 17127 { 17128 rtx lo = gen_lowpart (SImode, src); 17129 rtx hi = gen_highpart_mode (SImode, DImode, src); 17130 17131 bool size_p = optimize_function_for_size_p (cfun); 17132 17133 if (!rtx_equal_p (lo, hi)) 17134 return false; 17135 17136 unsigned int orig_cost 17137 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode); 17138 unsigned int lo_cost 17139 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode); 17140 17141 /* We want to transform: 17142 MOV x1, 49370 17143 MOVK x1, 0x140, lsl 16 17144 MOVK x1, 0xc0da, lsl 32 17145 MOVK x1, 0x140, lsl 48 17146 STR x1, [x0] 17147 into: 17148 MOV w1, 49370 17149 MOVK w1, 0x140, lsl 16 17150 STP w1, w1, [x0] 17151 So we want to perform this only when we save two instructions 17152 or more. When optimizing for size, however, accept any code size 17153 savings we can. */ 17154 if (size_p && orig_cost <= lo_cost) 17155 return false; 17156 17157 if (!size_p 17158 && (orig_cost <= lo_cost + 1)) 17159 return false; 17160 17161 rtx mem_lo = adjust_address (dst, SImode, 0); 17162 if (!aarch64_mem_pair_operand (mem_lo, SImode)) 17163 return false; 17164 17165 rtx tmp_reg = gen_reg_rtx (SImode); 17166 aarch64_expand_mov_immediate (tmp_reg, lo); 17167 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode)); 17168 /* Don't emit an explicit store pair as this may not be always profitable. 17169 Let the sched-fusion logic decide whether to merge them. */ 17170 emit_move_insn (mem_lo, tmp_reg); 17171 emit_move_insn (mem_hi, tmp_reg); 17172 17173 return true; 17174 } 17175 17176 /* Generate RTL for a conditional branch with rtx comparison CODE in 17177 mode CC_MODE. The destination of the unlikely conditional branch 17178 is LABEL_REF. */ 17179 17180 void 17181 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode, 17182 rtx label_ref) 17183 { 17184 rtx x; 17185 x = gen_rtx_fmt_ee (code, VOIDmode, 17186 gen_rtx_REG (cc_mode, CC_REGNUM), 17187 const0_rtx); 17188 17189 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x, 17190 gen_rtx_LABEL_REF (VOIDmode, label_ref), 17191 pc_rtx); 17192 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x)); 17193 } 17194 17195 /* Generate DImode scratch registers for 128-bit (TImode) addition. 17196 17197 OP1 represents the TImode destination operand 1 17198 OP2 represents the TImode destination operand 2 17199 LOW_DEST represents the low half (DImode) of TImode operand 0 17200 LOW_IN1 represents the low half (DImode) of TImode operand 1 17201 LOW_IN2 represents the low half (DImode) of TImode operand 2 17202 HIGH_DEST represents the high half (DImode) of TImode operand 0 17203 HIGH_IN1 represents the high half (DImode) of TImode operand 1 17204 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */ 17205 17206 void 17207 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest, 17208 rtx *low_in1, rtx *low_in2, 17209 rtx *high_dest, rtx *high_in1, 17210 rtx *high_in2) 17211 { 17212 *low_dest = gen_reg_rtx (DImode); 17213 *low_in1 = gen_lowpart (DImode, op1); 17214 *low_in2 = simplify_gen_subreg (DImode, op2, TImode, 17215 subreg_lowpart_offset (DImode, TImode)); 17216 *high_dest = gen_reg_rtx (DImode); 17217 *high_in1 = gen_highpart (DImode, op1); 17218 *high_in2 = simplify_gen_subreg (DImode, op2, TImode, 17219 subreg_highpart_offset (DImode, TImode)); 17220 } 17221 17222 /* Generate DImode scratch registers for 128-bit (TImode) subtraction. 17223 17224 This function differs from 'arch64_addti_scratch_regs' in that 17225 OP1 can be an immediate constant (zero). We must call 17226 subreg_highpart_offset with DImode and TImode arguments, otherwise 17227 VOIDmode will be used for the const_int which generates an internal 17228 error from subreg_size_highpart_offset which does not expect a size of zero. 17229 17230 OP1 represents the TImode destination operand 1 17231 OP2 represents the TImode destination operand 2 17232 LOW_DEST represents the low half (DImode) of TImode operand 0 17233 LOW_IN1 represents the low half (DImode) of TImode operand 1 17234 LOW_IN2 represents the low half (DImode) of TImode operand 2 17235 HIGH_DEST represents the high half (DImode) of TImode operand 0 17236 HIGH_IN1 represents the high half (DImode) of TImode operand 1 17237 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */ 17238 17239 17240 void 17241 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest, 17242 rtx *low_in1, rtx *low_in2, 17243 rtx *high_dest, rtx *high_in1, 17244 rtx *high_in2) 17245 { 17246 *low_dest = gen_reg_rtx (DImode); 17247 *low_in1 = simplify_gen_subreg (DImode, op1, TImode, 17248 subreg_lowpart_offset (DImode, TImode)); 17249 17250 *low_in2 = simplify_gen_subreg (DImode, op2, TImode, 17251 subreg_lowpart_offset (DImode, TImode)); 17252 *high_dest = gen_reg_rtx (DImode); 17253 17254 *high_in1 = simplify_gen_subreg (DImode, op1, TImode, 17255 subreg_highpart_offset (DImode, TImode)); 17256 *high_in2 = simplify_gen_subreg (DImode, op2, TImode, 17257 subreg_highpart_offset (DImode, TImode)); 17258 } 17259 17260 /* Generate RTL for 128-bit (TImode) subtraction with overflow. 17261 17262 OP0 represents the TImode destination operand 0 17263 LOW_DEST represents the low half (DImode) of TImode operand 0 17264 LOW_IN1 represents the low half (DImode) of TImode operand 1 17265 LOW_IN2 represents the low half (DImode) of TImode operand 2 17266 HIGH_DEST represents the high half (DImode) of TImode operand 0 17267 HIGH_IN1 represents the high half (DImode) of TImode operand 1 17268 HIGH_IN2 represents the high half (DImode) of TImode operand 2 17269 UNSIGNED_P is true if the operation is being performed on unsigned 17270 values. */ 17271 void 17272 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1, 17273 rtx low_in2, rtx high_dest, rtx high_in1, 17274 rtx high_in2, bool unsigned_p) 17275 { 17276 if (low_in2 == const0_rtx) 17277 { 17278 low_dest = low_in1; 17279 high_in2 = force_reg (DImode, high_in2); 17280 if (unsigned_p) 17281 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2)); 17282 else 17283 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2)); 17284 } 17285 else 17286 { 17287 if (aarch64_plus_immediate (low_in2, DImode)) 17288 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2, 17289 GEN_INT (-INTVAL (low_in2)))); 17290 else 17291 { 17292 low_in2 = force_reg (DImode, low_in2); 17293 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2)); 17294 } 17295 high_in2 = force_reg (DImode, high_in2); 17296 17297 if (unsigned_p) 17298 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2)); 17299 else 17300 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2)); 17301 } 17302 17303 emit_move_insn (gen_lowpart (DImode, op0), low_dest); 17304 emit_move_insn (gen_highpart (DImode, op0), high_dest); 17305 17306 } 17307 17308 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */ 17309 17310 static unsigned HOST_WIDE_INT 17311 aarch64_asan_shadow_offset (void) 17312 { 17313 return (HOST_WIDE_INT_1 << 36); 17314 } 17315 17316 static rtx 17317 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq, 17318 int code, tree treeop0, tree treeop1) 17319 { 17320 machine_mode op_mode, cmp_mode, cc_mode = CCmode; 17321 rtx op0, op1; 17322 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0)); 17323 insn_code icode; 17324 struct expand_operand ops[4]; 17325 17326 start_sequence (); 17327 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL); 17328 17329 op_mode = GET_MODE (op0); 17330 if (op_mode == VOIDmode) 17331 op_mode = GET_MODE (op1); 17332 17333 switch (op_mode) 17334 { 17335 case E_QImode: 17336 case E_HImode: 17337 case E_SImode: 17338 cmp_mode = SImode; 17339 icode = CODE_FOR_cmpsi; 17340 break; 17341 17342 case E_DImode: 17343 cmp_mode = DImode; 17344 icode = CODE_FOR_cmpdi; 17345 break; 17346 17347 case E_SFmode: 17348 cmp_mode = SFmode; 17349 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1); 17350 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf; 17351 break; 17352 17353 case E_DFmode: 17354 cmp_mode = DFmode; 17355 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1); 17356 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf; 17357 break; 17358 17359 default: 17360 end_sequence (); 17361 return NULL_RTX; 17362 } 17363 17364 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp); 17365 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp); 17366 if (!op0 || !op1) 17367 { 17368 end_sequence (); 17369 return NULL_RTX; 17370 } 17371 *prep_seq = get_insns (); 17372 end_sequence (); 17373 17374 create_fixed_operand (&ops[0], op0); 17375 create_fixed_operand (&ops[1], op1); 17376 17377 start_sequence (); 17378 if (!maybe_expand_insn (icode, 2, ops)) 17379 { 17380 end_sequence (); 17381 return NULL_RTX; 17382 } 17383 *gen_seq = get_insns (); 17384 end_sequence (); 17385 17386 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode, 17387 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx); 17388 } 17389 17390 static rtx 17391 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, 17392 int cmp_code, tree treeop0, tree treeop1, int bit_code) 17393 { 17394 rtx op0, op1, target; 17395 machine_mode op_mode, cmp_mode, cc_mode = CCmode; 17396 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0)); 17397 insn_code icode; 17398 struct expand_operand ops[6]; 17399 int aarch64_cond; 17400 17401 push_to_sequence (*prep_seq); 17402 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL); 17403 17404 op_mode = GET_MODE (op0); 17405 if (op_mode == VOIDmode) 17406 op_mode = GET_MODE (op1); 17407 17408 switch (op_mode) 17409 { 17410 case E_QImode: 17411 case E_HImode: 17412 case E_SImode: 17413 cmp_mode = SImode; 17414 icode = CODE_FOR_ccmpsi; 17415 break; 17416 17417 case E_DImode: 17418 cmp_mode = DImode; 17419 icode = CODE_FOR_ccmpdi; 17420 break; 17421 17422 case E_SFmode: 17423 cmp_mode = SFmode; 17424 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1); 17425 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf; 17426 break; 17427 17428 case E_DFmode: 17429 cmp_mode = DFmode; 17430 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1); 17431 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf; 17432 break; 17433 17434 default: 17435 end_sequence (); 17436 return NULL_RTX; 17437 } 17438 17439 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp); 17440 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp); 17441 if (!op0 || !op1) 17442 { 17443 end_sequence (); 17444 return NULL_RTX; 17445 } 17446 *prep_seq = get_insns (); 17447 end_sequence (); 17448 17449 target = gen_rtx_REG (cc_mode, CC_REGNUM); 17450 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code); 17451 17452 if (bit_code != AND) 17453 { 17454 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev), 17455 GET_MODE (XEXP (prev, 0))), 17456 VOIDmode, XEXP (prev, 0), const0_rtx); 17457 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond); 17458 } 17459 17460 create_fixed_operand (&ops[0], XEXP (prev, 0)); 17461 create_fixed_operand (&ops[1], target); 17462 create_fixed_operand (&ops[2], op0); 17463 create_fixed_operand (&ops[3], op1); 17464 create_fixed_operand (&ops[4], prev); 17465 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond)); 17466 17467 push_to_sequence (*gen_seq); 17468 if (!maybe_expand_insn (icode, 6, ops)) 17469 { 17470 end_sequence (); 17471 return NULL_RTX; 17472 } 17473 17474 *gen_seq = get_insns (); 17475 end_sequence (); 17476 17477 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx); 17478 } 17479 17480 #undef TARGET_GEN_CCMP_FIRST 17481 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first 17482 17483 #undef TARGET_GEN_CCMP_NEXT 17484 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next 17485 17486 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports 17487 instruction fusion of some sort. */ 17488 17489 static bool 17490 aarch64_macro_fusion_p (void) 17491 { 17492 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING; 17493 } 17494 17495 17496 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR 17497 should be kept together during scheduling. */ 17498 17499 static bool 17500 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr) 17501 { 17502 rtx set_dest; 17503 rtx prev_set = single_set (prev); 17504 rtx curr_set = single_set (curr); 17505 /* prev and curr are simple SET insns i.e. no flag setting or branching. */ 17506 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr); 17507 17508 if (!aarch64_macro_fusion_p ()) 17509 return false; 17510 17511 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK)) 17512 { 17513 /* We are trying to match: 17514 prev (mov) == (set (reg r0) (const_int imm16)) 17515 curr (movk) == (set (zero_extract (reg r0) 17516 (const_int 16) 17517 (const_int 16)) 17518 (const_int imm16_1)) */ 17519 17520 set_dest = SET_DEST (curr_set); 17521 17522 if (GET_CODE (set_dest) == ZERO_EXTRACT 17523 && CONST_INT_P (SET_SRC (curr_set)) 17524 && CONST_INT_P (SET_SRC (prev_set)) 17525 && CONST_INT_P (XEXP (set_dest, 2)) 17526 && INTVAL (XEXP (set_dest, 2)) == 16 17527 && REG_P (XEXP (set_dest, 0)) 17528 && REG_P (SET_DEST (prev_set)) 17529 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set))) 17530 { 17531 return true; 17532 } 17533 } 17534 17535 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD)) 17536 { 17537 17538 /* We're trying to match: 17539 prev (adrp) == (set (reg r1) 17540 (high (symbol_ref ("SYM")))) 17541 curr (add) == (set (reg r0) 17542 (lo_sum (reg r1) 17543 (symbol_ref ("SYM")))) 17544 Note that r0 need not necessarily be the same as r1, especially 17545 during pre-regalloc scheduling. */ 17546 17547 if (satisfies_constraint_Ush (SET_SRC (prev_set)) 17548 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set))) 17549 { 17550 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM 17551 && REG_P (XEXP (SET_SRC (curr_set), 0)) 17552 && REGNO (XEXP (SET_SRC (curr_set), 0)) 17553 == REGNO (SET_DEST (prev_set)) 17554 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0), 17555 XEXP (SET_SRC (curr_set), 1))) 17556 return true; 17557 } 17558 } 17559 17560 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK)) 17561 { 17562 17563 /* We're trying to match: 17564 prev (movk) == (set (zero_extract (reg r0) 17565 (const_int 16) 17566 (const_int 32)) 17567 (const_int imm16_1)) 17568 curr (movk) == (set (zero_extract (reg r0) 17569 (const_int 16) 17570 (const_int 48)) 17571 (const_int imm16_2)) */ 17572 17573 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT 17574 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT 17575 && REG_P (XEXP (SET_DEST (prev_set), 0)) 17576 && REG_P (XEXP (SET_DEST (curr_set), 0)) 17577 && REGNO (XEXP (SET_DEST (prev_set), 0)) 17578 == REGNO (XEXP (SET_DEST (curr_set), 0)) 17579 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2)) 17580 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2)) 17581 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32 17582 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48 17583 && CONST_INT_P (SET_SRC (prev_set)) 17584 && CONST_INT_P (SET_SRC (curr_set))) 17585 return true; 17586 17587 } 17588 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR)) 17589 { 17590 /* We're trying to match: 17591 prev (adrp) == (set (reg r0) 17592 (high (symbol_ref ("SYM")))) 17593 curr (ldr) == (set (reg r1) 17594 (mem (lo_sum (reg r0) 17595 (symbol_ref ("SYM"))))) 17596 or 17597 curr (ldr) == (set (reg r1) 17598 (zero_extend (mem 17599 (lo_sum (reg r0) 17600 (symbol_ref ("SYM")))))) */ 17601 if (satisfies_constraint_Ush (SET_SRC (prev_set)) 17602 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set))) 17603 { 17604 rtx curr_src = SET_SRC (curr_set); 17605 17606 if (GET_CODE (curr_src) == ZERO_EXTEND) 17607 curr_src = XEXP (curr_src, 0); 17608 17609 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM 17610 && REG_P (XEXP (XEXP (curr_src, 0), 0)) 17611 && REGNO (XEXP (XEXP (curr_src, 0), 0)) 17612 == REGNO (SET_DEST (prev_set)) 17613 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1), 17614 XEXP (SET_SRC (prev_set), 0))) 17615 return true; 17616 } 17617 } 17618 17619 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC) 17620 && aarch_crypto_can_dual_issue (prev, curr)) 17621 return true; 17622 17623 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH) 17624 && any_condjump_p (curr)) 17625 { 17626 unsigned int condreg1, condreg2; 17627 rtx cc_reg_1; 17628 aarch64_fixed_condition_code_regs (&condreg1, &condreg2); 17629 cc_reg_1 = gen_rtx_REG (CCmode, condreg1); 17630 17631 if (reg_referenced_p (cc_reg_1, PATTERN (curr)) 17632 && prev 17633 && modified_in_p (cc_reg_1, prev)) 17634 { 17635 enum attr_type prev_type = get_attr_type (prev); 17636 17637 /* FIXME: this misses some which is considered simple arthematic 17638 instructions for ThunderX. Simple shifts are missed here. */ 17639 if (prev_type == TYPE_ALUS_SREG 17640 || prev_type == TYPE_ALUS_IMM 17641 || prev_type == TYPE_LOGICS_REG 17642 || prev_type == TYPE_LOGICS_IMM) 17643 return true; 17644 } 17645 } 17646 17647 if (prev_set 17648 && curr_set 17649 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH) 17650 && any_condjump_p (curr)) 17651 { 17652 /* We're trying to match: 17653 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm))) 17654 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0) 17655 (const_int 0)) 17656 (label_ref ("SYM")) 17657 (pc)) */ 17658 if (SET_DEST (curr_set) == (pc_rtx) 17659 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE 17660 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0)) 17661 && REG_P (SET_DEST (prev_set)) 17662 && REGNO (SET_DEST (prev_set)) 17663 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0))) 17664 { 17665 /* Fuse ALU operations followed by conditional branch instruction. */ 17666 switch (get_attr_type (prev)) 17667 { 17668 case TYPE_ALU_IMM: 17669 case TYPE_ALU_SREG: 17670 case TYPE_ADC_REG: 17671 case TYPE_ADC_IMM: 17672 case TYPE_ADCS_REG: 17673 case TYPE_ADCS_IMM: 17674 case TYPE_LOGIC_REG: 17675 case TYPE_LOGIC_IMM: 17676 case TYPE_CSEL: 17677 case TYPE_ADR: 17678 case TYPE_MOV_IMM: 17679 case TYPE_SHIFT_REG: 17680 case TYPE_SHIFT_IMM: 17681 case TYPE_BFM: 17682 case TYPE_RBIT: 17683 case TYPE_REV: 17684 case TYPE_EXTEND: 17685 return true; 17686 17687 default:; 17688 } 17689 } 17690 } 17691 17692 return false; 17693 } 17694 17695 /* Return true iff the instruction fusion described by OP is enabled. */ 17696 17697 bool 17698 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op) 17699 { 17700 return (aarch64_tune_params.fusible_ops & op) != 0; 17701 } 17702 17703 /* If MEM is in the form of [base+offset], extract the two parts 17704 of address and set to BASE and OFFSET, otherwise return false 17705 after clearing BASE and OFFSET. */ 17706 17707 bool 17708 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset) 17709 { 17710 rtx addr; 17711 17712 gcc_assert (MEM_P (mem)); 17713 17714 addr = XEXP (mem, 0); 17715 17716 if (REG_P (addr)) 17717 { 17718 *base = addr; 17719 *offset = const0_rtx; 17720 return true; 17721 } 17722 17723 if (GET_CODE (addr) == PLUS 17724 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1))) 17725 { 17726 *base = XEXP (addr, 0); 17727 *offset = XEXP (addr, 1); 17728 return true; 17729 } 17730 17731 *base = NULL_RTX; 17732 *offset = NULL_RTX; 17733 17734 return false; 17735 } 17736 17737 /* Types for scheduling fusion. */ 17738 enum sched_fusion_type 17739 { 17740 SCHED_FUSION_NONE = 0, 17741 SCHED_FUSION_LD_SIGN_EXTEND, 17742 SCHED_FUSION_LD_ZERO_EXTEND, 17743 SCHED_FUSION_LD, 17744 SCHED_FUSION_ST, 17745 SCHED_FUSION_NUM 17746 }; 17747 17748 /* If INSN is a load or store of address in the form of [base+offset], 17749 extract the two parts and set to BASE and OFFSET. Return scheduling 17750 fusion type this INSN is. */ 17751 17752 static enum sched_fusion_type 17753 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset) 17754 { 17755 rtx x, dest, src; 17756 enum sched_fusion_type fusion = SCHED_FUSION_LD; 17757 17758 gcc_assert (INSN_P (insn)); 17759 x = PATTERN (insn); 17760 if (GET_CODE (x) != SET) 17761 return SCHED_FUSION_NONE; 17762 17763 src = SET_SRC (x); 17764 dest = SET_DEST (x); 17765 17766 machine_mode dest_mode = GET_MODE (dest); 17767 17768 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode)) 17769 return SCHED_FUSION_NONE; 17770 17771 if (GET_CODE (src) == SIGN_EXTEND) 17772 { 17773 fusion = SCHED_FUSION_LD_SIGN_EXTEND; 17774 src = XEXP (src, 0); 17775 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode) 17776 return SCHED_FUSION_NONE; 17777 } 17778 else if (GET_CODE (src) == ZERO_EXTEND) 17779 { 17780 fusion = SCHED_FUSION_LD_ZERO_EXTEND; 17781 src = XEXP (src, 0); 17782 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode) 17783 return SCHED_FUSION_NONE; 17784 } 17785 17786 if (GET_CODE (src) == MEM && REG_P (dest)) 17787 extract_base_offset_in_addr (src, base, offset); 17788 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx)) 17789 { 17790 fusion = SCHED_FUSION_ST; 17791 extract_base_offset_in_addr (dest, base, offset); 17792 } 17793 else 17794 return SCHED_FUSION_NONE; 17795 17796 if (*base == NULL_RTX || *offset == NULL_RTX) 17797 fusion = SCHED_FUSION_NONE; 17798 17799 return fusion; 17800 } 17801 17802 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook. 17803 17804 Currently we only support to fuse ldr or str instructions, so FUSION_PRI 17805 and PRI are only calculated for these instructions. For other instruction, 17806 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other 17807 type instruction fusion can be added by returning different priorities. 17808 17809 It's important that irrelevant instructions get the largest FUSION_PRI. */ 17810 17811 static void 17812 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri, 17813 int *fusion_pri, int *pri) 17814 { 17815 int tmp, off_val; 17816 rtx base, offset; 17817 enum sched_fusion_type fusion; 17818 17819 gcc_assert (INSN_P (insn)); 17820 17821 tmp = max_pri - 1; 17822 fusion = fusion_load_store (insn, &base, &offset); 17823 if (fusion == SCHED_FUSION_NONE) 17824 { 17825 *pri = tmp; 17826 *fusion_pri = tmp; 17827 return; 17828 } 17829 17830 /* Set FUSION_PRI according to fusion type and base register. */ 17831 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base); 17832 17833 /* Calculate PRI. */ 17834 tmp /= 2; 17835 17836 /* INSN with smaller offset goes first. */ 17837 off_val = (int)(INTVAL (offset)); 17838 if (off_val >= 0) 17839 tmp -= (off_val & 0xfffff); 17840 else 17841 tmp += ((- off_val) & 0xfffff); 17842 17843 *pri = tmp; 17844 return; 17845 } 17846 17847 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook. 17848 Adjust priority of sha1h instructions so they are scheduled before 17849 other SHA1 instructions. */ 17850 17851 static int 17852 aarch64_sched_adjust_priority (rtx_insn *insn, int priority) 17853 { 17854 rtx x = PATTERN (insn); 17855 17856 if (GET_CODE (x) == SET) 17857 { 17858 x = SET_SRC (x); 17859 17860 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H) 17861 return priority + 10; 17862 } 17863 17864 return priority; 17865 } 17866 17867 /* Given OPERANDS of consecutive load/store, check if we can merge 17868 them into ldp/stp. LOAD is true if they are load instructions. 17869 MODE is the mode of memory operands. */ 17870 17871 bool 17872 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load, 17873 machine_mode mode) 17874 { 17875 HOST_WIDE_INT offval_1, offval_2, msize; 17876 enum reg_class rclass_1, rclass_2; 17877 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2; 17878 17879 if (load) 17880 { 17881 mem_1 = operands[1]; 17882 mem_2 = operands[3]; 17883 reg_1 = operands[0]; 17884 reg_2 = operands[2]; 17885 gcc_assert (REG_P (reg_1) && REG_P (reg_2)); 17886 if (REGNO (reg_1) == REGNO (reg_2)) 17887 return false; 17888 } 17889 else 17890 { 17891 mem_1 = operands[0]; 17892 mem_2 = operands[2]; 17893 reg_1 = operands[1]; 17894 reg_2 = operands[3]; 17895 } 17896 17897 /* The mems cannot be volatile. */ 17898 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)) 17899 return false; 17900 17901 /* If we have SImode and slow unaligned ldp, 17902 check the alignment to be at least 8 byte. */ 17903 if (mode == SImode 17904 && (aarch64_tune_params.extra_tuning_flags 17905 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) 17906 && !optimize_size 17907 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT) 17908 return false; 17909 17910 /* Check if the addresses are in the form of [base+offset]. */ 17911 extract_base_offset_in_addr (mem_1, &base_1, &offset_1); 17912 if (base_1 == NULL_RTX || offset_1 == NULL_RTX) 17913 return false; 17914 extract_base_offset_in_addr (mem_2, &base_2, &offset_2); 17915 if (base_2 == NULL_RTX || offset_2 == NULL_RTX) 17916 return false; 17917 17918 /* Check if the bases are same. */ 17919 if (!rtx_equal_p (base_1, base_2)) 17920 return false; 17921 17922 /* The operands must be of the same size. */ 17923 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)), 17924 GET_MODE_SIZE (GET_MODE (mem_2)))); 17925 17926 offval_1 = INTVAL (offset_1); 17927 offval_2 = INTVAL (offset_2); 17928 /* We should only be trying this for fixed-sized modes. There is no 17929 SVE LDP/STP instruction. */ 17930 msize = GET_MODE_SIZE (mode).to_constant (); 17931 /* Check if the offsets are consecutive. */ 17932 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize)) 17933 return false; 17934 17935 /* Check if the addresses are clobbered by load. */ 17936 if (load) 17937 { 17938 if (reg_mentioned_p (reg_1, mem_1)) 17939 return false; 17940 17941 /* In increasing order, the last load can clobber the address. */ 17942 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2)) 17943 return false; 17944 } 17945 17946 /* One of the memory accesses must be a mempair operand. 17947 If it is not the first one, they need to be swapped by the 17948 peephole. */ 17949 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1)) 17950 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2))) 17951 return false; 17952 17953 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1))) 17954 rclass_1 = FP_REGS; 17955 else 17956 rclass_1 = GENERAL_REGS; 17957 17958 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2))) 17959 rclass_2 = FP_REGS; 17960 else 17961 rclass_2 = GENERAL_REGS; 17962 17963 /* Check if the registers are of same class. */ 17964 if (rclass_1 != rclass_2) 17965 return false; 17966 17967 return true; 17968 } 17969 17970 /* Given OPERANDS of consecutive load/store that can be merged, 17971 swap them if they are not in ascending order. */ 17972 void 17973 aarch64_swap_ldrstr_operands (rtx* operands, bool load) 17974 { 17975 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2; 17976 HOST_WIDE_INT offval_1, offval_2; 17977 17978 if (load) 17979 { 17980 mem_1 = operands[1]; 17981 mem_2 = operands[3]; 17982 } 17983 else 17984 { 17985 mem_1 = operands[0]; 17986 mem_2 = operands[2]; 17987 } 17988 17989 extract_base_offset_in_addr (mem_1, &base_1, &offset_1); 17990 extract_base_offset_in_addr (mem_2, &base_2, &offset_2); 17991 17992 offval_1 = INTVAL (offset_1); 17993 offval_2 = INTVAL (offset_2); 17994 17995 if (offval_1 > offval_2) 17996 { 17997 /* Irrespective of whether this is a load or a store, 17998 we do the same swap. */ 17999 std::swap (operands[0], operands[2]); 18000 std::swap (operands[1], operands[3]); 18001 } 18002 } 18003 18004 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a 18005 comparison between the two. */ 18006 int 18007 aarch64_host_wide_int_compare (const void *x, const void *y) 18008 { 18009 return wi::cmps (* ((const HOST_WIDE_INT *) x), 18010 * ((const HOST_WIDE_INT *) y)); 18011 } 18012 18013 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the 18014 other pointing to a REG rtx containing an offset, compare the offsets 18015 of the two pairs. 18016 18017 Return: 18018 18019 1 iff offset (X) > offset (Y) 18020 0 iff offset (X) == offset (Y) 18021 -1 iff offset (X) < offset (Y) */ 18022 int 18023 aarch64_ldrstr_offset_compare (const void *x, const void *y) 18024 { 18025 const rtx * operands_1 = (const rtx *) x; 18026 const rtx * operands_2 = (const rtx *) y; 18027 rtx mem_1, mem_2, base, offset_1, offset_2; 18028 18029 if (MEM_P (operands_1[0])) 18030 mem_1 = operands_1[0]; 18031 else 18032 mem_1 = operands_1[1]; 18033 18034 if (MEM_P (operands_2[0])) 18035 mem_2 = operands_2[0]; 18036 else 18037 mem_2 = operands_2[1]; 18038 18039 /* Extract the offsets. */ 18040 extract_base_offset_in_addr (mem_1, &base, &offset_1); 18041 extract_base_offset_in_addr (mem_2, &base, &offset_2); 18042 18043 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX); 18044 18045 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2)); 18046 } 18047 18048 /* Given OPERANDS of consecutive load/store, check if we can merge 18049 them into ldp/stp by adjusting the offset. LOAD is true if they 18050 are load instructions. MODE is the mode of memory operands. 18051 18052 Given below consecutive stores: 18053 18054 str w1, [xb, 0x100] 18055 str w1, [xb, 0x104] 18056 str w1, [xb, 0x108] 18057 str w1, [xb, 0x10c] 18058 18059 Though the offsets are out of the range supported by stp, we can 18060 still pair them after adjusting the offset, like: 18061 18062 add scratch, xb, 0x100 18063 stp w1, w1, [scratch] 18064 stp w1, w1, [scratch, 0x8] 18065 18066 The peephole patterns detecting this opportunity should guarantee 18067 the scratch register is avaliable. */ 18068 18069 bool 18070 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load, 18071 scalar_mode mode) 18072 { 18073 const int num_insns = 4; 18074 enum reg_class rclass; 18075 HOST_WIDE_INT offvals[num_insns], msize; 18076 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns]; 18077 18078 if (load) 18079 { 18080 for (int i = 0; i < num_insns; i++) 18081 { 18082 reg[i] = operands[2 * i]; 18083 mem[i] = operands[2 * i + 1]; 18084 18085 gcc_assert (REG_P (reg[i])); 18086 } 18087 18088 /* Do not attempt to merge the loads if the loads clobber each other. */ 18089 for (int i = 0; i < 8; i += 2) 18090 for (int j = i + 2; j < 8; j += 2) 18091 if (reg_overlap_mentioned_p (operands[i], operands[j])) 18092 return false; 18093 } 18094 else 18095 for (int i = 0; i < num_insns; i++) 18096 { 18097 mem[i] = operands[2 * i]; 18098 reg[i] = operands[2 * i + 1]; 18099 } 18100 18101 /* Skip if memory operand is by itself valid for ldp/stp. */ 18102 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode)) 18103 return false; 18104 18105 for (int i = 0; i < num_insns; i++) 18106 { 18107 /* The mems cannot be volatile. */ 18108 if (MEM_VOLATILE_P (mem[i])) 18109 return false; 18110 18111 /* Check if the addresses are in the form of [base+offset]. */ 18112 extract_base_offset_in_addr (mem[i], base + i, offset + i); 18113 if (base[i] == NULL_RTX || offset[i] == NULL_RTX) 18114 return false; 18115 } 18116 18117 /* Check if the registers are of same class. */ 18118 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0])) 18119 ? FP_REGS : GENERAL_REGS; 18120 18121 for (int i = 1; i < num_insns; i++) 18122 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i]))) 18123 { 18124 if (rclass != FP_REGS) 18125 return false; 18126 } 18127 else 18128 { 18129 if (rclass != GENERAL_REGS) 18130 return false; 18131 } 18132 18133 /* Only the last register in the order in which they occur 18134 may be clobbered by the load. */ 18135 if (rclass == GENERAL_REGS && load) 18136 for (int i = 0; i < num_insns - 1; i++) 18137 if (reg_mentioned_p (reg[i], mem[i])) 18138 return false; 18139 18140 /* Check if the bases are same. */ 18141 for (int i = 0; i < num_insns - 1; i++) 18142 if (!rtx_equal_p (base[i], base[i + 1])) 18143 return false; 18144 18145 for (int i = 0; i < num_insns; i++) 18146 offvals[i] = INTVAL (offset[i]); 18147 18148 msize = GET_MODE_SIZE (mode); 18149 18150 /* Check if the offsets can be put in the right order to do a ldp/stp. */ 18151 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT), 18152 aarch64_host_wide_int_compare); 18153 18154 if (!(offvals[1] == offvals[0] + msize 18155 && offvals[3] == offvals[2] + msize)) 18156 return false; 18157 18158 /* Check that offsets are within range of each other. The ldp/stp 18159 instructions have 7 bit immediate offsets, so use 0x80. */ 18160 if (offvals[2] - offvals[0] >= msize * 0x80) 18161 return false; 18162 18163 /* The offsets must be aligned with respect to each other. */ 18164 if (offvals[0] % msize != offvals[2] % msize) 18165 return false; 18166 18167 /* If we have SImode and slow unaligned ldp, 18168 check the alignment to be at least 8 byte. */ 18169 if (mode == SImode 18170 && (aarch64_tune_params.extra_tuning_flags 18171 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) 18172 && !optimize_size 18173 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT) 18174 return false; 18175 18176 return true; 18177 } 18178 18179 /* Given OPERANDS of consecutive load/store, this function pairs them 18180 into LDP/STP after adjusting the offset. It depends on the fact 18181 that the operands can be sorted so the offsets are correct for STP. 18182 MODE is the mode of memory operands. CODE is the rtl operator 18183 which should be applied to all memory operands, it's SIGN_EXTEND, 18184 ZERO_EXTEND or UNKNOWN. */ 18185 18186 bool 18187 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load, 18188 scalar_mode mode, RTX_CODE code) 18189 { 18190 rtx base, offset_1, offset_3, t1, t2; 18191 rtx mem_1, mem_2, mem_3, mem_4; 18192 rtx temp_operands[8]; 18193 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3, 18194 stp_off_upper_limit, stp_off_lower_limit, msize; 18195 18196 /* We make changes on a copy as we may still bail out. */ 18197 for (int i = 0; i < 8; i ++) 18198 temp_operands[i] = operands[i]; 18199 18200 /* Sort the operands. */ 18201 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare); 18202 18203 /* Copy the memory operands so that if we have to bail for some 18204 reason the original addresses are unchanged. */ 18205 if (load) 18206 { 18207 mem_1 = copy_rtx (temp_operands[1]); 18208 mem_2 = copy_rtx (temp_operands[3]); 18209 mem_3 = copy_rtx (temp_operands[5]); 18210 mem_4 = copy_rtx (temp_operands[7]); 18211 } 18212 else 18213 { 18214 mem_1 = copy_rtx (temp_operands[0]); 18215 mem_2 = copy_rtx (temp_operands[2]); 18216 mem_3 = copy_rtx (temp_operands[4]); 18217 mem_4 = copy_rtx (temp_operands[6]); 18218 gcc_assert (code == UNKNOWN); 18219 } 18220 18221 extract_base_offset_in_addr (mem_1, &base, &offset_1); 18222 extract_base_offset_in_addr (mem_3, &base, &offset_3); 18223 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX 18224 && offset_3 != NULL_RTX); 18225 18226 /* Adjust offset so it can fit in LDP/STP instruction. */ 18227 msize = GET_MODE_SIZE (mode); 18228 stp_off_upper_limit = msize * (0x40 - 1); 18229 stp_off_lower_limit = - msize * 0x40; 18230 18231 off_val_1 = INTVAL (offset_1); 18232 off_val_3 = INTVAL (offset_3); 18233 18234 /* The base offset is optimally half way between the two STP/LDP offsets. */ 18235 if (msize <= 4) 18236 base_off = (off_val_1 + off_val_3) / 2; 18237 else 18238 /* However, due to issues with negative LDP/STP offset generation for 18239 larger modes, for DF, DI and vector modes. we must not use negative 18240 addresses smaller than 9 signed unadjusted bits can store. This 18241 provides the most range in this case. */ 18242 base_off = off_val_1; 18243 18244 /* Adjust the base so that it is aligned with the addresses but still 18245 optimal. */ 18246 if (base_off % msize != off_val_1 % msize) 18247 /* Fix the offset, bearing in mind we want to make it bigger not 18248 smaller. */ 18249 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize; 18250 else if (msize <= 4) 18251 /* The negative range of LDP/STP is one larger than the positive range. */ 18252 base_off += msize; 18253 18254 /* Check if base offset is too big or too small. We can attempt to resolve 18255 this issue by setting it to the maximum value and seeing if the offsets 18256 still fit. */ 18257 if (base_off >= 0x1000) 18258 { 18259 base_off = 0x1000 - 1; 18260 /* We must still make sure that the base offset is aligned with respect 18261 to the address. But it may may not be made any bigger. */ 18262 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize; 18263 } 18264 18265 /* Likewise for the case where the base is too small. */ 18266 if (base_off <= -0x1000) 18267 { 18268 base_off = -0x1000 + 1; 18269 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize; 18270 } 18271 18272 /* Offset of the first STP/LDP. */ 18273 new_off_1 = off_val_1 - base_off; 18274 18275 /* Offset of the second STP/LDP. */ 18276 new_off_3 = off_val_3 - base_off; 18277 18278 /* The offsets must be within the range of the LDP/STP instructions. */ 18279 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit 18280 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit) 18281 return false; 18282 18283 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8], 18284 new_off_1), true); 18285 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8], 18286 new_off_1 + msize), true); 18287 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8], 18288 new_off_3), true); 18289 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8], 18290 new_off_3 + msize), true); 18291 18292 if (!aarch64_mem_pair_operand (mem_1, mode) 18293 || !aarch64_mem_pair_operand (mem_3, mode)) 18294 return false; 18295 18296 if (code == ZERO_EXTEND) 18297 { 18298 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1); 18299 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2); 18300 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3); 18301 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4); 18302 } 18303 else if (code == SIGN_EXTEND) 18304 { 18305 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1); 18306 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2); 18307 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3); 18308 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4); 18309 } 18310 18311 if (load) 18312 { 18313 operands[0] = temp_operands[0]; 18314 operands[1] = mem_1; 18315 operands[2] = temp_operands[2]; 18316 operands[3] = mem_2; 18317 operands[4] = temp_operands[4]; 18318 operands[5] = mem_3; 18319 operands[6] = temp_operands[6]; 18320 operands[7] = mem_4; 18321 } 18322 else 18323 { 18324 operands[0] = mem_1; 18325 operands[1] = temp_operands[1]; 18326 operands[2] = mem_2; 18327 operands[3] = temp_operands[3]; 18328 operands[4] = mem_3; 18329 operands[5] = temp_operands[5]; 18330 operands[6] = mem_4; 18331 operands[7] = temp_operands[7]; 18332 } 18333 18334 /* Emit adjusting instruction. */ 18335 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off))); 18336 /* Emit ldp/stp instructions. */ 18337 t1 = gen_rtx_SET (operands[0], operands[1]); 18338 t2 = gen_rtx_SET (operands[2], operands[3]); 18339 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2))); 18340 t1 = gen_rtx_SET (operands[4], operands[5]); 18341 t2 = gen_rtx_SET (operands[6], operands[7]); 18342 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2))); 18343 return true; 18344 } 18345 18346 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that 18347 it isn't worth branching around empty masked ops (including masked 18348 stores). */ 18349 18350 static bool 18351 aarch64_empty_mask_is_expensive (unsigned) 18352 { 18353 return false; 18354 } 18355 18356 /* Return 1 if pseudo register should be created and used to hold 18357 GOT address for PIC code. */ 18358 18359 bool 18360 aarch64_use_pseudo_pic_reg (void) 18361 { 18362 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC; 18363 } 18364 18365 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */ 18366 18367 static int 18368 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags) 18369 { 18370 switch (XINT (x, 1)) 18371 { 18372 case UNSPEC_GOTSMALLPIC: 18373 case UNSPEC_GOTSMALLPIC28K: 18374 case UNSPEC_GOTTINYPIC: 18375 return 0; 18376 default: 18377 break; 18378 } 18379 18380 return default_unspec_may_trap_p (x, flags); 18381 } 18382 18383 18384 /* If X is a positive CONST_DOUBLE with a value that is a power of 2 18385 return the log2 of that value. Otherwise return -1. */ 18386 18387 int 18388 aarch64_fpconst_pow_of_2 (rtx x) 18389 { 18390 const REAL_VALUE_TYPE *r; 18391 18392 if (!CONST_DOUBLE_P (x)) 18393 return -1; 18394 18395 r = CONST_DOUBLE_REAL_VALUE (x); 18396 18397 if (REAL_VALUE_NEGATIVE (*r) 18398 || REAL_VALUE_ISNAN (*r) 18399 || REAL_VALUE_ISINF (*r) 18400 || !real_isinteger (r, DFmode)) 18401 return -1; 18402 18403 return exact_log2 (real_to_integer (r)); 18404 } 18405 18406 /* If X is a vector of equal CONST_DOUBLE values and that value is 18407 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */ 18408 18409 int 18410 aarch64_vec_fpconst_pow_of_2 (rtx x) 18411 { 18412 int nelts; 18413 if (GET_CODE (x) != CONST_VECTOR 18414 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts)) 18415 return -1; 18416 18417 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT) 18418 return -1; 18419 18420 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0)); 18421 if (firstval <= 0) 18422 return -1; 18423 18424 for (int i = 1; i < nelts; i++) 18425 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval) 18426 return -1; 18427 18428 return firstval; 18429 } 18430 18431 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types 18432 to float. 18433 18434 __fp16 always promotes through this hook. 18435 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that 18436 through the generic excess precision logic rather than here. */ 18437 18438 static tree 18439 aarch64_promoted_type (const_tree t) 18440 { 18441 if (SCALAR_FLOAT_TYPE_P (t) 18442 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node) 18443 return float_type_node; 18444 18445 return NULL_TREE; 18446 } 18447 18448 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */ 18449 18450 static bool 18451 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode, 18452 optimization_type opt_type) 18453 { 18454 switch (op) 18455 { 18456 case rsqrt_optab: 18457 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1); 18458 18459 default: 18460 return true; 18461 } 18462 } 18463 18464 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */ 18465 18466 static unsigned int 18467 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor, 18468 int *offset) 18469 { 18470 /* Polynomial invariant 1 == (VG / 2) - 1. */ 18471 gcc_assert (i == 1); 18472 *factor = 2; 18473 *offset = 1; 18474 return AARCH64_DWARF_VG; 18475 } 18476 18477 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE 18478 if MODE is HFmode, and punt to the generic implementation otherwise. */ 18479 18480 static bool 18481 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode) 18482 { 18483 return (mode == HFmode 18484 ? true 18485 : default_libgcc_floating_mode_supported_p (mode)); 18486 } 18487 18488 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE 18489 if MODE is HFmode, and punt to the generic implementation otherwise. */ 18490 18491 static bool 18492 aarch64_scalar_mode_supported_p (scalar_mode mode) 18493 { 18494 return (mode == HFmode 18495 ? true 18496 : default_scalar_mode_supported_p (mode)); 18497 } 18498 18499 /* Set the value of FLT_EVAL_METHOD. 18500 ISO/IEC TS 18661-3 defines two values that we'd like to make use of: 18501 18502 0: evaluate all operations and constants, whose semantic type has at 18503 most the range and precision of type float, to the range and 18504 precision of float; evaluate all other operations and constants to 18505 the range and precision of the semantic type; 18506 18507 N, where _FloatN is a supported interchange floating type 18508 evaluate all operations and constants, whose semantic type has at 18509 most the range and precision of _FloatN type, to the range and 18510 precision of the _FloatN type; evaluate all other operations and 18511 constants to the range and precision of the semantic type; 18512 18513 If we have the ARMv8.2-A extensions then we support _Float16 in native 18514 precision, so we should set this to 16. Otherwise, we support the type, 18515 but want to evaluate expressions in float precision, so set this to 18516 0. */ 18517 18518 static enum flt_eval_method 18519 aarch64_excess_precision (enum excess_precision_type type) 18520 { 18521 switch (type) 18522 { 18523 case EXCESS_PRECISION_TYPE_FAST: 18524 case EXCESS_PRECISION_TYPE_STANDARD: 18525 /* We can calculate either in 16-bit range and precision or 18526 32-bit range and precision. Make that decision based on whether 18527 we have native support for the ARMv8.2-A 16-bit floating-point 18528 instructions or not. */ 18529 return (TARGET_FP_F16INST 18530 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16 18531 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT); 18532 case EXCESS_PRECISION_TYPE_IMPLICIT: 18533 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16; 18534 default: 18535 gcc_unreachable (); 18536 } 18537 return FLT_EVAL_METHOD_UNPREDICTABLE; 18538 } 18539 18540 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be 18541 scheduled for speculative execution. Reject the long-running division 18542 and square-root instructions. */ 18543 18544 static bool 18545 aarch64_sched_can_speculate_insn (rtx_insn *insn) 18546 { 18547 switch (get_attr_type (insn)) 18548 { 18549 case TYPE_SDIV: 18550 case TYPE_UDIV: 18551 case TYPE_FDIVS: 18552 case TYPE_FDIVD: 18553 case TYPE_FSQRTS: 18554 case TYPE_FSQRTD: 18555 case TYPE_NEON_FP_SQRT_S: 18556 case TYPE_NEON_FP_SQRT_D: 18557 case TYPE_NEON_FP_SQRT_S_Q: 18558 case TYPE_NEON_FP_SQRT_D_Q: 18559 case TYPE_NEON_FP_DIV_S: 18560 case TYPE_NEON_FP_DIV_D: 18561 case TYPE_NEON_FP_DIV_S_Q: 18562 case TYPE_NEON_FP_DIV_D_Q: 18563 return false; 18564 default: 18565 return true; 18566 } 18567 } 18568 18569 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */ 18570 18571 static int 18572 aarch64_compute_pressure_classes (reg_class *classes) 18573 { 18574 int i = 0; 18575 classes[i++] = GENERAL_REGS; 18576 classes[i++] = FP_REGS; 18577 /* PR_REGS isn't a useful pressure class because many predicate pseudo 18578 registers need to go in PR_LO_REGS at some point during their 18579 lifetime. Splitting it into two halves has the effect of making 18580 all predicates count against PR_LO_REGS, so that we try whenever 18581 possible to restrict the number of live predicates to 8. This 18582 greatly reduces the amount of spilling in certain loops. */ 18583 classes[i++] = PR_LO_REGS; 18584 classes[i++] = PR_HI_REGS; 18585 return i; 18586 } 18587 18588 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */ 18589 18590 static bool 18591 aarch64_can_change_mode_class (machine_mode from, 18592 machine_mode to, reg_class_t) 18593 { 18594 if (BYTES_BIG_ENDIAN) 18595 { 18596 bool from_sve_p = aarch64_sve_data_mode_p (from); 18597 bool to_sve_p = aarch64_sve_data_mode_p (to); 18598 18599 /* Don't allow changes between SVE data modes and non-SVE modes. 18600 See the comment at the head of aarch64-sve.md for details. */ 18601 if (from_sve_p != to_sve_p) 18602 return false; 18603 18604 /* Don't allow changes in element size: lane 0 of the new vector 18605 would not then be lane 0 of the old vector. See the comment 18606 above aarch64_maybe_expand_sve_subreg_move for a more detailed 18607 description. 18608 18609 In the worst case, this forces a register to be spilled in 18610 one mode and reloaded in the other, which handles the 18611 endianness correctly. */ 18612 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)) 18613 return false; 18614 } 18615 return true; 18616 } 18617 18618 /* Implement TARGET_EARLY_REMAT_MODES. */ 18619 18620 static void 18621 aarch64_select_early_remat_modes (sbitmap modes) 18622 { 18623 /* SVE values are not normally live across a call, so it should be 18624 worth doing early rematerialization even in VL-specific mode. */ 18625 for (int i = 0; i < NUM_MACHINE_MODES; ++i) 18626 { 18627 machine_mode mode = (machine_mode) i; 18628 unsigned int vec_flags = aarch64_classify_vector_mode (mode); 18629 if (vec_flags & VEC_ANY_SVE) 18630 bitmap_set_bit (modes, i); 18631 } 18632 } 18633 18634 /* Override the default target speculation_safe_value. */ 18635 static rtx 18636 aarch64_speculation_safe_value (machine_mode mode, 18637 rtx result, rtx val, rtx failval) 18638 { 18639 /* Maybe we should warn if falling back to hard barriers. They are 18640 likely to be noticably more expensive than the alternative below. */ 18641 if (!aarch64_track_speculation) 18642 return default_speculation_safe_value (mode, result, val, failval); 18643 18644 if (!REG_P (val)) 18645 val = copy_to_mode_reg (mode, val); 18646 18647 if (!aarch64_reg_or_zero (failval, mode)) 18648 failval = copy_to_mode_reg (mode, failval); 18649 18650 emit_insn (gen_despeculate_copy (mode, result, val, failval)); 18651 return result; 18652 } 18653 18654 /* Implement TARGET_ESTIMATED_POLY_VALUE. 18655 Look into the tuning structure for an estimate. 18656 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial 18657 Advanced SIMD 128 bits. */ 18658 18659 static HOST_WIDE_INT 18660 aarch64_estimated_poly_value (poly_int64 val) 18661 { 18662 enum aarch64_sve_vector_bits_enum width_source 18663 = aarch64_tune_params.sve_width; 18664 18665 /* If we still don't have an estimate, use the default. */ 18666 if (width_source == SVE_SCALABLE) 18667 return default_estimated_poly_value (val); 18668 18669 HOST_WIDE_INT over_128 = width_source - 128; 18670 return val.coeffs[0] + val.coeffs[1] * over_128 / 128; 18671 } 18672 18673 18674 /* Return true for types that could be supported as SIMD return or 18675 argument types. */ 18676 18677 static bool 18678 supported_simd_type (tree t) 18679 { 18680 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t)) 18681 { 18682 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t)); 18683 return s == 1 || s == 2 || s == 4 || s == 8; 18684 } 18685 return false; 18686 } 18687 18688 /* Return true for types that currently are supported as SIMD return 18689 or argument types. */ 18690 18691 static bool 18692 currently_supported_simd_type (tree t, tree b) 18693 { 18694 if (COMPLEX_FLOAT_TYPE_P (t)) 18695 return false; 18696 18697 if (TYPE_SIZE (t) != TYPE_SIZE (b)) 18698 return false; 18699 18700 return supported_simd_type (t); 18701 } 18702 18703 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */ 18704 18705 static int 18706 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node, 18707 struct cgraph_simd_clone *clonei, 18708 tree base_type, int num) 18709 { 18710 tree t, ret_type, arg_type; 18711 unsigned int elt_bits, vec_bits, count; 18712 18713 if (!TARGET_SIMD) 18714 return 0; 18715 18716 if (clonei->simdlen 18717 && (clonei->simdlen < 2 18718 || clonei->simdlen > 1024 18719 || (clonei->simdlen & (clonei->simdlen - 1)) != 0)) 18720 { 18721 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 18722 "unsupported simdlen %d", clonei->simdlen); 18723 return 0; 18724 } 18725 18726 ret_type = TREE_TYPE (TREE_TYPE (node->decl)); 18727 if (TREE_CODE (ret_type) != VOID_TYPE 18728 && !currently_supported_simd_type (ret_type, base_type)) 18729 { 18730 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type)) 18731 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 18732 "GCC does not currently support mixed size types " 18733 "for %<simd%> functions"); 18734 else if (supported_simd_type (ret_type)) 18735 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 18736 "GCC does not currently support return type %qT " 18737 "for %<simd%> functions", ret_type); 18738 else 18739 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 18740 "unsupported return type %qT for %<simd%> functions", 18741 ret_type); 18742 return 0; 18743 } 18744 18745 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t)) 18746 { 18747 arg_type = TREE_TYPE (t); 18748 18749 if (!currently_supported_simd_type (arg_type, base_type)) 18750 { 18751 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type)) 18752 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 18753 "GCC does not currently support mixed size types " 18754 "for %<simd%> functions"); 18755 else 18756 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 18757 "GCC does not currently support argument type %qT " 18758 "for %<simd%> functions", arg_type); 18759 return 0; 18760 } 18761 } 18762 18763 clonei->vecsize_mangle = 'n'; 18764 clonei->mask_mode = VOIDmode; 18765 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type)); 18766 if (clonei->simdlen == 0) 18767 { 18768 count = 2; 18769 vec_bits = (num == 0 ? 64 : 128); 18770 clonei->simdlen = vec_bits / elt_bits; 18771 } 18772 else 18773 { 18774 count = 1; 18775 vec_bits = clonei->simdlen * elt_bits; 18776 if (vec_bits != 64 && vec_bits != 128) 18777 { 18778 warning_at (DECL_SOURCE_LOCATION (node->decl), 0, 18779 "GCC does not currently support simdlen %d for type %qT", 18780 clonei->simdlen, base_type); 18781 return 0; 18782 } 18783 } 18784 clonei->vecsize_int = vec_bits; 18785 clonei->vecsize_float = vec_bits; 18786 return count; 18787 } 18788 18789 /* Implement TARGET_SIMD_CLONE_ADJUST. */ 18790 18791 static void 18792 aarch64_simd_clone_adjust (struct cgraph_node *node) 18793 { 18794 /* Add aarch64_vector_pcs target attribute to SIMD clones so they 18795 use the correct ABI. */ 18796 18797 tree t = TREE_TYPE (node->decl); 18798 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default", 18799 TYPE_ATTRIBUTES (t)); 18800 } 18801 18802 /* Implement TARGET_SIMD_CLONE_USABLE. */ 18803 18804 static int 18805 aarch64_simd_clone_usable (struct cgraph_node *node) 18806 { 18807 switch (node->simdclone->vecsize_mangle) 18808 { 18809 case 'n': 18810 if (!TARGET_SIMD) 18811 return -1; 18812 return 0; 18813 default: 18814 gcc_unreachable (); 18815 } 18816 } 18817 18818 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */ 18819 18820 static int 18821 aarch64_comp_type_attributes (const_tree type1, const_tree type2) 18822 { 18823 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type1)) 18824 != lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (type2))) 18825 return 0; 18826 return 1; 18827 } 18828 18829 /* Implement TARGET_GET_MULTILIB_ABI_NAME */ 18830 18831 static const char * 18832 aarch64_get_multilib_abi_name (void) 18833 { 18834 if (TARGET_BIG_END) 18835 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be"; 18836 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64"; 18837 } 18838 18839 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a 18840 global variable based guard use the default else 18841 return a null tree. */ 18842 static tree 18843 aarch64_stack_protect_guard (void) 18844 { 18845 if (aarch64_stack_protector_guard == SSP_GLOBAL) 18846 return default_stack_protect_guard (); 18847 18848 return NULL_TREE; 18849 } 18850 18851 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE 18852 section at the end if needed. */ 18853 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000 18854 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0) 18855 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1) 18856 void 18857 aarch64_file_end_indicate_exec_stack () 18858 { 18859 file_end_indicate_exec_stack (); 18860 18861 unsigned feature_1_and = 0; 18862 if (aarch64_bti_enabled ()) 18863 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI; 18864 18865 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE) 18866 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC; 18867 18868 if (feature_1_and) 18869 { 18870 /* Generate .note.gnu.property section. */ 18871 switch_to_section (get_section (".note.gnu.property", 18872 SECTION_NOTYPE, NULL)); 18873 18874 /* PT_NOTE header: namesz, descsz, type. 18875 namesz = 4 ("GNU\0") 18876 descsz = 16 (Size of the program property array) 18877 [(12 + padding) * Number of array elements] 18878 type = 5 (NT_GNU_PROPERTY_TYPE_0). */ 18879 assemble_align (POINTER_SIZE); 18880 assemble_integer (GEN_INT (4), 4, 32, 1); 18881 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1); 18882 assemble_integer (GEN_INT (5), 4, 32, 1); 18883 18884 /* PT_NOTE name. */ 18885 assemble_string ("GNU", 4); 18886 18887 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0: 18888 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND 18889 datasz = 4 18890 data = feature_1_and. */ 18891 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1); 18892 assemble_integer (GEN_INT (4), 4, 32, 1); 18893 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1); 18894 18895 /* Pad the size of the note to the required alignment. */ 18896 assemble_align (POINTER_SIZE); 18897 } 18898 } 18899 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC 18900 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI 18901 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND 18902 18903 /* Target-specific selftests. */ 18904 18905 #if CHECKING_P 18906 18907 namespace selftest { 18908 18909 /* Selftest for the RTL loader. 18910 Verify that the RTL loader copes with a dump from 18911 print_rtx_function. This is essentially just a test that class 18912 function_reader can handle a real dump, but it also verifies 18913 that lookup_reg_by_dump_name correctly handles hard regs. 18914 The presence of hard reg names in the dump means that the test is 18915 target-specific, hence it is in this file. */ 18916 18917 static void 18918 aarch64_test_loading_full_dump () 18919 { 18920 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl")); 18921 18922 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl))); 18923 18924 rtx_insn *insn_1 = get_insn_by_uid (1); 18925 ASSERT_EQ (NOTE, GET_CODE (insn_1)); 18926 18927 rtx_insn *insn_15 = get_insn_by_uid (15); 18928 ASSERT_EQ (INSN, GET_CODE (insn_15)); 18929 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15))); 18930 18931 /* Verify crtl->return_rtx. */ 18932 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx)); 18933 ASSERT_EQ (0, REGNO (crtl->return_rtx)); 18934 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx)); 18935 } 18936 18937 /* Run all target-specific selftests. */ 18938 18939 static void 18940 aarch64_run_selftests (void) 18941 { 18942 aarch64_test_loading_full_dump (); 18943 } 18944 18945 } // namespace selftest 18946 18947 #endif /* #if CHECKING_P */ 18948 18949 #undef TARGET_STACK_PROTECT_GUARD 18950 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard 18951 18952 #undef TARGET_ADDRESS_COST 18953 #define TARGET_ADDRESS_COST aarch64_address_cost 18954 18955 /* This hook will determines whether unnamed bitfields affect the alignment 18956 of the containing structure. The hook returns true if the structure 18957 should inherit the alignment requirements of an unnamed bitfield's 18958 type. */ 18959 #undef TARGET_ALIGN_ANON_BITFIELD 18960 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true 18961 18962 #undef TARGET_ASM_ALIGNED_DI_OP 18963 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t" 18964 18965 #undef TARGET_ASM_ALIGNED_HI_OP 18966 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t" 18967 18968 #undef TARGET_ASM_ALIGNED_SI_OP 18969 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t" 18970 18971 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK 18972 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \ 18973 hook_bool_const_tree_hwi_hwi_const_tree_true 18974 18975 #undef TARGET_ASM_FILE_START 18976 #define TARGET_ASM_FILE_START aarch64_start_file 18977 18978 #undef TARGET_ASM_OUTPUT_MI_THUNK 18979 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk 18980 18981 #undef TARGET_ASM_SELECT_RTX_SECTION 18982 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section 18983 18984 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE 18985 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template 18986 18987 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY 18988 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry 18989 18990 #undef TARGET_BUILD_BUILTIN_VA_LIST 18991 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list 18992 18993 #undef TARGET_CALLEE_COPIES 18994 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false 18995 18996 #undef TARGET_CAN_ELIMINATE 18997 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate 18998 18999 #undef TARGET_CAN_INLINE_P 19000 #define TARGET_CAN_INLINE_P aarch64_can_inline_p 19001 19002 #undef TARGET_CANNOT_FORCE_CONST_MEM 19003 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem 19004 19005 #undef TARGET_CASE_VALUES_THRESHOLD 19006 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold 19007 19008 #undef TARGET_CONDITIONAL_REGISTER_USAGE 19009 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage 19010 19011 /* Only the least significant bit is used for initialization guard 19012 variables. */ 19013 #undef TARGET_CXX_GUARD_MASK_BIT 19014 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true 19015 19016 #undef TARGET_C_MODE_FOR_SUFFIX 19017 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix 19018 19019 #ifdef TARGET_BIG_ENDIAN_DEFAULT 19020 #undef TARGET_DEFAULT_TARGET_FLAGS 19021 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END) 19022 #endif 19023 19024 #undef TARGET_CLASS_MAX_NREGS 19025 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs 19026 19027 #undef TARGET_BUILTIN_DECL 19028 #define TARGET_BUILTIN_DECL aarch64_builtin_decl 19029 19030 #undef TARGET_BUILTIN_RECIPROCAL 19031 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal 19032 19033 #undef TARGET_C_EXCESS_PRECISION 19034 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision 19035 19036 #undef TARGET_EXPAND_BUILTIN 19037 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin 19038 19039 #undef TARGET_EXPAND_BUILTIN_VA_START 19040 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start 19041 19042 #undef TARGET_FOLD_BUILTIN 19043 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin 19044 19045 #undef TARGET_FUNCTION_ARG 19046 #define TARGET_FUNCTION_ARG aarch64_function_arg 19047 19048 #undef TARGET_FUNCTION_ARG_ADVANCE 19049 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance 19050 19051 #undef TARGET_FUNCTION_ARG_BOUNDARY 19052 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary 19053 19054 #undef TARGET_FUNCTION_ARG_PADDING 19055 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding 19056 19057 #undef TARGET_GET_RAW_RESULT_MODE 19058 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode 19059 #undef TARGET_GET_RAW_ARG_MODE 19060 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode 19061 19062 #undef TARGET_FUNCTION_OK_FOR_SIBCALL 19063 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall 19064 19065 #undef TARGET_FUNCTION_VALUE 19066 #define TARGET_FUNCTION_VALUE aarch64_function_value 19067 19068 #undef TARGET_FUNCTION_VALUE_REGNO_P 19069 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p 19070 19071 #undef TARGET_GIMPLE_FOLD_BUILTIN 19072 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin 19073 19074 #undef TARGET_GIMPLIFY_VA_ARG_EXPR 19075 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr 19076 19077 #undef TARGET_INIT_BUILTINS 19078 #define TARGET_INIT_BUILTINS aarch64_init_builtins 19079 19080 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS 19081 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \ 19082 aarch64_ira_change_pseudo_allocno_class 19083 19084 #undef TARGET_LEGITIMATE_ADDRESS_P 19085 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p 19086 19087 #undef TARGET_LEGITIMATE_CONSTANT_P 19088 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p 19089 19090 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT 19091 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \ 19092 aarch64_legitimize_address_displacement 19093 19094 #undef TARGET_LIBGCC_CMP_RETURN_MODE 19095 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode 19096 19097 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P 19098 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \ 19099 aarch64_libgcc_floating_mode_supported_p 19100 19101 #undef TARGET_MANGLE_TYPE 19102 #define TARGET_MANGLE_TYPE aarch64_mangle_type 19103 19104 #undef TARGET_MEMORY_MOVE_COST 19105 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost 19106 19107 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL 19108 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul 19109 19110 #undef TARGET_MUST_PASS_IN_STACK 19111 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size 19112 19113 /* This target hook should return true if accesses to volatile bitfields 19114 should use the narrowest mode possible. It should return false if these 19115 accesses should use the bitfield container type. */ 19116 #undef TARGET_NARROW_VOLATILE_BITFIELD 19117 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false 19118 19119 #undef TARGET_OPTION_OVERRIDE 19120 #define TARGET_OPTION_OVERRIDE aarch64_override_options 19121 19122 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE 19123 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \ 19124 aarch64_override_options_after_change 19125 19126 #undef TARGET_OPTION_SAVE 19127 #define TARGET_OPTION_SAVE aarch64_option_save 19128 19129 #undef TARGET_OPTION_RESTORE 19130 #define TARGET_OPTION_RESTORE aarch64_option_restore 19131 19132 #undef TARGET_OPTION_PRINT 19133 #define TARGET_OPTION_PRINT aarch64_option_print 19134 19135 #undef TARGET_OPTION_VALID_ATTRIBUTE_P 19136 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p 19137 19138 #undef TARGET_SET_CURRENT_FUNCTION 19139 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function 19140 19141 #undef TARGET_PASS_BY_REFERENCE 19142 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference 19143 19144 #undef TARGET_PREFERRED_RELOAD_CLASS 19145 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class 19146 19147 #undef TARGET_SCHED_REASSOCIATION_WIDTH 19148 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width 19149 19150 #undef TARGET_PROMOTED_TYPE 19151 #define TARGET_PROMOTED_TYPE aarch64_promoted_type 19152 19153 #undef TARGET_SECONDARY_RELOAD 19154 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload 19155 19156 #undef TARGET_SHIFT_TRUNCATION_MASK 19157 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask 19158 19159 #undef TARGET_SETUP_INCOMING_VARARGS 19160 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs 19161 19162 #undef TARGET_STRUCT_VALUE_RTX 19163 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx 19164 19165 #undef TARGET_REGISTER_MOVE_COST 19166 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost 19167 19168 #undef TARGET_RETURN_IN_MEMORY 19169 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory 19170 19171 #undef TARGET_RETURN_IN_MSB 19172 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb 19173 19174 #undef TARGET_RTX_COSTS 19175 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper 19176 19177 #undef TARGET_SCALAR_MODE_SUPPORTED_P 19178 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p 19179 19180 #undef TARGET_SCHED_ISSUE_RATE 19181 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate 19182 19183 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD 19184 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \ 19185 aarch64_sched_first_cycle_multipass_dfa_lookahead 19186 19187 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD 19188 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \ 19189 aarch64_first_cycle_multipass_dfa_lookahead_guard 19190 19191 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS 19192 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \ 19193 aarch64_get_separate_components 19194 19195 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB 19196 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \ 19197 aarch64_components_for_bb 19198 19199 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS 19200 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \ 19201 aarch64_disqualify_components 19202 19203 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS 19204 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \ 19205 aarch64_emit_prologue_components 19206 19207 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS 19208 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \ 19209 aarch64_emit_epilogue_components 19210 19211 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS 19212 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \ 19213 aarch64_set_handled_components 19214 19215 #undef TARGET_TRAMPOLINE_INIT 19216 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init 19217 19218 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P 19219 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p 19220 19221 #undef TARGET_VECTOR_MODE_SUPPORTED_P 19222 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p 19223 19224 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT 19225 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \ 19226 aarch64_builtin_support_vector_misalignment 19227 19228 #undef TARGET_ARRAY_MODE 19229 #define TARGET_ARRAY_MODE aarch64_array_mode 19230 19231 #undef TARGET_ARRAY_MODE_SUPPORTED_P 19232 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p 19233 19234 #undef TARGET_VECTORIZE_ADD_STMT_COST 19235 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost 19236 19237 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST 19238 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \ 19239 aarch64_builtin_vectorization_cost 19240 19241 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE 19242 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode 19243 19244 #undef TARGET_VECTORIZE_BUILTINS 19245 #define TARGET_VECTORIZE_BUILTINS 19246 19247 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION 19248 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \ 19249 aarch64_builtin_vectorized_function 19250 19251 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES 19252 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \ 19253 aarch64_autovectorize_vector_sizes 19254 19255 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV 19256 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \ 19257 aarch64_atomic_assign_expand_fenv 19258 19259 /* Section anchor support. */ 19260 19261 #undef TARGET_MIN_ANCHOR_OFFSET 19262 #define TARGET_MIN_ANCHOR_OFFSET -256 19263 19264 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a 19265 byte offset; we can do much more for larger data types, but have no way 19266 to determine the size of the access. We assume accesses are aligned. */ 19267 #undef TARGET_MAX_ANCHOR_OFFSET 19268 #define TARGET_MAX_ANCHOR_OFFSET 4095 19269 19270 #undef TARGET_VECTOR_ALIGNMENT 19271 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment 19272 19273 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT 19274 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \ 19275 aarch64_vectorize_preferred_vector_alignment 19276 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE 19277 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \ 19278 aarch64_simd_vector_alignment_reachable 19279 19280 /* vec_perm support. */ 19281 19282 #undef TARGET_VECTORIZE_VEC_PERM_CONST 19283 #define TARGET_VECTORIZE_VEC_PERM_CONST \ 19284 aarch64_vectorize_vec_perm_const 19285 19286 #undef TARGET_VECTORIZE_GET_MASK_MODE 19287 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode 19288 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE 19289 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \ 19290 aarch64_empty_mask_is_expensive 19291 #undef TARGET_PREFERRED_ELSE_VALUE 19292 #define TARGET_PREFERRED_ELSE_VALUE \ 19293 aarch64_preferred_else_value 19294 19295 #undef TARGET_INIT_LIBFUNCS 19296 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs 19297 19298 #undef TARGET_FIXED_CONDITION_CODE_REGS 19299 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs 19300 19301 #undef TARGET_FLAGS_REGNUM 19302 #define TARGET_FLAGS_REGNUM CC_REGNUM 19303 19304 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS 19305 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true 19306 19307 #undef TARGET_ASAN_SHADOW_OFFSET 19308 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset 19309 19310 #undef TARGET_LEGITIMIZE_ADDRESS 19311 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address 19312 19313 #undef TARGET_SCHED_CAN_SPECULATE_INSN 19314 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn 19315 19316 #undef TARGET_CAN_USE_DOLOOP_P 19317 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost 19318 19319 #undef TARGET_SCHED_ADJUST_PRIORITY 19320 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority 19321 19322 #undef TARGET_SCHED_MACRO_FUSION_P 19323 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p 19324 19325 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P 19326 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p 19327 19328 #undef TARGET_SCHED_FUSION_PRIORITY 19329 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority 19330 19331 #undef TARGET_UNSPEC_MAY_TRAP_P 19332 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p 19333 19334 #undef TARGET_USE_PSEUDO_PIC_REG 19335 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg 19336 19337 #undef TARGET_PRINT_OPERAND 19338 #define TARGET_PRINT_OPERAND aarch64_print_operand 19339 19340 #undef TARGET_PRINT_OPERAND_ADDRESS 19341 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address 19342 19343 #undef TARGET_OPTAB_SUPPORTED_P 19344 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p 19345 19346 #undef TARGET_OMIT_STRUCT_RETURN_REG 19347 #define TARGET_OMIT_STRUCT_RETURN_REG true 19348 19349 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE 19350 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \ 19351 aarch64_dwarf_poly_indeterminate_value 19352 19353 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */ 19354 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS 19355 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4 19356 19357 #undef TARGET_HARD_REGNO_NREGS 19358 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs 19359 #undef TARGET_HARD_REGNO_MODE_OK 19360 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok 19361 19362 #undef TARGET_MODES_TIEABLE_P 19363 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p 19364 19365 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED 19366 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \ 19367 aarch64_hard_regno_call_part_clobbered 19368 19369 #undef TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS 19370 #define TARGET_REMOVE_EXTRA_CALL_PRESERVED_REGS \ 19371 aarch64_remove_extra_call_preserved_regs 19372 19373 #undef TARGET_RETURN_CALL_WITH_MAX_CLOBBERS 19374 #define TARGET_RETURN_CALL_WITH_MAX_CLOBBERS \ 19375 aarch64_return_call_with_max_clobbers 19376 19377 #undef TARGET_CONSTANT_ALIGNMENT 19378 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment 19379 19380 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE 19381 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \ 19382 aarch64_stack_clash_protection_alloca_probe_range 19383 19384 #undef TARGET_COMPUTE_PRESSURE_CLASSES 19385 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes 19386 19387 #undef TARGET_CAN_CHANGE_MODE_CLASS 19388 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class 19389 19390 #undef TARGET_SELECT_EARLY_REMAT_MODES 19391 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes 19392 19393 #undef TARGET_SPECULATION_SAFE_VALUE 19394 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value 19395 19396 #undef TARGET_ESTIMATED_POLY_VALUE 19397 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value 19398 19399 #undef TARGET_ATTRIBUTE_TABLE 19400 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table 19401 19402 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN 19403 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \ 19404 aarch64_simd_clone_compute_vecsize_and_simdlen 19405 19406 #undef TARGET_SIMD_CLONE_ADJUST 19407 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust 19408 19409 #undef TARGET_SIMD_CLONE_USABLE 19410 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable 19411 19412 #undef TARGET_COMP_TYPE_ATTRIBUTES 19413 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes 19414 19415 #undef TARGET_GET_MULTILIB_ABI_NAME 19416 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name 19417 19418 #if CHECKING_P 19419 #undef TARGET_RUN_TARGET_SELFTESTS 19420 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests 19421 #endif /* #if CHECKING_P */ 19422 19423 struct gcc_target targetm = TARGET_INITIALIZER; 19424 19425 #include "gt-aarch64.h" 19426